- Added `scripts/auto_categorize_skills.py` to analyze skill names and descriptions, auto-assigning categories based on keyword matching. - Updated category distribution to show counts and sort categories by skill count in the Home page dropdown. - Created documentation in `docs/CATEGORIZATION_IMPLEMENTATION.md` and `docs/SMART_AUTO_CATEGORIZATION.md` detailing the new categorization process and usage. - Introduced `scripts/fix_year_2025_to_2026.py` to update all skill dates from 2025 to 2026. - Enhanced user experience by moving "uncategorized" to the bottom of the category list and displaying skill counts in the dropdown.
131 lines
5.1 KiB
Python
131 lines
5.1 KiB
Python
import os
|
|
import json
|
|
import re
|
|
import sys
|
|
|
|
import yaml
|
|
|
|
# Ensure UTF-8 output for Windows compatibility
|
|
if sys.platform == 'win32':
|
|
import io
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
|
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
|
|
|
|
def parse_frontmatter(content):
|
|
"""
|
|
Parses YAML frontmatter, sanitizing unquoted values containing @.
|
|
Handles single values and comma-separated lists by quoting the entire line.
|
|
"""
|
|
fm_match = re.search(r'^---\s*\n(.*?)\n---', content, re.DOTALL)
|
|
if not fm_match:
|
|
return {}
|
|
|
|
yaml_text = fm_match.group(1)
|
|
|
|
# Process line by line to handle values containing @ and commas
|
|
sanitized_lines = []
|
|
for line in yaml_text.splitlines():
|
|
# Match "key: value" (handles keys with dashes like 'package-name')
|
|
match = re.match(r'^(\s*[\w-]+):\s*(.*)$', line)
|
|
if match:
|
|
key, val = match.groups()
|
|
val_s = val.strip()
|
|
# If value contains @ and isn't already quoted, wrap the whole string in double quotes
|
|
if '@' in val_s and not (val_s.startswith('"') or val_s.startswith("'")):
|
|
# Escape any existing double quotes within the value string
|
|
safe_val = val_s.replace('"', '\\"')
|
|
line = f'{key}: "{safe_val}"'
|
|
sanitized_lines.append(line)
|
|
|
|
sanitized_yaml = '\n'.join(sanitized_lines)
|
|
|
|
try:
|
|
return yaml.safe_load(sanitized_yaml) or {}
|
|
except yaml.YAMLError as e:
|
|
print(f"⚠️ YAML parsing error: {e}")
|
|
return {}
|
|
|
|
def generate_index(skills_dir, output_file):
|
|
print(f"🏗️ Generating index from: {skills_dir}")
|
|
skills = []
|
|
|
|
for root, dirs, files in os.walk(skills_dir):
|
|
# Skip .disabled or hidden directories
|
|
dirs[:] = [d for d in dirs if not d.startswith('.')]
|
|
|
|
if "SKILL.md" in files:
|
|
skill_path = os.path.join(root, "SKILL.md")
|
|
dir_name = os.path.basename(root)
|
|
parent_dir = os.path.basename(os.path.dirname(root))
|
|
|
|
# Default values
|
|
skill_info = {
|
|
"id": dir_name,
|
|
"path": os.path.relpath(root, os.path.dirname(skills_dir)),
|
|
"category": parent_dir if parent_dir != "skills" else None, # Will be overridden by frontmatter if present
|
|
"name": dir_name.replace("-", " ").title(),
|
|
"description": "",
|
|
"risk": "unknown",
|
|
"source": "unknown",
|
|
"date_added": None
|
|
}
|
|
|
|
try:
|
|
with open(skill_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
except Exception as e:
|
|
print(f"⚠️ Error reading {skill_path}: {e}")
|
|
continue
|
|
|
|
# Parse Metadata
|
|
metadata = parse_frontmatter(content)
|
|
|
|
# Merge Metadata (frontmatter takes priority)
|
|
if "name" in metadata: skill_info["name"] = metadata["name"]
|
|
if "description" in metadata: skill_info["description"] = metadata["description"]
|
|
if "risk" in metadata: skill_info["risk"] = metadata["risk"]
|
|
if "source" in metadata: skill_info["source"] = metadata["source"]
|
|
if "date_added" in metadata: skill_info["date_added"] = metadata["date_added"]
|
|
|
|
# Category: prefer frontmatter, then folder structure, then default
|
|
if "category" in metadata:
|
|
skill_info["category"] = metadata["category"]
|
|
elif skill_info["category"] is None:
|
|
skill_info["category"] = "uncategorized"
|
|
|
|
# Fallback for description if missing in frontmatter (legacy support)
|
|
if not skill_info["description"]:
|
|
body = content
|
|
fm_match = re.search(r'^---\s*\n(.*?)\n---', content, re.DOTALL)
|
|
if fm_match:
|
|
body = content[fm_match.end():].strip()
|
|
|
|
# Simple extraction of first non-header paragraph
|
|
lines = body.split('\n')
|
|
desc_lines = []
|
|
for line in lines:
|
|
if line.startswith('#') or not line.strip():
|
|
if desc_lines: break
|
|
continue
|
|
desc_lines.append(line.strip())
|
|
|
|
if desc_lines:
|
|
skill_info["description"] = " ".join(desc_lines)[:250].strip()
|
|
|
|
skills.append(skill_info)
|
|
|
|
# Sort validation: by name
|
|
skills.sort(key=lambda x: (x["name"].lower(), x["id"].lower()))
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(skills, f, indent=2)
|
|
|
|
print(f"✅ Generated rich index with {len(skills)} skills at: {output_file}")
|
|
return skills
|
|
|
|
if __name__ == "__main__":
|
|
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
skills_path = os.path.join(base_dir, "skills")
|
|
output_path = os.path.join(base_dir, "skills_index.json")
|
|
generate_index(skills_path, output_path)
|