Consolidate the repository into clearer apps, tools, and layered docs areas so contributors can navigate and maintain it more reliably. Align validation, metadata sync, and CI around the same canonical workflow to reduce drift across local checks and GitHub Actions.
260 lines
9.7 KiB
Python
260 lines
9.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Auto-categorize skills based on their names and descriptions.
|
|
Removes "uncategorized" by intelligently assigning categories.
|
|
|
|
Usage:
|
|
python auto_categorize_skills.py
|
|
python auto_categorize_skills.py --dry-run (shows what would change)
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import json
|
|
import sys
|
|
import argparse
|
|
import yaml
|
|
from _project_paths import find_repo_root
|
|
|
|
# Ensure UTF-8 output for Windows compatibility
|
|
if sys.platform == 'win32':
|
|
import io
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
|
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
|
|
|
|
# Category keywords mapping
|
|
CATEGORY_KEYWORDS = {
|
|
'web-development': [
|
|
'react', 'vue', 'angular', 'svelte', 'nextjs', 'gatsby', 'remix',
|
|
'html', 'css', 'javascript', 'typescript', 'frontend', 'web', 'tailwind',
|
|
'bootstrap', 'sass', 'less', 'webpack', 'vite', 'rollup', 'parcel',
|
|
'rest api', 'graphql', 'http', 'fetch', 'axios', 'cors',
|
|
'responsive', 'seo', 'accessibility', 'a11y', 'pwa', 'progressive',
|
|
'dom', 'jsx', 'tsx', 'component', 'router', 'routing'
|
|
],
|
|
'backend': [
|
|
'nodejs', 'node.js', 'express', 'fastapi', 'django', 'flask',
|
|
'spring', 'java', 'python', 'golang', 'rust', 'c#', 'csharp',
|
|
'dotnet', '.net', 'laravel', 'php', 'ruby', 'rails',
|
|
'server', 'backend', 'api', 'rest', 'graphql', 'database',
|
|
'sql', 'mongodb', 'postgres', 'mysql', 'redis', 'cache',
|
|
'authentication', 'auth', 'jwt', 'oauth', 'session',
|
|
'middleware', 'routing', 'controller', 'model'
|
|
],
|
|
'database': [
|
|
'database', 'sql', 'postgres', 'postgresql', 'mysql', 'mariadb',
|
|
'mongodb', 'nosql', 'firestore', 'dynamodb', 'cassandra',
|
|
'elasticsearch', 'redis', 'memcached', 'graphql', 'prisma',
|
|
'orm', 'query', 'migration', 'schema', 'index'
|
|
],
|
|
'ai-ml': [
|
|
'ai', 'artificial intelligence', 'machine learning', 'ml',
|
|
'deep learning', 'neural', 'tensorflow', 'pytorch', 'scikit',
|
|
'nlp', 'computer vision', 'cv', 'llm', 'gpt', 'bert',
|
|
'classification', 'regression', 'clustering', 'transformer',
|
|
'embedding', 'vector', 'embedding', 'training', 'model'
|
|
],
|
|
'devops': [
|
|
'devops', 'docker', 'kubernetes', 'k8s', 'ci/cd', 'git',
|
|
'github', 'gitlab', 'jenkins', 'gitlab-ci', 'github actions',
|
|
'aws', 'azure', 'gcp', 'terraform', 'ansible', 'vagrant',
|
|
'deploy', 'deployment', 'container', 'orchestration',
|
|
'monitoring', 'logging', 'prometheus', 'grafana'
|
|
],
|
|
'cloud': [
|
|
'aws', 'amazon', 'azure', 'gcp', 'google cloud', 'cloud',
|
|
'ec2', 's3', 'lambda', 'cloudformation', 'terraform',
|
|
'serverless', 'functions', 'storage', 'cdn', 'distributed'
|
|
],
|
|
'security': [
|
|
'security', 'encryption', 'cryptography', 'ssl', 'tls',
|
|
'hashing', 'bcrypt', 'jwt', 'oauth', 'authentication',
|
|
'authorization', 'firewall', 'penetration', 'audit',
|
|
'vulnerability', 'privacy', 'gdpr', 'compliance'
|
|
],
|
|
'testing': [
|
|
'test', 'testing', 'jest', 'mocha', 'jasmine', 'pytest',
|
|
'unittest', 'cypress', 'selenium', 'puppeteer', 'e2e',
|
|
'unit test', 'integration', 'coverage', 'ci/cd'
|
|
],
|
|
'mobile': [
|
|
'mobile', 'android', 'ios', 'react native', 'flutter',
|
|
'swift', 'kotlin', 'objective-c', 'app', 'native',
|
|
'cross-platform', 'expo', 'cordova', 'xamarin'
|
|
],
|
|
'game-development': [
|
|
'game', 'unity', 'unreal', 'godot', 'canvas', 'webgl',
|
|
'threejs', 'babylon', 'phaser', 'sprite', 'physics',
|
|
'collision', '2d', '3d', 'shader', 'rendering'
|
|
],
|
|
'data-science': [
|
|
'data', 'analytics', 'science', 'pandas', 'numpy', 'scipy',
|
|
'jupyter', 'notebook', 'visualization', 'matplotlib', 'plotly',
|
|
'statistics', 'correlation', 'regression', 'clustering'
|
|
],
|
|
'automation': [
|
|
'automation', 'scripting', 'selenium', 'puppeteer', 'robot',
|
|
'workflow', 'automation', 'scheduled', 'trigger', 'integration'
|
|
],
|
|
'content': [
|
|
'markdown', 'documentation', 'content', 'blog', 'writing',
|
|
'seo', 'meta', 'schema', 'og', 'twitter', 'description'
|
|
]
|
|
}
|
|
|
|
def categorize_skill(skill_name, description):
|
|
"""
|
|
Intelligently categorize a skill based on name and description.
|
|
Returns the best matching category or None if no match.
|
|
"""
|
|
combined_text = f"{skill_name} {description}".lower()
|
|
|
|
# Score each category based on keyword matches
|
|
scores = {}
|
|
for category, keywords in CATEGORY_KEYWORDS.items():
|
|
score = 0
|
|
for keyword in keywords:
|
|
# Prefer exact phrase matches with word boundaries
|
|
if re.search(r'\b' + re.escape(keyword) + r'\b', combined_text):
|
|
score += 2
|
|
elif keyword in combined_text:
|
|
score += 1
|
|
|
|
if score > 0:
|
|
scores[category] = score
|
|
|
|
# Return the category with highest score
|
|
if scores:
|
|
best_category = max(scores, key=scores.get)
|
|
return best_category
|
|
|
|
return None
|
|
|
|
def auto_categorize(skills_dir, dry_run=False):
|
|
"""Auto-categorize skills and update SKILL.md files"""
|
|
skills = []
|
|
categorized_count = 0
|
|
already_categorized = 0
|
|
failed_count = 0
|
|
|
|
for root, dirs, files in os.walk(skills_dir):
|
|
dirs[:] = [d for d in dirs if not d.startswith('.')]
|
|
|
|
if "SKILL.md" in files:
|
|
skill_path = os.path.join(root, "SKILL.md")
|
|
skill_id = os.path.basename(root)
|
|
|
|
try:
|
|
with open(skill_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Extract frontmatter and body
|
|
fm_match = re.search(r'^---\s*\n(.*?)\n---', content, re.DOTALL)
|
|
if not fm_match:
|
|
continue
|
|
|
|
fm_text = fm_match.group(1)
|
|
body = content[fm_match.end():]
|
|
|
|
try:
|
|
metadata = yaml.safe_load(fm_text) or {}
|
|
except yaml.YAMLError as e:
|
|
print(f"⚠️ {skill_id}: YAML error - {e}")
|
|
continue
|
|
|
|
skill_name = metadata.get('name', skill_id)
|
|
description = metadata.get('description', '')
|
|
current_category = metadata.get('category', 'uncategorized')
|
|
|
|
# Skip if already has a meaningful category
|
|
if current_category and current_category != 'uncategorized':
|
|
already_categorized += 1
|
|
skills.append({
|
|
'id': skill_id,
|
|
'name': skill_name,
|
|
'current': current_category,
|
|
'action': 'SKIP'
|
|
})
|
|
continue
|
|
|
|
# Try to auto-categorize
|
|
new_category = categorize_skill(skill_name, description)
|
|
|
|
if new_category:
|
|
skills.append({
|
|
'id': skill_id,
|
|
'name': skill_name,
|
|
'current': current_category,
|
|
'new': new_category,
|
|
'action': 'UPDATE'
|
|
})
|
|
|
|
if not dry_run:
|
|
metadata['category'] = new_category
|
|
new_fm = yaml.dump(metadata, sort_keys=False, allow_unicode=True, width=1000).strip()
|
|
new_content = f"---\n{new_fm}\n---" + body
|
|
|
|
with open(skill_path, 'w', encoding='utf-8') as f:
|
|
f.write(new_content)
|
|
|
|
categorized_count += 1
|
|
else:
|
|
skills.append({
|
|
'id': skill_id,
|
|
'name': skill_name,
|
|
'current': current_category,
|
|
'action': 'FAILED'
|
|
})
|
|
failed_count += 1
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error processing {skill_id}: {str(e)}")
|
|
|
|
# Print report
|
|
print("\n" + "="*70)
|
|
print("AUTO-CATEGORIZATION REPORT")
|
|
print("="*70)
|
|
print(f"\n📊 Summary:")
|
|
print(f" ✅ Categorized: {categorized_count}")
|
|
print(f" ⏭️ Already categorized: {already_categorized}")
|
|
print(f" ❌ Failed to categorize: {failed_count}")
|
|
print(f" 📈 Total processed: {len(skills)}")
|
|
|
|
if categorized_count > 0:
|
|
print(f"\n📋 Sample changes:")
|
|
for skill in skills[:10]:
|
|
if skill['action'] == 'UPDATE':
|
|
print(f" • {skill['id']}")
|
|
print(f" {skill['current']} → {skill['new']}")
|
|
|
|
if dry_run:
|
|
print(f"\n🔍 DRY RUN MODE - No changes made")
|
|
else:
|
|
print(f"\n💾 Changes saved to SKILL.md files")
|
|
|
|
return categorized_count
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Auto-categorize skills based on content",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
python auto_categorize_skills.py --dry-run
|
|
python auto_categorize_skills.py
|
|
"""
|
|
)
|
|
|
|
parser.add_argument('--dry-run', action='store_true',
|
|
help='Show what would be changed without making changes')
|
|
|
|
args = parser.parse_args()
|
|
|
|
base_dir = str(find_repo_root(__file__))
|
|
skills_path = os.path.join(base_dir, "skills")
|
|
|
|
auto_categorize(skills_path, dry_run=args.dry_run)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|