feat: Implement intelligent auto-categorization for skills
- Added `scripts/auto_categorize_skills.py` to analyze skill names and descriptions, auto-assigning categories based on keyword matching. - Updated category distribution to show counts and sort categories by skill count in the Home page dropdown. - Created documentation in `docs/CATEGORIZATION_IMPLEMENTATION.md` and `docs/SMART_AUTO_CATEGORIZATION.md` detailing the new categorization process and usage. - Introduced `scripts/fix_year_2025_to_2026.py` to update all skill dates from 2025 to 2026. - Enhanced user experience by moving "uncategorized" to the bottom of the category list and displaying skill counts in the dropdown.
This commit is contained in:
275
scripts/auto_categorize_skills.py
Normal file
275
scripts/auto_categorize_skills.py
Normal file
@@ -0,0 +1,275 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Auto-categorize skills based on their names and descriptions.
|
||||
Removes "uncategorized" by intelligently assigning categories.
|
||||
|
||||
Usage:
|
||||
python auto_categorize_skills.py
|
||||
python auto_categorize_skills.py --dry-run (shows what would change)
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
# Ensure UTF-8 output for Windows compatibility
|
||||
if sys.platform == 'win32':
|
||||
import io
|
||||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
||||
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
|
||||
|
||||
# Category keywords mapping
|
||||
CATEGORY_KEYWORDS = {
|
||||
'web-development': [
|
||||
'react', 'vue', 'angular', 'svelte', 'nextjs', 'gatsby', 'remix',
|
||||
'html', 'css', 'javascript', 'typescript', 'frontend', 'web', 'tailwind',
|
||||
'bootstrap', 'sass', 'less', 'webpack', 'vite', 'rollup', 'parcel',
|
||||
'rest api', 'graphql', 'http', 'fetch', 'axios', 'cors',
|
||||
'responsive', 'seo', 'accessibility', 'a11y', 'pwa', 'progressive',
|
||||
'dom', 'jsx', 'tsx', 'component', 'router', 'routing'
|
||||
],
|
||||
'backend': [
|
||||
'nodejs', 'node.js', 'express', 'fastapi', 'django', 'flask',
|
||||
'spring', 'java', 'python', 'golang', 'rust', 'c#', 'csharp',
|
||||
'dotnet', '.net', 'laravel', 'php', 'ruby', 'rails',
|
||||
'server', 'backend', 'api', 'rest', 'graphql', 'database',
|
||||
'sql', 'mongodb', 'postgres', 'mysql', 'redis', 'cache',
|
||||
'authentication', 'auth', 'jwt', 'oauth', 'session',
|
||||
'middleware', 'routing', 'controller', 'model'
|
||||
],
|
||||
'database': [
|
||||
'database', 'sql', 'postgres', 'postgresql', 'mysql', 'mariadb',
|
||||
'mongodb', 'nosql', 'firestore', 'dynamodb', 'cassandra',
|
||||
'elasticsearch', 'redis', 'memcached', 'graphql', 'prisma',
|
||||
'orm', 'query', 'migration', 'schema', 'index'
|
||||
],
|
||||
'ai-ml': [
|
||||
'ai', 'artificial intelligence', 'machine learning', 'ml',
|
||||
'deep learning', 'neural', 'tensorflow', 'pytorch', 'scikit',
|
||||
'nlp', 'computer vision', 'cv', 'llm', 'gpt', 'bert',
|
||||
'classification', 'regression', 'clustering', 'transformer',
|
||||
'embedding', 'vector', 'embedding', 'training', 'model'
|
||||
],
|
||||
'devops': [
|
||||
'devops', 'docker', 'kubernetes', 'k8s', 'ci/cd', 'git',
|
||||
'github', 'gitlab', 'jenkins', 'gitlab-ci', 'github actions',
|
||||
'aws', 'azure', 'gcp', 'terraform', 'ansible', 'vagrant',
|
||||
'deploy', 'deployment', 'container', 'orchestration',
|
||||
'monitoring', 'logging', 'prometheus', 'grafana'
|
||||
],
|
||||
'cloud': [
|
||||
'aws', 'amazon', 'azure', 'gcp', 'google cloud', 'cloud',
|
||||
'ec2', 's3', 'lambda', 'cloudformation', 'terraform',
|
||||
'serverless', 'functions', 'storage', 'cdn', 'distributed'
|
||||
],
|
||||
'security': [
|
||||
'security', 'encryption', 'cryptography', 'ssl', 'tls',
|
||||
'hashing', 'bcrypt', 'jwt', 'oauth', 'authentication',
|
||||
'authorization', 'firewall', 'penetration', 'audit',
|
||||
'vulnerability', 'privacy', 'gdpr', 'compliance'
|
||||
],
|
||||
'testing': [
|
||||
'test', 'testing', 'jest', 'mocha', 'jasmine', 'pytest',
|
||||
'unittest', 'cypress', 'selenium', 'puppeteer', 'e2e',
|
||||
'unit test', 'integration', 'coverage', 'ci/cd'
|
||||
],
|
||||
'mobile': [
|
||||
'mobile', 'android', 'ios', 'react native', 'flutter',
|
||||
'swift', 'kotlin', 'objective-c', 'app', 'native',
|
||||
'cross-platform', 'expo', 'cordova', 'xamarin'
|
||||
],
|
||||
'game-development': [
|
||||
'game', 'unity', 'unreal', 'godot', 'canvas', 'webgl',
|
||||
'threejs', 'babylon', 'phaser', 'sprite', 'physics',
|
||||
'collision', '2d', '3d', 'shader', 'rendering'
|
||||
],
|
||||
'data-science': [
|
||||
'data', 'analytics', 'science', 'pandas', 'numpy', 'scipy',
|
||||
'jupyter', 'notebook', 'visualization', 'matplotlib', 'plotly',
|
||||
'statistics', 'correlation', 'regression', 'clustering'
|
||||
],
|
||||
'automation': [
|
||||
'automation', 'scripting', 'selenium', 'puppeteer', 'robot',
|
||||
'workflow', 'automation', 'scheduled', 'trigger', 'integration'
|
||||
],
|
||||
'content': [
|
||||
'markdown', 'documentation', 'content', 'blog', 'writing',
|
||||
'seo', 'meta', 'schema', 'og', 'twitter', 'description'
|
||||
]
|
||||
}
|
||||
|
||||
def categorize_skill(skill_name, description):
|
||||
"""
|
||||
Intelligently categorize a skill based on name and description.
|
||||
Returns the best matching category or None if no match.
|
||||
"""
|
||||
combined_text = f"{skill_name} {description}".lower()
|
||||
|
||||
# Score each category based on keyword matches
|
||||
scores = {}
|
||||
for category, keywords in CATEGORY_KEYWORDS.items():
|
||||
score = 0
|
||||
for keyword in keywords:
|
||||
# Prefer exact phrase matches with word boundaries
|
||||
if re.search(r'\b' + re.escape(keyword) + r'\b', combined_text):
|
||||
score += 2
|
||||
elif keyword in combined_text:
|
||||
score += 1
|
||||
|
||||
if score > 0:
|
||||
scores[category] = score
|
||||
|
||||
# Return the category with highest score
|
||||
if scores:
|
||||
best_category = max(scores, key=scores.get)
|
||||
return best_category
|
||||
|
||||
return None
|
||||
|
||||
def auto_categorize(skills_dir, dry_run=False):
|
||||
"""Auto-categorize skills and update generate_index.py"""
|
||||
skills = []
|
||||
categorized_count = 0
|
||||
already_categorized = 0
|
||||
failed_count = 0
|
||||
|
||||
for root, dirs, files in os.walk(skills_dir):
|
||||
dirs[:] = [d for d in dirs if not d.startswith('.')]
|
||||
|
||||
if "SKILL.md" in files:
|
||||
skill_path = os.path.join(root, "SKILL.md")
|
||||
skill_id = os.path.basename(root)
|
||||
|
||||
try:
|
||||
with open(skill_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Extract name and description from frontmatter
|
||||
fm_match = re.search(r'^---\s*\n(.*?)\n---', content, re.DOTALL)
|
||||
if not fm_match:
|
||||
continue
|
||||
|
||||
fm_text = fm_match.group(1)
|
||||
metadata = {}
|
||||
for line in fm_text.split('\n'):
|
||||
if ':' in line and not line.strip().startswith('#'):
|
||||
key, val = line.split(':', 1)
|
||||
metadata[key.strip()] = val.strip().strip('"').strip("'")
|
||||
|
||||
skill_name = metadata.get('name', skill_id)
|
||||
description = metadata.get('description', '')
|
||||
current_category = metadata.get('category', 'uncategorized')
|
||||
|
||||
# Skip if already has a meaningful category
|
||||
if current_category and current_category != 'uncategorized':
|
||||
already_categorized += 1
|
||||
skills.append({
|
||||
'id': skill_id,
|
||||
'name': skill_name,
|
||||
'current': current_category,
|
||||
'action': 'SKIP'
|
||||
})
|
||||
continue
|
||||
|
||||
# Try to auto-categorize
|
||||
new_category = categorize_skill(skill_name, description)
|
||||
|
||||
if new_category:
|
||||
skills.append({
|
||||
'id': skill_id,
|
||||
'name': skill_name,
|
||||
'current': current_category,
|
||||
'new': new_category,
|
||||
'action': 'UPDATE'
|
||||
})
|
||||
|
||||
if not dry_run:
|
||||
# Update the SKILL.md file - add or replace category
|
||||
fm_start = content.find('---')
|
||||
fm_end = content.find('---', fm_start + 3)
|
||||
|
||||
if fm_start >= 0 and fm_end > fm_start:
|
||||
frontmatter = content[fm_start:fm_end+3]
|
||||
body = content[fm_end+3:]
|
||||
|
||||
# Check if category exists in frontmatter
|
||||
if 'category:' in frontmatter:
|
||||
# Replace existing category
|
||||
new_frontmatter = re.sub(
|
||||
r'category:\s*\w+',
|
||||
f'category: {new_category}',
|
||||
frontmatter
|
||||
)
|
||||
else:
|
||||
# Add category before the closing ---
|
||||
new_frontmatter = frontmatter.replace(
|
||||
'\n---',
|
||||
f'\ncategory: {new_category}\n---'
|
||||
)
|
||||
|
||||
new_content = new_frontmatter + body
|
||||
with open(skill_path, 'w', encoding='utf-8') as f:
|
||||
f.write(new_content)
|
||||
|
||||
categorized_count += 1
|
||||
else:
|
||||
skills.append({
|
||||
'id': skill_id,
|
||||
'name': skill_name,
|
||||
'current': current_category,
|
||||
'action': 'FAILED'
|
||||
})
|
||||
failed_count += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error processing {skill_id}: {str(e)}")
|
||||
|
||||
# Print report
|
||||
print("\n" + "="*70)
|
||||
print("AUTO-CATEGORIZATION REPORT")
|
||||
print("="*70)
|
||||
print(f"\n📊 Summary:")
|
||||
print(f" ✅ Categorized: {categorized_count}")
|
||||
print(f" ⏭️ Already categorized: {already_categorized}")
|
||||
print(f" ❌ Failed to categorize: {failed_count}")
|
||||
print(f" 📈 Total processed: {len(skills)}")
|
||||
|
||||
if categorized_count > 0:
|
||||
print(f"\n📋 Sample changes:")
|
||||
for skill in skills[:10]:
|
||||
if skill['action'] == 'UPDATE':
|
||||
print(f" • {skill['id']}")
|
||||
print(f" {skill['current']} → {skill['new']}")
|
||||
|
||||
if dry_run:
|
||||
print(f"\n🔍 DRY RUN MODE - No changes made")
|
||||
else:
|
||||
print(f"\n💾 Changes saved to SKILL.md files")
|
||||
|
||||
return categorized_count
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Auto-categorize skills based on content",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python auto_categorize_skills.py --dry-run
|
||||
python auto_categorize_skills.py
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('--dry-run', action='store_true',
|
||||
help='Show what would be changed without making changes')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
skills_path = os.path.join(base_dir, "skills")
|
||||
|
||||
auto_categorize(skills_path, dry_run=args.dry_run)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
53
scripts/fix_year_2025_to_2026.py
Normal file
53
scripts/fix_year_2025_to_2026.py
Normal file
@@ -0,0 +1,53 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Update all skill dates from 2025 to 2026.
|
||||
Fixes the year mismatch issue.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
# Ensure UTF-8 output for Windows compatibility
|
||||
if sys.platform == 'win32':
|
||||
import io
|
||||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
||||
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
|
||||
|
||||
def update_dates(skills_dir):
|
||||
"""Update all dates from 2025 to 2026"""
|
||||
updated_count = 0
|
||||
|
||||
for root, dirs, files in os.walk(skills_dir):
|
||||
dirs[:] = [d for d in dirs if not d.startswith('.')]
|
||||
|
||||
if "SKILL.md" in files:
|
||||
skill_path = os.path.join(root, "SKILL.md")
|
||||
skill_id = os.path.basename(root)
|
||||
|
||||
try:
|
||||
with open(skill_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Replace 2025 with 2026 in date_added field
|
||||
if 'date_added: "2025-' in content:
|
||||
new_content = content.replace('date_added: "2025-', 'date_added: "2026-')
|
||||
|
||||
with open(skill_path, 'w', encoding='utf-8') as f:
|
||||
f.write(new_content)
|
||||
|
||||
print(f"OK {skill_id}")
|
||||
updated_count += 1
|
||||
except Exception as e:
|
||||
print(f"Error updating {skill_id}: {str(e)}")
|
||||
|
||||
print(f"\nUpdated {updated_count} skills to 2026")
|
||||
return updated_count
|
||||
|
||||
if __name__ == "__main__":
|
||||
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
skills_path = os.path.join(base_dir, "skills")
|
||||
|
||||
print("Updating all dates from 2025 to 2026...\n")
|
||||
update_dates(skills_path)
|
||||
print("\nDone! Run: python scripts/generate_index.py")
|
||||
@@ -62,7 +62,7 @@ def generate_index(skills_dir, output_file):
|
||||
skill_info = {
|
||||
"id": dir_name,
|
||||
"path": os.path.relpath(root, os.path.dirname(skills_dir)),
|
||||
"category": parent_dir if parent_dir != "skills" else "uncategorized",
|
||||
"category": parent_dir if parent_dir != "skills" else None, # Will be overridden by frontmatter if present
|
||||
"name": dir_name.replace("-", " ").title(),
|
||||
"description": "",
|
||||
"risk": "unknown",
|
||||
@@ -80,13 +80,19 @@ def generate_index(skills_dir, output_file):
|
||||
# Parse Metadata
|
||||
metadata = parse_frontmatter(content)
|
||||
|
||||
# Merge Metadata
|
||||
# Merge Metadata (frontmatter takes priority)
|
||||
if "name" in metadata: skill_info["name"] = metadata["name"]
|
||||
if "description" in metadata: skill_info["description"] = metadata["description"]
|
||||
if "risk" in metadata: skill_info["risk"] = metadata["risk"]
|
||||
if "source" in metadata: skill_info["source"] = metadata["source"]
|
||||
if "date_added" in metadata: skill_info["date_added"] = metadata["date_added"]
|
||||
|
||||
# Category: prefer frontmatter, then folder structure, then default
|
||||
if "category" in metadata:
|
||||
skill_info["category"] = metadata["category"]
|
||||
elif skill_info["category"] is None:
|
||||
skill_info["category"] = "uncategorized"
|
||||
|
||||
# Fallback for description if missing in frontmatter (legacy support)
|
||||
if not skill_info["description"]:
|
||||
body = content
|
||||
|
||||
Reference in New Issue
Block a user