feat: Implement intelligent auto-categorization for skills

- Added `scripts/auto_categorize_skills.py` to analyze skill names and descriptions, auto-assigning categories based on keyword matching. - Updated category distribution to show counts and sort categories by skill count in the Home page dropdown. - Created documentation in `docs/CATEGORIZATION_IMPLEMENTATION.md` and `docs/SMART_AUTO_CATEGORIZATION.md` detailing the new categorization process and usage. - Introduced `scripts/fix_year_2025_to_2026.py` to update all skill dates from 2025 to 2026. - Enhanced user experience by moving "uncategorized" to the bottom of the category list and displaying skill counts in the dropdown.
2026-02-26 12:52:03 +01:00
parent f8123cb5a9
commit 8de886a2ff
8 changed files with 4258 additions and 3520 deletions
--- a/scripts/auto_categorize_skills.py
+++ b/scripts/auto_categorize_skills.py
@@ -0,0 +1,275 @@
+#!/usr/bin/env python3
+"""
+Auto-categorize skills based on their names and descriptions.
+Removes "uncategorized" by intelligently assigning categories.
+
+Usage:
+  python auto_categorize_skills.py
+  python auto_categorize_skills.py --dry-run (shows what would change)
+"""
+
+import os
+import re
+import json
+import sys
+import argparse
+
+# Ensure UTF-8 output for Windows compatibility
+if sys.platform == 'win32':
+    import io
+    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
+
+# Category keywords mapping
+CATEGORY_KEYWORDS = {
+    'web-development': [
+        'react', 'vue', 'angular', 'svelte', 'nextjs', 'gatsby', 'remix',
+        'html', 'css', 'javascript', 'typescript', 'frontend', 'web', 'tailwind',
+        'bootstrap', 'sass', 'less', 'webpack', 'vite', 'rollup', 'parcel',
+        'rest api', 'graphql', 'http', 'fetch', 'axios', 'cors',
+        'responsive', 'seo', 'accessibility', 'a11y', 'pwa', 'progressive',
+        'dom', 'jsx', 'tsx', 'component', 'router', 'routing'
+    ],
+    'backend': [
+        'nodejs', 'node.js', 'express', 'fastapi', 'django', 'flask',
+        'spring', 'java', 'python', 'golang', 'rust', 'c#', 'csharp',
+        'dotnet', '.net', 'laravel', 'php', 'ruby', 'rails',
+        'server', 'backend', 'api', 'rest', 'graphql', 'database',
+        'sql', 'mongodb', 'postgres', 'mysql', 'redis', 'cache',
+        'authentication', 'auth', 'jwt', 'oauth', 'session',
+        'middleware', 'routing', 'controller', 'model'
+    ],
+    'database': [
+        'database', 'sql', 'postgres', 'postgresql', 'mysql', 'mariadb',
+        'mongodb', 'nosql', 'firestore', 'dynamodb', 'cassandra',
+        'elasticsearch', 'redis', 'memcached', 'graphql', 'prisma',
+        'orm', 'query', 'migration', 'schema', 'index'
+    ],
+    'ai-ml': [
+        'ai', 'artificial intelligence', 'machine learning', 'ml',
+        'deep learning', 'neural', 'tensorflow', 'pytorch', 'scikit',
+        'nlp', 'computer vision', 'cv', 'llm', 'gpt', 'bert',
+        'classification', 'regression', 'clustering', 'transformer',
+        'embedding', 'vector', 'embedding', 'training', 'model'
+    ],
+    'devops': [
+        'devops', 'docker', 'kubernetes', 'k8s', 'ci/cd', 'git',
+        'github', 'gitlab', 'jenkins', 'gitlab-ci', 'github actions',
+        'aws', 'azure', 'gcp', 'terraform', 'ansible', 'vagrant',
+        'deploy', 'deployment', 'container', 'orchestration',
+        'monitoring', 'logging', 'prometheus', 'grafana'
+    ],
+    'cloud': [
+        'aws', 'amazon', 'azure', 'gcp', 'google cloud', 'cloud',
+        'ec2', 's3', 'lambda', 'cloudformation', 'terraform',
+        'serverless', 'functions', 'storage', 'cdn', 'distributed'
+    ],
+    'security': [
+        'security', 'encryption', 'cryptography', 'ssl', 'tls',
+        'hashing', 'bcrypt', 'jwt', 'oauth', 'authentication',
+        'authorization', 'firewall', 'penetration', 'audit',
+        'vulnerability', 'privacy', 'gdpr', 'compliance'
+    ],
+    'testing': [
+        'test', 'testing', 'jest', 'mocha', 'jasmine', 'pytest',
+        'unittest', 'cypress', 'selenium', 'puppeteer', 'e2e',
+        'unit test', 'integration', 'coverage', 'ci/cd'
+    ],
+    'mobile': [
+        'mobile', 'android', 'ios', 'react native', 'flutter',
+        'swift', 'kotlin', 'objective-c', 'app', 'native',
+        'cross-platform', 'expo', 'cordova', 'xamarin'
+    ],
+    'game-development': [
+        'game', 'unity', 'unreal', 'godot', 'canvas', 'webgl',
+        'threejs', 'babylon', 'phaser', 'sprite', 'physics',
+        'collision', '2d', '3d', 'shader', 'rendering'
+    ],
+    'data-science': [
+        'data', 'analytics', 'science', 'pandas', 'numpy', 'scipy',
+        'jupyter', 'notebook', 'visualization', 'matplotlib', 'plotly',
+        'statistics', 'correlation', 'regression', 'clustering'
+    ],
+    'automation': [
+        'automation', 'scripting', 'selenium', 'puppeteer', 'robot',
+        'workflow', 'automation', 'scheduled', 'trigger', 'integration'
+    ],
+    'content': [
+        'markdown', 'documentation', 'content', 'blog', 'writing',
+        'seo', 'meta', 'schema', 'og', 'twitter', 'description'
+    ]
+}
+
+def categorize_skill(skill_name, description):
+    """
+    Intelligently categorize a skill based on name and description.
+    Returns the best matching category or None if no match.
+    """
+    combined_text = f"{skill_name} {description}".lower()
+    
+    # Score each category based on keyword matches
+    scores = {}
+    for category, keywords in CATEGORY_KEYWORDS.items():
+        score = 0
+        for keyword in keywords:
+            # Prefer exact phrase matches with word boundaries
+            if re.search(r'\b' + re.escape(keyword) + r'\b', combined_text):
+                score += 2
+            elif keyword in combined_text:
+                score += 1
+        
+        if score > 0:
+            scores[category] = score
+    
+    # Return the category with highest score
+    if scores:
+        best_category = max(scores, key=scores.get)
+        return best_category
+    
+    return None
+
+def auto_categorize(skills_dir, dry_run=False):
+    """Auto-categorize skills and update generate_index.py"""
+    skills = []
+    categorized_count = 0
+    already_categorized = 0
+    failed_count = 0
+    
+    for root, dirs, files in os.walk(skills_dir):
+        dirs[:] = [d for d in dirs if not d.startswith('.')]
+        
+        if "SKILL.md" in files:
+            skill_path = os.path.join(root, "SKILL.md")
+            skill_id = os.path.basename(root)
+            
+            try:
+                with open(skill_path, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                
+                # Extract name and description from frontmatter
+                fm_match = re.search(r'^---\s*\n(.*?)\n---', content, re.DOTALL)
+                if not fm_match:
+                    continue
+                
+                fm_text = fm_match.group(1)
+                metadata = {}
+                for line in fm_text.split('\n'):
+                    if ':' in line and not line.strip().startswith('#'):
+                        key, val = line.split(':', 1)
+                        metadata[key.strip()] = val.strip().strip('"').strip("'")
+                
+                skill_name = metadata.get('name', skill_id)
+                description = metadata.get('description', '')
+                current_category = metadata.get('category', 'uncategorized')
+                
+                # Skip if already has a meaningful category
+                if current_category and current_category != 'uncategorized':
+                    already_categorized += 1
+                    skills.append({
+                        'id': skill_id,
+                        'name': skill_name,
+                        'current': current_category,
+                        'action': 'SKIP'
+                    })
+                    continue
+                
+                # Try to auto-categorize
+                new_category = categorize_skill(skill_name, description)
+                
+                if new_category:
+                    skills.append({
+                        'id': skill_id,
+                        'name': skill_name,
+                        'current': current_category,
+                        'new': new_category,
+                        'action': 'UPDATE'
+                    })
+                    
+                    if not dry_run:
+                        # Update the SKILL.md file - add or replace category
+                        fm_start = content.find('---')
+                        fm_end = content.find('---', fm_start + 3)
+                        
+                        if fm_start >= 0 and fm_end > fm_start:
+                            frontmatter = content[fm_start:fm_end+3]
+                            body = content[fm_end+3:]
+                            
+                            # Check if category exists in frontmatter
+                            if 'category:' in frontmatter:
+                                # Replace existing category
+                                new_frontmatter = re.sub(
+                                    r'category:\s*\w+',
+                                    f'category: {new_category}',
+                                    frontmatter
+                                )
+                            else:
+                                # Add category before the closing ---
+                                new_frontmatter = frontmatter.replace(
+                                    '\n---',
+                                    f'\ncategory: {new_category}\n---'
+                                )
+                            
+                            new_content = new_frontmatter + body
+                            with open(skill_path, 'w', encoding='utf-8') as f:
+                                f.write(new_content)
+                    
+                    categorized_count += 1
+                else:
+                    skills.append({
+                        'id': skill_id,
+                        'name': skill_name,
+                        'current': current_category,
+                        'action': 'FAILED'
+                    })
+                    failed_count += 1
+                    
+            except Exception as e:
+                print(f"❌ Error processing {skill_id}: {str(e)}")
+    
+    # Print report
+    print("\n" + "="*70)
+    print("AUTO-CATEGORIZATION REPORT")
+    print("="*70)
+    print(f"\n📊 Summary:")
+    print(f"   ✅ Categorized: {categorized_count}")
+    print(f"   ⏭️  Already categorized: {already_categorized}")
+    print(f"   ❌ Failed to categorize: {failed_count}")
+    print(f"   📈 Total processed: {len(skills)}")
+    
+    if categorized_count > 0:
+        print(f"\n📋 Sample changes:")
+        for skill in skills[:10]:
+            if skill['action'] == 'UPDATE':
+                print(f"   • {skill['id']}")
+                print(f"     {skill['current']} → {skill['new']}")
+    
+    if dry_run:
+        print(f"\n🔍 DRY RUN MODE - No changes made")
+    else:
+        print(f"\n💾 Changes saved to SKILL.md files")
+    
+    return categorized_count
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Auto-categorize skills based on content",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python auto_categorize_skills.py --dry-run
+  python auto_categorize_skills.py
+        """
+    )
+    
+    parser.add_argument('--dry-run', action='store_true', 
+                       help='Show what would be changed without making changes')
+    
+    args = parser.parse_args()
+    
+    base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    skills_path = os.path.join(base_dir, "skills")
+    
+    auto_categorize(skills_path, dry_run=args.dry_run)
+
+if __name__ == "__main__":
+    main()
--- a/scripts/fix_year_2025_to_2026.py
+++ b/scripts/fix_year_2025_to_2026.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+"""
+Update all skill dates from 2025 to 2026.
+Fixes the year mismatch issue.
+"""
+
+import os
+import re
+import sys
+
+# Ensure UTF-8 output for Windows compatibility
+if sys.platform == 'win32':
+    import io
+    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
+
+def update_dates(skills_dir):
+    """Update all dates from 2025 to 2026"""
+    updated_count = 0
+    
+    for root, dirs, files in os.walk(skills_dir):
+        dirs[:] = [d for d in dirs if not d.startswith('.')]
+        
+        if "SKILL.md" in files:
+            skill_path = os.path.join(root, "SKILL.md")
+            skill_id = os.path.basename(root)
+            
+            try:
+                with open(skill_path, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                
+                # Replace 2025 with 2026 in date_added field
+                if 'date_added: "2025-' in content:
+                    new_content = content.replace('date_added: "2025-', 'date_added: "2026-')
+                    
+                    with open(skill_path, 'w', encoding='utf-8') as f:
+                        f.write(new_content)
+                    
+                    print(f"OK {skill_id}")
+                    updated_count += 1
+            except Exception as e:
+                print(f"Error updating {skill_id}: {str(e)}")
+    
+    print(f"\nUpdated {updated_count} skills to 2026")
+    return updated_count
+
+if __name__ == "__main__":
+    base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    skills_path = os.path.join(base_dir, "skills")
+    
+    print("Updating all dates from 2025 to 2026...\n")
+    update_dates(skills_path)
+    print("\nDone! Run: python scripts/generate_index.py")
--- a/scripts/generate_index.py
+++ b/scripts/generate_index.py
@@ -62,7 +62,7 @@ def generate_index(skills_dir, output_file):
            skill_info = {
                "id": dir_name,
                "path": os.path.relpath(root, os.path.dirname(skills_dir)),
-                "category": parent_dir if parent_dir != "skills" else "uncategorized",
+                "category": parent_dir if parent_dir != "skills" else None,  # Will be overridden by frontmatter if present
                "name": dir_name.replace("-", " ").title(),
                "description": "",
                "risk": "unknown",
@@ -80,13 +80,19 @@ def generate_index(skills_dir, output_file):
            # Parse Metadata
            metadata = parse_frontmatter(content)
            
-            # Merge Metadata
+            # Merge Metadata (frontmatter takes priority)
            if "name" in metadata: skill_info["name"] = metadata["name"]
            if "description" in metadata: skill_info["description"] = metadata["description"]
            if "risk" in metadata: skill_info["risk"] = metadata["risk"]
            if "source" in metadata: skill_info["source"] = metadata["source"]
            if "date_added" in metadata: skill_info["date_added"] = metadata["date_added"]
            
+            # Category: prefer frontmatter, then folder structure, then default
+            if "category" in metadata:
+                skill_info["category"] = metadata["category"]
+            elif skill_info["category"] is None:
+                skill_info["category"] = "uncategorized"
+            
            # Fallback for description if missing in frontmatter (legacy support)
            if not skill_info["description"]:
                body = content