diff --git a/skill_categorization/docs/maintainers/smart-auto-categorization.md b/skill_categorization/docs/maintainers/smart-auto-categorization.md new file mode 100644 index 00000000..be81f110 --- /dev/null +++ b/skill_categorization/docs/maintainers/smart-auto-categorization.md @@ -0,0 +1,223 @@ +# Smart Auto-Categorization Guide + +## Overview + +The skill collection now uses intelligent auto-categorization to eliminate "uncategorized" and organize skills into meaningful categories based on their content. + +`tools/scripts/generate_index.py` now classifies skills at index-build time and writes two explainability fields to every record in `skills_index.json`: +- `category_confidence` (numeric confidence score) +- `category_reason` (how the category was selected) + +## Current Status + +✅ Current repository indexed through the generated catalog +- Most skills are in meaningful categories +- A smaller tail still needs manual review or better keyword coverage +- 11 primary categories +- Categories sorted by skill count (most first) + +## Category Distribution + +| Category | Count | Examples | +|----------|-------|----------| +| Backend | 164 | Node.js, Django, Express, FastAPI | +| Web Development | 107 | React, Vue, Tailwind, CSS | +| Automation | 103 | Workflow, Scripting, RPA | +| DevOps | 83 | Docker, Kubernetes, CI/CD, Git | +| AI/ML | 79 | TensorFlow, PyTorch, NLP, LLM | +| Content | 47 | Documentation, SEO, Writing | +| Database | 44 | SQL, MongoDB, PostgreSQL | +| Testing | 38 | Jest, Cypress, Unit Testing | +| Security | 36 | Encryption, Authentication | +| Cloud | 33 | AWS, Azure, GCP | +| Mobile | 21 | React Native, Flutter, iOS | +| Game Dev | 15 | Unity, WebGL, 3D | +| Data Science | 14 | Pandas, NumPy, Analytics | + +## How It Works + +### 1. **Keyword-Based Analysis** +The system analyzes skill names and descriptions for keywords: +- **Backend**: nodejs, express, fastapi, django, server, api, database +- **Web Dev**: react, vue, angular, frontend, css, html, tailwind +- **AI/ML**: ai, machine learning, tensorflow, nlp, gpt +- **DevOps**: docker, kubernetes, ci/cd, deploy +- And more... + +### 2. **Priority System** +Frontmatter category > Detected Keywords > Fallback (uncategorized) + +If a skill already has a category in frontmatter, that's preserved. + +When no known keyword category matches, the index builder derives a dynamic category from the skill id tokens. This keeps the system open-ended and allows new categories without maintaining a fixed allow-list. + +### 3. **Scope-Based Matching** +- Exact phrase matches weighted 2x higher than partial matches +- Uses word boundaries to avoid false positives + +## Using the Auto-Categorization + +### Run on Uncategorized Skills +```bash +python tools/scripts/auto_categorize_skills.py +``` + +### Build index with explainable auto-categorization +```bash +python tools/scripts/generate_index.py +``` + +### Preview Changes First (Dry Run) +```bash +python tools/scripts/auto_categorize_skills.py --dry-run +``` + +### Output +``` +====================================================================== +AUTO-CATEGORIZATION REPORT +====================================================================== + +Summary: + ✅ Categorized: 776 + ⏭️ Already categorized: 46 + ❌ Failed to categorize: 124 + 📈 Total processed: full repository + +Sample changes: + • 3d-web-experience + uncategorized → web-development + • ab-test-setup + uncategorized → testing + • agent-framework-azure-ai-py + uncategorized → backend +``` + +## Web App Improvements + +### Category Filter +**Before:** +- Unordered list including "uncategorized" +- No indication of category size + +**After:** +- Categories sorted by skill count (most first, "uncategorized" last) +- Shows count: "Backend (164)" "Web Development (107)" +- Much easier to browse + +### Example Dropdowns + +**Sorted Order:** +1. All Categories +2. Backend (164) +3. Web Development (107) +4. Automation (103) +5. DevOps (83) +6. AI/ML (79) +7. ... more categories ... +8. Uncategorized (126) ← at the end + +## For Skill Creators + +### When Adding a New Skill + +Include category in frontmatter: +```yaml +--- +name: my-skill +description: "..." +category: web-development +date_added: "2026-03-06" +--- +``` + +### If You're Not Sure + +The system will automatically categorize on next index regeneration: +```bash +python tools/scripts/generate_index.py +``` + +## Keyword Reference + +Available auto-categorization keywords by category: + +**Backend**: nodejs, node.js, express, fastapi, django, flask, spring, java, python, golang, rust, server, api, rest, graphql, database, sql, mongodb + +**Web Development**: react, vue, angular, html, css, javascript, typescript, frontend, tailwind, bootstrap, webpack, vite, pwa, responsive, seo + +**Database**: database, sql, postgres, mysql, mongodb, firestore, redis, orm, schema + +**AI/ML**: ai, machine learning, ml, tensorflow, pytorch, nlp, llm, gpt, transformer, embedding, training + +**DevOps**: docker, kubernetes, ci/cd, git, jenkins, terraform, ansible, deploy, container, monitoring + +**Cloud**: aws, azure, gcp, serverless, lambda, storage, cdn + +**Security**: encryption, cryptography, jwt, oauth, authentication, authorization, vulnerability + +**Testing**: test, jest, mocha, pytest, cypress, selenium, unit test, e2e + +**Mobile**: mobile, react native, flutter, ios, android, swift, kotlin + +**Automation**: automation, workflow, scripting, robot, trigger, integration + +**Game Development**: game, unity, unreal, godot, threejs, 2d, 3d, physics + +**Data Science**: data, analytics, pandas, numpy, statistics, visualization + +## Customization + +### Add Custom Keywords + +Edit [`tools/scripts/auto_categorize_skills.py`](../../tools/scripts/auto_categorize_skills.py): + +```python +CATEGORY_KEYWORDS = { + 'your-category': [ + 'keyword1', 'keyword2', 'exact phrase', 'another-keyword' + ], + # ... other categories +} +``` + +Then re-run: +```bash +python tools/scripts/auto_categorize_skills.py +python tools/scripts/generate_index.py +``` + +## Troubleshooting + +### "Failed to categorize" Skills + +Some skills may be too generic or unique. You can: + +1. **Manually set category** in the skill's frontmatter: +```yaml +category: your-chosen-category +``` + +2. **Add keywords** to CATEGORY_KEYWORDS config + +3. **Move to folder** if it fits a broader category: +``` +skills/backend/my-new-skill/SKILL.md +``` + +### Regenerating Index + +After making changes to SKILL.md files: +```bash +python tools/scripts/generate_index.py +``` + +This will: +- Parse frontmatter categories +- Fallback to folder structure +- Generate new skills_index.json +- Copy to apps/web-app/public/skills.json + +--- + +**Result**: Much cleaner category filter with smart, meaningful organization! 🎉 diff --git a/skill_categorization/tools/scripts/generate_index.py b/skill_categorization/tools/scripts/generate_index.py new file mode 100644 index 00000000..94724b77 --- /dev/null +++ b/skill_categorization/tools/scripts/generate_index.py @@ -0,0 +1,284 @@ +import os +import json +import re +import sys + +import yaml +from _project_paths import find_repo_root + +# Ensure UTF-8 output for Windows compatibility +if sys.platform == 'win32': + import io + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') + + +CATEGORY_KEYWORDS = { + "web-engineering": [ + "react", "vue", "angular", "svelte", "nextjs", "tailwind", "frontend", + "html", "css", "browser", "web", "dom", "accessibility", "seo", + ], + "backend": [ + "backend", "api", "server", "fastapi", "django", "flask", "express", + "spring", "node", "golang", "rust", "php", "laravel", + ], + "database": [ + "database", "sql", "postgres", "mysql", "mongodb", "redis", "dynamodb", + "orm", "schema", "query", + ], + "ai-ml": [ + "llm", "gpt", "ai", "machine learning", "deep learning", "pytorch", + "tensorflow", "embedding", "rag", "transformer", "model", + ], + "cloud-devops": [ + "docker", "kubernetes", "k8s", "ci/cd", "github actions", "terraform", + "ansible", "aws", "azure", "gcp", "deployment", "devops", "serverless", + ], + "security": [ + "security", "owasp", "audit", "vulnerability", "threat", "penetration", + "authentication", "authorization", "jwt", "oauth", "compliance", + ], + "testing-qa": [ + "test", "testing", "pytest", "jest", "cypress", "playwright", "quality", + "regression", "coverage", "e2e", + ], + "mobile": [ + "android", "ios", "react native", "flutter", "swift", "kotlin", "mobile", + ], + "data-engineering": [ + "etl", "pipeline", "airflow", "spark", "warehouse", "analytics", "data", + ], + "research": [ + "research", "manuscript", "systematic review", "meta-analysis", "grade", + "consort", "prisma", "study", + ], + "bioinformatics": [ + "genomics", "proteomics", "rna", "sequencing", "variant", "phylogenetics", + "biopython", "single-cell", "biomedical", + ], + "geospatial": [ + "geospatial", "gis", "spatial", "remote sensing", "raster", "vector", + ], + "finance": [ + "finance", "trading", "portfolio", "risk", "market", "economic", "treasury", + ], +} + +STOPWORD_TOKENS = { + "skill", "skills", "tool", "tools", "builder", "expert", "guide", "workflow", + "workflows", "system", "systems", "analysis", "integration", "development", + "testing", "management", "engineer", "engineering", "automation", "framework", + "advanced", "modern", "official", "pro", "expert", "starter", "setup", "patterns", + "using", "with", "for", "and", "the", "a", "an", "v2", "v3", "ts", "py", "dotnet", +} + + +def normalize_category(value): + """Normalize category values to lowercase kebab-case.""" + if value is None: + return None + text = str(value).strip().lower() + if not text: + return None + text = text.replace("_", "-") + text = re.sub(r"\s+", "-", text) + text = re.sub(r"[^a-z0-9-]", "", text) + text = re.sub(r"-+", "-", text).strip("-") + return text or None + + +def infer_dynamic_category(skill_id): + """ + Infer a category dynamically from skill id tokens. + + This allows new categories without a fixed allow-list. + """ + raw_tokens = [ + token for token in re.split(r"[^a-z0-9]+", skill_id.lower()) if token + ] + tokens = [token for token in raw_tokens if token not in STOPWORD_TOKENS and len(token) >= 3] + + if len(tokens) >= 2 and tokens[0] in { + "azure", "aws", "google", "github", "gitlab", "slack", "discord", "shopify", + "wordpress", "odoo", "notion", "expo", "react", "nextjs", "kubernetes", + }: + category = normalize_category(f"{tokens[0]}-{tokens[1]}") + if category: + return category, 0.42, f"derived-from-id-prefix:{tokens[0]}-{tokens[1]}" + + if tokens: + category = normalize_category(tokens[-1]) + if category: + return category, 0.34, f"derived-from-id-token:{tokens[-1]}" + + return "general", 0.20, "fallback:general" + + +def infer_category(skill_info, metadata, body_text): + """Infer category, confidence, and reason with deterministic priority rules.""" + explicit_category = normalize_category(metadata.get("category")) + parent_category = normalize_category(skill_info.get("category")) + + if explicit_category and explicit_category != "uncategorized": + return explicit_category, 1.0, "frontmatter:category" + + if parent_category and parent_category != "uncategorized": + return parent_category, 0.95, "path:folder" + + combined_text = " ".join( + [ + str(skill_info.get("id", "")), + str(skill_info.get("name", "")), + str(skill_info.get("description", "")), + body_text, + ] + ).lower() + + best_category = None + best_score = 0 + best_hits = [] + + for category, keywords in CATEGORY_KEYWORDS.items(): + score = 0 + hits = [] + for keyword in keywords: + if re.search(r"\\b" + re.escape(keyword) + r"\\b", combined_text): + score += 3 + hits.append(keyword) + elif len(keyword) >= 5 and keyword in combined_text: + score += 1 + hits.append(keyword) + + if score > best_score: + best_category = category + best_score = score + best_hits = hits + + if best_category and best_score > 0: + confidence = min(0.92, 0.45 + (0.05 * best_score)) + reason_hits = ",".join(best_hits[:3]) if best_hits else "keyword-match" + return best_category, round(confidence, 2), f"keyword-match:{reason_hits}" + + return infer_dynamic_category(str(skill_info.get("id", ""))) + +def parse_frontmatter(content): + """ + Parses YAML frontmatter, sanitizing unquoted values containing @. + Handles single values and comma-separated lists by quoting the entire line. + """ + fm_match = re.search(r'^---\s*\n(.*?)\n---', content, re.DOTALL) + if not fm_match: + return {} + + yaml_text = fm_match.group(1) + + # Process line by line to handle values containing @ and commas + sanitized_lines = [] + for line in yaml_text.splitlines(): + # Match "key: value" (handles keys with dashes like 'package-name') + match = re.match(r'^(\s*[\w-]+):\s*(.*)$', line) + if match: + key, val = match.groups() + val_s = val.strip() + # If value contains @ and isn't already quoted, wrap the whole string in double quotes + if '@' in val_s and not (val_s.startswith('"') or val_s.startswith("'")): + # Escape any existing double quotes within the value string + safe_val = val_s.replace('"', '\\"') + line = f'{key}: "{safe_val}"' + sanitized_lines.append(line) + + sanitized_yaml = '\n'.join(sanitized_lines) + + try: + return yaml.safe_load(sanitized_yaml) or {} + except yaml.YAMLError as e: + print(f"⚠️ YAML parsing error: {e}") + return {} + +def generate_index(skills_dir, output_file): + print(f"🏗️ Generating index from: {skills_dir}") + skills = [] + + for root, dirs, files in os.walk(skills_dir): + # Skip .disabled or hidden directories + dirs[:] = [d for d in dirs if not d.startswith('.')] + + if "SKILL.md" in files: + skill_path = os.path.join(root, "SKILL.md") + dir_name = os.path.basename(root) + parent_dir = os.path.basename(os.path.dirname(root)) + + # Default values + rel_path = os.path.relpath(root, os.path.dirname(skills_dir)) + # Force forward slashes for cross-platform JSON compatibility + skill_info = { + "id": dir_name, + "path": rel_path.replace(os.sep, '/'), + "category": parent_dir if parent_dir != "skills" else None, # Will be overridden by frontmatter if present + "category_confidence": None, + "category_reason": None, + "name": dir_name.replace("-", " ").title(), + "description": "", + "risk": "unknown", + "source": "unknown", + "date_added": None + } + + try: + with open(skill_path, 'r', encoding='utf-8') as f: + content = f.read() + except Exception as e: + print(f"⚠️ Error reading {skill_path}: {e}") + continue + + # Parse Metadata + metadata = parse_frontmatter(content) + + body = content + fm_match = re.search(r'^---\s*\n(.*?)\n---', content, re.DOTALL) + if fm_match: + body = content[fm_match.end():].strip() + + # Merge Metadata (frontmatter takes priority) + if "name" in metadata: skill_info["name"] = metadata["name"] + if "description" in metadata: skill_info["description"] = metadata["description"] + if "risk" in metadata: skill_info["risk"] = metadata["risk"] + if "source" in metadata: skill_info["source"] = metadata["source"] + if "date_added" in metadata: skill_info["date_added"] = metadata["date_added"] + + # Category: prefer frontmatter, then folder structure, then default + inferred_category, confidence, reason = infer_category(skill_info, metadata, body) + skill_info["category"] = inferred_category or "uncategorized" + skill_info["category_confidence"] = confidence + skill_info["category_reason"] = reason + + # Fallback for description if missing in frontmatter (legacy support) + if not skill_info["description"]: + # Simple extraction of first non-header paragraph + lines = body.split('\n') + desc_lines = [] + for line in lines: + if line.startswith('#') or not line.strip(): + if desc_lines: break + continue + desc_lines.append(line.strip()) + + if desc_lines: + skill_info["description"] = " ".join(desc_lines)[:250].strip() + + skills.append(skill_info) + + # Sort validation: by name + skills.sort(key=lambda x: (x["name"].lower(), x["id"].lower())) + + with open(output_file, 'w', encoding='utf-8', newline='\n') as f: + json.dump(skills, f, indent=2) + + print(f"✅ Generated rich index with {len(skills)} skills at: {output_file}") + return skills + +if __name__ == "__main__": + base_dir = str(find_repo_root(__file__)) + skills_path = os.path.join(base_dir, "skills") + output_path = os.path.join(base_dir, "skills_index.json") + generate_index(skills_path, output_path) diff --git a/skill_categorization/tools/scripts/tests/test_generate_index_categorization.py b/skill_categorization/tools/scripts/tests/test_generate_index_categorization.py new file mode 100644 index 00000000..0b8cc635 --- /dev/null +++ b/skill_categorization/tools/scripts/tests/test_generate_index_categorization.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 + +import sys +from pathlib import Path + +sys.path.append(str(Path(__file__).resolve().parents[1])) + +from generate_index import infer_category, normalize_category + + +def assert_equal(actual, expected, label): + assert actual == expected, f"{label}: expected={expected!r} actual={actual!r}" + + +def run_tests(): + category = normalize_category("Cloud DevOps") + assert_equal(category, "cloud-devops", "normalize category") + + skill_info = { + "id": "secure-api-gateway", + "category": "uncategorized", + "name": "secure-api-gateway", + "description": "Harden OAuth and JWT authentication flows for APIs.", + } + metadata = {"category": "security"} + category, confidence, reason = infer_category(skill_info, metadata, "") + assert_equal(category, "security", "frontmatter category") + assert_equal(confidence, 1.0, "frontmatter confidence") + assert reason.startswith("frontmatter:"), reason + + skill_info = { + "id": "kubernetes-rollout-checker", + "category": "uncategorized", + "name": "kubernetes-rollout-checker", + "description": "Validate container rollouts and deployment health.", + } + metadata = {} + category, confidence, reason = infer_category(skill_info, metadata, "") + assert_equal(category, "cloud-devops", "keyword category") + assert confidence >= 0.5, confidence + assert reason.startswith("keyword-match:"), reason + + skill_info = { + "id": "vendorx-hyperflux", + "category": "uncategorized", + "name": "vendorx-hyperflux", + "description": "Internal helper without known taxonomy keywords.", + } + metadata = {} + category, confidence, reason = infer_category(skill_info, metadata, "") + assert_equal(category, "hyperflux", "dynamic category") + assert confidence > 0.0, confidence + assert reason.startswith("derived-from-id-token:"), reason + + print("ok") + + +if __name__ == "__main__": + run_tests()