meta(index): Improve safe skill categorization

2026-03-20 09:33:54 +01:00
parent 054565490e
commit 515423b80d
4 changed files with 1037 additions and 692 deletions
--- a/tools/scripts/generate_index.py
+++ b/tools/scripts/generate_index.py
@@ -14,6 +14,246 @@ if sys.platform == 'win32':
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')

+
+CATEGORY_RULES = [
+    {
+        "name": "security",
+        "keywords": [
+            "security", "auth", "authentication", "authorization", "oauth", "jwt",
+            "cryptography", "encryption", "vulnerability", "threat", "pentest",
+            "xss", "sqli", "gdpr", "pci", "compliance",
+        ],
+    },
+    {
+        "name": "testing",
+        "keywords": [
+            "test", "testing", "tdd", "qa", "e2e", "playwright", "cypress",
+            "pytest", "jest", "benchmark", "evaluation", "end to end",
+        ],
+        "strong_keywords": ["playwright", "cypress", "pytest", "jest", "e2e", "end to end"],
+    },
+    {
+        "name": "automation",
+        "keywords": [
+            "automation", "workflow", "trigger", "integration", "slack",
+            "airtable", "calendar", "gmail", "google", "hubspot", "notion",
+            "zendesk", "stripe", "shopify", "sendgrid", "clickup", "n8n",
+            "zapier", "make", "zoom",
+        ],
+    },
+    {
+        "name": "devops",
+        "keywords": [
+            "docker", "kubernetes", "k8s", "helm", "terraform", "deploy",
+            "deployment", "cicd", "gitops", "observability", "monitoring",
+            "grafana", "prometheus", "incident", "sre", "tracing",
+        ],
+    },
+    {
+        "name": "cloud",
+        "keywords": [
+            "aws", "azure", "gcp", "cloud", "serverless", "lambda", "storage",
+            "functions", "cdn", "azure", "azd",
+        ],
+    },
+    {
+        "name": "database",
+        "keywords": [
+            "database", "sql", "postgres", "postgresql", "mysql", "mongodb",
+            "redis", "orm", "schema", "migration", "query", "prisma",
+        ],
+    },
+    {
+        "name": "ai-ml",
+        "keywords": [
+            "ai", "ml", "llm", "agent", "agents", "gpt", "embedding",
+            "vector", "rag", "prompt", "model", "training", "inference",
+            "pytorch", "tensorflow", "hugging", "openai",
+        ],
+    },
+    {
+        "name": "mobile",
+        "keywords": [
+            "mobile", "android", "ios", "swift", "swiftui", "kotlin",
+            "flutter", "expo", "react native", "app store", "play store",
+            "jetpack compose",
+        ],
+    },
+    {
+        "name": "game-development",
+        "keywords": [
+            "game", "unity", "unreal", "godot", "threejs", "3d", "2d",
+            "shader", "rendering", "webgl", "physics",
+        ],
+    },
+    {
+        "name": "web-development",
+        "keywords": [
+            "web", "frontend", "react", "nextjs", "vue", "angular", "svelte",
+            "tailwind", "css", "html", "browser", "extension", "component",
+            "ui", "ux", "javascript", "typescript",
+        ],
+    },
+    {
+        "name": "backend",
+        "keywords": [
+            "backend", "api", "fastapi", "django", "flask", "express",
+            "node", "server", "middleware", "graphql", "rest",
+        ],
+    },
+    {
+        "name": "data-science",
+        "keywords": [
+            "data", "analytics", "pandas", "numpy", "statistics",
+            "matplotlib", "plotly", "seaborn", "scipy", "notebook",
+        ],
+    },
+    {
+        "name": "content",
+        "keywords": [
+            "content", "copy", "copywriting", "writing", "documentation",
+            "transcription", "transcribe", "seo", "blog", "markdown",
+        ],
+    },
+    {
+        "name": "business",
+        "keywords": [
+            "business", "product", "market", "sales", "finance", "startup",
+            "legal", "customer", "competitive", "pricing", "kpi",
+        ],
+    },
+    {
+        "name": "architecture",
+        "keywords": [
+            "architecture", "adr", "microservices", "ddd", "domain",
+            "cqrs", "saga", "patterns",
+        ],
+    },
+]
+
+FAMILY_CATEGORY_RULES = [
+    ("azure-", "cloud"),
+    ("aws-", "cloud"),
+    ("gcp-", "cloud"),
+    ("apify-", "automation"),
+    ("google-", "automation"),
+    ("n8n-", "automation"),
+    ("makepad-", "development"),
+    ("robius-", "development"),
+    ("avalonia-", "development"),
+    ("hig-", "development"),
+    ("fp-", "development"),
+    ("fp-ts-", "development"),
+    ("threejs-", "web-development"),
+    ("react-", "web-development"),
+    ("vue-", "web-development"),
+    ("angular-", "web-development"),
+    ("browser-", "web-development"),
+    ("expo-", "mobile"),
+    ("swiftui-", "mobile"),
+    ("android-", "mobile"),
+    ("ios-", "mobile"),
+    ("hugging-face-", "ai-ml"),
+    ("agent-", "ai-ml"),
+    ("agents-", "ai-ml"),
+    ("ai-", "ai-ml"),
+    ("claude-", "ai-ml"),
+    ("context-", "ai-ml"),
+    ("fal-", "ai-ml"),
+    ("yann-", "ai-ml"),
+    ("llm-", "ai-ml"),
+    ("rag-", "ai-ml"),
+    ("embedding-", "ai-ml"),
+    ("odoo-", "business"),
+    ("product-", "business"),
+    ("data-", "data-science"),
+    ("wiki-", "content"),
+    ("documentation-", "content"),
+    ("copy", "content"),
+    ("audio-", "content"),
+    ("video-", "content"),
+    ("api-", "backend"),
+    ("django-", "backend"),
+    ("fastapi-", "backend"),
+    ("backend-", "backend"),
+    ("python-", "development"),
+    ("bash-", "development"),
+    ("code-", "development"),
+    ("codebase-", "development"),
+    ("error-", "development"),
+    ("framework-", "development"),
+    ("debugging-", "development"),
+    ("javascript-", "development"),
+    ("go-", "development"),
+    ("performance-", "development"),
+    ("dbos-", "development"),
+    ("conductor-", "workflow"),
+    ("workflow-", "workflow"),
+    ("create-", "workflow"),
+    ("git-", "workflow"),
+    ("github-", "workflow"),
+    ("gitlab-", "workflow"),
+    ("skill-", "meta"),
+    ("cc-skill-", "meta"),
+    ("tdd-", "testing"),
+    ("test-", "testing"),
+    ("security-", "security"),
+    ("database-", "database"),
+    ("c4-", "architecture"),
+    ("deployment-", "devops"),
+    ("incident-", "devops"),
+    ("terraform-", "devops"),
+]
+
+
+def tokenize(text):
+    return re.findall(r"[a-z0-9]+", text.lower())
+
+
+def infer_category(skill_id, skill_name, description):
+    for prefix, category in FAMILY_CATEGORY_RULES:
+        if skill_id.startswith(prefix):
+            return category
+
+    normalized_name = skill_name if isinstance(skill_name, str) else ""
+    normalized_description = description if isinstance(description, str) else ""
+    combined_text = f"{skill_id} {normalized_name} {normalized_description}".lower()
+    token_set = set(tokenize(combined_text))
+    scores = {}
+
+    for rule in CATEGORY_RULES:
+        score = 0
+        strong_keywords = {keyword.lower() for keyword in rule.get("strong_keywords", [])}
+        for keyword in rule["keywords"]:
+            keyword_lower = keyword.lower()
+            if " " in keyword_lower:
+                if keyword_lower in combined_text:
+                    score += 4 if keyword_lower in strong_keywords else 3
+                continue
+
+            if keyword_lower in token_set:
+                score += 3 if keyword_lower in strong_keywords else 2
+            elif keyword_lower in combined_text:
+                score += 1
+
+        if score > 0:
+            scores[rule["name"]] = score
+
+    if not scores:
+        return None
+
+    ranked = sorted(scores.items(), key=lambda item: (-item[1], item[0]))
+    best_category, best_score = ranked[0]
+    second_score = ranked[1][1] if len(ranked) > 1 else 0
+
+    if best_score < 4:
+        return None
+
+    if best_score < 8 and (best_score - second_score) < 2:
+        return None
+
+    return best_category
+
 def normalize_yaml_value(value):
    if isinstance(value, Mapping):
        return {key: normalize_yaml_value(val) for key, val in value.items()}
@@ -109,11 +349,16 @@ def generate_index(skills_dir, output_file):
            if "source" in metadata: skill_info["source"] = metadata["source"]
            if "date_added" in metadata: skill_info["date_added"] = metadata["date_added"]
            
-            # Category: prefer frontmatter, then folder structure, then default
+            # Category: prefer frontmatter, then folder structure, then conservative inference
            if "category" in metadata:
                skill_info["category"] = metadata["category"]
            elif skill_info["category"] is None:
-                skill_info["category"] = "uncategorized"
+                inferred_category = infer_category(
+                    skill_info["id"],
+                    skill_info["name"],
+                    skill_info["description"],
+                )
+                skill_info["category"] = inferred_category or "uncategorized"
            
            # Fallback for description if missing in frontmatter (legacy support)
            if not skill_info["description"]:
--- a/tools/scripts/tests/run-test-suite.js
+++ b/tools/scripts/tests/run-test-suite.js
@@ -23,6 +23,7 @@ const LOCAL_TEST_COMMANDS = [
  [path.join(TOOL_SCRIPTS, "run-python.js"), path.join(TOOL_TESTS, "test_fix_missing_skill_metadata.py")],
  [path.join(TOOL_SCRIPTS, "run-python.js"), path.join(TOOL_TESTS, "test_fix_missing_skill_sections.py")],
  [path.join(TOOL_SCRIPTS, "run-python.js"), path.join(TOOL_TESTS, "test_fix_truncated_descriptions.py")],
+  [path.join(TOOL_SCRIPTS, "run-python.js"), path.join(TOOL_TESTS, "test_generate_index_categories.py")],
  [path.join(TOOL_SCRIPTS, "run-python.js"), path.join(TOOL_TESTS, "test_sync_microsoft_skills_security.py")],
  [path.join(TOOL_SCRIPTS, "run-python.js"), path.join(TOOL_TESTS, "test_validate_skills_headings.py")],
 ];
--- a/tools/scripts/tests/test_generate_index_categories.py
+++ b/tools/scripts/tests/test_generate_index_categories.py
@@ -0,0 +1,99 @@
+import importlib.util
+import pathlib
+import sys
+import tempfile
+import unittest
+
+
+REPO_ROOT = pathlib.Path(__file__).resolve().parents[3]
+sys.path.insert(0, str(REPO_ROOT / "tools" / "scripts"))
+
+
+def load_module(module_path: str, module_name: str):
+    spec = importlib.util.spec_from_file_location(module_name, REPO_ROOT / module_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+generate_index = load_module("tools/scripts/generate_index.py", "generate_index_categories")
+
+
+class GenerateIndexCategoryTests(unittest.TestCase):
+    def test_infer_category_returns_none_for_weak_signal(self):
+        inferred = generate_index.infer_category(
+            "mystery-skill",
+            "Mystery Skill",
+            "General-purpose guidance for assorted tasks.",
+        )
+        self.assertIsNone(inferred)
+
+    def test_infer_category_detects_security_skill(self):
+        inferred = generate_index.infer_category(
+            "web-security-testing",
+            "Web Security Testing",
+            "Identify vulnerabilities, auth flaws, and threat scenarios for web applications.",
+        )
+        self.assertEqual(inferred, "security")
+
+    def test_infer_category_uses_family_prefix_when_high_confidence(self):
+        inferred = generate_index.infer_category(
+            "apify-market-research",
+            "Apify Market Research",
+            "Research markets using Apify actors.",
+        )
+        self.assertEqual(inferred, "automation")
+
+    def test_infer_category_maps_workflow_family_prefixes(self):
+        inferred = generate_index.infer_category(
+            "github-actions-templates",
+            "GitHub Actions Templates",
+            "Production-ready workflow patterns for GitHub automation.",
+        )
+        self.assertEqual(inferred, "workflow")
+
+    def test_infer_category_maps_development_family_prefixes(self):
+        inferred = generate_index.infer_category(
+            "javascript-mastery",
+            "JavaScript Mastery",
+            "Essential JavaScript concepts for developers.",
+        )
+        self.assertEqual(inferred, "development")
+
+    def test_generate_index_prefers_frontmatter_then_parent_then_inference(self):
+        with tempfile.TemporaryDirectory() as temp_dir:
+            base = pathlib.Path(temp_dir)
+            skills_dir = base / "skills"
+            output_file = base / "skills_index.json"
+
+            explicit_dir = skills_dir / "explicit-skill"
+            explicit_dir.mkdir(parents=True)
+            (explicit_dir / "SKILL.md").write_text(
+                "---\nname: explicit-skill\ncategory: custom\n---\nbody\n",
+                encoding="utf-8",
+            )
+
+            nested_dir = skills_dir / "bundles" / "nested-skill"
+            nested_dir.mkdir(parents=True)
+            (nested_dir / "SKILL.md").write_text(
+                "---\nname: nested-skill\ndescription: Example\n---\nbody\n",
+                encoding="utf-8",
+            )
+
+            inferred_dir = skills_dir / "playwright-skill"
+            inferred_dir.mkdir(parents=True)
+            (inferred_dir / "SKILL.md").write_text(
+                "---\nname: playwright-skill\ndescription: End-to-end test automation with Playwright and browser workflows.\n---\nbody\n",
+                encoding="utf-8",
+            )
+
+            skills = generate_index.generate_index(str(skills_dir), str(output_file))
+            categories = {skill["id"]: skill["category"] for skill in skills}
+
+            self.assertEqual(categories["explicit-skill"], "custom")
+            self.assertEqual(categories["nested-skill"], "bundles")
+            self.assertEqual(categories["playwright-skill"], "testing")
+
+
+if __name__ == "__main__":
+    unittest.main()