meta(index): Improve safe skill categorization

This commit is contained in:
sickn33
2026-03-20 09:33:54 +01:00
parent 054565490e
commit 515423b80d
4 changed files with 1037 additions and 692 deletions

View File

@@ -14,6 +14,246 @@ if sys.platform == 'win32':
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
CATEGORY_RULES = [
{
"name": "security",
"keywords": [
"security", "auth", "authentication", "authorization", "oauth", "jwt",
"cryptography", "encryption", "vulnerability", "threat", "pentest",
"xss", "sqli", "gdpr", "pci", "compliance",
],
},
{
"name": "testing",
"keywords": [
"test", "testing", "tdd", "qa", "e2e", "playwright", "cypress",
"pytest", "jest", "benchmark", "evaluation", "end to end",
],
"strong_keywords": ["playwright", "cypress", "pytest", "jest", "e2e", "end to end"],
},
{
"name": "automation",
"keywords": [
"automation", "workflow", "trigger", "integration", "slack",
"airtable", "calendar", "gmail", "google", "hubspot", "notion",
"zendesk", "stripe", "shopify", "sendgrid", "clickup", "n8n",
"zapier", "make", "zoom",
],
},
{
"name": "devops",
"keywords": [
"docker", "kubernetes", "k8s", "helm", "terraform", "deploy",
"deployment", "cicd", "gitops", "observability", "monitoring",
"grafana", "prometheus", "incident", "sre", "tracing",
],
},
{
"name": "cloud",
"keywords": [
"aws", "azure", "gcp", "cloud", "serverless", "lambda", "storage",
"functions", "cdn", "azure", "azd",
],
},
{
"name": "database",
"keywords": [
"database", "sql", "postgres", "postgresql", "mysql", "mongodb",
"redis", "orm", "schema", "migration", "query", "prisma",
],
},
{
"name": "ai-ml",
"keywords": [
"ai", "ml", "llm", "agent", "agents", "gpt", "embedding",
"vector", "rag", "prompt", "model", "training", "inference",
"pytorch", "tensorflow", "hugging", "openai",
],
},
{
"name": "mobile",
"keywords": [
"mobile", "android", "ios", "swift", "swiftui", "kotlin",
"flutter", "expo", "react native", "app store", "play store",
"jetpack compose",
],
},
{
"name": "game-development",
"keywords": [
"game", "unity", "unreal", "godot", "threejs", "3d", "2d",
"shader", "rendering", "webgl", "physics",
],
},
{
"name": "web-development",
"keywords": [
"web", "frontend", "react", "nextjs", "vue", "angular", "svelte",
"tailwind", "css", "html", "browser", "extension", "component",
"ui", "ux", "javascript", "typescript",
],
},
{
"name": "backend",
"keywords": [
"backend", "api", "fastapi", "django", "flask", "express",
"node", "server", "middleware", "graphql", "rest",
],
},
{
"name": "data-science",
"keywords": [
"data", "analytics", "pandas", "numpy", "statistics",
"matplotlib", "plotly", "seaborn", "scipy", "notebook",
],
},
{
"name": "content",
"keywords": [
"content", "copy", "copywriting", "writing", "documentation",
"transcription", "transcribe", "seo", "blog", "markdown",
],
},
{
"name": "business",
"keywords": [
"business", "product", "market", "sales", "finance", "startup",
"legal", "customer", "competitive", "pricing", "kpi",
],
},
{
"name": "architecture",
"keywords": [
"architecture", "adr", "microservices", "ddd", "domain",
"cqrs", "saga", "patterns",
],
},
]
FAMILY_CATEGORY_RULES = [
("azure-", "cloud"),
("aws-", "cloud"),
("gcp-", "cloud"),
("apify-", "automation"),
("google-", "automation"),
("n8n-", "automation"),
("makepad-", "development"),
("robius-", "development"),
("avalonia-", "development"),
("hig-", "development"),
("fp-", "development"),
("fp-ts-", "development"),
("threejs-", "web-development"),
("react-", "web-development"),
("vue-", "web-development"),
("angular-", "web-development"),
("browser-", "web-development"),
("expo-", "mobile"),
("swiftui-", "mobile"),
("android-", "mobile"),
("ios-", "mobile"),
("hugging-face-", "ai-ml"),
("agent-", "ai-ml"),
("agents-", "ai-ml"),
("ai-", "ai-ml"),
("claude-", "ai-ml"),
("context-", "ai-ml"),
("fal-", "ai-ml"),
("yann-", "ai-ml"),
("llm-", "ai-ml"),
("rag-", "ai-ml"),
("embedding-", "ai-ml"),
("odoo-", "business"),
("product-", "business"),
("data-", "data-science"),
("wiki-", "content"),
("documentation-", "content"),
("copy", "content"),
("audio-", "content"),
("video-", "content"),
("api-", "backend"),
("django-", "backend"),
("fastapi-", "backend"),
("backend-", "backend"),
("python-", "development"),
("bash-", "development"),
("code-", "development"),
("codebase-", "development"),
("error-", "development"),
("framework-", "development"),
("debugging-", "development"),
("javascript-", "development"),
("go-", "development"),
("performance-", "development"),
("dbos-", "development"),
("conductor-", "workflow"),
("workflow-", "workflow"),
("create-", "workflow"),
("git-", "workflow"),
("github-", "workflow"),
("gitlab-", "workflow"),
("skill-", "meta"),
("cc-skill-", "meta"),
("tdd-", "testing"),
("test-", "testing"),
("security-", "security"),
("database-", "database"),
("c4-", "architecture"),
("deployment-", "devops"),
("incident-", "devops"),
("terraform-", "devops"),
]
def tokenize(text):
return re.findall(r"[a-z0-9]+", text.lower())
def infer_category(skill_id, skill_name, description):
for prefix, category in FAMILY_CATEGORY_RULES:
if skill_id.startswith(prefix):
return category
normalized_name = skill_name if isinstance(skill_name, str) else ""
normalized_description = description if isinstance(description, str) else ""
combined_text = f"{skill_id} {normalized_name} {normalized_description}".lower()
token_set = set(tokenize(combined_text))
scores = {}
for rule in CATEGORY_RULES:
score = 0
strong_keywords = {keyword.lower() for keyword in rule.get("strong_keywords", [])}
for keyword in rule["keywords"]:
keyword_lower = keyword.lower()
if " " in keyword_lower:
if keyword_lower in combined_text:
score += 4 if keyword_lower in strong_keywords else 3
continue
if keyword_lower in token_set:
score += 3 if keyword_lower in strong_keywords else 2
elif keyword_lower in combined_text:
score += 1
if score > 0:
scores[rule["name"]] = score
if not scores:
return None
ranked = sorted(scores.items(), key=lambda item: (-item[1], item[0]))
best_category, best_score = ranked[0]
second_score = ranked[1][1] if len(ranked) > 1 else 0
if best_score < 4:
return None
if best_score < 8 and (best_score - second_score) < 2:
return None
return best_category
def normalize_yaml_value(value):
if isinstance(value, Mapping):
return {key: normalize_yaml_value(val) for key, val in value.items()}
@@ -109,11 +349,16 @@ def generate_index(skills_dir, output_file):
if "source" in metadata: skill_info["source"] = metadata["source"]
if "date_added" in metadata: skill_info["date_added"] = metadata["date_added"]
# Category: prefer frontmatter, then folder structure, then default
# Category: prefer frontmatter, then folder structure, then conservative inference
if "category" in metadata:
skill_info["category"] = metadata["category"]
elif skill_info["category"] is None:
skill_info["category"] = "uncategorized"
inferred_category = infer_category(
skill_info["id"],
skill_info["name"],
skill_info["description"],
)
skill_info["category"] = inferred_category or "uncategorized"
# Fallback for description if missing in frontmatter (legacy support)
if not skill_info["description"]:

View File

@@ -23,6 +23,7 @@ const LOCAL_TEST_COMMANDS = [
[path.join(TOOL_SCRIPTS, "run-python.js"), path.join(TOOL_TESTS, "test_fix_missing_skill_metadata.py")],
[path.join(TOOL_SCRIPTS, "run-python.js"), path.join(TOOL_TESTS, "test_fix_missing_skill_sections.py")],
[path.join(TOOL_SCRIPTS, "run-python.js"), path.join(TOOL_TESTS, "test_fix_truncated_descriptions.py")],
[path.join(TOOL_SCRIPTS, "run-python.js"), path.join(TOOL_TESTS, "test_generate_index_categories.py")],
[path.join(TOOL_SCRIPTS, "run-python.js"), path.join(TOOL_TESTS, "test_sync_microsoft_skills_security.py")],
[path.join(TOOL_SCRIPTS, "run-python.js"), path.join(TOOL_TESTS, "test_validate_skills_headings.py")],
];

View File

@@ -0,0 +1,99 @@
import importlib.util
import pathlib
import sys
import tempfile
import unittest
REPO_ROOT = pathlib.Path(__file__).resolve().parents[3]
sys.path.insert(0, str(REPO_ROOT / "tools" / "scripts"))
def load_module(module_path: str, module_name: str):
spec = importlib.util.spec_from_file_location(module_name, REPO_ROOT / module_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module
generate_index = load_module("tools/scripts/generate_index.py", "generate_index_categories")
class GenerateIndexCategoryTests(unittest.TestCase):
def test_infer_category_returns_none_for_weak_signal(self):
inferred = generate_index.infer_category(
"mystery-skill",
"Mystery Skill",
"General-purpose guidance for assorted tasks.",
)
self.assertIsNone(inferred)
def test_infer_category_detects_security_skill(self):
inferred = generate_index.infer_category(
"web-security-testing",
"Web Security Testing",
"Identify vulnerabilities, auth flaws, and threat scenarios for web applications.",
)
self.assertEqual(inferred, "security")
def test_infer_category_uses_family_prefix_when_high_confidence(self):
inferred = generate_index.infer_category(
"apify-market-research",
"Apify Market Research",
"Research markets using Apify actors.",
)
self.assertEqual(inferred, "automation")
def test_infer_category_maps_workflow_family_prefixes(self):
inferred = generate_index.infer_category(
"github-actions-templates",
"GitHub Actions Templates",
"Production-ready workflow patterns for GitHub automation.",
)
self.assertEqual(inferred, "workflow")
def test_infer_category_maps_development_family_prefixes(self):
inferred = generate_index.infer_category(
"javascript-mastery",
"JavaScript Mastery",
"Essential JavaScript concepts for developers.",
)
self.assertEqual(inferred, "development")
def test_generate_index_prefers_frontmatter_then_parent_then_inference(self):
with tempfile.TemporaryDirectory() as temp_dir:
base = pathlib.Path(temp_dir)
skills_dir = base / "skills"
output_file = base / "skills_index.json"
explicit_dir = skills_dir / "explicit-skill"
explicit_dir.mkdir(parents=True)
(explicit_dir / "SKILL.md").write_text(
"---\nname: explicit-skill\ncategory: custom\n---\nbody\n",
encoding="utf-8",
)
nested_dir = skills_dir / "bundles" / "nested-skill"
nested_dir.mkdir(parents=True)
(nested_dir / "SKILL.md").write_text(
"---\nname: nested-skill\ndescription: Example\n---\nbody\n",
encoding="utf-8",
)
inferred_dir = skills_dir / "playwright-skill"
inferred_dir.mkdir(parents=True)
(inferred_dir / "SKILL.md").write_text(
"---\nname: playwright-skill\ndescription: End-to-end test automation with Playwright and browser workflows.\n---\nbody\n",
encoding="utf-8",
)
skills = generate_index.generate_index(str(skills_dir), str(output_file))
categories = {skill["id"]: skill["category"] for skill in skills}
self.assertEqual(categories["explicit-skill"], "custom")
self.assertEqual(categories["nested-skill"], "bundles")
self.assertEqual(categories["playwright-skill"], "testing")
if __name__ == "__main__":
unittest.main()