meta(risk): Sync conservative legacy labels

Add a maintainers script to safely promote high-confidence legacy risk labels from unknown to concrete values, cover it with tests, and regenerate the canonical skill artifacts and plugin copies. This reduces the legacy unknown backlog without forcing noisy classifications that still need manual review.
This commit is contained in:
sickn33
2026-03-29 10:45:21 +02:00
parent eb3df2a577
commit 0db870eb11
582 changed files with 1241 additions and 935 deletions

View File

@@ -0,0 +1,173 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import os
import re
import sys
from collections import Counter
from pathlib import Path
from _project_paths import find_repo_root
from _safe_files import is_safe_regular_file
from risk_classifier import suggest_risk
from validate_skills import configure_utf8_output, parse_frontmatter
FRONTMATTER_PATTERN = re.compile(r"^---\s*\n(.*?)\n---", re.DOTALL)
SAFE_BLOCKLIST_PATTERN = re.compile(
r"\b(?:"
r"create|write|overwrite|append|modify|update|delete|remove|deploy|publish|"
r"push|commit|merge|install|token|secret|password|oauth|api[_ -]?key|"
r"POST|PUT|PATCH|DELETE"
r")\b",
re.IGNORECASE,
)
STRONG_CRITICAL_REASONS = {
"curl pipes into a shell",
"wget pipes into a shell",
"PowerShell invoke-expression",
"destructive filesystem delete",
"git mutation",
"package publication",
"deployment or infrastructure mutation",
}
SAFE_ALLOWED_REASONS = {
"non-mutating command example",
"contains fenced examples",
"read-only or diagnostic language",
"technical or integration language",
}
EXPLICIT_OFFENSIVE_REASON = "explicit offensive disclaimer"
def strip_frontmatter(content: str) -> tuple[str, str] | None:
match = FRONTMATTER_PATTERN.search(content)
if not match:
return None
return match.group(1), content[match.end():]
def replace_risk_value(content: str, new_risk: str) -> str:
frontmatter = strip_frontmatter(content)
if frontmatter is None:
return content
frontmatter_text, body = frontmatter
lines = frontmatter_text.splitlines()
for index, line in enumerate(lines):
stripped = line.strip()
if stripped.startswith("risk:"):
indent = line[: len(line) - len(line.lstrip())]
lines[index] = f"{indent}risk: {new_risk}"
break
else:
return content
updated_frontmatter = "\n".join(lines)
return f"---\n{updated_frontmatter}\n---{body}"
def choose_synced_risk(content: str, metadata: dict[str, object] | None) -> tuple[str, tuple[str, ...]] | None:
if not metadata or metadata.get("risk") != "unknown":
return None
suggestion = suggest_risk(content, metadata)
reasons = tuple(suggestion.reasons)
reason_set = set(reasons)
if suggestion.risk == "offensive":
if EXPLICIT_OFFENSIVE_REASON in reason_set:
return "offensive", reasons
return None
if suggestion.risk == "critical":
if reason_set & STRONG_CRITICAL_REASONS:
return "critical", reasons
return None
if suggestion.risk == "safe":
if not reason_set:
return None
if not reason_set.issubset(SAFE_ALLOWED_REASONS):
return None
if SAFE_BLOCKLIST_PATTERN.search(content):
return None
return "safe", reasons
return None
def update_skill_file(skill_path: Path) -> tuple[bool, str | None, tuple[str, ...]]:
if not is_safe_regular_file(skill_path):
return False, None, ()
content = skill_path.read_text(encoding="utf-8")
metadata, _ = parse_frontmatter(content, skill_path.as_posix())
decision = choose_synced_risk(content, metadata)
if decision is None:
return False, None, ()
new_risk, reasons = decision
updated_content = replace_risk_value(content, new_risk)
if updated_content == content:
return False, None, ()
skill_path.write_text(updated_content, encoding="utf-8")
return True, new_risk, reasons
def iter_skill_files(skills_dir: Path):
for root, dirs, files in os.walk(skills_dir):
dirs[:] = [directory for directory in dirs if not directory.startswith(".")]
if "SKILL.md" in files:
yield Path(root) / "SKILL.md"
def main() -> int:
configure_utf8_output()
parser = argparse.ArgumentParser(
description="Conservatively sync legacy risk: unknown labels to concrete values.",
)
parser.add_argument("--dry-run", action="store_true", help="Preview changes without writing files.")
args = parser.parse_args()
repo_root = find_repo_root(__file__)
skills_dir = repo_root / "skills"
updated_count = 0
by_risk: Counter[str] = Counter()
for skill_path in iter_skill_files(skills_dir):
content = skill_path.read_text(encoding="utf-8")
metadata, _ = parse_frontmatter(content, skill_path.as_posix())
decision = choose_synced_risk(content, metadata)
if decision is None:
continue
new_risk, reasons = decision
rel_path = skill_path.relative_to(repo_root)
if args.dry_run:
print(f"SYNC {rel_path} [risk={new_risk}; reasons={', '.join(reasons[:3])}]")
updated_count += 1
by_risk[new_risk] += 1
continue
changed, applied_risk, applied_reasons = update_skill_file(skill_path)
if changed and applied_risk is not None:
print(
f"SYNC {rel_path} [risk={applied_risk}; reasons={', '.join(applied_reasons[:3])}]"
)
updated_count += 1
by_risk[applied_risk] += 1
print(f"\nUpdated: {updated_count}")
if updated_count:
print(f"By risk: {dict(sorted(by_risk.items()))}")
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@@ -39,6 +39,7 @@ const LOCAL_TEST_COMMANDS = [
[path.join(TOOL_SCRIPTS, "run-python.js"), path.join(TOOL_TESTS, "test_sync_microsoft_skills_security.py")],
[path.join(TOOL_SCRIPTS, "run-python.js"), path.join(TOOL_TESTS, "test_sync_repo_metadata.py")],
[path.join(TOOL_SCRIPTS, "run-python.js"), path.join(TOOL_TESTS, "test_sync_contributors.py")],
[path.join(TOOL_SCRIPTS, "run-python.js"), path.join(TOOL_TESTS, "test_sync_risk_labels.py")],
[path.join(TOOL_SCRIPTS, "run-python.js"), path.join(TOOL_TESTS, "test_validation_warning_budget.py")],
[path.join(TOOL_SCRIPTS, "run-python.js"), path.join(TOOL_TESTS, "test_whatsapp_config_logging_security.py")],
[path.join(TOOL_SCRIPTS, "run-python.js"), path.join(TOOL_TESTS, "test_maintainer_audit.py")],

View File

@@ -0,0 +1,129 @@
import importlib.util
import sys
import tempfile
import unittest
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parents[3]
TOOLS_SCRIPTS_DIR = REPO_ROOT / "tools" / "scripts"
if str(TOOLS_SCRIPTS_DIR) not in sys.path:
sys.path.insert(0, str(TOOLS_SCRIPTS_DIR))
def load_module(relative_path: str, module_name: str):
module_path = REPO_ROOT / relative_path
spec = importlib.util.spec_from_file_location(module_name, module_path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
sys.modules[module_name] = module
spec.loader.exec_module(module)
return module
sync_risk_labels = load_module(
"tools/scripts/sync_risk_labels.py",
"sync_risk_labels_test",
)
class SyncRiskLabelsTests(unittest.TestCase):
def test_choose_synced_risk_promotes_git_mutation_to_critical(self):
content = """---
name: commit
description: commit changes safely
risk: unknown
source: community
---
Use `git commit` and `git push` once the branch is ready.
"""
metadata = {"name": "commit", "description": "commit changes safely", "risk": "unknown"}
decision = sync_risk_labels.choose_synced_risk(content, metadata)
self.assertIsNotNone(decision)
assert decision is not None
self.assertEqual(decision[0], "critical")
self.assertIn("git mutation", decision[1])
def test_choose_synced_risk_promotes_read_only_skill_to_safe(self):
content = """---
name: seo-fundamentals
description: Learn the core principles of SEO.
risk: unknown
source: community
---
## Overview
Review search quality signals and analyze page structure.
"""
metadata = {"name": "seo-fundamentals", "description": "Learn the core principles of SEO.", "risk": "unknown"}
decision = sync_risk_labels.choose_synced_risk(content, metadata)
self.assertIsNotNone(decision)
assert decision is not None
self.assertEqual(decision[0], "safe")
def test_choose_synced_risk_keeps_unknown_when_safe_text_mentions_install(self):
content = """---
name: package-setup
description: Explain how to inspect package setup.
risk: unknown
source: community
---
Use this skill to analyze package setup and install dependencies if needed.
"""
metadata = {"name": "package-setup", "description": "Explain how to inspect package setup.", "risk": "unknown"}
decision = sync_risk_labels.choose_synced_risk(content, metadata)
self.assertIsNone(decision)
def test_choose_synced_risk_requires_explicit_disclaimer_for_offensive(self):
content = """---
name: pentest-checklist
description: penetration testing checklist
risk: unknown
source: community
---
Plan a penetration testing engagement and define red team scope.
"""
metadata = {"name": "pentest-checklist", "description": "penetration testing checklist", "risk": "unknown"}
decision = sync_risk_labels.choose_synced_risk(content, metadata)
self.assertIsNone(decision)
def test_update_skill_file_rewrites_frontmatter(self):
with tempfile.TemporaryDirectory() as temp_dir:
skill_path = Path(temp_dir) / "SKILL.md"
skill_path.write_text(
"""---
name: commit
description: commit changes safely
risk: unknown
source: community
---
Use `git commit` before `git push`.
""",
encoding="utf-8",
)
changed, new_risk, reasons = sync_risk_labels.update_skill_file(skill_path)
self.assertTrue(changed)
self.assertEqual(new_risk, "critical")
self.assertIn("git mutation", reasons)
updated = skill_path.read_text(encoding="utf-8")
self.assertIn("risk: critical", updated)
self.assertNotIn("risk: unknown", updated)
if __name__ == "__main__":
unittest.main()