Add a maintainers script to safely promote high-confidence legacy risk labels from unknown to concrete values, cover it with tests, and regenerate the canonical skill artifacts and plugin copies. This reduces the legacy unknown backlog without forcing noisy classifications that still need manual review.
174 lines
5.3 KiB
Python
174 lines
5.3 KiB
Python
#!/usr/bin/env python3
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import os
|
|
import re
|
|
import sys
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
|
|
from _project_paths import find_repo_root
|
|
from _safe_files import is_safe_regular_file
|
|
from risk_classifier import suggest_risk
|
|
from validate_skills import configure_utf8_output, parse_frontmatter
|
|
|
|
|
|
FRONTMATTER_PATTERN = re.compile(r"^---\s*\n(.*?)\n---", re.DOTALL)
|
|
SAFE_BLOCKLIST_PATTERN = re.compile(
|
|
r"\b(?:"
|
|
r"create|write|overwrite|append|modify|update|delete|remove|deploy|publish|"
|
|
r"push|commit|merge|install|token|secret|password|oauth|api[_ -]?key|"
|
|
r"POST|PUT|PATCH|DELETE"
|
|
r")\b",
|
|
re.IGNORECASE,
|
|
)
|
|
STRONG_CRITICAL_REASONS = {
|
|
"curl pipes into a shell",
|
|
"wget pipes into a shell",
|
|
"PowerShell invoke-expression",
|
|
"destructive filesystem delete",
|
|
"git mutation",
|
|
"package publication",
|
|
"deployment or infrastructure mutation",
|
|
}
|
|
SAFE_ALLOWED_REASONS = {
|
|
"non-mutating command example",
|
|
"contains fenced examples",
|
|
"read-only or diagnostic language",
|
|
"technical or integration language",
|
|
}
|
|
EXPLICIT_OFFENSIVE_REASON = "explicit offensive disclaimer"
|
|
|
|
|
|
def strip_frontmatter(content: str) -> tuple[str, str] | None:
|
|
match = FRONTMATTER_PATTERN.search(content)
|
|
if not match:
|
|
return None
|
|
return match.group(1), content[match.end():]
|
|
|
|
|
|
def replace_risk_value(content: str, new_risk: str) -> str:
|
|
frontmatter = strip_frontmatter(content)
|
|
if frontmatter is None:
|
|
return content
|
|
|
|
frontmatter_text, body = frontmatter
|
|
lines = frontmatter_text.splitlines()
|
|
for index, line in enumerate(lines):
|
|
stripped = line.strip()
|
|
if stripped.startswith("risk:"):
|
|
indent = line[: len(line) - len(line.lstrip())]
|
|
lines[index] = f"{indent}risk: {new_risk}"
|
|
break
|
|
else:
|
|
return content
|
|
|
|
updated_frontmatter = "\n".join(lines)
|
|
return f"---\n{updated_frontmatter}\n---{body}"
|
|
|
|
|
|
def choose_synced_risk(content: str, metadata: dict[str, object] | None) -> tuple[str, tuple[str, ...]] | None:
|
|
if not metadata or metadata.get("risk") != "unknown":
|
|
return None
|
|
|
|
suggestion = suggest_risk(content, metadata)
|
|
reasons = tuple(suggestion.reasons)
|
|
reason_set = set(reasons)
|
|
|
|
if suggestion.risk == "offensive":
|
|
if EXPLICIT_OFFENSIVE_REASON in reason_set:
|
|
return "offensive", reasons
|
|
return None
|
|
|
|
if suggestion.risk == "critical":
|
|
if reason_set & STRONG_CRITICAL_REASONS:
|
|
return "critical", reasons
|
|
return None
|
|
|
|
if suggestion.risk == "safe":
|
|
if not reason_set:
|
|
return None
|
|
if not reason_set.issubset(SAFE_ALLOWED_REASONS):
|
|
return None
|
|
if SAFE_BLOCKLIST_PATTERN.search(content):
|
|
return None
|
|
return "safe", reasons
|
|
|
|
return None
|
|
|
|
|
|
def update_skill_file(skill_path: Path) -> tuple[bool, str | None, tuple[str, ...]]:
|
|
if not is_safe_regular_file(skill_path):
|
|
return False, None, ()
|
|
|
|
content = skill_path.read_text(encoding="utf-8")
|
|
metadata, _ = parse_frontmatter(content, skill_path.as_posix())
|
|
decision = choose_synced_risk(content, metadata)
|
|
if decision is None:
|
|
return False, None, ()
|
|
|
|
new_risk, reasons = decision
|
|
updated_content = replace_risk_value(content, new_risk)
|
|
if updated_content == content:
|
|
return False, None, ()
|
|
|
|
skill_path.write_text(updated_content, encoding="utf-8")
|
|
return True, new_risk, reasons
|
|
|
|
|
|
def iter_skill_files(skills_dir: Path):
|
|
for root, dirs, files in os.walk(skills_dir):
|
|
dirs[:] = [directory for directory in dirs if not directory.startswith(".")]
|
|
if "SKILL.md" in files:
|
|
yield Path(root) / "SKILL.md"
|
|
|
|
|
|
def main() -> int:
|
|
configure_utf8_output()
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Conservatively sync legacy risk: unknown labels to concrete values.",
|
|
)
|
|
parser.add_argument("--dry-run", action="store_true", help="Preview changes without writing files.")
|
|
args = parser.parse_args()
|
|
|
|
repo_root = find_repo_root(__file__)
|
|
skills_dir = repo_root / "skills"
|
|
|
|
updated_count = 0
|
|
by_risk: Counter[str] = Counter()
|
|
|
|
for skill_path in iter_skill_files(skills_dir):
|
|
content = skill_path.read_text(encoding="utf-8")
|
|
metadata, _ = parse_frontmatter(content, skill_path.as_posix())
|
|
decision = choose_synced_risk(content, metadata)
|
|
if decision is None:
|
|
continue
|
|
|
|
new_risk, reasons = decision
|
|
rel_path = skill_path.relative_to(repo_root)
|
|
|
|
if args.dry_run:
|
|
print(f"SYNC {rel_path} [risk={new_risk}; reasons={', '.join(reasons[:3])}]")
|
|
updated_count += 1
|
|
by_risk[new_risk] += 1
|
|
continue
|
|
|
|
changed, applied_risk, applied_reasons = update_skill_file(skill_path)
|
|
if changed and applied_risk is not None:
|
|
print(
|
|
f"SYNC {rel_path} [risk={applied_risk}; reasons={', '.join(applied_reasons[:3])}]"
|
|
)
|
|
updated_count += 1
|
|
by_risk[applied_risk] += 1
|
|
|
|
print(f"\nUpdated: {updated_count}")
|
|
if updated_count:
|
|
print(f"By risk: {dict(sorted(by_risk.items()))}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|