Add repo-wide auditing and targeted repair scripts for skill metadata. Fix truncated descriptions automatically, keep heading normalization conservative, and remove synthetic boilerplate sections that degrade editorial quality while regenerating repo indexes and catalogs. Fixes #365
251 lines
7.8 KiB
Python
251 lines
7.8 KiB
Python
#!/usr/bin/env python3
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import os
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
from _project_paths import find_repo_root
|
|
from validate_skills import configure_utf8_output, parse_frontmatter
|
|
|
|
|
|
ELLIPSIS_PATTERN = re.compile(r"(?:\.\.\.|…)\s*$")
|
|
MAX_DESCRIPTION_LENGTH = 300
|
|
MIN_PARAGRAPH_LENGTH = 40
|
|
TOP_LEVEL_KEY_PATTERN = re.compile(r"^[A-Za-z0-9_-]+:\s*")
|
|
FRONTMATTER_PATTERN = re.compile(r"^---\s*\n(.*?)\n---", re.DOTALL)
|
|
MARKDOWN_DECORATION_PATTERN = re.compile(r"[*_`]+")
|
|
HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
|
|
MULTISPACE_PATTERN = re.compile(r"\s+")
|
|
|
|
|
|
def strip_frontmatter(content: str) -> str:
|
|
match = FRONTMATTER_PATTERN.search(content)
|
|
if not match:
|
|
return content
|
|
return content[match.end():].lstrip()
|
|
|
|
|
|
def normalize_text(text: str) -> str:
|
|
text = text.strip()
|
|
text = re.sub(r"^\s*>+\s?", "", text)
|
|
text = MARKDOWN_DECORATION_PATTERN.sub("", text)
|
|
text = HTML_TAG_PATTERN.sub("", text)
|
|
text = MULTISPACE_PATTERN.sub(" ", text)
|
|
return text.strip()
|
|
|
|
|
|
def split_candidate_paragraphs(body: str) -> list[str]:
|
|
paragraphs: list[str] = []
|
|
current: list[str] = []
|
|
in_code_block = False
|
|
|
|
for raw_line in body.splitlines():
|
|
line = raw_line.rstrip()
|
|
stripped = line.strip()
|
|
|
|
if stripped.startswith("```"):
|
|
in_code_block = not in_code_block
|
|
if current:
|
|
paragraphs.append(" ".join(current))
|
|
current = []
|
|
continue
|
|
|
|
if in_code_block:
|
|
continue
|
|
|
|
if not stripped:
|
|
if current:
|
|
paragraphs.append(" ".join(current))
|
|
current = []
|
|
continue
|
|
|
|
if stripped.startswith("#"):
|
|
if current:
|
|
paragraphs.append(" ".join(current))
|
|
current = []
|
|
continue
|
|
|
|
if stripped.startswith(("- ", "* ", "|", "1. ", "2. ", "3. ", "4. ", "5. ")):
|
|
if current:
|
|
paragraphs.append(" ".join(current))
|
|
current = []
|
|
continue
|
|
|
|
current.append(stripped)
|
|
|
|
if current:
|
|
paragraphs.append(" ".join(current))
|
|
|
|
return [normalize_text(paragraph) for paragraph in paragraphs if normalize_text(paragraph)]
|
|
|
|
|
|
def is_usable_paragraph(paragraph: str) -> bool:
|
|
lower = paragraph.lower()
|
|
if len(paragraph) < MIN_PARAGRAPH_LENGTH:
|
|
return False
|
|
if lower.startswith(("role:", "works well with:", "capabilities:", "patterns:", "anti-patterns:")):
|
|
return False
|
|
if lower.startswith("this skill is applicable to execute the workflow"):
|
|
return False
|
|
return True
|
|
|
|
|
|
def normalize_for_match(text: str) -> str:
|
|
return re.sub(r"[^a-z0-9]+", "", text.lower())
|
|
|
|
|
|
def pick_candidate(description: str, body: str) -> str | None:
|
|
paragraphs = [paragraph for paragraph in split_candidate_paragraphs(body) if is_usable_paragraph(paragraph)]
|
|
if not paragraphs:
|
|
return None
|
|
|
|
desc_prefix = ELLIPSIS_PATTERN.sub("", description).strip()
|
|
normalized_prefix = normalize_for_match(desc_prefix)
|
|
|
|
if normalized_prefix:
|
|
for paragraph in paragraphs:
|
|
normalized_paragraph = normalize_for_match(paragraph)
|
|
if normalized_paragraph.startswith(normalized_prefix) or normalized_prefix in normalized_paragraph:
|
|
return paragraph
|
|
|
|
return paragraphs[0]
|
|
|
|
|
|
def clamp_description(text: str, max_length: int = MAX_DESCRIPTION_LENGTH) -> str:
|
|
text = normalize_text(text)
|
|
if len(text) <= max_length:
|
|
return text
|
|
|
|
sentence_candidates = [". ", "! ", "? "]
|
|
best_split = -1
|
|
for marker in sentence_candidates:
|
|
split = text.rfind(marker, 0, max_length + 1)
|
|
if split > best_split:
|
|
best_split = split
|
|
|
|
if best_split != -1:
|
|
return text[: best_split + 1].strip()
|
|
|
|
split = text.rfind(" ", 0, max_length + 1)
|
|
if split == -1:
|
|
return text[:max_length].strip()
|
|
return text[:split].strip()
|
|
|
|
|
|
def escape_yaml_string(text: str) -> str:
|
|
return text.replace("\\", "\\\\").replace('"', '\\"')
|
|
|
|
|
|
def replace_description(frontmatter_text: str, new_description: str) -> str:
|
|
lines = frontmatter_text.splitlines()
|
|
replacement = f'description: "{escape_yaml_string(new_description)}"'
|
|
|
|
for index, line in enumerate(lines):
|
|
if not re.match(r"^\s*description:\s*", line):
|
|
continue
|
|
|
|
current_indent = len(line) - len(line.lstrip(" "))
|
|
end_index = index + 1
|
|
while end_index < len(lines):
|
|
candidate = lines[end_index]
|
|
stripped = candidate.strip()
|
|
candidate_indent = len(candidate) - len(candidate.lstrip(" "))
|
|
if not stripped:
|
|
end_index += 1
|
|
continue
|
|
if candidate_indent <= current_indent and TOP_LEVEL_KEY_PATTERN.match(stripped):
|
|
break
|
|
end_index += 1
|
|
|
|
updated = lines[:index] + [replacement] + lines[end_index:]
|
|
return "\n".join(updated)
|
|
|
|
raise ValueError("Description field not found in frontmatter.")
|
|
|
|
|
|
def update_skill_file(skill_path: Path) -> tuple[bool, str | None]:
|
|
content = skill_path.read_text(encoding="utf-8")
|
|
match = FRONTMATTER_PATTERN.search(content)
|
|
if not match:
|
|
return False, None
|
|
|
|
metadata, _ = parse_frontmatter(content, skill_path.as_posix())
|
|
if not metadata:
|
|
return False, None
|
|
|
|
description = metadata.get("description")
|
|
if not isinstance(description, str) or not ELLIPSIS_PATTERN.search(description.strip()):
|
|
return False, None
|
|
|
|
candidate = pick_candidate(description, strip_frontmatter(content))
|
|
if not candidate:
|
|
return False, None
|
|
|
|
new_description = clamp_description(candidate)
|
|
if not new_description or new_description == normalize_text(description):
|
|
return False, None
|
|
|
|
updated_frontmatter = replace_description(match.group(1), new_description)
|
|
updated_content = f"---\n{updated_frontmatter}\n---{content[match.end():]}"
|
|
if updated_content == content:
|
|
return False, None
|
|
|
|
skill_path.write_text(updated_content, encoding="utf-8")
|
|
return True, new_description
|
|
|
|
|
|
def main() -> int:
|
|
configure_utf8_output()
|
|
|
|
parser = argparse.ArgumentParser(description="Repair truncated SKILL.md frontmatter descriptions.")
|
|
parser.add_argument("--dry-run", action="store_true", help="Report planned fixes without writing files.")
|
|
args = parser.parse_args()
|
|
|
|
repo_root = find_repo_root(__file__)
|
|
skills_dir = repo_root / "skills"
|
|
|
|
fixed = 0
|
|
skipped = 0
|
|
for root, dirs, files in os.walk(skills_dir):
|
|
dirs[:] = [directory for directory in dirs if not directory.startswith(".")]
|
|
if "SKILL.md" not in files:
|
|
continue
|
|
|
|
skill_path = Path(root) / "SKILL.md"
|
|
content = skill_path.read_text(encoding="utf-8")
|
|
metadata, _ = parse_frontmatter(content, skill_path.as_posix())
|
|
description = metadata.get("description") if metadata else None
|
|
if not isinstance(description, str) or not ELLIPSIS_PATTERN.search(description.strip()):
|
|
continue
|
|
|
|
candidate = pick_candidate(description, strip_frontmatter(content))
|
|
if not candidate:
|
|
skipped += 1
|
|
print(f"SKIP {skill_path.relative_to(repo_root)}")
|
|
continue
|
|
|
|
new_description = clamp_description(candidate)
|
|
if args.dry_run:
|
|
fixed += 1
|
|
print(f"FIX {skill_path.relative_to(repo_root)} -> {new_description}")
|
|
continue
|
|
|
|
changed, _ = update_skill_file(skill_path)
|
|
if changed:
|
|
fixed += 1
|
|
print(f"FIX {skill_path.relative_to(repo_root)}")
|
|
else:
|
|
skipped += 1
|
|
print(f"SKIP {skill_path.relative_to(repo_root)}")
|
|
|
|
print(f"\nFixed: {fixed}")
|
|
print(f"Skipped: {skipped}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|