Files
antigravity-skills-reference/tools/scripts/convert_html_to_markdown.py
sickn33 344854e9e5 fix(security): Address remaining scanning alerts
Tighten the remaining high-signal security findings by switching the todo example to a standard Express rate limiter, removing sensitive metadata from boilerplate logging, and replacing fragile HTML tag filtering with parser-based conversion.

Co-Authored-By: Claude <noreply@anthropic.com>
2026-03-18 18:15:49 +01:00

430 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Convert skills with HTML content to clean markdown.
Attempts to download raw markdown files from GitHub, extracts content from HTML if needed,
or creates minimal markdown content as fallback.
"""
import json
import re
import sys
import urllib.request
import urllib.error
from html import unescape
from html.parser import HTMLParser
from pathlib import Path
from typing import Dict, Optional, Tuple
from urllib.parse import urlparse, urljoin
class MarkdownHTMLParser(HTMLParser):
"""Convert a constrained subset of HTML into markdown without regex tag stripping."""
def __init__(self) -> None:
super().__init__(convert_charrefs=True)
self._parts: list[str] = []
self._ignored_tag: Optional[str] = None
self._ignored_depth = 0
self._current_link: Optional[str] = None
self._list_depth = 0
self._in_pre = False
def handle_starttag(self, tag: str, attrs: list[tuple[str, Optional[str]]]) -> None:
if self._ignored_tag:
if tag == self._ignored_tag:
self._ignored_depth += 1
return
if tag in {"script", "style"}:
self._ignored_tag = tag
self._ignored_depth = 1
return
attrs_dict = dict(attrs)
if tag in {"article", "main", "div", "section"}:
self._append("\n")
elif tag == "br":
self._append("\n")
elif tag == "p":
self._append("\n\n")
elif tag in {"h1", "h2", "h3"}:
prefix = {"h1": "# ", "h2": "## ", "h3": "### "}[tag]
self._append(f"\n\n{prefix}")
elif tag in {"ul", "ol"}:
self._list_depth += 1
self._append("\n")
elif tag == "li":
indent = " " * max(0, self._list_depth - 1)
self._append(f"\n{indent}- ")
elif tag == "a":
self._current_link = attrs_dict.get("href")
self._append("[")
elif tag == "pre":
self._in_pre = True
self._append("\n\n```\n")
elif tag == "code" and not self._in_pre:
self._append("`")
def handle_endtag(self, tag: str) -> None:
if self._ignored_tag:
if tag == self._ignored_tag:
self._ignored_depth -= 1
if self._ignored_depth == 0:
self._ignored_tag = None
return
if tag in {"h1", "h2", "h3", "p"}:
self._append("\n")
elif tag in {"ul", "ol"}:
self._list_depth = max(0, self._list_depth - 1)
self._append("\n")
elif tag == "a":
href = self._current_link or ""
self._append(f"]({href})")
self._current_link = None
elif tag == "pre":
self._in_pre = False
self._append("\n```\n")
elif tag == "code" and not self._in_pre:
self._append("`")
def handle_data(self, data: str) -> None:
if self._ignored_tag or not data:
return
self._append(unescape(data))
def get_markdown(self) -> str:
markdown = "".join(self._parts)
markdown = re.sub(r"\n{3,}", "\n\n", markdown)
return markdown.strip()
def _append(self, text: str) -> None:
if text:
self._parts.append(text)
def parse_frontmatter(content: str) -> Optional[Dict]:
"""Parse YAML frontmatter."""
fm_match = re.search(r'^---\s*\n(.*?)\n---', content, re.DOTALL)
if not fm_match:
return None
fm_text = fm_match.group(1)
metadata = {}
for line in fm_text.split('\n'):
if ':' in line:
key, val = line.split(':', 1)
metadata[key.strip()] = val.strip().strip('"').strip("'")
return metadata
def has_html_content(content: str) -> bool:
"""Check if content contains HTML document structure."""
html_patterns = [
r'<!DOCTYPE\s+html',
r'<html\s',
r'github\.githubassets\.com',
r'github-cloud\.s3\.amazonaws\.com'
]
# Check outside code blocks
lines = content.split('\n')
in_code_block = False
html_count = 0
for line in lines:
if line.strip().startswith('```'):
in_code_block = not in_code_block
continue
if not in_code_block:
for pattern in html_patterns:
if re.search(pattern, line, re.IGNORECASE):
html_count += 1
break
return html_count > 5
def build_raw_github_url(source_url: str) -> Optional[str]:
"""Convert GitHub tree/blob URL to raw URL."""
if not source_url or 'github.com' not in source_url:
return None
# Handle tree URLs: https://github.com/org/repo/tree/main/path
if '/tree/' in source_url:
parts = source_url.split('/tree/')
if len(parts) == 2:
base = parts[0]
path = parts[1]
return f"{base}/raw/{path}/SKILL.md"
# Handle blob URLs: https://github.com/org/repo/blob/main/path/SKILL.md
if '/blob/' in source_url:
return source_url.replace('/blob/', '/raw/')
# Handle directory URLs - try common paths
if source_url.endswith('/'):
source_url = source_url.rstrip('/')
# Try adding SKILL.md
variations = [
f"{source_url}/SKILL.md",
f"{source_url}/raw/main/SKILL.md",
f"{source_url}/raw/master/SKILL.md"
]
return variations[0] if variations else None
def download_raw_markdown(url: str) -> Tuple[bool, Optional[str]]:
"""Attempt to download raw markdown file."""
try:
req = urllib.request.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (compatible; AntigravitySkillsConverter/1.0)')
with urllib.request.urlopen(req, timeout=15) as response:
if response.status == 200:
content = response.read().decode('utf-8')
# Validate it's markdown (not HTML)
if not has_html_content(content):
return True, content
except urllib.error.HTTPError as e:
if e.code == 404:
return False, None
except Exception:
pass
return False, None
def extract_markdown_from_html(html_content: str) -> Optional[str]:
"""Extract markdown content from GitHub HTML page."""
# Try to find markdown content in common GitHub page structures
patterns = [
r'<article[^>]*>(.*?)</article>',
r'<main[^>]*>(.*?)</main>',
r'<div[^>]*class="[^"]*markdown[^"]*"[^>]*>(.*?)</div>',
r'<div[^>]*class="[^"]*readme[^"]*"[^>]*>(.*?)</div>',
]
for pattern in patterns:
match = re.search(pattern, html_content, re.DOTALL | re.IGNORECASE)
if match:
content = match.group(1)
# Basic HTML to markdown conversion
markdown = convert_html_to_markdown(content)
if markdown and len(markdown.strip()) > 100:
return markdown
return None
def convert_html_to_markdown(html: str) -> str:
"""Basic HTML to markdown conversion."""
parser = MarkdownHTMLParser()
parser.feed(html)
parser.close()
return parser.get_markdown()
def create_minimal_markdown(metadata: Dict, source_url: str) -> str:
"""Create minimal markdown content from metadata."""
name = metadata.get('name', 'skill')
description = metadata.get('description', '')
# Extract "When to Use" if it exists in current content
when_to_use = f"Use this skill when you need to {description.lower()}."
# Create title from name
title = name.replace('-', ' ').title()
markdown = f"""# {title}
## Overview
{description}
## When to Use This Skill
{when_to_use}
## Instructions
This skill provides guidance and patterns for {description.lower()}.
## Resources
For more information, see the [source repository]({source_url}).
"""
return markdown
def convert_skill(skill_path: Path) -> Dict:
"""Convert a single skill from HTML to markdown."""
skill_name = skill_path.parent.name
result = {
'skill': skill_name,
'method': None,
'success': False,
'error': None
}
try:
content = skill_path.read_text(encoding='utf-8')
except Exception as e:
result['error'] = f"Failed to read file: {e}"
return result
# Parse frontmatter
metadata = parse_frontmatter(content)
if not metadata:
result['error'] = "No frontmatter found"
return result
source_url = metadata.get('source', '')
# Extract frontmatter and "When to Use" section
frontmatter_match = re.search(r'^(---\s*\n.*?\n---)', content, re.DOTALL)
frontmatter = frontmatter_match.group(1) if frontmatter_match else ''
when_to_use_match = re.search(r'##\s+When to Use.*?\n(.*?)(?=\n<!DOCTYPE|\n##|\Z)', content, re.DOTALL | re.IGNORECASE)
when_to_use_content = when_to_use_match.group(1).strip() if when_to_use_match else None
# Try method 1: Download raw markdown
raw_url = build_raw_github_url(source_url)
if raw_url:
success, raw_content = download_raw_markdown(raw_url)
if success and raw_content:
# Preserve frontmatter from original
raw_metadata = parse_frontmatter(raw_content)
if raw_metadata:
# Merge metadata (keep original source)
raw_metadata['source'] = source_url
raw_metadata['risk'] = metadata.get('risk', 'safe')
# Rebuild frontmatter
new_frontmatter = '---\n'
for key, value in raw_metadata.items():
if isinstance(value, str) and (' ' in value or ':' in value):
new_frontmatter += f'{key}: "{value}"\n'
else:
new_frontmatter += f'{key}: {value}\n'
new_frontmatter += '---\n'
# Remove frontmatter from raw content
raw_content_no_fm = re.sub(r'^---\s*\n.*?\n---\s*\n', '', raw_content, flags=re.DOTALL)
new_content = new_frontmatter + '\n' + raw_content_no_fm
skill_path.write_text(new_content, encoding='utf-8')
result['method'] = 'raw_download'
result['success'] = True
return result
# Try method 2: Extract from HTML
if has_html_content(content):
markdown_content = extract_markdown_from_html(content)
if markdown_content and len(markdown_content.strip()) > 100:
# Rebuild with frontmatter
new_content = frontmatter + '\n\n' + markdown_content
skill_path.write_text(new_content, encoding='utf-8')
result['method'] = 'html_extraction'
result['success'] = True
return result
# Method 3: Create minimal content
minimal_content = create_minimal_markdown(metadata, source_url)
new_content = frontmatter + '\n\n' + minimal_content
skill_path.write_text(new_content, encoding='utf-8')
result['method'] = 'minimal_creation'
result['success'] = True
return result
def main():
base_dir = Path(__file__).parent.parent
skills_dir = base_dir / "skills"
# Find skills with HTML content
print("🔍 Identifying skills with HTML content...")
skills_with_html = []
for skill_dir in skills_dir.iterdir():
if not skill_dir.is_dir() or skill_dir.name.startswith('.'):
continue
skill_file = skill_dir / 'SKILL.md'
if not skill_file.exists():
continue
try:
content = skill_file.read_text(encoding='utf-8')
if has_html_content(content):
skills_with_html.append(skill_file)
except Exception:
continue
print(f"✅ Found {len(skills_with_html)} skills with HTML content\n")
if not skills_with_html:
print("No skills with HTML content found.")
return
# Create backup directory
backup_dir = base_dir / "skills_backup_html"
backup_dir.mkdir(exist_ok=True)
print(f"📦 Creating backups in: {backup_dir}")
for skill_file in skills_with_html:
backup_path = backup_dir / skill_file.parent.name / 'SKILL.md'
backup_path.parent.mkdir(parents=True, exist_ok=True)
backup_path.write_bytes(skill_file.read_bytes())
print("✅ Backups created\n")
# Convert each skill
print(f"🔄 Converting {len(skills_with_html)} skills...\n")
results = []
for i, skill_file in enumerate(skills_with_html, 1):
skill_name = skill_file.parent.name
print(f"[{i}/{len(skills_with_html)}] {skill_name}")
result = convert_skill(skill_file)
results.append(result)
if result['success']:
print(f" ✅ Converted using method: {result['method']}")
else:
print(f" ❌ Failed: {result.get('error', 'Unknown error')}")
print()
# Summary
print("=" * 60)
print("📊 Conversion Summary:")
print(f" Total skills: {len(skills_with_html)}")
print(f" ✅ Successful: {sum(1 for r in results if r['success'])}")
print(f" ❌ Failed: {sum(1 for r in results if not r['success'])}")
methods = {}
for r in results:
if r['success']:
method = r['method']
methods[method] = methods.get(method, 0) + 1
print(f"\n Methods used:")
for method, count in methods.items():
print(f"{method}: {count}")
# Save report
report = {
'total_skills': len(skills_with_html),
'successful': sum(1 for r in results if r['success']),
'failed': sum(1 for r in results if not r['success']),
'results': results,
'backup_location': str(backup_dir)
}
report_file = base_dir / "html_conversion_results.json"
with open(report_file, 'w', encoding='utf-8') as f:
json.dump(report, f, indent=2, ensure_ascii=False)
print(f"\n💾 Report saved to: {report_file}")
print(f"📦 Backups saved to: {backup_dir}")
if __name__ == "__main__":
main()