#!/usr/bin/env python3
"""
Convert skills with HTML content to clean markdown.
Attempts to download raw markdown files from GitHub, extracts content from HTML if needed,
or creates minimal markdown content as fallback.
"""
import json
import re
import sys
import urllib.request
import urllib.error
from pathlib import Path
from typing import Dict, Optional, Tuple
from urllib.parse import urlparse, urljoin
def parse_frontmatter(content: str) -> Optional[Dict]:
"""Parse YAML frontmatter."""
fm_match = re.search(r'^---\s*\n(.*?)\n---', content, re.DOTALL)
if not fm_match:
return None
fm_text = fm_match.group(1)
metadata = {}
for line in fm_text.split('\n'):
if ':' in line:
key, val = line.split(':', 1)
metadata[key.strip()] = val.strip().strip('"').strip("'")
return metadata
def has_html_content(content: str) -> bool:
"""Check if content contains HTML document structure."""
html_patterns = [
r' 5
def build_raw_github_url(source_url: str) -> Optional[str]:
"""Convert GitHub tree/blob URL to raw URL."""
if not source_url or 'github.com' not in source_url:
return None
# Handle tree URLs: https://github.com/org/repo/tree/main/path
if '/tree/' in source_url:
parts = source_url.split('/tree/')
if len(parts) == 2:
base = parts[0]
path = parts[1]
return f"{base}/raw/{path}/SKILL.md"
# Handle blob URLs: https://github.com/org/repo/blob/main/path/SKILL.md
if '/blob/' in source_url:
return source_url.replace('/blob/', '/raw/')
# Handle directory URLs - try common paths
if source_url.endswith('/'):
source_url = source_url.rstrip('/')
# Try adding SKILL.md
variations = [
f"{source_url}/SKILL.md",
f"{source_url}/raw/main/SKILL.md",
f"{source_url}/raw/master/SKILL.md"
]
return variations[0] if variations else None
def download_raw_markdown(url: str) -> Tuple[bool, Optional[str]]:
"""Attempt to download raw markdown file."""
try:
req = urllib.request.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (compatible; AntigravitySkillsConverter/1.0)')
with urllib.request.urlopen(req, timeout=15) as response:
if response.status == 200:
content = response.read().decode('utf-8')
# Validate it's markdown (not HTML)
if not has_html_content(content):
return True, content
except urllib.error.HTTPError as e:
if e.code == 404:
return False, None
except Exception:
pass
return False, None
def extract_markdown_from_html(html_content: str) -> Optional[str]:
"""Extract markdown content from GitHub HTML page."""
# Try to find markdown content in common GitHub page structures
patterns = [
r'
]*>]*>(.*?)', r'```\n\1\n```', html, flags=re.DOTALL | re.IGNORECASE)
html = re.sub(r']*>(.*?)', r'`\1`', html, flags=re.DOTALL | re.IGNORECASE)
# Links
html = re.sub(r']*href="([^"]*)"[^>]*>(.*?)', r'[\2](\1)', html, flags=re.DOTALL | re.IGNORECASE)
# Lists
html = re.sub(r']*>(.*?)
', r'\1\n\n', html, flags=re.DOTALL | re.IGNORECASE) # Remove remaining HTML tags html = re.sub(r'<[^>]+>', '', html) # Clean up whitespace html = re.sub(r'\n{3,}', '\n\n', html) html = html.strip() return html def create_minimal_markdown(metadata: Dict, source_url: str) -> str: """Create minimal markdown content from metadata.""" name = metadata.get('name', 'skill') description = metadata.get('description', '') # Extract "When to Use" if it exists in current content when_to_use = f"Use this skill when you need to {description.lower()}." # Create title from name title = name.replace('-', ' ').title() markdown = f"""# {title} ## Overview {description} ## When to Use This Skill {when_to_use} ## Instructions This skill provides guidance and patterns for {description.lower()}. ## Resources For more information, see the [source repository]({source_url}). """ return markdown def convert_skill(skill_path: Path) -> Dict: """Convert a single skill from HTML to markdown.""" skill_name = skill_path.parent.name result = { 'skill': skill_name, 'method': None, 'success': False, 'error': None } try: content = skill_path.read_text(encoding='utf-8') except Exception as e: result['error'] = f"Failed to read file: {e}" return result # Parse frontmatter metadata = parse_frontmatter(content) if not metadata: result['error'] = "No frontmatter found" return result source_url = metadata.get('source', '') # Extract frontmatter and "When to Use" section frontmatter_match = re.search(r'^(---\s*\n.*?\n---)', content, re.DOTALL) frontmatter = frontmatter_match.group(1) if frontmatter_match else '' when_to_use_match = re.search(r'##\s+When to Use.*?\n(.*?)(?=\n 100: # Rebuild with frontmatter new_content = frontmatter + '\n\n' + markdown_content skill_path.write_text(new_content, encoding='utf-8') result['method'] = 'html_extraction' result['success'] = True return result # Method 3: Create minimal content minimal_content = create_minimal_markdown(metadata, source_url) new_content = frontmatter + '\n\n' + minimal_content skill_path.write_text(new_content, encoding='utf-8') result['method'] = 'minimal_creation' result['success'] = True return result def main(): base_dir = Path(__file__).parent.parent skills_dir = base_dir / "skills" # Find skills with HTML content print("š Identifying skills with HTML content...") skills_with_html = [] for skill_dir in skills_dir.iterdir(): if not skill_dir.is_dir() or skill_dir.name.startswith('.'): continue skill_file = skill_dir / 'SKILL.md' if not skill_file.exists(): continue try: content = skill_file.read_text(encoding='utf-8') if has_html_content(content): skills_with_html.append(skill_file) except Exception: continue print(f"ā Found {len(skills_with_html)} skills with HTML content\n") if not skills_with_html: print("No skills with HTML content found.") return # Create backup directory backup_dir = base_dir / "skills_backup_html" backup_dir.mkdir(exist_ok=True) print(f"š¦ Creating backups in: {backup_dir}") for skill_file in skills_with_html: backup_path = backup_dir / skill_file.parent.name / 'SKILL.md' backup_path.parent.mkdir(parents=True, exist_ok=True) backup_path.write_bytes(skill_file.read_bytes()) print("ā Backups created\n") # Convert each skill print(f"š Converting {len(skills_with_html)} skills...\n") results = [] for i, skill_file in enumerate(skills_with_html, 1): skill_name = skill_file.parent.name print(f"[{i}/{len(skills_with_html)}] {skill_name}") result = convert_skill(skill_file) results.append(result) if result['success']: print(f" ā Converted using method: {result['method']}") else: print(f" ā Failed: {result.get('error', 'Unknown error')}") print() # Summary print("=" * 60) print("š Conversion Summary:") print(f" Total skills: {len(skills_with_html)}") print(f" ā Successful: {sum(1 for r in results if r['success'])}") print(f" ā Failed: {sum(1 for r in results if not r['success'])}") methods = {} for r in results: if r['success']: method = r['method'] methods[method] = methods.get(method, 0) + 1 print(f"\n Methods used:") for method, count in methods.items(): print(f" ⢠{method}: {count}") # Save report report = { 'total_skills': len(skills_with_html), 'successful': sum(1 for r in results if r['success']), 'failed': sum(1 for r in results if not r['success']), 'results': results, 'backup_location': str(backup_dir) } report_file = base_dir / "html_conversion_results.json" with open(report_file, 'w', encoding='utf-8') as f: json.dump(report, f, indent=2, ensure_ascii=False) print(f"\nš¾ Report saved to: {report_file}") print(f"š¦ Backups saved to: {backup_dir}") if __name__ == "__main__": main()