#!/usr/bin/env python3
"""
Convert skills with HTML content to clean markdown.
Attempts to download raw markdown files from GitHub, extracts content from HTML if needed,
or creates minimal markdown content as fallback.
"""
import json
import re
import sys
import urllib.request
import urllib.error
from html import unescape
from html.parser import HTMLParser
from pathlib import Path
from typing import Dict, Optional, Tuple
from urllib.parse import urlparse, urljoin
class MarkdownHTMLParser(HTMLParser):
"""Convert a constrained subset of HTML into markdown without regex tag stripping."""
def __init__(self) -> None:
super().__init__(convert_charrefs=True)
self._parts: list[str] = []
self._ignored_tag: Optional[str] = None
self._ignored_depth = 0
self._current_link: Optional[str] = None
self._list_depth = 0
self._in_pre = False
def handle_starttag(self, tag: str, attrs: list[tuple[str, Optional[str]]]) -> None:
if self._ignored_tag:
if tag == self._ignored_tag:
self._ignored_depth += 1
return
if tag in {"script", "style"}:
self._ignored_tag = tag
self._ignored_depth = 1
return
attrs_dict = dict(attrs)
if tag in {"article", "main", "div", "section"}:
self._append("\n")
elif tag == "br":
self._append("\n")
elif tag == "p":
self._append("\n\n")
elif tag in {"h1", "h2", "h3"}:
prefix = {"h1": "# ", "h2": "## ", "h3": "### "}[tag]
self._append(f"\n\n{prefix}")
elif tag in {"ul", "ol"}:
self._list_depth += 1
self._append("\n")
elif tag == "li":
indent = " " * max(0, self._list_depth - 1)
self._append(f"\n{indent}- ")
elif tag == "a":
self._current_link = attrs_dict.get("href")
self._append("[")
elif tag == "pre":
self._in_pre = True
self._append("\n\n```\n")
elif tag == "code" and not self._in_pre:
self._append("`")
def handle_endtag(self, tag: str) -> None:
if self._ignored_tag:
if tag == self._ignored_tag:
self._ignored_depth -= 1
if self._ignored_depth == 0:
self._ignored_tag = None
return
if tag in {"h1", "h2", "h3", "p"}:
self._append("\n")
elif tag in {"ul", "ol"}:
self._list_depth = max(0, self._list_depth - 1)
self._append("\n")
elif tag == "a":
href = self._current_link or ""
self._append(f"]({href})")
self._current_link = None
elif tag == "pre":
self._in_pre = False
self._append("\n```\n")
elif tag == "code" and not self._in_pre:
self._append("`")
def handle_data(self, data: str) -> None:
if self._ignored_tag or not data:
return
self._append(unescape(data))
def get_markdown(self) -> str:
markdown = "".join(self._parts)
markdown = re.sub(r"\n{3,}", "\n\n", markdown)
return markdown.strip()
def _append(self, text: str) -> None:
if text:
self._parts.append(text)
def parse_frontmatter(content: str) -> Optional[Dict]:
"""Parse YAML frontmatter."""
fm_match = re.search(r'^---\s*\n(.*?)\n---', content, re.DOTALL)
if not fm_match:
return None
fm_text = fm_match.group(1)
metadata = {}
for line in fm_text.split('\n'):
if ':' in line:
key, val = line.split(':', 1)
metadata[key.strip()] = val.strip().strip('"').strip("'")
return metadata
def has_html_content(content: str) -> bool:
"""Check if content contains HTML document structure."""
html_patterns = [
r' 5
def build_raw_github_url(source_url: str) -> Optional[str]:
"""Convert GitHub tree/blob URL to raw URL."""
if not source_url or 'github.com' not in source_url:
return None
# Handle tree URLs: https://github.com/org/repo/tree/main/path
if '/tree/' in source_url:
parts = source_url.split('/tree/')
if len(parts) == 2:
base = parts[0]
path = parts[1]
return f"{base}/raw/{path}/SKILL.md"
# Handle blob URLs: https://github.com/org/repo/blob/main/path/SKILL.md
if '/blob/' in source_url:
return source_url.replace('/blob/', '/raw/')
# Handle directory URLs - try common paths
if source_url.endswith('/'):
source_url = source_url.rstrip('/')
# Try adding SKILL.md
variations = [
f"{source_url}/SKILL.md",
f"{source_url}/raw/main/SKILL.md",
f"{source_url}/raw/master/SKILL.md"
]
return variations[0] if variations else None
def download_raw_markdown(url: str) -> Tuple[bool, Optional[str]]:
"""Attempt to download raw markdown file."""
try:
req = urllib.request.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (compatible; AntigravitySkillsConverter/1.0)')
with urllib.request.urlopen(req, timeout=15) as response:
if response.status == 200:
content = response.read().decode('utf-8')
# Validate it's markdown (not HTML)
if not has_html_content(content):
return True, content
except urllib.error.HTTPError as e:
if e.code == 404:
return False, None
except Exception:
pass
return False, None
def extract_markdown_from_html(html_content: str) -> Optional[str]:
"""Extract markdown content from GitHub HTML page."""
# Try to find markdown content in common GitHub page structures
patterns = [
r'