#!/usr/bin/env python3 """ Documentation to Claude Skill Converter Single tool to scrape any documentation and create high-quality Claude skills. Usage: python3 doc_scraper.py --interactive python3 doc_scraper.py --config configs/godot.json python3 doc_scraper.py --url https://react.dev/ --name react """ import os import sys import json import time import re import argparse import hashlib import requests from pathlib import Path from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup from collections import deque, defaultdict class DocToSkillConverter: def __init__(self, config): self.config = config self.name = config['name'] self.base_url = config['base_url'] # Paths self.data_dir = f"output/{self.name}_data" self.skill_dir = f"output/{self.name}" # State self.visited_urls = set() # Support multiple starting URLs start_urls = config.get('start_urls', [self.base_url]) self.pending_urls = deque(start_urls) self.pages = [] # Create directories os.makedirs(f"{self.data_dir}/pages", exist_ok=True) os.makedirs(f"{self.skill_dir}/references", exist_ok=True) os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True) os.makedirs(f"{self.skill_dir}/assets", exist_ok=True) def is_valid_url(self, url): """Check if URL should be scraped""" if not url.startswith(self.base_url): return False # Include patterns includes = self.config.get('url_patterns', {}).get('include', []) if includes and not any(pattern in url for pattern in includes): return False # Exclude patterns excludes = self.config.get('url_patterns', {}).get('exclude', []) if any(pattern in url for pattern in excludes): return False return True def extract_content(self, soup, url): """Extract content with improved code and pattern detection""" page = { 'url': url, 'title': '', 'content': '', 'headings': [], 'code_samples': [], 'patterns': [], # NEW: Extract common patterns 'links': [] } selectors = self.config.get('selectors', {}) # Extract title title_elem = soup.select_one(selectors.get('title', 'title')) if title_elem: page['title'] = self.clean_text(title_elem.get_text()) # Find main content main_selector = selectors.get('main_content', 'div[role="main"]') main = soup.select_one(main_selector) if not main: print(f"⚠ No content: {url}") return page # Extract headings with better structure for h in main.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']): text = self.clean_text(h.get_text()) if text: page['headings'].append({ 'level': h.name, 'text': text, 'id': h.get('id', '') }) # Extract code with language detection code_selector = selectors.get('code_blocks', 'pre code') for code_elem in main.select(code_selector): code = code_elem.get_text() if len(code.strip()) > 10: # Try to detect language lang = self.detect_language(code_elem, code) page['code_samples'].append({ 'code': code.strip(), 'language': lang }) # Extract patterns (NEW: common code patterns) page['patterns'] = self.extract_patterns(main, page['code_samples']) # Extract paragraphs paragraphs = [] for p in main.find_all('p'): text = self.clean_text(p.get_text()) if text and len(text) > 20: # Skip very short paragraphs paragraphs.append(text) page['content'] = '\n\n'.join(paragraphs) # Extract links for link in main.find_all('a', href=True): href = urljoin(url, link['href']) if self.is_valid_url(href): page['links'].append(href) return page def detect_language(self, elem, code): """Detect programming language from code block""" # Check class attribute classes = elem.get('class', []) for cls in classes: if 'language-' in cls: return cls.replace('language-', '') if 'lang-' in cls: return cls.replace('lang-', '') # Check parent pre element parent = elem.parent if parent and parent.name == 'pre': classes = parent.get('class', []) for cls in classes: if 'language-' in cls: return cls.replace('language-', '') # Heuristic detection if 'import ' in code and 'from ' in code: return 'python' if 'const ' in code or 'let ' in code or '=>' in code: return 'javascript' if 'func ' in code and 'var ' in code: return 'gdscript' if 'def ' in code and ':' in code: return 'python' if '#include' in code or 'int main' in code: return 'cpp' return 'unknown' def extract_patterns(self, main, code_samples): """Extract common coding patterns (NEW FEATURE)""" patterns = [] # Look for "Example:" or "Pattern:" sections for elem in main.find_all(['p', 'div']): text = elem.get_text().lower() if any(word in text for word in ['example:', 'pattern:', 'usage:', 'typical use']): # Get the code that follows next_code = elem.find_next(['pre', 'code']) if next_code: patterns.append({ 'description': self.clean_text(elem.get_text()), 'code': next_code.get_text().strip() }) return patterns[:5] # Limit to 5 most relevant patterns def clean_text(self, text): """Clean text content""" text = re.sub(r'\s+', ' ', text) return text.strip() def save_page(self, page): """Save page data""" url_hash = hashlib.md5(page['url'].encode()).hexdigest()[:10] safe_title = re.sub(r'[^\w\s-]', '', page['title'])[:50] safe_title = re.sub(r'[-\s]+', '_', safe_title) filename = f"{safe_title}_{url_hash}.json" filepath = os.path.join(self.data_dir, "pages", filename) with open(filepath, 'w', encoding='utf-8') as f: json.dump(page, f, indent=2, ensure_ascii=False) def scrape_page(self, url): """Scrape a single page""" try: print(f" {url}") headers = {'User-Agent': 'Mozilla/5.0 (Documentation Scraper)'} response = requests.get(url, headers=headers, timeout=30) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') page = self.extract_content(soup, url) self.save_page(page) self.pages.append(page) # Add new URLs for link in page['links']: if link not in self.visited_urls and link not in self.pending_urls: self.pending_urls.append(link) # Rate limiting time.sleep(self.config.get('rate_limit', 0.5)) except Exception as e: print(f" ✗ Error: {e}") def scrape_all(self): """Scrape all pages""" print(f"\n{'='*60}") print(f"SCRAPING: {self.name}") print(f"{'='*60}") print(f"Base URL: {self.base_url}") print(f"Output: {self.data_dir}\n") max_pages = self.config.get('max_pages', 500) while self.pending_urls and len(self.visited_urls) < max_pages: url = self.pending_urls.popleft() if url in self.visited_urls: continue self.visited_urls.add(url) self.scrape_page(url) if len(self.visited_urls) % 10 == 0: print(f" [{len(self.visited_urls)} pages]") print(f"\n✅ Scraped {len(self.visited_urls)} pages") self.save_summary() def save_summary(self): """Save scraping summary""" summary = { 'name': self.name, 'total_pages': len(self.pages), 'base_url': self.base_url, 'pages': [{'title': p['title'], 'url': p['url']} for p in self.pages] } with open(f"{self.data_dir}/summary.json", 'w', encoding='utf-8') as f: json.dump(summary, f, indent=2, ensure_ascii=False) def load_scraped_data(self): """Load previously scraped data""" pages = [] pages_dir = Path(self.data_dir) / "pages" if not pages_dir.exists(): return [] for json_file in pages_dir.glob("*.json"): try: with open(json_file, 'r', encoding='utf-8') as f: pages.append(json.load(f)) except Exception as e: print(f"⚠ Error loading {json_file}: {e}") return pages def smart_categorize(self, pages): """Improved categorization with better pattern matching""" category_defs = self.config.get('categories', {}) # Default smart categories if none provided if not category_defs: category_defs = self.infer_categories(pages) categories = {cat: [] for cat in category_defs.keys()} categories['other'] = [] for page in pages: url = page['url'].lower() title = page['title'].lower() content = page.get('content', '').lower()[:500] # Check first 500 chars categorized = False # Match against keywords for cat, keywords in category_defs.items(): score = 0 for keyword in keywords: keyword = keyword.lower() if keyword in url: score += 3 if keyword in title: score += 2 if keyword in content: score += 1 if score >= 2: # Threshold for categorization categories[cat].append(page) categorized = True break if not categorized: categories['other'].append(page) # Remove empty categories categories = {k: v for k, v in categories.items() if v} return categories def infer_categories(self, pages): """Infer categories from URL patterns (IMPROVED)""" url_segments = defaultdict(int) for page in pages: path = urlparse(page['url']).path segments = [s for s in path.split('/') if s and s not in ['en', 'stable', 'latest', 'docs']] for seg in segments: url_segments[seg] += 1 # Top segments become categories top_segments = sorted(url_segments.items(), key=lambda x: x[1], reverse=True)[:8] categories = {} for seg, count in top_segments: if count >= 3: # At least 3 pages categories[seg] = [seg] # Add common defaults if 'tutorial' not in categories and any('tutorial' in url for url in [p['url'] for p in pages]): categories['tutorials'] = ['tutorial', 'guide', 'getting-started'] if 'api' not in categories and any('api' in url or 'reference' in url for url in [p['url'] for p in pages]): categories['api'] = ['api', 'reference', 'class'] return categories def generate_quick_reference(self, pages): """Generate quick reference from common patterns (NEW FEATURE)""" quick_ref = [] # Collect all patterns all_patterns = [] for page in pages: all_patterns.extend(page.get('patterns', [])) # Get most common code patterns seen_codes = set() for pattern in all_patterns: code = pattern['code'] if code not in seen_codes and len(code) < 300: quick_ref.append(pattern) seen_codes.add(code) if len(quick_ref) >= 15: break return quick_ref def create_reference_file(self, category, pages): """Create enhanced reference file""" if not pages: return lines = [] lines.append(f"# {self.name.title()} - {category.replace('_', ' ').title()}\n") lines.append(f"**Pages:** {len(pages)}\n") lines.append("---\n") for page in pages: lines.append(f"## {page['title']}\n") lines.append(f"**URL:** {page['url']}\n") # Table of contents from headings if page.get('headings'): lines.append("**Contents:**") for h in page['headings'][:10]: level = int(h['level'][1]) if len(h['level']) > 1 else 1 indent = " " * max(0, level - 2) lines.append(f"{indent}- {h['text']}") lines.append("") # Content if page.get('content'): content = page['content'][:2500] if len(page['content']) > 2500: content += "\n\n*[Content truncated]*" lines.append(content) lines.append("") # Code examples with language if page.get('code_samples'): lines.append("**Examples:**\n") for i, sample in enumerate(page['code_samples'][:4], 1): lang = sample.get('language', 'unknown') code = sample.get('code', sample if isinstance(sample, str) else '') lines.append(f"Example {i} ({lang}):") lines.append(f"```{lang}") lines.append(code[:600]) if len(code) > 600: lines.append("...") lines.append("```\n") lines.append("---\n") filepath = os.path.join(self.skill_dir, "references", f"{category}.md") with open(filepath, 'w', encoding='utf-8') as f: f.write('\n'.join(lines)) print(f" ✓ {category}.md ({len(pages)} pages)") def create_enhanced_skill_md(self, categories, quick_ref): """Create SKILL.md with actual examples (IMPROVED)""" description = self.config.get('description', f'Comprehensive assistance with {self.name}') # Extract actual code examples from docs example_codes = [] for pages in categories.values(): for page in pages[:3]: # First 3 pages per category for sample in page.get('code_samples', [])[:2]: # First 2 samples per page code = sample.get('code', sample if isinstance(sample, str) else '') lang = sample.get('language', 'unknown') if len(code) < 200 and lang != 'unknown': example_codes.append((lang, code)) if len(example_codes) >= 10: break if len(example_codes) >= 10: break if len(example_codes) >= 10: break content = f"""--- name: {self.name} description: {description} --- # {self.name.title()} Skill Comprehensive assistance with {self.name} development, generated from official documentation. ## When to Use This Skill This skill should be triggered when: - Working with {self.name} - Asking about {self.name} features or APIs - Implementing {self.name} solutions - Debugging {self.name} code - Learning {self.name} best practices ## Quick Reference ### Common Patterns """ # Add actual quick reference patterns if quick_ref: for i, pattern in enumerate(quick_ref[:8], 1): content += f"**Pattern {i}:** {pattern.get('description', 'Example pattern')}\n\n" content += "```\n" content += pattern.get('code', '')[:300] content += "\n```\n\n" else: content += "*Quick reference patterns will be added as you use the skill.*\n\n" # Add example codes from docs if example_codes: content += "### Example Code Patterns\n\n" for i, (lang, code) in enumerate(example_codes[:5], 1): content += f"**Example {i}** ({lang}):\n```{lang}\n{code}\n```\n\n" content += f"""## Reference Files This skill includes comprehensive documentation in `references/`: """ for cat in sorted(categories.keys()): content += f"- **{cat}.md** - {cat.replace('_', ' ').title()} documentation\n" content += """ Use `view` to read specific reference files when detailed information is needed. ## Working with This Skill ### For Beginners Start with the getting_started or tutorials reference files for foundational concepts. ### For Specific Features Use the appropriate category reference file (api, guides, etc.) for detailed information. ### For Code Examples The quick reference section above contains common patterns extracted from the official docs. ## Resources ### references/ Organized documentation extracted from official sources. These files contain: - Detailed explanations - Code examples with language annotations - Links to original documentation - Table of contents for quick navigation ### scripts/ Add helper scripts here for common automation tasks. ### assets/ Add templates, boilerplate, or example projects here. ## Notes - This skill was automatically generated from official documentation - Reference files preserve the structure and examples from source docs - Code examples include language detection for better syntax highlighting - Quick reference patterns are extracted from common usage examples in the docs ## Updating To refresh this skill with updated documentation: 1. Re-run the scraper with the same configuration 2. The skill will be rebuilt with the latest information """ filepath = os.path.join(self.skill_dir, "SKILL.md") with open(filepath, 'w', encoding='utf-8') as f: f.write(content) print(f" ✓ SKILL.md (enhanced with {len(example_codes)} examples)") def create_index(self, categories): """Create navigation index""" lines = [] lines.append(f"# {self.name.title()} Documentation Index\n") lines.append("## Categories\n") for cat, pages in sorted(categories.items()): lines.append(f"### {cat.replace('_', ' ').title()}") lines.append(f"**File:** `{cat}.md`") lines.append(f"**Pages:** {len(pages)}\n") filepath = os.path.join(self.skill_dir, "references", "index.md") with open(filepath, 'w', encoding='utf-8') as f: f.write('\n'.join(lines)) print(" ✓ index.md") def build_skill(self): """Build the skill from scraped data""" print(f"\n{'='*60}") print(f"BUILDING SKILL: {self.name}") print(f"{'='*60}\n") # Load data print("Loading scraped data...") pages = self.load_scraped_data() if not pages: print("✗ No scraped data found!") return False print(f" ✓ Loaded {len(pages)} pages\n") # Categorize print("Categorizing pages...") categories = self.smart_categorize(pages) print(f" ✓ Created {len(categories)} categories\n") # Generate quick reference print("Generating quick reference...") quick_ref = self.generate_quick_reference(pages) print(f" ✓ Extracted {len(quick_ref)} patterns\n") # Create reference files print("Creating reference files...") for cat, cat_pages in categories.items(): self.create_reference_file(cat, cat_pages) # Create index self.create_index(categories) print() # Create enhanced SKILL.md print("Creating SKILL.md...") self.create_enhanced_skill_md(categories, quick_ref) print(f"\n✅ Skill built: {self.skill_dir}/") return True def load_config(config_path): """Load configuration from file""" with open(config_path, 'r') as f: return json.load(f) def interactive_config(): """Interactive configuration""" print("\n" + "="*60) print("Documentation to Skill Converter") print("="*60 + "\n") config = {} # Basic info config['name'] = input("Skill name (e.g., 'react', 'godot'): ").strip() config['description'] = input("Skill description: ").strip() config['base_url'] = input("Base URL (e.g., https://docs.example.com/): ").strip() if not config['base_url'].endswith('/'): config['base_url'] += '/' # Selectors print("\nCSS Selectors (press Enter for defaults):") selectors = {} selectors['main_content'] = input(" Main content [div[role='main']]: ").strip() or "div[role='main']" selectors['title'] = input(" Title [title]: ").strip() or "title" selectors['code_blocks'] = input(" Code blocks [pre code]: ").strip() or "pre code" config['selectors'] = selectors # URL patterns print("\nURL Patterns (comma-separated, optional):") include = input(" Include: ").strip() exclude = input(" Exclude: ").strip() config['url_patterns'] = { 'include': [p.strip() for p in include.split(',') if p.strip()], 'exclude': [p.strip() for p in exclude.split(',') if p.strip()] } # Settings rate = input("\nRate limit (seconds) [0.5]: ").strip() config['rate_limit'] = float(rate) if rate else 0.5 max_p = input("Max pages [500]: ").strip() config['max_pages'] = int(max_p) if max_p else 500 return config def check_existing_data(name): """Check if scraped data already exists""" data_dir = f"output/{name}_data" if os.path.exists(data_dir) and os.path.exists(f"{data_dir}/summary.json"): with open(f"{data_dir}/summary.json", 'r') as f: summary = json.load(f) return True, summary.get('total_pages', 0) return False, 0 def main(): parser = argparse.ArgumentParser( description='Convert documentation websites to Claude skills', formatter_class=argparse.RawDescriptionHelpFormatter ) parser.add_argument('--interactive', '-i', action='store_true', help='Interactive configuration mode') parser.add_argument('--config', '-c', type=str, help='Load configuration from file (e.g., configs/godot.json)') parser.add_argument('--name', type=str, help='Skill name') parser.add_argument('--url', type=str, help='Base documentation URL') parser.add_argument('--description', '-d', type=str, help='Skill description') parser.add_argument('--skip-scrape', action='store_true', help='Skip scraping, use existing data') parser.add_argument('--enhance', action='store_true', help='Enhance SKILL.md using Claude API after building (requires API key)') parser.add_argument('--enhance-local', action='store_true', help='Enhance SKILL.md using Claude Code in new terminal (no API key needed)') parser.add_argument('--api-key', type=str, help='Anthropic API key for --enhance (or set ANTHROPIC_API_KEY)') args = parser.parse_args() # Get configuration if args.config: config = load_config(args.config) elif args.interactive or not (args.name and args.url): config = interactive_config() else: config = { 'name': args.name, 'description': args.description or f'Comprehensive assistance with {args.name}', 'base_url': args.url, 'selectors': { 'main_content': "div[role='main']", 'title': 'title', 'code_blocks': 'pre code' }, 'url_patterns': {'include': [], 'exclude': []}, 'rate_limit': 0.5, 'max_pages': 500 } # Check for existing data exists, page_count = check_existing_data(config['name']) if exists and not args.skip_scrape: print(f"\n✓ Found existing data: {page_count} pages") response = input("Use existing data? (y/n): ").strip().lower() if response == 'y': args.skip_scrape = True # Create converter converter = DocToSkillConverter(config) # Scrape or skip if not args.skip_scrape: try: converter.scrape_all() except KeyboardInterrupt: print("\n\nScraping interrupted.") response = input("Continue with skill building? (y/n): ").strip().lower() if response != 'y': return else: print(f"\n⏭️ Skipping scrape, using existing data") # Build skill success = converter.build_skill() if not success: sys.exit(1) # Optional enhancement with Claude API if args.enhance: print(f"\n{'='*60}") print(f"ENHANCING SKILL.MD WITH CLAUDE API") print(f"{'='*60}\n") try: import subprocess enhance_cmd = ['python3', 'enhance_skill.py', f'output/{config["name"]}/'] if args.api_key: enhance_cmd.extend(['--api-key', args.api_key]) result = subprocess.run(enhance_cmd, check=True) if result.returncode == 0: print("\n✅ Enhancement complete!") except subprocess.CalledProcessError: print("\n⚠ Enhancement failed, but skill was still built") except FileNotFoundError: print("\n⚠ enhance_skill.py not found. Run manually:") print(f" python3 enhance_skill.py output/{config['name']}/") # Optional enhancement with Claude Code (local, no API key) if args.enhance_local: print(f"\n{'='*60}") print(f"ENHANCING SKILL.MD WITH CLAUDE CODE (LOCAL)") print(f"{'='*60}\n") try: import subprocess enhance_cmd = ['python3', 'enhance_skill_local.py', f'output/{config["name"]}/'] subprocess.run(enhance_cmd, check=True) except subprocess.CalledProcessError: print("\n⚠ Enhancement failed, but skill was still built") except FileNotFoundError: print("\n⚠ enhance_skill_local.py not found. Run manually:") print(f" python3 enhance_skill_local.py output/{config['name']}/") print(f"\n📦 Package your skill:") print(f" python3 /mnt/skills/examples/skill-creator/scripts/package_skill.py output/{config['name']}/") if not args.enhance and not args.enhance_local: print(f"\n💡 Optional: Enhance SKILL.md with Claude:") print(f" API-based: python3 enhance_skill.py output/{config['name']}/") print(f" or re-run with: --enhance") print(f" Local (no API key): python3 enhance_skill_local.py output/{config['name']}/") print(f" or re-run with: --enhance-local") if __name__ == "__main__": main()