diff --git a/cli/generate_router.py b/cli/generate_router.py new file mode 100644 index 0000000..5b87d5e --- /dev/null +++ b/cli/generate_router.py @@ -0,0 +1,274 @@ +#!/usr/bin/env python3 +""" +Router Skill Generator + +Creates a router/hub skill that intelligently directs queries to specialized sub-skills. +This is used for large documentation sites split into multiple focused skills. +""" + +import json +import sys +import argparse +from pathlib import Path +from typing import Dict, List, Any + + +class RouterGenerator: + """Generates router skills that direct to specialized sub-skills""" + + def __init__(self, config_paths: List[str], router_name: str = None): + self.config_paths = [Path(p) for p in config_paths] + self.configs = [self.load_config(p) for p in self.config_paths] + self.router_name = router_name or self.infer_router_name() + self.base_config = self.configs[0] # Use first as template + + def load_config(self, path: Path) -> Dict[str, Any]: + """Load a config file""" + try: + with open(path, 'r') as f: + return json.load(f) + except Exception as e: + print(f"❌ Error loading {path}: {e}") + sys.exit(1) + + def infer_router_name(self) -> str: + """Infer router name from sub-skill names""" + # Find common prefix + names = [cfg['name'] for cfg in self.configs] + if not names: + return "router" + + # Get common prefix before first dash + first_name = names[0] + if '-' in first_name: + return first_name.split('-')[0] + return first_name + + def extract_routing_keywords(self) -> Dict[str, List[str]]: + """Extract keywords for routing to each skill""" + routing = {} + + for config in self.configs: + name = config['name'] + keywords = [] + + # Extract from categories + if 'categories' in config: + keywords.extend(config['categories'].keys()) + + # Extract from name (part after dash) + if '-' in name: + skill_topic = name.split('-', 1)[1] + keywords.append(skill_topic) + + routing[name] = keywords + + return routing + + def generate_skill_md(self) -> str: + """Generate router SKILL.md content""" + routing_keywords = self.extract_routing_keywords() + + skill_md = f"""# {self.router_name.replace('-', ' ').title()} Documentation (Router) + +## When to Use This Skill + +{self.base_config.get('description', f'Use for {self.router_name} development and programming.')} + +This is a router skill that directs your questions to specialized sub-skills for efficient, focused assistance. + +## How It Works + +This skill analyzes your question and activates the appropriate specialized skill(s): + +""" + + # List sub-skills + for config in self.configs: + name = config['name'] + desc = config.get('description', '') + # Remove router name prefix from description if present + if desc.startswith(f"{self.router_name.title()} -"): + desc = desc.split(' - ', 1)[1] + + skill_md += f"### {name}\n{desc}\n\n" + + # Routing logic + skill_md += """## Routing Logic + +The router analyzes your question for topic keywords and activates relevant skills: + +**Keywords → Skills:** +""" + + for skill_name, keywords in routing_keywords.items(): + keyword_str = ", ".join(keywords) + skill_md += f"- {keyword_str} → **{skill_name}**\n" + + # Quick reference + skill_md += f""" + +## Quick Reference + +For quick answers, this router provides basic overview information. For detailed documentation, the specialized skills contain comprehensive references. + +### Getting Started + +1. Ask your question naturally - mention the topic area +2. The router will activate the appropriate skill(s) +3. You'll receive focused, detailed answers from specialized documentation + +### Examples + +**Question:** "How do I create a 2D sprite?" +**Activates:** {self.router_name}-2d skill + +**Question:** "GDScript function syntax" +**Activates:** {self.router_name}-scripting skill + +**Question:** "Physics collision handling in 3D" +**Activates:** {self.router_name}-3d + {self.router_name}-physics skills + +### All Available Skills + +""" + + # List all skills + for config in self.configs: + skill_md += f"- **{config['name']}**\n" + + skill_md += f""" + +## Need Help? + +Simply ask your question and mention the topic. The router will find the right specialized skill for you! + +--- + +*This is a router skill. For complete documentation, see the specialized skills listed above.* +""" + + return skill_md + + def create_router_config(self) -> Dict[str, Any]: + """Create router configuration""" + routing_keywords = self.extract_routing_keywords() + + router_config = { + "name": self.router_name, + "description": self.base_config.get('description', f'{self.router_name.title()} documentation router'), + "base_url": self.base_config['base_url'], + "selectors": self.base_config.get('selectors', {}), + "url_patterns": self.base_config.get('url_patterns', {}), + "rate_limit": self.base_config.get('rate_limit', 0.5), + "max_pages": 500, # Router only scrapes overview pages + "_router": True, + "_sub_skills": [cfg['name'] for cfg in self.configs], + "_routing_keywords": routing_keywords + } + + return router_config + + def generate(self, output_dir: Path = None) -> Tuple[Path, Path]: + """Generate router skill and config""" + if output_dir is None: + output_dir = self.config_paths[0].parent + + output_dir = Path(output_dir) + + # Generate SKILL.md + skill_md = self.generate_skill_md() + skill_path = output_dir.parent / f"output/{self.router_name}/SKILL.md" + skill_path.parent.mkdir(parents=True, exist_ok=True) + + with open(skill_path, 'w') as f: + f.write(skill_md) + + # Generate config + router_config = self.create_router_config() + config_path = output_dir / f"{self.router_name}.json" + + with open(config_path, 'w') as f: + json.dump(router_config, f, indent=2) + + return config_path, skill_path + + +def main(): + parser = argparse.ArgumentParser( + description="Generate router/hub skill for split documentation", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Generate router from multiple configs + python3 generate_router.py configs/godot-2d.json configs/godot-3d.json configs/godot-scripting.json + + # Use glob pattern + python3 generate_router.py configs/godot-*.json + + # Custom router name + python3 generate_router.py configs/godot-*.json --name godot-hub + + # Custom output directory + python3 generate_router.py configs/godot-*.json --output-dir configs/routers/ + """ + ) + + parser.add_argument( + 'configs', + nargs='+', + help='Sub-skill config files' + ) + + parser.add_argument( + '--name', + help='Router skill name (default: inferred from sub-skills)' + ) + + parser.add_argument( + '--output-dir', + help='Output directory (default: same as input configs)' + ) + + args = parser.parse_args() + + # Filter out router configs (avoid recursion) + config_files = [] + for path_str in args.configs: + path = Path(path_str) + if path.exists() and not path.stem.endswith('-router'): + config_files.append(path_str) + + if not config_files: + print("❌ Error: No valid config files provided") + sys.exit(1) + + print(f"\n{'='*60}") + print("ROUTER SKILL GENERATOR") + print(f"{'='*60}") + print(f"Sub-skills: {len(config_files)}") + for cfg in config_files: + print(f" - {Path(cfg).stem}") + print("") + + # Generate router + generator = RouterGenerator(config_files, args.name) + config_path, skill_path = generator.generate(args.output_dir) + + print(f"✅ Router config created: {config_path}") + print(f"✅ Router SKILL.md created: {skill_path}") + print("") + print(f"{'='*60}") + print("NEXT STEPS") + print(f"{'='*60}") + print(f"1. Review router SKILL.md: {skill_path}") + print(f"2. Optionally scrape router (for overview pages):") + print(f" python3 cli/doc_scraper.py --config {config_path}") + print("3. Package router skill:") + print(f" python3 cli/package_skill.py output/{generator.router_name}/") + print("4. Upload router + all sub-skills to Claude") + print("") + + +if __name__ == "__main__": + main() diff --git a/cli/package_multi.py b/cli/package_multi.py new file mode 100644 index 0000000..bffdb9c --- /dev/null +++ b/cli/package_multi.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +""" +Multi-Skill Packager + +Package multiple skills at once. Useful for packaging router + sub-skills together. +""" + +import sys +import argparse +from pathlib import Path +import subprocess + + +def package_skill(skill_dir: Path) -> bool: + """Package a single skill""" + try: + result = subprocess.run( + [sys.executable, str(Path(__file__).parent / "package_skill.py"), str(skill_dir)], + capture_output=True, + text=True + ) + return result.returncode == 0 + except Exception as e: + print(f"❌ Error packaging {skill_dir}: {e}") + return False + + +def main(): + parser = argparse.ArgumentParser( + description="Package multiple skills at once", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Package all godot skills + python3 package_multi.py output/godot*/ + + # Package specific skills + python3 package_multi.py output/godot-2d/ output/godot-3d/ output/godot-scripting/ + """ + ) + + parser.add_argument( + 'skill_dirs', + nargs='+', + help='Skill directories to package' + ) + + args = parser.parse_args() + + print(f"\n{'='*60}") + print(f"MULTI-SKILL PACKAGER") + print(f"{'='*60}\n") + + skill_dirs = [Path(d) for d in args.skill_dirs] + success_count = 0 + total_count = len(skill_dirs) + + for skill_dir in skill_dirs: + if not skill_dir.exists(): + print(f"⚠️ Skipping (not found): {skill_dir}") + continue + + if not (skill_dir / "SKILL.md").exists(): + print(f"⚠️ Skipping (no SKILL.md): {skill_dir}") + continue + + print(f"📦 Packaging: {skill_dir.name}") + if package_skill(skill_dir): + success_count += 1 + print(f" ✅ Success") + else: + print(f" ❌ Failed") + print("") + + print(f"{'='*60}") + print(f"SUMMARY: {success_count}/{total_count} skills packaged") + print(f"{'='*60}\n") + + +if __name__ == "__main__": + main() diff --git a/cli/split_config.py b/cli/split_config.py new file mode 100644 index 0000000..031679e --- /dev/null +++ b/cli/split_config.py @@ -0,0 +1,320 @@ +#!/usr/bin/env python3 +""" +Config Splitter for Large Documentation Sites + +Splits large documentation configs into multiple smaller, focused skill configs. +Supports multiple splitting strategies: category-based, size-based, and automatic. +""" + +import json +import sys +import argparse +from pathlib import Path +from typing import Dict, List, Any, Tuple +from collections import defaultdict + + +class ConfigSplitter: + """Splits large documentation configs into multiple focused configs""" + + def __init__(self, config_path: str, strategy: str = "auto", target_pages: int = 5000): + self.config_path = Path(config_path) + self.strategy = strategy + self.target_pages = target_pages + self.config = self.load_config() + self.base_name = self.config['name'] + + def load_config(self) -> Dict[str, Any]: + """Load configuration from file""" + try: + with open(self.config_path, 'r') as f: + return json.load(f) + except FileNotFoundError: + print(f"❌ Error: Config file not found: {self.config_path}") + sys.exit(1) + except json.JSONDecodeError as e: + print(f"❌ Error: Invalid JSON in config file: {e}") + sys.exit(1) + + def get_split_strategy(self) -> str: + """Determine split strategy""" + # Check if strategy is defined in config + if 'split_strategy' in self.config: + config_strategy = self.config['split_strategy'] + if config_strategy != "none": + return config_strategy + + # Use provided strategy or auto-detect + if self.strategy == "auto": + max_pages = self.config.get('max_pages', 500) + + if max_pages < 5000: + print(f"ℹ️ Small documentation ({max_pages} pages) - no splitting needed") + return "none" + elif max_pages < 10000 and 'categories' in self.config: + print(f"ℹ️ Medium documentation ({max_pages} pages) - category split recommended") + return "category" + elif 'categories' in self.config and len(self.config['categories']) >= 3: + print(f"ℹ️ Large documentation ({max_pages} pages) - router + categories recommended") + return "router" + else: + print(f"ℹ️ Large documentation ({max_pages} pages) - size-based split") + return "size" + + return self.strategy + + def split_by_category(self, create_router: bool = False) -> List[Dict[str, Any]]: + """Split config by categories""" + if 'categories' not in self.config: + print("❌ Error: No categories defined in config") + sys.exit(1) + + categories = self.config['categories'] + split_categories = self.config.get('split_config', {}).get('split_by_categories') + + # If specific categories specified, use only those + if split_categories: + categories = {k: v for k, v in categories.items() if k in split_categories} + + configs = [] + + for category_name, keywords in categories.items(): + # Create new config for this category + new_config = self.config.copy() + new_config['name'] = f"{self.base_name}-{category_name}" + new_config['description'] = f"{self.base_name.capitalize()} - {category_name.replace('_', ' ').title()}. {self.config.get('description', '')}" + + # Update URL patterns to focus on this category + url_patterns = new_config.get('url_patterns', {}) + + # Add category keywords to includes + includes = url_patterns.get('include', []) + for keyword in keywords: + if keyword.startswith('/'): + includes.append(keyword) + + if includes: + url_patterns['include'] = list(set(includes)) + new_config['url_patterns'] = url_patterns + + # Keep only this category + new_config['categories'] = {category_name: keywords} + + # Remove split config from child + if 'split_strategy' in new_config: + del new_config['split_strategy'] + if 'split_config' in new_config: + del new_config['split_config'] + + # Adjust max_pages estimate + if 'max_pages' in new_config: + new_config['max_pages'] = self.target_pages + + configs.append(new_config) + + print(f"✅ Created {len(configs)} category-based configs") + + # Optionally create router config + if create_router: + router_config = self.create_router_config(configs) + configs.insert(0, router_config) + print(f"✅ Created router config: {router_config['name']}") + + return configs + + def split_by_size(self) -> List[Dict[str, Any]]: + """Split config by size (page count)""" + max_pages = self.config.get('max_pages', 500) + num_splits = (max_pages + self.target_pages - 1) // self.target_pages + + configs = [] + + for i in range(num_splits): + new_config = self.config.copy() + part_num = i + 1 + new_config['name'] = f"{self.base_name}-part{part_num}" + new_config['description'] = f"{self.base_name.capitalize()} - Part {part_num}. {self.config.get('description', '')}" + new_config['max_pages'] = self.target_pages + + # Remove split config from child + if 'split_strategy' in new_config: + del new_config['split_strategy'] + if 'split_config' in new_config: + del new_config['split_config'] + + configs.append(new_config) + + print(f"✅ Created {len(configs)} size-based configs ({self.target_pages} pages each)") + return configs + + def create_router_config(self, sub_configs: List[Dict[str, Any]]) -> Dict[str, Any]: + """Create a router config that references sub-skills""" + router_name = self.config.get('split_config', {}).get('router_name', self.base_name) + + router_config = { + "name": router_name, + "description": self.config.get('description', ''), + "base_url": self.config['base_url'], + "selectors": self.config['selectors'], + "url_patterns": self.config.get('url_patterns', {}), + "rate_limit": self.config.get('rate_limit', 0.5), + "max_pages": 500, # Router only needs overview pages + "_router": True, + "_sub_skills": [cfg['name'] for cfg in sub_configs], + "_routing_keywords": { + cfg['name']: list(cfg.get('categories', {}).keys()) + for cfg in sub_configs + } + } + + return router_config + + def split(self) -> List[Dict[str, Any]]: + """Execute split based on strategy""" + strategy = self.get_split_strategy() + + print(f"\n{'='*60}") + print(f"CONFIG SPLITTER: {self.base_name}") + print(f"{'='*60}") + print(f"Strategy: {strategy}") + print(f"Target pages per skill: {self.target_pages}") + print("") + + if strategy == "none": + print("ℹ️ No splitting required") + return [self.config] + + elif strategy == "category": + return self.split_by_category(create_router=False) + + elif strategy == "router": + create_router = self.config.get('split_config', {}).get('create_router', True) + return self.split_by_category(create_router=create_router) + + elif strategy == "size": + return self.split_by_size() + + else: + print(f"❌ Error: Unknown strategy: {strategy}") + sys.exit(1) + + def save_configs(self, configs: List[Dict[str, Any]], output_dir: Path = None) -> List[Path]: + """Save configs to files""" + if output_dir is None: + output_dir = self.config_path.parent + + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + saved_files = [] + + for config in configs: + filename = f"{config['name']}.json" + filepath = output_dir / filename + + with open(filepath, 'w') as f: + json.dump(config, f, indent=2) + + saved_files.append(filepath) + print(f" 💾 Saved: {filepath}") + + return saved_files + + +def main(): + parser = argparse.ArgumentParser( + description="Split large documentation configs into multiple focused skills", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Auto-detect strategy + python3 split_config.py configs/godot.json + + # Use category-based split + python3 split_config.py configs/godot.json --strategy category + + # Use router + categories + python3 split_config.py configs/godot.json --strategy router + + # Custom target size + python3 split_config.py configs/godot.json --target-pages 3000 + + # Dry run (don't save files) + python3 split_config.py configs/godot.json --dry-run + +Split Strategies: + none - No splitting (single skill) + auto - Automatically choose best strategy + category - Split by categories defined in config + router - Create router + category-based sub-skills + size - Split by page count + """ + ) + + parser.add_argument( + 'config', + help='Path to config file (e.g., configs/godot.json)' + ) + + parser.add_argument( + '--strategy', + choices=['auto', 'none', 'category', 'router', 'size'], + default='auto', + help='Splitting strategy (default: auto)' + ) + + parser.add_argument( + '--target-pages', + type=int, + default=5000, + help='Target pages per skill (default: 5000)' + ) + + parser.add_argument( + '--output-dir', + help='Output directory for configs (default: same as input)' + ) + + parser.add_argument( + '--dry-run', + action='store_true', + help='Show what would be created without saving files' + ) + + args = parser.parse_args() + + # Create splitter + splitter = ConfigSplitter(args.config, args.strategy, args.target_pages) + + # Split config + configs = splitter.split() + + if args.dry_run: + print(f"\n{'='*60}") + print("DRY RUN - No files saved") + print(f"{'='*60}") + print(f"Would create {len(configs)} config files:") + for cfg in configs: + is_router = cfg.get('_router', False) + router_marker = " (ROUTER)" if is_router else "" + print(f" 📄 {cfg['name']}.json{router_marker}") + else: + print(f"\n{'='*60}") + print("SAVING CONFIGS") + print(f"{'='*60}") + saved_files = splitter.save_configs(configs, args.output_dir) + + print(f"\n{'='*60}") + print("NEXT STEPS") + print(f"{'='*60}") + print("1. Review generated configs") + print("2. Scrape each config:") + for filepath in saved_files: + print(f" python3 cli/doc_scraper.py --config {filepath}") + print("3. Package skills:") + print(" python3 cli/package_multi.py configs/-*.json") + print("") + + +if __name__ == "__main__": + main() diff --git a/configs/godot-large-example.json b/configs/godot-large-example.json new file mode 100644 index 0000000..a4d04b9 --- /dev/null +++ b/configs/godot-large-example.json @@ -0,0 +1,63 @@ +{ + "name": "godot", + "description": "Godot Engine game development. Use for Godot projects, GDScript/C# coding, scene setup, node systems, 2D/3D development, physics, animation, UI, shaders, or any Godot-specific questions.", + "base_url": "https://docs.godotengine.org/en/stable/", + "start_urls": [ + "https://docs.godotengine.org/en/stable/getting_started/introduction/index.html", + "https://docs.godotengine.org/en/stable/tutorials/scripting/gdscript/index.html", + "https://docs.godotengine.org/en/stable/tutorials/2d/index.html", + "https://docs.godotengine.org/en/stable/tutorials/3d/index.html", + "https://docs.godotengine.org/en/stable/tutorials/physics/index.html", + "https://docs.godotengine.org/en/stable/tutorials/animation/index.html", + "https://docs.godotengine.org/en/stable/classes/index.html" + ], + "selectors": { + "main_content": "div[role='main']", + "title": "title", + "code_blocks": "pre" + }, + "url_patterns": { + "include": [ + "/getting_started/", + "/tutorials/", + "/classes/" + ], + "exclude": [ + "/genindex.html", + "/search.html", + "/_static/", + "/_sources/" + ] + }, + "categories": { + "getting_started": ["introduction", "getting_started", "first", "your_first"], + "scripting": ["scripting", "gdscript", "c#", "csharp"], + "2d": ["/2d/", "sprite", "canvas", "tilemap"], + "3d": ["/3d/", "spatial", "mesh", "3d_"], + "physics": ["physics", "collision", "rigidbody", "characterbody"], + "animation": ["animation", "tween", "animationplayer"], + "ui": ["ui", "control", "gui", "theme"], + "shaders": ["shader", "material", "visual_shader"], + "audio": ["audio", "sound"], + "networking": ["networking", "multiplayer", "rpc"], + "export": ["export", "platform", "deploy"] + }, + "rate_limit": 0.5, + "max_pages": 40000, + + "_comment": "=== NEW: Split Strategy Configuration ===", + "split_strategy": "router", + "split_config": { + "target_pages_per_skill": 5000, + "create_router": true, + "split_by_categories": ["scripting", "2d", "3d", "physics", "shaders"], + "router_name": "godot", + "parallel_scraping": true + }, + + "_comment2": "=== NEW: Checkpoint Configuration ===", + "checkpoint": { + "enabled": true, + "interval": 1000 + } +} diff --git a/docs/LARGE_DOCUMENTATION.md b/docs/LARGE_DOCUMENTATION.md new file mode 100644 index 0000000..bff2bc5 --- /dev/null +++ b/docs/LARGE_DOCUMENTATION.md @@ -0,0 +1,431 @@ +# Handling Large Documentation Sites (10K+ Pages) + +Complete guide for scraping and managing large documentation sites with Skill Seeker. + +--- + +## Table of Contents + +- [When to Split Documentation](#when-to-split-documentation) +- [Split Strategies](#split-strategies) +- [Quick Start](#quick-start) +- [Detailed Workflows](#detailed-workflows) +- [Best Practices](#best-practices) +- [Examples](#examples) +- [Troubleshooting](#troubleshooting) + +--- + +## When to Split Documentation + +### Size Guidelines + +| Documentation Size | Recommendation | Strategy | +|-------------------|----------------|----------| +| < 5,000 pages | **One skill** | No splitting needed | +| 5,000 - 10,000 pages | **Consider splitting** | Category-based | +| 10,000 - 30,000 pages | **Recommended** | Router + Categories | +| 30,000+ pages | **Strongly recommended** | Router + Categories | + +### Why Split Large Documentation? + +**Benefits:** +- ✅ Faster scraping (parallel execution) +- ✅ More focused skills (better Claude performance) +- ✅ Easier maintenance (update one topic at a time) +- ✅ Better user experience (precise answers) +- ✅ Avoids context window limits + +**Trade-offs:** +- ⚠️ Multiple skills to manage +- ⚠️ Initial setup more complex +- ⚠️ Router adds one extra skill + +--- + +## Split Strategies + +### 1. **No Split** (One Big Skill) +**Best for:** Small to medium documentation (< 5K pages) + +```bash +# Just use the config as-is +python3 cli/doc_scraper.py --config configs/react.json +``` + +**Pros:** Simple, one skill to maintain +**Cons:** Can be slow for large docs, may hit limits + +--- + +### 2. **Category Split** (Multiple Focused Skills) +**Best for:** 5K-15K pages with clear topic divisions + +```bash +# Auto-split by categories +python3 cli/split_config.py configs/godot.json --strategy category + +# Creates: +# - godot-scripting.json +# - godot-2d.json +# - godot-3d.json +# - godot-physics.json +# - etc. +``` + +**Pros:** Focused skills, clear separation +**Cons:** User must know which skill to use + +--- + +### 3. **Router + Categories** (Intelligent Hub) ⭐ RECOMMENDED +**Best for:** 10K+ pages, best user experience + +```bash +# Create router + sub-skills +python3 cli/split_config.py configs/godot.json --strategy router + +# Creates: +# - godot.json (router/hub) +# - godot-scripting.json +# - godot-2d.json +# - etc. +``` + +**Pros:** Best of both worlds, intelligent routing, natural UX +**Cons:** Slightly more complex setup + +--- + +### 4. **Size-Based Split** +**Best for:** Docs without clear categories + +```bash +# Split every 5000 pages +python3 cli/split_config.py configs/bigdocs.json --strategy size --target-pages 5000 + +# Creates: +# - bigdocs-part1.json +# - bigdocs-part2.json +# - bigdocs-part3.json +# - etc. +``` + +**Pros:** Simple, predictable +**Cons:** May split related topics + +--- + +## Quick Start + +### Option 1: Automatic (Recommended) + +```bash +# 1. Create config +python3 cli/doc_scraper.py --interactive +# Name: godot +# URL: https://docs.godotengine.org +# ... fill in prompts ... + +# 2. Estimate pages (discovers it's large) +python3 cli/estimate_pages.py configs/godot.json +# Output: ⚠️ 40,000 pages detected - splitting recommended + +# 3. Auto-split with router +python3 cli/split_config.py configs/godot.json --strategy router + +# 4. Scrape all sub-skills +for config in configs/godot-*.json; do + python3 cli/doc_scraper.py --config $config & +done +wait + +# 5. Generate router +python3 cli/generate_router.py configs/godot-*.json + +# 6. Package all +python3 cli/package_multi.py output/godot*/ + +# 7. Upload all .zip files to Claude +``` + +--- + +### Option 2: Manual Control + +```bash +# 1. Define split in config +nano configs/godot.json + +# Add: +{ + "split_strategy": "router", + "split_config": { + "target_pages_per_skill": 5000, + "create_router": true, + "split_by_categories": ["scripting", "2d", "3d", "physics"] + } +} + +# 2. Split +python3 cli/split_config.py configs/godot.json + +# 3. Continue as above... +``` + +--- + +## Detailed Workflows + +### Workflow 1: Router + Categories (40K Pages) + +**Scenario:** Godot documentation (40,000 pages) + +**Step 1: Estimate** +```bash +python3 cli/estimate_pages.py configs/godot.json + +# Output: +# Estimated: 40,000 pages +# Recommended: Split into 8 skills (5K each) +``` + +**Step 2: Split Configuration** +```bash +python3 cli/split_config.py configs/godot.json --strategy router --target-pages 5000 + +# Creates: +# configs/godot.json (router) +# configs/godot-scripting.json (5K pages) +# configs/godot-2d.json (8K pages) +# configs/godot-3d.json (10K pages) +# configs/godot-physics.json (6K pages) +# configs/godot-shaders.json (11K pages) +``` + +**Step 3: Scrape Sub-Skills (Parallel)** +```bash +# Open multiple terminals or use background jobs +python3 cli/doc_scraper.py --config configs/godot-scripting.json & +python3 cli/doc_scraper.py --config configs/godot-2d.json & +python3 cli/doc_scraper.py --config configs/godot-3d.json & +python3 cli/doc_scraper.py --config configs/godot-physics.json & +python3 cli/doc_scraper.py --config configs/godot-shaders.json & + +# Wait for all to complete +wait + +# Time: 4-8 hours (parallel) vs 20-40 hours (sequential) +``` + +**Step 4: Generate Router** +```bash +python3 cli/generate_router.py configs/godot-*.json + +# Creates: +# output/godot/SKILL.md (router skill) +``` + +**Step 5: Package All** +```bash +python3 cli/package_multi.py output/godot*/ + +# Creates: +# output/godot.zip (router) +# output/godot-scripting.zip +# output/godot-2d.zip +# output/godot-3d.zip +# output/godot-physics.zip +# output/godot-shaders.zip +``` + +**Step 6: Upload to Claude** +Upload all 6 .zip files to Claude. The router will intelligently direct queries to the right sub-skill! + +--- + +### Workflow 2: Category Split Only (15K Pages) + +**Scenario:** Vue.js documentation (15,000 pages) + +**No router needed - just focused skills:** + +```bash +# 1. Split +python3 cli/split_config.py configs/vue.json --strategy category + +# 2. Scrape each +for config in configs/vue-*.json; do + python3 cli/doc_scraper.py --config $config +done + +# 3. Package +python3 cli/package_multi.py output/vue*/ + +# 4. Upload all to Claude +``` + +**Result:** 5 focused Vue skills (components, reactivity, routing, etc.) + +--- + +## Best Practices + +### 1. **Choose Target Size Wisely** + +```bash +# Small focused skills (3K-5K pages) - more skills, very focused +python3 cli/split_config.py config.json --target-pages 3000 + +# Medium skills (5K-8K pages) - balanced (RECOMMENDED) +python3 cli/split_config.py config.json --target-pages 5000 + +# Larger skills (8K-10K pages) - fewer skills, broader +python3 cli/split_config.py config.json --target-pages 8000 +``` + +### 2. **Use Parallel Scraping** + +```bash +# Serial (slow - 40 hours) +for config in configs/godot-*.json; do + python3 cli/doc_scraper.py --config $config +done + +# Parallel (fast - 8 hours) ⭐ +for config in configs/godot-*.json; do + python3 cli/doc_scraper.py --config $config & +done +wait +``` + +### 3. **Test Before Full Scrape** + +```bash +# Test with limited pages first +nano configs/godot-2d.json +# Set: "max_pages": 50 + +python3 cli/doc_scraper.py --config configs/godot-2d.json + +# If output looks good, increase to full +``` + +### 4. **Use Checkpoints for Long Scrapes** + +```bash +# Enable checkpoints in config +{ + "checkpoint": { + "enabled": true, + "interval": 1000 + } +} + +# If scrape fails, resume +python3 cli/doc_scraper.py --config config.json --resume +``` + +--- + +## Examples + +### Example 1: AWS Documentation (Hypothetical 50K Pages) + +```bash +# 1. Split by AWS services +python3 cli/split_config.py configs/aws.json --strategy router --target-pages 5000 + +# Creates ~10 skills: +# - aws (router) +# - aws-compute (EC2, Lambda) +# - aws-storage (S3, EBS) +# - aws-database (RDS, DynamoDB) +# - etc. + +# 2. Scrape in parallel (overnight) +# 3. Upload all skills to Claude +# 4. User asks "How do I create an S3 bucket?" +# 5. Router activates aws-storage skill +# 6. Focused, accurate answer! +``` + +### Example 2: Microsoft Docs (100K+ Pages) + +```bash +# Too large even with splitting - use selective categories + +# Only scrape key topics +python3 cli/split_config.py configs/microsoft.json --strategy category + +# Edit configs to include only: +# - microsoft-azure (Azure docs only) +# - microsoft-dotnet (.NET docs only) +# - microsoft-typescript (TS docs only) + +# Skip less relevant sections +``` + +--- + +## Troubleshooting + +### Issue: "Splitting creates too many skills" + +**Solution:** Increase target size or combine categories + +```bash +# Instead of 5K per skill, use 8K +python3 cli/split_config.py config.json --target-pages 8000 + +# Or manually combine categories in config +``` + +### Issue: "Router not routing correctly" + +**Solution:** Check routing keywords in router SKILL.md + +```bash +# Review router +cat output/godot/SKILL.md + +# Update keywords if needed +nano output/godot/SKILL.md +``` + +### Issue: "Parallel scraping fails" + +**Solution:** Reduce parallelism or check rate limits + +```bash +# Scrape 2-3 at a time instead of all +python3 cli/doc_scraper.py --config config1.json & +python3 cli/doc_scraper.py --config config2.json & +wait + +python3 cli/doc_scraper.py --config config3.json & +python3 cli/doc_scraper.py --config config4.json & +wait +``` + +--- + +## Summary + +**For 40K+ Page Documentation:** + +1. ✅ **Estimate first**: `python3 cli/estimate_pages.py config.json` +2. ✅ **Split with router**: `python3 cli/split_config.py config.json --strategy router` +3. ✅ **Scrape in parallel**: Multiple terminals or background jobs +4. ✅ **Generate router**: `python3 cli/generate_router.py configs/*-*.json` +5. ✅ **Package all**: `python3 cli/package_multi.py output/*/` +6. ✅ **Upload to Claude**: All .zip files + +**Result:** Intelligent, fast, focused skills that work seamlessly together! + +--- + +**Questions? See:** +- [Main README](../README.md) +- [MCP Setup Guide](MCP_SETUP.md) +- [Enhancement Guide](ENHANCEMENT.md) diff --git a/mcp/server.py b/mcp/server.py index 2134cd4..69b4c8d 100644 --- a/mcp/server.py +++ b/mcp/server.py @@ -150,6 +150,53 @@ async def list_tools() -> list[Tool]: "required": ["config_path"], }, ), + Tool( + name="split_config", + description="Split large documentation config into multiple focused skills. For 10K+ page documentation.", + inputSchema={ + "type": "object", + "properties": { + "config_path": { + "type": "string", + "description": "Path to config JSON file (e.g., configs/godot.json)", + }, + "strategy": { + "type": "string", + "description": "Split strategy: auto, none, category, router, size (default: auto)", + "default": "auto", + }, + "target_pages": { + "type": "integer", + "description": "Target pages per skill (default: 5000)", + "default": 5000, + }, + "dry_run": { + "type": "boolean", + "description": "Preview without saving files (default: false)", + "default": False, + }, + }, + "required": ["config_path"], + }, + ), + Tool( + name="generate_router", + description="Generate router/hub skill for split documentation. Creates intelligent routing to sub-skills.", + inputSchema={ + "type": "object", + "properties": { + "config_pattern": { + "type": "string", + "description": "Config pattern for sub-skills (e.g., 'configs/godot-*.json')", + }, + "router_name": { + "type": "string", + "description": "Router skill name (optional, inferred from configs)", + }, + }, + "required": ["config_pattern"], + }, + ), ] @@ -170,6 +217,10 @@ async def call_tool(name: str, arguments: Any) -> list[TextContent]: return await list_configs_tool(arguments) elif name == "validate_config": return await validate_config_tool(arguments) + elif name == "split_config": + return await split_config_tool(arguments) + elif name == "generate_router": + return await generate_router_tool(arguments) else: return [TextContent(type="text", text=f"Unknown tool: {name}")] @@ -374,6 +425,63 @@ async def validate_config_tool(args: dict) -> list[TextContent]: return [TextContent(type="text", text=f"❌ Error: {str(e)}")] +async def split_config_tool(args: dict) -> list[TextContent]: + """Split large config into multiple focused configs""" + config_path = args["config_path"] + strategy = args.get("strategy", "auto") + target_pages = args.get("target_pages", 5000) + dry_run = args.get("dry_run", False) + + # Run split_config.py + cmd = [ + sys.executable, + str(CLI_DIR / "split_config.py"), + config_path, + "--strategy", strategy, + "--target-pages", str(target_pages) + ] + + if dry_run: + cmd.append("--dry-run") + + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode == 0: + return [TextContent(type="text", text=result.stdout)] + else: + return [TextContent(type="text", text=f"Error: {result.stderr}\n\n{result.stdout}")] + + +async def generate_router_tool(args: dict) -> list[TextContent]: + """Generate router skill for split documentation""" + import glob + + config_pattern = args["config_pattern"] + router_name = args.get("router_name") + + # Expand glob pattern + config_files = glob.glob(config_pattern) + + if not config_files: + return [TextContent(type="text", text=f"❌ No config files match pattern: {config_pattern}")] + + # Run generate_router.py + cmd = [ + sys.executable, + str(CLI_DIR / "generate_router.py"), + ] + config_files + + if router_name: + cmd.extend(["--name", router_name]) + + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode == 0: + return [TextContent(type="text", text=result.stdout)] + else: + return [TextContent(type="text", text=f"Error: {result.stderr}\n\n{result.stdout}")] + + async def main(): """Run the MCP server""" from mcp.server.stdio import stdio_server