#!/usr/bin/env python3 """ Config Splitter for Large Documentation Sites Splits large documentation configs into multiple smaller, focused skill configs. Supports multiple splitting strategies: category-based, size-based, and automatic. """ import argparse import json import sys from collections import defaultdict from pathlib import Path from typing import Any class ConfigSplitter: """Splits large documentation configs into multiple focused configs""" def __init__(self, config_path: str, strategy: str = "auto", target_pages: int = 5000): self.config_path = Path(config_path) self.strategy = strategy self.target_pages = target_pages self.config = self.load_config() self.base_name = self.config["name"] def load_config(self) -> dict[str, Any]: """Load configuration from file""" try: with open(self.config_path) as f: return json.load(f) except FileNotFoundError: print(f"❌ Error: Config file not found: {self.config_path}") sys.exit(1) except json.JSONDecodeError as e: print(f"❌ Error: Invalid JSON in config file: {e}") sys.exit(1) def is_unified_config(self) -> bool: """Check if this is a unified multi-source config""" return "sources" in self.config def get_split_strategy(self) -> str: """Determine split strategy""" # For unified configs, default to source-based splitting if self.is_unified_config(): if self.strategy == "auto": num_sources = len(self.config.get("sources", [])) if num_sources <= 1: print("ℹ️ Single source unified config - no splitting needed") return "none" else: print( f"ℹ️ Multi-source unified config ({num_sources} sources) - source split recommended" ) return "source" # For unified configs, only 'source' and 'none' strategies are valid elif self.strategy in ["source", "none"]: return self.strategy else: print(f"⚠️ Warning: Strategy '{self.strategy}' not supported for unified configs") print("ℹ️ Using 'source' strategy instead") return "source" # Check if strategy is defined in config (documentation configs) if "split_strategy" in self.config: config_strategy = self.config["split_strategy"] if config_strategy != "none": return config_strategy # Use provided strategy or auto-detect (documentation configs) if self.strategy == "auto": max_pages = self.config.get("max_pages", 500) if max_pages < 5000: print(f"ℹ️ Small documentation ({max_pages} pages) - no splitting needed") return "none" elif max_pages < 10000 and "categories" in self.config: print(f"ℹ️ Medium documentation ({max_pages} pages) - category split recommended") return "category" elif "categories" in self.config and len(self.config["categories"]) >= 3: print( f"ℹ️ Large documentation ({max_pages} pages) - router + categories recommended" ) return "router" else: print(f"ℹ️ Large documentation ({max_pages} pages) - size-based split") return "size" return self.strategy def split_by_category(self, create_router: bool = False) -> list[dict[str, Any]]: """Split config by categories""" if "categories" not in self.config: print("❌ Error: No categories defined in config") sys.exit(1) categories = self.config["categories"] split_categories = self.config.get("split_config", {}).get("split_by_categories") # If specific categories specified, use only those if split_categories: categories = {k: v for k, v in categories.items() if k in split_categories} configs = [] for category_name, keywords in categories.items(): # Create new config for this category new_config = self.config.copy() new_config["name"] = f"{self.base_name}-{category_name}" new_config["description"] = ( f"{self.base_name.capitalize()} - {category_name.replace('_', ' ').title()}. {self.config.get('description', '')}" ) # Update URL patterns to focus on this category url_patterns = new_config.get("url_patterns", {}) # Add category keywords to includes includes = url_patterns.get("include", []) for keyword in keywords: if keyword.startswith("/"): includes.append(keyword) if includes: url_patterns["include"] = list(set(includes)) new_config["url_patterns"] = url_patterns # Keep only this category new_config["categories"] = {category_name: keywords} # Remove split config from child if "split_strategy" in new_config: del new_config["split_strategy"] if "split_config" in new_config: del new_config["split_config"] # Adjust max_pages estimate if "max_pages" in new_config: new_config["max_pages"] = self.target_pages configs.append(new_config) print(f"✅ Created {len(configs)} category-based configs") # Optionally create router config if create_router: router_config = self.create_router_config(configs) configs.insert(0, router_config) print(f"✅ Created router config: {router_config['name']}") return configs def split_by_size(self) -> list[dict[str, Any]]: """Split config by size (page count)""" max_pages = self.config.get("max_pages", 500) num_splits = (max_pages + self.target_pages - 1) // self.target_pages configs = [] for i in range(num_splits): new_config = self.config.copy() part_num = i + 1 new_config["name"] = f"{self.base_name}-part{part_num}" new_config["description"] = ( f"{self.base_name.capitalize()} - Part {part_num}. {self.config.get('description', '')}" ) new_config["max_pages"] = self.target_pages # Remove split config from child if "split_strategy" in new_config: del new_config["split_strategy"] if "split_config" in new_config: del new_config["split_config"] configs.append(new_config) print(f"✅ Created {len(configs)} size-based configs ({self.target_pages} pages each)") return configs def split_by_source(self) -> list[dict[str, Any]]: """Split unified config by source type""" if not self.is_unified_config(): print("❌ Error: Config is not a unified config (missing 'sources' key)") sys.exit(1) sources = self.config.get("sources", []) if not sources: print("❌ Error: No sources defined in unified config") sys.exit(1) configs = [] source_type_counts = defaultdict(int) for source in sources: source_type = source.get("type", "unknown") source_type_counts[source_type] += 1 count = source_type_counts[source_type] # Create new config for this source new_config = { "name": f"{self.base_name}-{source_type}" + (f"-{count}" if count > 1 else ""), "description": f"{self.base_name.capitalize()} - {source_type.title()} source. {self.config.get('description', '')}", "sources": [source], # Single source per config } # Copy merge_mode if it exists if "merge_mode" in self.config: new_config["merge_mode"] = self.config["merge_mode"] configs.append(new_config) print(f"✅ Created {len(configs)} source-based configs") # Show breakdown by source type for source_type, count in source_type_counts.items(): print(f" 📄 {count}x {source_type}") return configs def create_router_config(self, sub_configs: list[dict[str, Any]]) -> dict[str, Any]: """Create a router config that references sub-skills""" router_name = self.config.get("split_config", {}).get("router_name", self.base_name) router_config = { "name": router_name, "description": self.config.get("description", ""), "base_url": self.config["base_url"], "selectors": self.config["selectors"], "url_patterns": self.config.get("url_patterns", {}), "rate_limit": self.config.get("rate_limit", 0.5), "max_pages": 500, # Router only needs overview pages "_router": True, "_sub_skills": [cfg["name"] for cfg in sub_configs], "_routing_keywords": { cfg["name"]: list(cfg.get("categories", {}).keys()) for cfg in sub_configs }, } return router_config def split(self) -> list[dict[str, Any]]: """Execute split based on strategy""" strategy = self.get_split_strategy() config_type = "UNIFIED" if self.is_unified_config() else "DOCUMENTATION" print(f"\n{'=' * 60}") print(f"CONFIG SPLITTER: {self.base_name} ({config_type})") print(f"{'=' * 60}") print(f"Strategy: {strategy}") if not self.is_unified_config(): print(f"Target pages per skill: {self.target_pages}") print("") if strategy == "none": print("ℹ️ No splitting required") return [self.config] elif strategy == "source": return self.split_by_source() elif strategy == "category": return self.split_by_category(create_router=False) elif strategy == "router": create_router = self.config.get("split_config", {}).get("create_router", True) return self.split_by_category(create_router=create_router) elif strategy == "size": return self.split_by_size() else: print(f"❌ Error: Unknown strategy: {strategy}") sys.exit(1) def save_configs(self, configs: list[dict[str, Any]], output_dir: Path = None) -> list[Path]: """Save configs to files""" if output_dir is None: output_dir = self.config_path.parent output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) saved_files = [] for config in configs: filename = f"{config['name']}.json" filepath = output_dir / filename with open(filepath, "w") as f: json.dump(config, f, indent=2) saved_files.append(filepath) print(f" 💾 Saved: {filepath}") return saved_files def main(): parser = argparse.ArgumentParser( description="Split large documentation configs into multiple focused skills", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Auto-detect strategy python3 split_config.py configs/godot.json # Use category-based split python3 split_config.py configs/godot.json --strategy category # Use router + categories python3 split_config.py configs/godot.json --strategy router # Custom target size python3 split_config.py configs/godot.json --target-pages 3000 # Dry run (don't save files) python3 split_config.py configs/godot.json --dry-run Split Strategies: none - No splitting (single skill) auto - Automatically choose best strategy source - Split unified configs by source type (docs, github, pdf) category - Split by categories defined in config router - Create router + category-based sub-skills size - Split by page count Config Types: Documentation - Single base_url config (supports: category, router, size) Unified - Multi-source config (supports: source) """, ) parser.add_argument("config", help="Path to config file (e.g., configs/godot.json)") parser.add_argument( "--strategy", choices=["auto", "none", "source", "category", "router", "size"], default="auto", help="Splitting strategy (default: auto)", ) parser.add_argument( "--target-pages", type=int, default=5000, help="Target pages per skill (default: 5000)" ) parser.add_argument( "--output-dir", help="Output directory for configs (default: same as input)" ) parser.add_argument( "--dry-run", action="store_true", help="Show what would be created without saving files" ) args = parser.parse_args() # Create splitter splitter = ConfigSplitter(args.config, args.strategy, args.target_pages) # Split config configs = splitter.split() if args.dry_run: print(f"\n{'=' * 60}") print("DRY RUN - No files saved") print(f"{'=' * 60}") print(f"Would create {len(configs)} config files:") for cfg in configs: is_router = cfg.get("_router", False) router_marker = " (ROUTER)" if is_router else "" print(f" 📄 {cfg['name']}.json{router_marker}") else: print(f"\n{'=' * 60}") print("SAVING CONFIGS") print(f"{'=' * 60}") saved_files = splitter.save_configs(configs, args.output_dir) print(f"\n{'=' * 60}") print("NEXT STEPS") print(f"{'=' * 60}") print("1. Review generated configs") print("2. Scrape each config:") for filepath in saved_files: print(f" skill-seekers scrape --config {filepath}") print("3. Package skills:") print(" skill-seekers-package-multi configs/-*.json") print("") if __name__ == "__main__": main()