391 lines
14 KiB
Python
391 lines
14 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Config Splitter for Large Documentation Sites
|
||
|
||
Splits large documentation configs into multiple smaller, focused skill configs.
|
||
Supports multiple splitting strategies: category-based, size-based, and automatic.
|
||
"""
|
||
|
||
import argparse
|
||
import json
|
||
import sys
|
||
from collections import defaultdict
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
|
||
class ConfigSplitter:
|
||
"""Splits large documentation configs into multiple focused configs"""
|
||
|
||
def __init__(self, config_path: str, strategy: str = "auto", target_pages: int = 5000):
|
||
self.config_path = Path(config_path)
|
||
self.strategy = strategy
|
||
self.target_pages = target_pages
|
||
self.config = self.load_config()
|
||
self.base_name = self.config["name"]
|
||
|
||
def load_config(self) -> dict[str, Any]:
|
||
"""Load configuration from file"""
|
||
try:
|
||
with open(self.config_path) as f:
|
||
return json.load(f)
|
||
except FileNotFoundError:
|
||
print(f"❌ Error: Config file not found: {self.config_path}")
|
||
sys.exit(1)
|
||
except json.JSONDecodeError as e:
|
||
print(f"❌ Error: Invalid JSON in config file: {e}")
|
||
sys.exit(1)
|
||
|
||
def is_unified_config(self) -> bool:
|
||
"""Check if this is a unified multi-source config"""
|
||
return "sources" in self.config
|
||
|
||
def get_split_strategy(self) -> str:
|
||
"""Determine split strategy"""
|
||
# For unified configs, default to source-based splitting
|
||
if self.is_unified_config():
|
||
if self.strategy == "auto":
|
||
num_sources = len(self.config.get("sources", []))
|
||
if num_sources <= 1:
|
||
print("ℹ️ Single source unified config - no splitting needed")
|
||
return "none"
|
||
else:
|
||
print(
|
||
f"ℹ️ Multi-source unified config ({num_sources} sources) - source split recommended"
|
||
)
|
||
return "source"
|
||
# For unified configs, only 'source' and 'none' strategies are valid
|
||
elif self.strategy in ["source", "none"]:
|
||
return self.strategy
|
||
else:
|
||
print(f"⚠️ Warning: Strategy '{self.strategy}' not supported for unified configs")
|
||
print("ℹ️ Using 'source' strategy instead")
|
||
return "source"
|
||
|
||
# Check if strategy is defined in config (documentation configs)
|
||
if "split_strategy" in self.config:
|
||
config_strategy = self.config["split_strategy"]
|
||
if config_strategy != "none":
|
||
return config_strategy
|
||
|
||
# Use provided strategy or auto-detect (documentation configs)
|
||
if self.strategy == "auto":
|
||
max_pages = self.config.get("max_pages", 500)
|
||
|
||
if max_pages < 5000:
|
||
print(f"ℹ️ Small documentation ({max_pages} pages) - no splitting needed")
|
||
return "none"
|
||
elif max_pages < 10000 and "categories" in self.config:
|
||
print(f"ℹ️ Medium documentation ({max_pages} pages) - category split recommended")
|
||
return "category"
|
||
elif "categories" in self.config and len(self.config["categories"]) >= 3:
|
||
print(
|
||
f"ℹ️ Large documentation ({max_pages} pages) - router + categories recommended"
|
||
)
|
||
return "router"
|
||
else:
|
||
print(f"ℹ️ Large documentation ({max_pages} pages) - size-based split")
|
||
return "size"
|
||
|
||
return self.strategy
|
||
|
||
def split_by_category(self, create_router: bool = False) -> list[dict[str, Any]]:
|
||
"""Split config by categories"""
|
||
if "categories" not in self.config:
|
||
print("❌ Error: No categories defined in config")
|
||
sys.exit(1)
|
||
|
||
categories = self.config["categories"]
|
||
split_categories = self.config.get("split_config", {}).get("split_by_categories")
|
||
|
||
# If specific categories specified, use only those
|
||
if split_categories:
|
||
categories = {k: v for k, v in categories.items() if k in split_categories}
|
||
|
||
configs = []
|
||
|
||
for category_name, keywords in categories.items():
|
||
# Create new config for this category
|
||
new_config = self.config.copy()
|
||
new_config["name"] = f"{self.base_name}-{category_name}"
|
||
new_config["description"] = (
|
||
f"{self.base_name.capitalize()} - {category_name.replace('_', ' ').title()}. {self.config.get('description', '')}"
|
||
)
|
||
|
||
# Update URL patterns to focus on this category
|
||
url_patterns = new_config.get("url_patterns", {})
|
||
|
||
# Add category keywords to includes
|
||
includes = url_patterns.get("include", [])
|
||
for keyword in keywords:
|
||
if keyword.startswith("/"):
|
||
includes.append(keyword)
|
||
|
||
if includes:
|
||
url_patterns["include"] = list(set(includes))
|
||
new_config["url_patterns"] = url_patterns
|
||
|
||
# Keep only this category
|
||
new_config["categories"] = {category_name: keywords}
|
||
|
||
# Remove split config from child
|
||
if "split_strategy" in new_config:
|
||
del new_config["split_strategy"]
|
||
if "split_config" in new_config:
|
||
del new_config["split_config"]
|
||
|
||
# Adjust max_pages estimate
|
||
if "max_pages" in new_config:
|
||
new_config["max_pages"] = self.target_pages
|
||
|
||
configs.append(new_config)
|
||
|
||
print(f"✅ Created {len(configs)} category-based configs")
|
||
|
||
# Optionally create router config
|
||
if create_router:
|
||
router_config = self.create_router_config(configs)
|
||
configs.insert(0, router_config)
|
||
print(f"✅ Created router config: {router_config['name']}")
|
||
|
||
return configs
|
||
|
||
def split_by_size(self) -> list[dict[str, Any]]:
|
||
"""Split config by size (page count)"""
|
||
max_pages = self.config.get("max_pages", 500)
|
||
num_splits = (max_pages + self.target_pages - 1) // self.target_pages
|
||
|
||
configs = []
|
||
|
||
for i in range(num_splits):
|
||
new_config = self.config.copy()
|
||
part_num = i + 1
|
||
new_config["name"] = f"{self.base_name}-part{part_num}"
|
||
new_config["description"] = (
|
||
f"{self.base_name.capitalize()} - Part {part_num}. {self.config.get('description', '')}"
|
||
)
|
||
new_config["max_pages"] = self.target_pages
|
||
|
||
# Remove split config from child
|
||
if "split_strategy" in new_config:
|
||
del new_config["split_strategy"]
|
||
if "split_config" in new_config:
|
||
del new_config["split_config"]
|
||
|
||
configs.append(new_config)
|
||
|
||
print(f"✅ Created {len(configs)} size-based configs ({self.target_pages} pages each)")
|
||
return configs
|
||
|
||
def split_by_source(self) -> list[dict[str, Any]]:
|
||
"""Split unified config by source type"""
|
||
if not self.is_unified_config():
|
||
print("❌ Error: Config is not a unified config (missing 'sources' key)")
|
||
sys.exit(1)
|
||
|
||
sources = self.config.get("sources", [])
|
||
if not sources:
|
||
print("❌ Error: No sources defined in unified config")
|
||
sys.exit(1)
|
||
|
||
configs = []
|
||
source_type_counts = defaultdict(int)
|
||
|
||
for source in sources:
|
||
source_type = source.get("type", "unknown")
|
||
source_type_counts[source_type] += 1
|
||
count = source_type_counts[source_type]
|
||
|
||
# Create new config for this source
|
||
new_config = {
|
||
"name": f"{self.base_name}-{source_type}" + (f"-{count}" if count > 1 else ""),
|
||
"description": f"{self.base_name.capitalize()} - {source_type.title()} source. {self.config.get('description', '')}",
|
||
"sources": [source], # Single source per config
|
||
}
|
||
|
||
# Copy merge_mode if it exists
|
||
if "merge_mode" in self.config:
|
||
new_config["merge_mode"] = self.config["merge_mode"]
|
||
|
||
configs.append(new_config)
|
||
|
||
print(f"✅ Created {len(configs)} source-based configs")
|
||
|
||
# Show breakdown by source type
|
||
for source_type, count in source_type_counts.items():
|
||
print(f" 📄 {count}x {source_type}")
|
||
|
||
return configs
|
||
|
||
def create_router_config(self, sub_configs: list[dict[str, Any]]) -> dict[str, Any]:
|
||
"""Create a router config that references sub-skills"""
|
||
router_name = self.config.get("split_config", {}).get("router_name", self.base_name)
|
||
|
||
router_config = {
|
||
"name": router_name,
|
||
"description": self.config.get("description", ""),
|
||
"base_url": self.config["base_url"],
|
||
"selectors": self.config["selectors"],
|
||
"url_patterns": self.config.get("url_patterns", {}),
|
||
"rate_limit": self.config.get("rate_limit", 0.5),
|
||
"max_pages": 500, # Router only needs overview pages
|
||
"_router": True,
|
||
"_sub_skills": [cfg["name"] for cfg in sub_configs],
|
||
"_routing_keywords": {
|
||
cfg["name"]: list(cfg.get("categories", {}).keys()) for cfg in sub_configs
|
||
},
|
||
}
|
||
|
||
return router_config
|
||
|
||
def split(self) -> list[dict[str, Any]]:
|
||
"""Execute split based on strategy"""
|
||
strategy = self.get_split_strategy()
|
||
|
||
config_type = "UNIFIED" if self.is_unified_config() else "DOCUMENTATION"
|
||
print(f"\n{'=' * 60}")
|
||
print(f"CONFIG SPLITTER: {self.base_name} ({config_type})")
|
||
print(f"{'=' * 60}")
|
||
print(f"Strategy: {strategy}")
|
||
if not self.is_unified_config():
|
||
print(f"Target pages per skill: {self.target_pages}")
|
||
print("")
|
||
|
||
if strategy == "none":
|
||
print("ℹ️ No splitting required")
|
||
return [self.config]
|
||
|
||
elif strategy == "source":
|
||
return self.split_by_source()
|
||
|
||
elif strategy == "category":
|
||
return self.split_by_category(create_router=False)
|
||
|
||
elif strategy == "router":
|
||
create_router = self.config.get("split_config", {}).get("create_router", True)
|
||
return self.split_by_category(create_router=create_router)
|
||
|
||
elif strategy == "size":
|
||
return self.split_by_size()
|
||
|
||
else:
|
||
print(f"❌ Error: Unknown strategy: {strategy}")
|
||
sys.exit(1)
|
||
|
||
def save_configs(self, configs: list[dict[str, Any]], output_dir: Path = None) -> list[Path]:
|
||
"""Save configs to files"""
|
||
if output_dir is None:
|
||
output_dir = self.config_path.parent
|
||
|
||
output_dir = Path(output_dir)
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
saved_files = []
|
||
|
||
for config in configs:
|
||
filename = f"{config['name']}.json"
|
||
filepath = output_dir / filename
|
||
|
||
with open(filepath, "w") as f:
|
||
json.dump(config, f, indent=2)
|
||
|
||
saved_files.append(filepath)
|
||
print(f" 💾 Saved: {filepath}")
|
||
|
||
return saved_files
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(
|
||
description="Split large documentation configs into multiple focused skills",
|
||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
epilog="""
|
||
Examples:
|
||
# Auto-detect strategy
|
||
python3 split_config.py configs/godot.json
|
||
|
||
# Use category-based split
|
||
python3 split_config.py configs/godot.json --strategy category
|
||
|
||
# Use router + categories
|
||
python3 split_config.py configs/godot.json --strategy router
|
||
|
||
# Custom target size
|
||
python3 split_config.py configs/godot.json --target-pages 3000
|
||
|
||
# Dry run (don't save files)
|
||
python3 split_config.py configs/godot.json --dry-run
|
||
|
||
Split Strategies:
|
||
none - No splitting (single skill)
|
||
auto - Automatically choose best strategy
|
||
source - Split unified configs by source type (docs, github, pdf)
|
||
category - Split by categories defined in config
|
||
router - Create router + category-based sub-skills
|
||
size - Split by page count
|
||
|
||
Config Types:
|
||
Documentation - Single base_url config (supports: category, router, size)
|
||
Unified - Multi-source config (supports: source)
|
||
""",
|
||
)
|
||
|
||
parser.add_argument("config", help="Path to config file (e.g., configs/godot.json)")
|
||
|
||
parser.add_argument(
|
||
"--strategy",
|
||
choices=["auto", "none", "source", "category", "router", "size"],
|
||
default="auto",
|
||
help="Splitting strategy (default: auto)",
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--target-pages", type=int, default=5000, help="Target pages per skill (default: 5000)"
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--output-dir", help="Output directory for configs (default: same as input)"
|
||
)
|
||
|
||
parser.add_argument(
|
||
"--dry-run", action="store_true", help="Show what would be created without saving files"
|
||
)
|
||
|
||
args = parser.parse_args()
|
||
|
||
# Create splitter
|
||
splitter = ConfigSplitter(args.config, args.strategy, args.target_pages)
|
||
|
||
# Split config
|
||
configs = splitter.split()
|
||
|
||
if args.dry_run:
|
||
print(f"\n{'=' * 60}")
|
||
print("DRY RUN - No files saved")
|
||
print(f"{'=' * 60}")
|
||
print(f"Would create {len(configs)} config files:")
|
||
for cfg in configs:
|
||
is_router = cfg.get("_router", False)
|
||
router_marker = " (ROUTER)" if is_router else ""
|
||
print(f" 📄 {cfg['name']}.json{router_marker}")
|
||
else:
|
||
print(f"\n{'=' * 60}")
|
||
print("SAVING CONFIGS")
|
||
print(f"{'=' * 60}")
|
||
saved_files = splitter.save_configs(configs, args.output_dir)
|
||
|
||
print(f"\n{'=' * 60}")
|
||
print("NEXT STEPS")
|
||
print(f"{'=' * 60}")
|
||
print("1. Review generated configs")
|
||
print("2. Scrape each config:")
|
||
for filepath in saved_files:
|
||
print(f" skill-seekers scrape --config {filepath}")
|
||
print("3. Package skills:")
|
||
print(" skill-seekers-package-multi configs/<name>-*.json")
|
||
print("")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|