This commit is contained in:
Pablo Estevez
2026-01-17 17:29:21 +00:00
parent c89f059712
commit 5ed767ff9a
144 changed files with 14142 additions and 16488 deletions

View File

@@ -6,12 +6,12 @@ Splits large documentation configs into multiple smaller, focused skill configs.
Supports multiple splitting strategies: category-based, size-based, and automatic.
"""
import argparse
import json
import sys
import argparse
from pathlib import Path
from typing import Dict, List, Any, Tuple
from collections import defaultdict
from pathlib import Path
from typing import Any
class ConfigSplitter:
@@ -22,12 +22,12 @@ class ConfigSplitter:
self.strategy = strategy
self.target_pages = target_pages
self.config = self.load_config()
self.base_name = self.config['name']
self.base_name = self.config["name"]
def load_config(self) -> Dict[str, Any]:
def load_config(self) -> dict[str, Any]:
"""Load configuration from file"""
try:
with open(self.config_path, 'r') as f:
with open(self.config_path) as f:
return json.load(f)
except FileNotFoundError:
print(f"❌ Error: Config file not found: {self.config_path}")
@@ -38,45 +38,45 @@ class ConfigSplitter:
def is_unified_config(self) -> bool:
"""Check if this is a unified multi-source config"""
return 'sources' in self.config
return "sources" in self.config
def get_split_strategy(self) -> str:
"""Determine split strategy"""
# For unified configs, default to source-based splitting
if self.is_unified_config():
if self.strategy == "auto":
num_sources = len(self.config.get('sources', []))
num_sources = len(self.config.get("sources", []))
if num_sources <= 1:
print(f" Single source unified config - no splitting needed")
print(" Single source unified config - no splitting needed")
return "none"
else:
print(f" Multi-source unified config ({num_sources} sources) - source split recommended")
return "source"
# For unified configs, only 'source' and 'none' strategies are valid
elif self.strategy in ['source', 'none']:
elif self.strategy in ["source", "none"]:
return self.strategy
else:
print(f"⚠️ Warning: Strategy '{self.strategy}' not supported for unified configs")
print(f" Using 'source' strategy instead")
print(" Using 'source' strategy instead")
return "source"
# Check if strategy is defined in config (documentation configs)
if 'split_strategy' in self.config:
config_strategy = self.config['split_strategy']
if "split_strategy" in self.config:
config_strategy = self.config["split_strategy"]
if config_strategy != "none":
return config_strategy
# Use provided strategy or auto-detect (documentation configs)
if self.strategy == "auto":
max_pages = self.config.get('max_pages', 500)
max_pages = self.config.get("max_pages", 500)
if max_pages < 5000:
print(f" Small documentation ({max_pages} pages) - no splitting needed")
return "none"
elif max_pages < 10000 and 'categories' in self.config:
elif max_pages < 10000 and "categories" in self.config:
print(f" Medium documentation ({max_pages} pages) - category split recommended")
return "category"
elif 'categories' in self.config and len(self.config['categories']) >= 3:
elif "categories" in self.config and len(self.config["categories"]) >= 3:
print(f" Large documentation ({max_pages} pages) - router + categories recommended")
return "router"
else:
@@ -85,14 +85,14 @@ class ConfigSplitter:
return self.strategy
def split_by_category(self, create_router: bool = False) -> List[Dict[str, Any]]:
def split_by_category(self, create_router: bool = False) -> list[dict[str, Any]]:
"""Split config by categories"""
if 'categories' not in self.config:
if "categories" not in self.config:
print("❌ Error: No categories defined in config")
sys.exit(1)
categories = self.config['categories']
split_categories = self.config.get('split_config', {}).get('split_by_categories')
categories = self.config["categories"]
split_categories = self.config.get("split_config", {}).get("split_by_categories")
# If specific categories specified, use only those
if split_categories:
@@ -103,34 +103,36 @@ class ConfigSplitter:
for category_name, keywords in categories.items():
# Create new config for this category
new_config = self.config.copy()
new_config['name'] = f"{self.base_name}-{category_name}"
new_config['description'] = f"{self.base_name.capitalize()} - {category_name.replace('_', ' ').title()}. {self.config.get('description', '')}"
new_config["name"] = f"{self.base_name}-{category_name}"
new_config["description"] = (
f"{self.base_name.capitalize()} - {category_name.replace('_', ' ').title()}. {self.config.get('description', '')}"
)
# Update URL patterns to focus on this category
url_patterns = new_config.get('url_patterns', {})
url_patterns = new_config.get("url_patterns", {})
# Add category keywords to includes
includes = url_patterns.get('include', [])
includes = url_patterns.get("include", [])
for keyword in keywords:
if keyword.startswith('/'):
if keyword.startswith("/"):
includes.append(keyword)
if includes:
url_patterns['include'] = list(set(includes))
new_config['url_patterns'] = url_patterns
url_patterns["include"] = list(set(includes))
new_config["url_patterns"] = url_patterns
# Keep only this category
new_config['categories'] = {category_name: keywords}
new_config["categories"] = {category_name: keywords}
# Remove split config from child
if 'split_strategy' in new_config:
del new_config['split_strategy']
if 'split_config' in new_config:
del new_config['split_config']
if "split_strategy" in new_config:
del new_config["split_strategy"]
if "split_config" in new_config:
del new_config["split_config"]
# Adjust max_pages estimate
if 'max_pages' in new_config:
new_config['max_pages'] = self.target_pages
if "max_pages" in new_config:
new_config["max_pages"] = self.target_pages
configs.append(new_config)
@@ -144,9 +146,9 @@ class ConfigSplitter:
return configs
def split_by_size(self) -> List[Dict[str, Any]]:
def split_by_size(self) -> list[dict[str, Any]]:
"""Split config by size (page count)"""
max_pages = self.config.get('max_pages', 500)
max_pages = self.config.get("max_pages", 500)
num_splits = (max_pages + self.target_pages - 1) // self.target_pages
configs = []
@@ -154,28 +156,30 @@ class ConfigSplitter:
for i in range(num_splits):
new_config = self.config.copy()
part_num = i + 1
new_config['name'] = f"{self.base_name}-part{part_num}"
new_config['description'] = f"{self.base_name.capitalize()} - Part {part_num}. {self.config.get('description', '')}"
new_config['max_pages'] = self.target_pages
new_config["name"] = f"{self.base_name}-part{part_num}"
new_config["description"] = (
f"{self.base_name.capitalize()} - Part {part_num}. {self.config.get('description', '')}"
)
new_config["max_pages"] = self.target_pages
# Remove split config from child
if 'split_strategy' in new_config:
del new_config['split_strategy']
if 'split_config' in new_config:
del new_config['split_config']
if "split_strategy" in new_config:
del new_config["split_strategy"]
if "split_config" in new_config:
del new_config["split_config"]
configs.append(new_config)
print(f"✅ Created {len(configs)} size-based configs ({self.target_pages} pages each)")
return configs
def split_by_source(self) -> List[Dict[str, Any]]:
def split_by_source(self) -> list[dict[str, Any]]:
"""Split unified config by source type"""
if not self.is_unified_config():
print("❌ Error: Config is not a unified config (missing 'sources' key)")
sys.exit(1)
sources = self.config.get('sources', [])
sources = self.config.get("sources", [])
if not sources:
print("❌ Error: No sources defined in unified config")
sys.exit(1)
@@ -184,20 +188,20 @@ class ConfigSplitter:
source_type_counts = defaultdict(int)
for source in sources:
source_type = source.get('type', 'unknown')
source_type = source.get("type", "unknown")
source_type_counts[source_type] += 1
count = source_type_counts[source_type]
# Create new config for this source
new_config = {
'name': f"{self.base_name}-{source_type}" + (f"-{count}" if count > 1 else ""),
'description': f"{self.base_name.capitalize()} - {source_type.title()} source. {self.config.get('description', '')}",
'sources': [source] # Single source per config
"name": f"{self.base_name}-{source_type}" + (f"-{count}" if count > 1 else ""),
"description": f"{self.base_name.capitalize()} - {source_type.title()} source. {self.config.get('description', '')}",
"sources": [source], # Single source per config
}
# Copy merge_mode if it exists
if 'merge_mode' in self.config:
new_config['merge_mode'] = self.config['merge_mode']
if "merge_mode" in self.config:
new_config["merge_mode"] = self.config["merge_mode"]
configs.append(new_config)
@@ -209,36 +213,33 @@ class ConfigSplitter:
return configs
def create_router_config(self, sub_configs: List[Dict[str, Any]]) -> Dict[str, Any]:
def create_router_config(self, sub_configs: list[dict[str, Any]]) -> dict[str, Any]:
"""Create a router config that references sub-skills"""
router_name = self.config.get('split_config', {}).get('router_name', self.base_name)
router_name = self.config.get("split_config", {}).get("router_name", self.base_name)
router_config = {
"name": router_name,
"description": self.config.get('description', ''),
"base_url": self.config['base_url'],
"selectors": self.config['selectors'],
"url_patterns": self.config.get('url_patterns', {}),
"rate_limit": self.config.get('rate_limit', 0.5),
"description": self.config.get("description", ""),
"base_url": self.config["base_url"],
"selectors": self.config["selectors"],
"url_patterns": self.config.get("url_patterns", {}),
"rate_limit": self.config.get("rate_limit", 0.5),
"max_pages": 500, # Router only needs overview pages
"_router": True,
"_sub_skills": [cfg['name'] for cfg in sub_configs],
"_routing_keywords": {
cfg['name']: list(cfg.get('categories', {}).keys())
for cfg in sub_configs
}
"_sub_skills": [cfg["name"] for cfg in sub_configs],
"_routing_keywords": {cfg["name"]: list(cfg.get("categories", {}).keys()) for cfg in sub_configs},
}
return router_config
def split(self) -> List[Dict[str, Any]]:
def split(self) -> list[dict[str, Any]]:
"""Execute split based on strategy"""
strategy = self.get_split_strategy()
config_type = "UNIFIED" if self.is_unified_config() else "DOCUMENTATION"
print(f"\n{'='*60}")
print(f"\n{'=' * 60}")
print(f"CONFIG SPLITTER: {self.base_name} ({config_type})")
print(f"{'='*60}")
print(f"{'=' * 60}")
print(f"Strategy: {strategy}")
if not self.is_unified_config():
print(f"Target pages per skill: {self.target_pages}")
@@ -255,7 +256,7 @@ class ConfigSplitter:
return self.split_by_category(create_router=False)
elif strategy == "router":
create_router = self.config.get('split_config', {}).get('create_router', True)
create_router = self.config.get("split_config", {}).get("create_router", True)
return self.split_by_category(create_router=create_router)
elif strategy == "size":
@@ -265,7 +266,7 @@ class ConfigSplitter:
print(f"❌ Error: Unknown strategy: {strategy}")
sys.exit(1)
def save_configs(self, configs: List[Dict[str, Any]], output_dir: Path = None) -> List[Path]:
def save_configs(self, configs: list[dict[str, Any]], output_dir: Path = None) -> list[Path]:
"""Save configs to files"""
if output_dir is None:
output_dir = self.config_path.parent
@@ -279,7 +280,7 @@ class ConfigSplitter:
filename = f"{config['name']}.json"
filepath = output_dir / filename
with open(filepath, 'w') as f:
with open(filepath, "w") as f:
json.dump(config, f, indent=2)
saved_files.append(filepath)
@@ -320,38 +321,23 @@ Split Strategies:
Config Types:
Documentation - Single base_url config (supports: category, router, size)
Unified - Multi-source config (supports: source)
"""
""",
)
parser.add_argument(
'config',
help='Path to config file (e.g., configs/godot.json)'
)
parser.add_argument("config", help="Path to config file (e.g., configs/godot.json)")
parser.add_argument(
'--strategy',
choices=['auto', 'none', 'source', 'category', 'router', 'size'],
default='auto',
help='Splitting strategy (default: auto)'
"--strategy",
choices=["auto", "none", "source", "category", "router", "size"],
default="auto",
help="Splitting strategy (default: auto)",
)
parser.add_argument(
'--target-pages',
type=int,
default=5000,
help='Target pages per skill (default: 5000)'
)
parser.add_argument("--target-pages", type=int, default=5000, help="Target pages per skill (default: 5000)")
parser.add_argument(
'--output-dir',
help='Output directory for configs (default: same as input)'
)
parser.add_argument("--output-dir", help="Output directory for configs (default: same as input)")
parser.add_argument(
'--dry-run',
action='store_true',
help='Show what would be created without saving files'
)
parser.add_argument("--dry-run", action="store_true", help="Show what would be created without saving files")
args = parser.parse_args()
@@ -362,23 +348,23 @@ Config Types:
configs = splitter.split()
if args.dry_run:
print(f"\n{'='*60}")
print(f"\n{'=' * 60}")
print("DRY RUN - No files saved")
print(f"{'='*60}")
print(f"{'=' * 60}")
print(f"Would create {len(configs)} config files:")
for cfg in configs:
is_router = cfg.get('_router', False)
is_router = cfg.get("_router", False)
router_marker = " (ROUTER)" if is_router else ""
print(f" 📄 {cfg['name']}.json{router_marker}")
else:
print(f"\n{'='*60}")
print(f"\n{'=' * 60}")
print("SAVING CONFIGS")
print(f"{'='*60}")
print(f"{'=' * 60}")
saved_files = splitter.save_configs(configs, args.output_dir)
print(f"\n{'='*60}")
print(f"\n{'=' * 60}")
print("NEXT STEPS")
print(f"{'='*60}")
print(f"{'=' * 60}")
print("1. Review generated configs")
print("2. Scrape each config:")
for filepath in saved_files: