run ruff
This commit is contained in:
@@ -6,12 +6,12 @@ Splits large documentation configs into multiple smaller, focused skill configs.
|
||||
Supports multiple splitting strategies: category-based, size-based, and automatic.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any, Tuple
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
class ConfigSplitter:
|
||||
@@ -22,12 +22,12 @@ class ConfigSplitter:
|
||||
self.strategy = strategy
|
||||
self.target_pages = target_pages
|
||||
self.config = self.load_config()
|
||||
self.base_name = self.config['name']
|
||||
self.base_name = self.config["name"]
|
||||
|
||||
def load_config(self) -> Dict[str, Any]:
|
||||
def load_config(self) -> dict[str, Any]:
|
||||
"""Load configuration from file"""
|
||||
try:
|
||||
with open(self.config_path, 'r') as f:
|
||||
with open(self.config_path) as f:
|
||||
return json.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"❌ Error: Config file not found: {self.config_path}")
|
||||
@@ -38,45 +38,45 @@ class ConfigSplitter:
|
||||
|
||||
def is_unified_config(self) -> bool:
|
||||
"""Check if this is a unified multi-source config"""
|
||||
return 'sources' in self.config
|
||||
return "sources" in self.config
|
||||
|
||||
def get_split_strategy(self) -> str:
|
||||
"""Determine split strategy"""
|
||||
# For unified configs, default to source-based splitting
|
||||
if self.is_unified_config():
|
||||
if self.strategy == "auto":
|
||||
num_sources = len(self.config.get('sources', []))
|
||||
num_sources = len(self.config.get("sources", []))
|
||||
if num_sources <= 1:
|
||||
print(f"ℹ️ Single source unified config - no splitting needed")
|
||||
print("ℹ️ Single source unified config - no splitting needed")
|
||||
return "none"
|
||||
else:
|
||||
print(f"ℹ️ Multi-source unified config ({num_sources} sources) - source split recommended")
|
||||
return "source"
|
||||
# For unified configs, only 'source' and 'none' strategies are valid
|
||||
elif self.strategy in ['source', 'none']:
|
||||
elif self.strategy in ["source", "none"]:
|
||||
return self.strategy
|
||||
else:
|
||||
print(f"⚠️ Warning: Strategy '{self.strategy}' not supported for unified configs")
|
||||
print(f"ℹ️ Using 'source' strategy instead")
|
||||
print("ℹ️ Using 'source' strategy instead")
|
||||
return "source"
|
||||
|
||||
# Check if strategy is defined in config (documentation configs)
|
||||
if 'split_strategy' in self.config:
|
||||
config_strategy = self.config['split_strategy']
|
||||
if "split_strategy" in self.config:
|
||||
config_strategy = self.config["split_strategy"]
|
||||
if config_strategy != "none":
|
||||
return config_strategy
|
||||
|
||||
# Use provided strategy or auto-detect (documentation configs)
|
||||
if self.strategy == "auto":
|
||||
max_pages = self.config.get('max_pages', 500)
|
||||
max_pages = self.config.get("max_pages", 500)
|
||||
|
||||
if max_pages < 5000:
|
||||
print(f"ℹ️ Small documentation ({max_pages} pages) - no splitting needed")
|
||||
return "none"
|
||||
elif max_pages < 10000 and 'categories' in self.config:
|
||||
elif max_pages < 10000 and "categories" in self.config:
|
||||
print(f"ℹ️ Medium documentation ({max_pages} pages) - category split recommended")
|
||||
return "category"
|
||||
elif 'categories' in self.config and len(self.config['categories']) >= 3:
|
||||
elif "categories" in self.config and len(self.config["categories"]) >= 3:
|
||||
print(f"ℹ️ Large documentation ({max_pages} pages) - router + categories recommended")
|
||||
return "router"
|
||||
else:
|
||||
@@ -85,14 +85,14 @@ class ConfigSplitter:
|
||||
|
||||
return self.strategy
|
||||
|
||||
def split_by_category(self, create_router: bool = False) -> List[Dict[str, Any]]:
|
||||
def split_by_category(self, create_router: bool = False) -> list[dict[str, Any]]:
|
||||
"""Split config by categories"""
|
||||
if 'categories' not in self.config:
|
||||
if "categories" not in self.config:
|
||||
print("❌ Error: No categories defined in config")
|
||||
sys.exit(1)
|
||||
|
||||
categories = self.config['categories']
|
||||
split_categories = self.config.get('split_config', {}).get('split_by_categories')
|
||||
categories = self.config["categories"]
|
||||
split_categories = self.config.get("split_config", {}).get("split_by_categories")
|
||||
|
||||
# If specific categories specified, use only those
|
||||
if split_categories:
|
||||
@@ -103,34 +103,36 @@ class ConfigSplitter:
|
||||
for category_name, keywords in categories.items():
|
||||
# Create new config for this category
|
||||
new_config = self.config.copy()
|
||||
new_config['name'] = f"{self.base_name}-{category_name}"
|
||||
new_config['description'] = f"{self.base_name.capitalize()} - {category_name.replace('_', ' ').title()}. {self.config.get('description', '')}"
|
||||
new_config["name"] = f"{self.base_name}-{category_name}"
|
||||
new_config["description"] = (
|
||||
f"{self.base_name.capitalize()} - {category_name.replace('_', ' ').title()}. {self.config.get('description', '')}"
|
||||
)
|
||||
|
||||
# Update URL patterns to focus on this category
|
||||
url_patterns = new_config.get('url_patterns', {})
|
||||
url_patterns = new_config.get("url_patterns", {})
|
||||
|
||||
# Add category keywords to includes
|
||||
includes = url_patterns.get('include', [])
|
||||
includes = url_patterns.get("include", [])
|
||||
for keyword in keywords:
|
||||
if keyword.startswith('/'):
|
||||
if keyword.startswith("/"):
|
||||
includes.append(keyword)
|
||||
|
||||
if includes:
|
||||
url_patterns['include'] = list(set(includes))
|
||||
new_config['url_patterns'] = url_patterns
|
||||
url_patterns["include"] = list(set(includes))
|
||||
new_config["url_patterns"] = url_patterns
|
||||
|
||||
# Keep only this category
|
||||
new_config['categories'] = {category_name: keywords}
|
||||
new_config["categories"] = {category_name: keywords}
|
||||
|
||||
# Remove split config from child
|
||||
if 'split_strategy' in new_config:
|
||||
del new_config['split_strategy']
|
||||
if 'split_config' in new_config:
|
||||
del new_config['split_config']
|
||||
if "split_strategy" in new_config:
|
||||
del new_config["split_strategy"]
|
||||
if "split_config" in new_config:
|
||||
del new_config["split_config"]
|
||||
|
||||
# Adjust max_pages estimate
|
||||
if 'max_pages' in new_config:
|
||||
new_config['max_pages'] = self.target_pages
|
||||
if "max_pages" in new_config:
|
||||
new_config["max_pages"] = self.target_pages
|
||||
|
||||
configs.append(new_config)
|
||||
|
||||
@@ -144,9 +146,9 @@ class ConfigSplitter:
|
||||
|
||||
return configs
|
||||
|
||||
def split_by_size(self) -> List[Dict[str, Any]]:
|
||||
def split_by_size(self) -> list[dict[str, Any]]:
|
||||
"""Split config by size (page count)"""
|
||||
max_pages = self.config.get('max_pages', 500)
|
||||
max_pages = self.config.get("max_pages", 500)
|
||||
num_splits = (max_pages + self.target_pages - 1) // self.target_pages
|
||||
|
||||
configs = []
|
||||
@@ -154,28 +156,30 @@ class ConfigSplitter:
|
||||
for i in range(num_splits):
|
||||
new_config = self.config.copy()
|
||||
part_num = i + 1
|
||||
new_config['name'] = f"{self.base_name}-part{part_num}"
|
||||
new_config['description'] = f"{self.base_name.capitalize()} - Part {part_num}. {self.config.get('description', '')}"
|
||||
new_config['max_pages'] = self.target_pages
|
||||
new_config["name"] = f"{self.base_name}-part{part_num}"
|
||||
new_config["description"] = (
|
||||
f"{self.base_name.capitalize()} - Part {part_num}. {self.config.get('description', '')}"
|
||||
)
|
||||
new_config["max_pages"] = self.target_pages
|
||||
|
||||
# Remove split config from child
|
||||
if 'split_strategy' in new_config:
|
||||
del new_config['split_strategy']
|
||||
if 'split_config' in new_config:
|
||||
del new_config['split_config']
|
||||
if "split_strategy" in new_config:
|
||||
del new_config["split_strategy"]
|
||||
if "split_config" in new_config:
|
||||
del new_config["split_config"]
|
||||
|
||||
configs.append(new_config)
|
||||
|
||||
print(f"✅ Created {len(configs)} size-based configs ({self.target_pages} pages each)")
|
||||
return configs
|
||||
|
||||
def split_by_source(self) -> List[Dict[str, Any]]:
|
||||
def split_by_source(self) -> list[dict[str, Any]]:
|
||||
"""Split unified config by source type"""
|
||||
if not self.is_unified_config():
|
||||
print("❌ Error: Config is not a unified config (missing 'sources' key)")
|
||||
sys.exit(1)
|
||||
|
||||
sources = self.config.get('sources', [])
|
||||
sources = self.config.get("sources", [])
|
||||
if not sources:
|
||||
print("❌ Error: No sources defined in unified config")
|
||||
sys.exit(1)
|
||||
@@ -184,20 +188,20 @@ class ConfigSplitter:
|
||||
source_type_counts = defaultdict(int)
|
||||
|
||||
for source in sources:
|
||||
source_type = source.get('type', 'unknown')
|
||||
source_type = source.get("type", "unknown")
|
||||
source_type_counts[source_type] += 1
|
||||
count = source_type_counts[source_type]
|
||||
|
||||
# Create new config for this source
|
||||
new_config = {
|
||||
'name': f"{self.base_name}-{source_type}" + (f"-{count}" if count > 1 else ""),
|
||||
'description': f"{self.base_name.capitalize()} - {source_type.title()} source. {self.config.get('description', '')}",
|
||||
'sources': [source] # Single source per config
|
||||
"name": f"{self.base_name}-{source_type}" + (f"-{count}" if count > 1 else ""),
|
||||
"description": f"{self.base_name.capitalize()} - {source_type.title()} source. {self.config.get('description', '')}",
|
||||
"sources": [source], # Single source per config
|
||||
}
|
||||
|
||||
# Copy merge_mode if it exists
|
||||
if 'merge_mode' in self.config:
|
||||
new_config['merge_mode'] = self.config['merge_mode']
|
||||
if "merge_mode" in self.config:
|
||||
new_config["merge_mode"] = self.config["merge_mode"]
|
||||
|
||||
configs.append(new_config)
|
||||
|
||||
@@ -209,36 +213,33 @@ class ConfigSplitter:
|
||||
|
||||
return configs
|
||||
|
||||
def create_router_config(self, sub_configs: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
def create_router_config(self, sub_configs: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
"""Create a router config that references sub-skills"""
|
||||
router_name = self.config.get('split_config', {}).get('router_name', self.base_name)
|
||||
router_name = self.config.get("split_config", {}).get("router_name", self.base_name)
|
||||
|
||||
router_config = {
|
||||
"name": router_name,
|
||||
"description": self.config.get('description', ''),
|
||||
"base_url": self.config['base_url'],
|
||||
"selectors": self.config['selectors'],
|
||||
"url_patterns": self.config.get('url_patterns', {}),
|
||||
"rate_limit": self.config.get('rate_limit', 0.5),
|
||||
"description": self.config.get("description", ""),
|
||||
"base_url": self.config["base_url"],
|
||||
"selectors": self.config["selectors"],
|
||||
"url_patterns": self.config.get("url_patterns", {}),
|
||||
"rate_limit": self.config.get("rate_limit", 0.5),
|
||||
"max_pages": 500, # Router only needs overview pages
|
||||
"_router": True,
|
||||
"_sub_skills": [cfg['name'] for cfg in sub_configs],
|
||||
"_routing_keywords": {
|
||||
cfg['name']: list(cfg.get('categories', {}).keys())
|
||||
for cfg in sub_configs
|
||||
}
|
||||
"_sub_skills": [cfg["name"] for cfg in sub_configs],
|
||||
"_routing_keywords": {cfg["name"]: list(cfg.get("categories", {}).keys()) for cfg in sub_configs},
|
||||
}
|
||||
|
||||
return router_config
|
||||
|
||||
def split(self) -> List[Dict[str, Any]]:
|
||||
def split(self) -> list[dict[str, Any]]:
|
||||
"""Execute split based on strategy"""
|
||||
strategy = self.get_split_strategy()
|
||||
|
||||
config_type = "UNIFIED" if self.is_unified_config() else "DOCUMENTATION"
|
||||
print(f"\n{'='*60}")
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"CONFIG SPLITTER: {self.base_name} ({config_type})")
|
||||
print(f"{'='*60}")
|
||||
print(f"{'=' * 60}")
|
||||
print(f"Strategy: {strategy}")
|
||||
if not self.is_unified_config():
|
||||
print(f"Target pages per skill: {self.target_pages}")
|
||||
@@ -255,7 +256,7 @@ class ConfigSplitter:
|
||||
return self.split_by_category(create_router=False)
|
||||
|
||||
elif strategy == "router":
|
||||
create_router = self.config.get('split_config', {}).get('create_router', True)
|
||||
create_router = self.config.get("split_config", {}).get("create_router", True)
|
||||
return self.split_by_category(create_router=create_router)
|
||||
|
||||
elif strategy == "size":
|
||||
@@ -265,7 +266,7 @@ class ConfigSplitter:
|
||||
print(f"❌ Error: Unknown strategy: {strategy}")
|
||||
sys.exit(1)
|
||||
|
||||
def save_configs(self, configs: List[Dict[str, Any]], output_dir: Path = None) -> List[Path]:
|
||||
def save_configs(self, configs: list[dict[str, Any]], output_dir: Path = None) -> list[Path]:
|
||||
"""Save configs to files"""
|
||||
if output_dir is None:
|
||||
output_dir = self.config_path.parent
|
||||
@@ -279,7 +280,7 @@ class ConfigSplitter:
|
||||
filename = f"{config['name']}.json"
|
||||
filepath = output_dir / filename
|
||||
|
||||
with open(filepath, 'w') as f:
|
||||
with open(filepath, "w") as f:
|
||||
json.dump(config, f, indent=2)
|
||||
|
||||
saved_files.append(filepath)
|
||||
@@ -320,38 +321,23 @@ Split Strategies:
|
||||
Config Types:
|
||||
Documentation - Single base_url config (supports: category, router, size)
|
||||
Unified - Multi-source config (supports: source)
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'config',
|
||||
help='Path to config file (e.g., configs/godot.json)'
|
||||
)
|
||||
parser.add_argument("config", help="Path to config file (e.g., configs/godot.json)")
|
||||
|
||||
parser.add_argument(
|
||||
'--strategy',
|
||||
choices=['auto', 'none', 'source', 'category', 'router', 'size'],
|
||||
default='auto',
|
||||
help='Splitting strategy (default: auto)'
|
||||
"--strategy",
|
||||
choices=["auto", "none", "source", "category", "router", "size"],
|
||||
default="auto",
|
||||
help="Splitting strategy (default: auto)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--target-pages',
|
||||
type=int,
|
||||
default=5000,
|
||||
help='Target pages per skill (default: 5000)'
|
||||
)
|
||||
parser.add_argument("--target-pages", type=int, default=5000, help="Target pages per skill (default: 5000)")
|
||||
|
||||
parser.add_argument(
|
||||
'--output-dir',
|
||||
help='Output directory for configs (default: same as input)'
|
||||
)
|
||||
parser.add_argument("--output-dir", help="Output directory for configs (default: same as input)")
|
||||
|
||||
parser.add_argument(
|
||||
'--dry-run',
|
||||
action='store_true',
|
||||
help='Show what would be created without saving files'
|
||||
)
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show what would be created without saving files")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -362,23 +348,23 @@ Config Types:
|
||||
configs = splitter.split()
|
||||
|
||||
if args.dry_run:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"\n{'=' * 60}")
|
||||
print("DRY RUN - No files saved")
|
||||
print(f"{'='*60}")
|
||||
print(f"{'=' * 60}")
|
||||
print(f"Would create {len(configs)} config files:")
|
||||
for cfg in configs:
|
||||
is_router = cfg.get('_router', False)
|
||||
is_router = cfg.get("_router", False)
|
||||
router_marker = " (ROUTER)" if is_router else ""
|
||||
print(f" 📄 {cfg['name']}.json{router_marker}")
|
||||
else:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"\n{'=' * 60}")
|
||||
print("SAVING CONFIGS")
|
||||
print(f"{'='*60}")
|
||||
print(f"{'=' * 60}")
|
||||
saved_files = splitter.save_configs(configs, args.output_dir)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"\n{'=' * 60}")
|
||||
print("NEXT STEPS")
|
||||
print(f"{'='*60}")
|
||||
print(f"{'=' * 60}")
|
||||
print("1. Review generated configs")
|
||||
print("2. Scrape each config:")
|
||||
for filepath in saved_files:
|
||||
|
||||
Reference in New Issue
Block a user