fix: unified scraper temp config uses unified format for doc_scraper (#317)
The unified scraper's _scrape_documentation() was creating temp configs in flat/legacy format (no "sources" key), causing doc_scraper's ConfigValidator to reject them. Wrap the temp config in unified format with a "sources" array. Also remove dead code branches and fix a pre-existing test that didn't clear GITHUB_TOKEN from env. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1970,8 +1970,6 @@ def load_config(config_path: str) -> dict[str, Any]:
|
||||
# Log config type
|
||||
if validator.is_unified:
|
||||
logger.debug("✓ Unified config format detected")
|
||||
else:
|
||||
logger.debug("✓ Legacy config format detected")
|
||||
except ValueError as e:
|
||||
logger.error("❌ Configuration validation errors in %s:", config_path)
|
||||
logger.error(" %s", str(e))
|
||||
|
||||
@@ -165,10 +165,6 @@ class UnifiedScraper:
|
||||
logger.info("PHASE 1: Scraping all sources")
|
||||
logger.info("=" * 60)
|
||||
|
||||
if not self.validator.is_unified:
|
||||
logger.warning("Config is not unified format, converting...")
|
||||
self.config = self.validator.convert_legacy_to_unified()
|
||||
|
||||
sources = self.config.get("sources", [])
|
||||
|
||||
for i, source in enumerate(sources):
|
||||
@@ -220,9 +216,10 @@ class UnifiedScraper:
|
||||
|
||||
def _scrape_documentation(self, source: dict[str, Any]):
|
||||
"""Scrape documentation website."""
|
||||
# Create temporary config for doc scraper
|
||||
doc_config = {
|
||||
"name": f"{self.name}_docs",
|
||||
# Create temporary config for doc scraper in unified format
|
||||
# (doc_scraper's ConfigValidator requires "sources" key)
|
||||
doc_source = {
|
||||
"type": "documentation",
|
||||
"base_url": source["base_url"],
|
||||
"selectors": source.get("selectors", {}),
|
||||
"url_patterns": source.get("url_patterns", {}),
|
||||
@@ -233,14 +230,20 @@ class UnifiedScraper:
|
||||
|
||||
# Pass through llms.txt settings (so unified configs behave the same as doc_scraper configs)
|
||||
if "llms_txt_url" in source:
|
||||
doc_config["llms_txt_url"] = source.get("llms_txt_url")
|
||||
doc_source["llms_txt_url"] = source["llms_txt_url"]
|
||||
|
||||
if "skip_llms_txt" in source:
|
||||
doc_config["skip_llms_txt"] = source.get("skip_llms_txt")
|
||||
doc_source["skip_llms_txt"] = source["skip_llms_txt"]
|
||||
|
||||
# Optional: support overriding start URLs
|
||||
if "start_urls" in source:
|
||||
doc_config["start_urls"] = source.get("start_urls")
|
||||
doc_source["start_urls"] = source["start_urls"]
|
||||
|
||||
doc_config = {
|
||||
"name": f"{self.name}_docs",
|
||||
"description": f"Documentation for {self.name}",
|
||||
"sources": [doc_source],
|
||||
}
|
||||
|
||||
# Write temporary config
|
||||
temp_config_path = os.path.join(self.data_dir, "temp_docs_config.json")
|
||||
|
||||
Reference in New Issue
Block a user