fix: unified scraper temp config uses unified format for doc_scraper (#317)

The unified scraper's _scrape_documentation() was creating temp configs
in flat/legacy format (no "sources" key), causing doc_scraper's
ConfigValidator to reject them. Wrap the temp config in unified format
with a "sources" array. Also remove dead code branches and fix a
pre-existing test that didn't clear GITHUB_TOKEN from env.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-03-20 22:35:12 +03:00
parent 4f87de6b56
commit f6131c6798
4 changed files with 29 additions and 15 deletions

View File

@@ -1970,8 +1970,6 @@ def load_config(config_path: str) -> dict[str, Any]:
# Log config type
if validator.is_unified:
logger.debug("✓ Unified config format detected")
else:
logger.debug("✓ Legacy config format detected")
except ValueError as e:
logger.error("❌ Configuration validation errors in %s:", config_path)
logger.error(" %s", str(e))

View File

@@ -165,10 +165,6 @@ class UnifiedScraper:
logger.info("PHASE 1: Scraping all sources")
logger.info("=" * 60)
if not self.validator.is_unified:
logger.warning("Config is not unified format, converting...")
self.config = self.validator.convert_legacy_to_unified()
sources = self.config.get("sources", [])
for i, source in enumerate(sources):
@@ -220,9 +216,10 @@ class UnifiedScraper:
def _scrape_documentation(self, source: dict[str, Any]):
"""Scrape documentation website."""
# Create temporary config for doc scraper
doc_config = {
"name": f"{self.name}_docs",
# Create temporary config for doc scraper in unified format
# (doc_scraper's ConfigValidator requires "sources" key)
doc_source = {
"type": "documentation",
"base_url": source["base_url"],
"selectors": source.get("selectors", {}),
"url_patterns": source.get("url_patterns", {}),
@@ -233,14 +230,20 @@ class UnifiedScraper:
# Pass through llms.txt settings (so unified configs behave the same as doc_scraper configs)
if "llms_txt_url" in source:
doc_config["llms_txt_url"] = source.get("llms_txt_url")
doc_source["llms_txt_url"] = source["llms_txt_url"]
if "skip_llms_txt" in source:
doc_config["skip_llms_txt"] = source.get("skip_llms_txt")
doc_source["skip_llms_txt"] = source["skip_llms_txt"]
# Optional: support overriding start URLs
if "start_urls" in source:
doc_config["start_urls"] = source.get("start_urls")
doc_source["start_urls"] = source["start_urls"]
doc_config = {
"name": f"{self.name}_docs",
"description": f"Documentation for {self.name}",
"sources": [doc_source],
}
# Write temporary config
temp_config_path = os.path.join(self.data_dir, "temp_docs_config.json")

View File

@@ -68,7 +68,12 @@ class TestGitHubScraperInitialization(unittest.TestCase):
"github_token": "test_token_123",
}
with patch("skill_seekers.cli.github_scraper.Github") as mock_github:
# Clear GITHUB_TOKEN env var so config token is used (env takes priority)
env = {k: v for k, v in os.environ.items() if k != "GITHUB_TOKEN"}
with (
patch.dict(os.environ, env, clear=True),
patch("skill_seekers.cli.github_scraper.Github") as mock_github,
):
_scraper = self.GitHubScraper(config)
mock_github.assert_called_once_with("test_token_123")

View File

@@ -224,7 +224,11 @@ class TestScrapeDocumentation:
mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="")
scraper._scrape_documentation(source)
assert any("llms_txt_url" in c for c in written_configs)
assert any(
"llms_txt_url" in s
for c in written_configs
for s in c.get("sources", [c])
)
def test_start_urls_forwarded_to_doc_config(self, tmp_path):
"""start_urls from source is forwarded to the temporary doc config."""
@@ -247,7 +251,11 @@ class TestScrapeDocumentation:
mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="")
scraper._scrape_documentation(source)
assert any("start_urls" in c for c in written_configs)
assert any(
"start_urls" in s
for c in written_configs
for s in c.get("sources", [c])
)
# ===========================================================================