fix: unified scraper temp config uses unified format for doc_scraper (#317)

The unified scraper's _scrape_documentation() was creating temp configs in flat/legacy format (no "sources" key), causing doc_scraper's ConfigValidator to reject them. Wrap the temp config in unified format with a "sources" array. Also remove dead code branches and fix a pre-existing test that didn't clear GITHUB_TOKEN from env. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-20 22:35:12 +03:00
parent 4f87de6b56
commit f6131c6798
4 changed files with 29 additions and 15 deletions
--- a/src/skill_seekers/cli/doc_scraper.py
+++ b/src/skill_seekers/cli/doc_scraper.py
@@ -1970,8 +1970,6 @@ def load_config(config_path: str) -> dict[str, Any]:
        # Log config type
        if validator.is_unified:
            logger.debug("✓ Unified config format detected")
-        else:
-            logger.debug("✓ Legacy config format detected")
    except ValueError as e:
        logger.error("❌ Configuration validation errors in %s:", config_path)
        logger.error("   %s", str(e))
--- a/src/skill_seekers/cli/unified_scraper.py
+++ b/src/skill_seekers/cli/unified_scraper.py
@@ -165,10 +165,6 @@ class UnifiedScraper:
        logger.info("PHASE 1: Scraping all sources")
        logger.info("=" * 60)

-        if not self.validator.is_unified:
-            logger.warning("Config is not unified format, converting...")
-            self.config = self.validator.convert_legacy_to_unified()
-
        sources = self.config.get("sources", [])

        for i, source in enumerate(sources):
@@ -220,9 +216,10 @@ class UnifiedScraper:

    def _scrape_documentation(self, source: dict[str, Any]):
        """Scrape documentation website."""
-        # Create temporary config for doc scraper
-        doc_config = {
-            "name": f"{self.name}_docs",
+        # Create temporary config for doc scraper in unified format
+        # (doc_scraper's ConfigValidator requires "sources" key)
+        doc_source = {
+            "type": "documentation",
            "base_url": source["base_url"],
            "selectors": source.get("selectors", {}),
            "url_patterns": source.get("url_patterns", {}),
@@ -233,14 +230,20 @@ class UnifiedScraper:

        # Pass through llms.txt settings (so unified configs behave the same as doc_scraper configs)
        if "llms_txt_url" in source:
-            doc_config["llms_txt_url"] = source.get("llms_txt_url")
+            doc_source["llms_txt_url"] = source["llms_txt_url"]

        if "skip_llms_txt" in source:
-            doc_config["skip_llms_txt"] = source.get("skip_llms_txt")
+            doc_source["skip_llms_txt"] = source["skip_llms_txt"]

        # Optional: support overriding start URLs
        if "start_urls" in source:
-            doc_config["start_urls"] = source.get("start_urls")
+            doc_source["start_urls"] = source["start_urls"]
+
+        doc_config = {
+            "name": f"{self.name}_docs",
+            "description": f"Documentation for {self.name}",
+            "sources": [doc_source],
+        }

        # Write temporary config
        temp_config_path = os.path.join(self.data_dir, "temp_docs_config.json")
--- a/tests/test_github_scraper.py
+++ b/tests/test_github_scraper.py
@@ -68,7 +68,12 @@ class TestGitHubScraperInitialization(unittest.TestCase):
            "github_token": "test_token_123",
        }

-        with patch("skill_seekers.cli.github_scraper.Github") as mock_github:
+        # Clear GITHUB_TOKEN env var so config token is used (env takes priority)
+        env = {k: v for k, v in os.environ.items() if k != "GITHUB_TOKEN"}
+        with (
+            patch.dict(os.environ, env, clear=True),
+            patch("skill_seekers.cli.github_scraper.Github") as mock_github,
+        ):
            _scraper = self.GitHubScraper(config)
            mock_github.assert_called_once_with("test_token_123")

--- a/tests/test_unified_scraper_orchestration.py
+++ b/tests/test_unified_scraper_orchestration.py
@@ -224,7 +224,11 @@ class TestScrapeDocumentation:
            mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="")
            scraper._scrape_documentation(source)

-        assert any("llms_txt_url" in c for c in written_configs)
+        assert any(
+            "llms_txt_url" in s
+            for c in written_configs
+            for s in c.get("sources", [c])
+        )

    def test_start_urls_forwarded_to_doc_config(self, tmp_path):
        """start_urls from source is forwarded to the temporary doc config."""
@@ -247,7 +251,11 @@ class TestScrapeDocumentation:
            mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="")
            scraper._scrape_documentation(source)

-        assert any("start_urls" in c for c in written_configs)
+        assert any(
+            "start_urls" in s
+            for c in written_configs
+            for s in c.get("sources", [c])
+        )


 # ===========================================================================