diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py index cbe4908..0fa3ba3 100755 --- a/src/skill_seekers/cli/doc_scraper.py +++ b/src/skill_seekers/cli/doc_scraper.py @@ -1970,8 +1970,6 @@ def load_config(config_path: str) -> dict[str, Any]: # Log config type if validator.is_unified: logger.debug("✓ Unified config format detected") - else: - logger.debug("✓ Legacy config format detected") except ValueError as e: logger.error("❌ Configuration validation errors in %s:", config_path) logger.error(" %s", str(e)) diff --git a/src/skill_seekers/cli/unified_scraper.py b/src/skill_seekers/cli/unified_scraper.py index c413176..02057c4 100644 --- a/src/skill_seekers/cli/unified_scraper.py +++ b/src/skill_seekers/cli/unified_scraper.py @@ -165,10 +165,6 @@ class UnifiedScraper: logger.info("PHASE 1: Scraping all sources") logger.info("=" * 60) - if not self.validator.is_unified: - logger.warning("Config is not unified format, converting...") - self.config = self.validator.convert_legacy_to_unified() - sources = self.config.get("sources", []) for i, source in enumerate(sources): @@ -220,9 +216,10 @@ class UnifiedScraper: def _scrape_documentation(self, source: dict[str, Any]): """Scrape documentation website.""" - # Create temporary config for doc scraper - doc_config = { - "name": f"{self.name}_docs", + # Create temporary config for doc scraper in unified format + # (doc_scraper's ConfigValidator requires "sources" key) + doc_source = { + "type": "documentation", "base_url": source["base_url"], "selectors": source.get("selectors", {}), "url_patterns": source.get("url_patterns", {}), @@ -233,14 +230,20 @@ class UnifiedScraper: # Pass through llms.txt settings (so unified configs behave the same as doc_scraper configs) if "llms_txt_url" in source: - doc_config["llms_txt_url"] = source.get("llms_txt_url") + doc_source["llms_txt_url"] = source["llms_txt_url"] if "skip_llms_txt" in source: - doc_config["skip_llms_txt"] = source.get("skip_llms_txt") + doc_source["skip_llms_txt"] = source["skip_llms_txt"] # Optional: support overriding start URLs if "start_urls" in source: - doc_config["start_urls"] = source.get("start_urls") + doc_source["start_urls"] = source["start_urls"] + + doc_config = { + "name": f"{self.name}_docs", + "description": f"Documentation for {self.name}", + "sources": [doc_source], + } # Write temporary config temp_config_path = os.path.join(self.data_dir, "temp_docs_config.json") diff --git a/tests/test_github_scraper.py b/tests/test_github_scraper.py index 149e171..9909233 100644 --- a/tests/test_github_scraper.py +++ b/tests/test_github_scraper.py @@ -68,7 +68,12 @@ class TestGitHubScraperInitialization(unittest.TestCase): "github_token": "test_token_123", } - with patch("skill_seekers.cli.github_scraper.Github") as mock_github: + # Clear GITHUB_TOKEN env var so config token is used (env takes priority) + env = {k: v for k, v in os.environ.items() if k != "GITHUB_TOKEN"} + with ( + patch.dict(os.environ, env, clear=True), + patch("skill_seekers.cli.github_scraper.Github") as mock_github, + ): _scraper = self.GitHubScraper(config) mock_github.assert_called_once_with("test_token_123") diff --git a/tests/test_unified_scraper_orchestration.py b/tests/test_unified_scraper_orchestration.py index e6d431f..02309fc 100644 --- a/tests/test_unified_scraper_orchestration.py +++ b/tests/test_unified_scraper_orchestration.py @@ -224,7 +224,11 @@ class TestScrapeDocumentation: mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="") scraper._scrape_documentation(source) - assert any("llms_txt_url" in c for c in written_configs) + assert any( + "llms_txt_url" in s + for c in written_configs + for s in c.get("sources", [c]) + ) def test_start_urls_forwarded_to_doc_config(self, tmp_path): """start_urls from source is forwarded to the temporary doc config.""" @@ -247,7 +251,11 @@ class TestScrapeDocumentation: mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="") scraper._scrape_documentation(source) - assert any("start_urls" in c for c in written_configs) + assert any( + "start_urls" in s + for c in written_configs + for s in c.get("sources", [c]) + ) # ===========================================================================