diff --git a/CHANGELOG.md b/CHANGELOG.md index 0765850..6c0e424 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **`docx` optional dependency group** — `pip install skill-seekers[docx]` (mammoth + python-docx) ### Fixed +- **Issue #300: Selector fallback & dry-run link discovery** — `create https://reactflow.dev/` now finds 20+ pages (was 1). Root causes: + - `extract_content()` extracted links after the early-return when no content selector matched, so they were never discovered. Moved link extraction before the early return. + - Dry-run extracted links from `main.find_all("a")` (main content only) instead of `soup.find_all("a")` (full page), missing navigation links. Fixed both sync and async dry-run paths. + - Async dry-run had no link extraction at all — only logged URLs. + - `get_configuration()` default used a CSS comma selector string that conflicted with the fallback loop. Removed `main_content` from defaults so `_find_main_content()` fallback kicks in. + - `create --config` with a simple web config (has `base_url`, no `sources`) incorrectly routed to `unified_scraper` which rejected it. Now peeks at JSON: routes `"sources"` configs to unified_scraper, `"base_url"` configs to doc_scraper. + - Selector fallback logic was duplicated in 3 places with `body` as ultimate fallback (masks failures). Extracted `FALLBACK_MAIN_SELECTORS` constant and `_find_main_content()` helper (no `body`). - **Reference file code truncation removed** — `codebase_scraper.py` no longer truncates code blocks to 500 chars in reference files (5 locations fixed) - **Enhancement code block limit replaced with token budget** — `enhance_skill_local.py` `summarize_reference()` now uses character-budget approach instead of arbitrary `[:5]` code block cap - **Dead variable removed** — `_target_lines` in `enhance_skill_local.py:309` was assigned but never used diff --git a/CLAUDE.md b/CLAUDE.md index 7f0250c..a1bbf04 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1835,15 +1835,17 @@ UNIVERSAL_ARGUMENTS = { ## 📚 Key Code Locations **Documentation Scraper** (`src/skill_seekers/cli/doc_scraper.py`): +- `FALLBACK_MAIN_SELECTORS` - Shared fallback CSS selectors for finding main content (no `body`) +- `_find_main_content()` - Centralized selector fallback: config selector → fallback list - `is_valid_url()` - URL validation -- `extract_content()` - Content extraction +- `extract_content()` - Content extraction (links extracted from full page before early return) - `detect_language()` - Code language detection - `extract_patterns()` - Pattern extraction - `smart_categorize()` - Smart categorization - `infer_categories()` - Category inference - `generate_quick_reference()` - Quick reference generation - `create_enhanced_skill_md()` - SKILL.md generation -- `scrape_all()` - Main scraping loop +- `scrape_all()` - Main scraping loop (dry-run extracts links from full page) - `main()` - Entry point **Codebase Analysis** (`src/skill_seekers/cli/`): @@ -2256,6 +2258,15 @@ The `scripts/` directory contains utility scripts: ## 🎉 Recent Achievements +**v3.1.4 (Unreleased) - "Selector Fallback & Dry-Run Fix":** +- 🐛 **Issue #300: `create https://reactflow.dev/` only found 1 page** — Now finds 20+ pages +- 🔧 **Centralized selector fallback** — `FALLBACK_MAIN_SELECTORS` constant + `_find_main_content()` helper replace 3 duplicated fallback loops +- 🔗 **Link extraction before early return** — `extract_content()` now discovers links even when no content selector matches +- 🔍 **Dry-run full-page link discovery** — Both sync and async dry-run paths extract links from the full page (was main-content-only or missing entirely) +- 🛣️ **Smart `create --config` routing** — Peeks at JSON to route `base_url` configs to doc_scraper and `sources` configs to unified_scraper +- 🧹 **Removed `body` fallback** — `body` matched everything, hiding real selector failures +- ✅ **Pre-existing test fixes** — `test_auto_fetch_enabled` (react.json exists locally) and `test_mcp_validate_legacy_config` (react.json is now unified format) + **v3.1.3 (Released) - "Unified Argument Interface":** - 🔧 **Unified Scraper Arguments** - All scrapers (scrape, github, analyze, pdf) now share a common argument contract via `add_all_standard_arguments(parser)` in `arguments/common.py` - 🐛 **Fix `create` Argument Forwarding** - `create --dry-run`, `create owner/repo --dry-run`, `create ./path --dry-run` all work now (previously crashed) diff --git a/configs/react.json b/configs/react.json new file mode 100644 index 0000000..42edb52 --- /dev/null +++ b/configs/react.json @@ -0,0 +1,69 @@ +{ + "name": "react", + "description": "Complete React knowledge base combining official documentation and React codebase insights. Use when working with React, understanding API changes, or debugging React internals.", + "version": "1.1.0", + "merge_mode": "rule-based", + "sources": [ + { + "type": "documentation", + "base_url": "https://react.dev/", + "extract_api": true, + "selectors": { + "main_content": "article", + "title": "h1", + "code_blocks": "pre code" + }, + "url_patterns": { + "include": [], + "exclude": [ + "/blog/", + "/community/" + ] + }, + "categories": { + "getting_started": [ + "learn", + "installation", + "quick-start" + ], + "components": [ + "components", + "props", + "state" + ], + "hooks": [ + "hooks", + "usestate", + "useeffect", + "usecontext" + ], + "api": [ + "api", + "reference" + ], + "advanced": [ + "context", + "refs", + "portals", + "suspense" + ] + }, + "rate_limit": 0.5 + }, + { + "type": "github", + "repo": "facebook/react", + "enable_codebase_analysis": true, + "code_analysis_depth": "deep", + "fetch_issues": true, + "max_issues": 100, + "fetch_changelog": true, + "fetch_releases": true, + "file_patterns": [ + "packages/react/src/**/*.js", + "packages/react-dom/src/**/*.js" + ] + } + ], + "base_url": "https://react.dev/" +} \ No newline at end of file diff --git a/docs/reference/CONFIG_FORMAT.md b/docs/reference/CONFIG_FORMAT.md index 5166c8e..703cbf2 100644 --- a/docs/reference/CONFIG_FORMAT.md +++ b/docs/reference/CONFIG_FORMAT.md @@ -1,7 +1,7 @@ # Config Format Reference - Skill Seekers -> **Version:** 3.1.0 -> **Last Updated:** 2026-02-16 +> **Version:** 3.1.4 +> **Last Updated:** 2026-02-26 > **Complete JSON configuration specification** --- @@ -25,17 +25,21 @@ ## Overview -Skill Seekers uses JSON configuration files to define scraping targets. There are two types: +Skill Seekers uses JSON configuration files with a unified format. All configs use a `sources` array, even for single-source scraping. -| Type | Use Case | File | -|------|----------|------| -| **Single-Source** | One source (docs, GitHub, PDF, or local) | `*.json` | -| **Unified** | Multiple sources combined | `*-unified.json` | +> **Important:** Legacy configs without `sources` were removed in v2.11.0. All configs must use the unified format shown below. + +| Use Case | Example | +|----------|---------| +| **Single source** | `"sources": [{ "type": "documentation", ... }]` | +| **Multiple sources** | `"sources": [{ "type": "documentation", ... }, { "type": "github", ... }]` | --- ## Single-Source Config +Even for a single source, wrap it in a `sources` array. + ### Documentation Source For scraping documentation websites. @@ -43,33 +47,37 @@ For scraping documentation websites. ```json { "name": "react", - "base_url": "https://react.dev/", "description": "React - JavaScript library for building UIs", - - "start_urls": [ - "https://react.dev/learn", - "https://react.dev/reference/react" - ], - - "selectors": { - "main_content": "article", - "title": "h1", - "code_blocks": "pre code" - }, - - "url_patterns": { - "include": ["/learn/", "/reference/"], - "exclude": ["/blog/", "/community/"] - }, - - "categories": { - "getting_started": ["learn", "tutorial", "intro"], - "api": ["reference", "api", "hooks"] - }, - - "rate_limit": 0.5, - "max_pages": 300, - "merge_mode": "claude-enhanced" + "sources": [ + { + "type": "documentation", + "base_url": "https://react.dev/", + + "start_urls": [ + "https://react.dev/learn", + "https://react.dev/reference/react" + ], + + "selectors": { + "main_content": "article", + "title": "h1", + "code_blocks": "pre code" + }, + + "url_patterns": { + "include": ["/learn/", "/reference/"], + "exclude": ["/blog/", "/community/"] + }, + + "categories": { + "getting_started": ["learn", "tutorial", "intro"], + "api": ["reference", "api", "hooks"] + }, + + "rate_limit": 0.5, + "max_pages": 300 + } + ] } ``` @@ -99,27 +107,31 @@ For analyzing GitHub repositories. ```json { "name": "react-github", - "type": "github", - "repo": "facebook/react", "description": "React GitHub repository analysis", - - "enable_codebase_analysis": true, - "code_analysis_depth": "deep", - - "fetch_issues": true, - "max_issues": 100, - "issue_labels": ["bug", "enhancement"], - - "fetch_releases": true, - "max_releases": 20, - - "fetch_changelog": true, - "analyze_commit_history": true, - - "file_patterns": ["*.js", "*.ts", "*.tsx"], - "exclude_patterns": ["*.test.js", "node_modules/**"], - - "rate_limit": 1.0 + "sources": [ + { + "type": "github", + "repo": "facebook/react", + + "enable_codebase_analysis": true, + "code_analysis_depth": "deep", + + "fetch_issues": true, + "max_issues": 100, + "issue_labels": ["bug", "enhancement"], + + "fetch_releases": true, + "max_releases": 20, + + "fetch_changelog": true, + "analyze_commit_history": true, + + "file_patterns": ["*.js", "*.ts", "*.tsx"], + "exclude_patterns": ["*.test.js", "node_modules/**"], + + "rate_limit": 1.0 + } + ] } ``` @@ -152,24 +164,28 @@ For extracting content from PDF files. ```json { "name": "product-manual", - "type": "pdf", - "pdf_path": "docs/manual.pdf", "description": "Product documentation manual", - - "enable_ocr": false, - "password": "", - - "extract_images": true, - "image_output_dir": "output/images/", - - "extract_tables": true, - "table_format": "markdown", - - "page_range": [1, 100], - "split_by_chapters": true, - - "chunk_size": 1000, - "chunk_overlap": 100 + "sources": [ + { + "type": "pdf", + "pdf_path": "docs/manual.pdf", + + "enable_ocr": false, + "password": "", + + "extract_images": true, + "image_output_dir": "output/images/", + + "extract_tables": true, + "table_format": "markdown", + + "page_range": [1, 100], + "split_by_chapters": true, + + "chunk_size": 1000, + "chunk_overlap": 100 + } + ] } ``` @@ -201,25 +217,29 @@ For analyzing local codebases. ```json { "name": "my-project", - "type": "local", - "directory": "./my-project", "description": "Local project analysis", - - "languages": ["Python", "JavaScript"], - "file_patterns": ["*.py", "*.js"], - "exclude_patterns": ["*.pyc", "node_modules/**", ".git/**"], - - "analysis_depth": "comprehensive", - - "extract_api": true, - "extract_patterns": true, - "extract_test_examples": true, - "extract_how_to_guides": true, - "extract_config_patterns": true, - - "include_comments": true, - "include_docstrings": true, - "include_readme": true + "sources": [ + { + "type": "local", + "directory": "./my-project", + + "languages": ["Python", "JavaScript"], + "file_patterns": ["*.py", "*.js"], + "exclude_patterns": ["*.pyc", "node_modules/**", ".git/**"], + + "analysis_depth": "comprehensive", + + "extract_api": true, + "extract_patterns": true, + "extract_test_examples": true, + "extract_how_to_guides": true, + "extract_config_patterns": true, + + "include_comments": true, + "include_docstrings": true, + "include_readme": true + } + ] } ``` @@ -406,14 +426,25 @@ CSS selectors for content extraction from HTML: ### Default Selectors -If not specified, these defaults are used: +If `main_content` is not specified, the scraper tries these selectors in order until one matches: + +1. `main` +2. `div[role="main"]` +3. `article` +4. `[role="main"]` +5. `.content` +6. `.doc-content` +7. `#main-content` + +> **Tip:** Omit `main_content` from your config to let auto-detection work. +> Only specify it when auto-detection picks the wrong element. + +Other defaults: | Element | Default Selector | |---------|-----------------| -| `main_content` | `article, main, .content, #content, [role='main']` | -| `title` | `h1, .page-title, title` | -| `code_blocks` | `pre code, code[class*="language-"]` | -| `navigation` | `nav, .sidebar, .toc` | +| `title` | `title` | +| `code_blocks` | `pre code` | --- @@ -494,29 +525,33 @@ Control which URLs are included or excluded: ```json { "name": "react", - "base_url": "https://react.dev/", "description": "React - JavaScript library for building UIs", - "start_urls": [ - "https://react.dev/learn", - "https://react.dev/reference/react", - "https://react.dev/reference/react-dom" - ], - "selectors": { - "main_content": "article", - "title": "h1", - "code_blocks": "pre code" - }, - "url_patterns": { - "include": ["/learn/", "/reference/", "/blog/"], - "exclude": ["/community/", "/search"] - }, - "categories": { - "getting_started": ["learn", "tutorial"], - "api": ["reference", "api"], - "blog": ["blog"] - }, - "rate_limit": 0.5, - "max_pages": 300 + "sources": [ + { + "type": "documentation", + "base_url": "https://react.dev/", + "start_urls": [ + "https://react.dev/learn", + "https://react.dev/reference/react", + "https://react.dev/reference/react-dom" + ], + "selectors": { + "main_content": "article", + "title": "h1", + "code_blocks": "pre code" + }, + "url_patterns": { + "include": ["/learn/", "/reference/"], + "exclude": ["/community/", "/search"] + }, + "categories": { + "getting_started": ["learn", "tutorial"], + "api": ["reference", "api"] + }, + "rate_limit": 0.5, + "max_pages": 300 + } + ] } ``` @@ -525,16 +560,20 @@ Control which URLs are included or excluded: ```json { "name": "django-github", - "type": "github", - "repo": "django/django", "description": "Django web framework source code", - "enable_codebase_analysis": true, - "code_analysis_depth": "deep", - "fetch_issues": true, - "max_issues": 100, - "fetch_releases": true, - "file_patterns": ["*.py"], - "exclude_patterns": ["tests/**", "docs/**"] + "sources": [ + { + "type": "github", + "repo": "django/django", + "enable_codebase_analysis": true, + "code_analysis_depth": "deep", + "fetch_issues": true, + "max_issues": 100, + "fetch_releases": true, + "file_patterns": ["*.py"], + "exclude_patterns": ["tests/**", "docs/**"] + } + ] } ``` @@ -572,15 +611,19 @@ Control which URLs are included or excluded: ```json { "name": "my-api", - "type": "local", - "directory": "./my-api-project", "description": "My REST API implementation", - "languages": ["Python"], - "file_patterns": ["*.py"], - "exclude_patterns": ["tests/**", "migrations/**"], - "analysis_depth": "comprehensive", - "extract_api": true, - "extract_test_examples": true + "sources": [ + { + "type": "local", + "directory": "./my-api-project", + "languages": ["Python"], + "file_patterns": ["*.py"], + "exclude_patterns": ["tests/**", "migrations/**"], + "analysis_depth": "comprehensive", + "extract_api": true, + "extract_test_examples": true + } + ] } ``` diff --git a/docs/user-guide/02-scraping.md b/docs/user-guide/02-scraping.md index 63e448a..37f436d 100644 --- a/docs/user-guide/02-scraping.md +++ b/docs/user-guide/02-scraping.md @@ -1,6 +1,6 @@ # Scraping Guide -> **Skill Seekers v3.1.0** +> **Skill Seekers v3.1.4** > **Complete guide to all scraping options** --- @@ -50,23 +50,30 @@ skill-seekers create --config fastapi ### Custom Configuration +All configs must use the unified format with a `sources` array (since v2.11.0): + ```bash # Create config file cat > configs/my-docs.json << 'EOF' { "name": "my-framework", - "base_url": "https://docs.example.com/", "description": "My framework documentation", - "max_pages": 200, - "rate_limit": 0.5, - "selectors": { - "main_content": "article", - "title": "h1" - }, - "url_patterns": { - "include": ["/docs/", "/api/"], - "exclude": ["/blog/", "/search"] - } + "sources": [ + { + "type": "documentation", + "base_url": "https://docs.example.com/", + "max_pages": 200, + "rate_limit": 0.5, + "selectors": { + "main_content": "article", + "title": "h1" + }, + "url_patterns": { + "include": ["/docs/", "/api/"], + "exclude": ["/blog/", "/search"] + } + } + ] } EOF @@ -74,6 +81,9 @@ EOF skill-seekers create --config configs/my-docs.json ``` +> **Note:** Omit `main_content` from `selectors` to let Skill Seekers auto-detect +> the best content element (`main`, `article`, `div[role="main"]`, etc.). + See [Config Format](../reference/CONFIG_FORMAT.md) for all options. ### Advanced Options @@ -331,14 +341,22 @@ skill-seekers resume **Solution:** ```bash -# Find correct selectors +# First, try without a main_content selector (auto-detection) +# The scraper tries: main, div[role="main"], article, .content, etc. +skill-seekers create --dry-run + +# If auto-detection fails, find the correct selector: curl -s | grep -i 'article\|main\|content' -# Update config +# Then specify it in your config's source: { - "selectors": { - "main_content": "div.content" // or "article", "main", etc. - } + "sources": [{ + "type": "documentation", + "base_url": "https://...", + "selectors": { + "main_content": "div.content" + } + }] } ``` diff --git a/src/skill_seekers/cli/create_command.py b/src/skill_seekers/cli/create_command.py index 15e68a8..92f6b1b 100644 --- a/src/skill_seekers/cli/create_command.py +++ b/src/skill_seekers/cli/create_command.py @@ -603,9 +603,30 @@ Common Workflows: log_level = logging.DEBUG if args.verbose else (logging.WARNING if args.quiet else logging.INFO) logging.basicConfig(level=log_level, format="%(levelname)s: %(message)s") - # Validate source provided - if not args.source: - parser.error("source is required") + # Validate source provided (config file can serve as source) + if not args.source and not args.config: + parser.error("source is required (or use --config to specify a config file)") + + # If config is provided but no source, peek at the JSON to route correctly + if not args.source and args.config: + import json + + try: + with open(args.config) as f: + config_peek = json.load(f) + if "sources" in config_peek: + # Unified format → route to unified_scraper via config type detection + args.source = args.config + elif "base_url" in config_peek: + # Simple web config → route to doc_scraper by using the base_url + args.source = config_peek["base_url"] + # source will be detected as web URL; --config is already set + else: + parser.error("Config file must contain 'sources' (unified) or 'base_url' (web)") + except json.JSONDecodeError as e: + parser.error(f"Cannot parse config file as JSON: {e}") + except FileNotFoundError: + parser.error(f"Config file not found: {args.config}") # Execute create command command = CreateCommand(args) diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py index 72c06df..9d59bf9 100755 --- a/src/skill_seekers/cli/doc_scraper.py +++ b/src/skill_seekers/cli/doc_scraper.py @@ -52,6 +52,18 @@ from skill_seekers.cli.utils import setup_logging # Configure logging logger = logging.getLogger(__name__) +# Shared fallback selectors for finding main content across all code paths. +# No 'body' — it matches everything and hides real selector failures. +FALLBACK_MAIN_SELECTORS = [ + "main", + 'div[role="main"]', + "article", + '[role="main"]', + ".content", + ".doc-content", + "#main-content", +] + def infer_description_from_docs( base_url: str, first_page_content: str | None = None, name: str = "" @@ -275,6 +287,35 @@ class DocToSkillConverter: except Exception as e: logger.warning("⚠️ Failed to clear checkpoint: %s", e) + def _find_main_content(self, soup: Any) -> tuple[Any, str | None]: + """Find the main content element using config selector with fallbacks. + + Tries the config-specified selector first, then falls back through + FALLBACK_MAIN_SELECTORS. Does NOT fall back to since that + matches everything and hides real selector failures. + + Args: + soup: BeautifulSoup parsed page + + Returns: + Tuple of (element, selector_used) or (None, None) if nothing matched + """ + selectors = self.config.get("selectors", {}) + main_selector = selectors.get("main_content") + + if main_selector: + main = soup.select_one(main_selector) + if main: + return main, main_selector + # Config selector didn't match — fall through to fallbacks + + for selector in FALLBACK_MAIN_SELECTORS: + main = soup.select_one(selector) + if main: + return main, selector + + return None, None + def extract_content(self, soup: Any, url: str) -> dict[str, Any]: """Extract content with improved code and pattern detection""" page = { @@ -294,9 +335,17 @@ class DocToSkillConverter: if title_elem: page["title"] = self.clean_text(title_elem.get_text()) - # Find main content - main_selector = selectors.get("main_content", 'div[role="main"]') - main = soup.select_one(main_selector) + # Extract links from entire page (always, even if main content not found). + # This allows discovery of navigation links outside the main content area. + for link in soup.find_all("a", href=True): + href = urljoin(url, link["href"]) + # Strip anchor fragments to avoid treating #anchors as separate pages + href = href.split("#")[0] + if self.is_valid_url(href) and href not in page["links"]: + page["links"].append(href) + + # Find main content using shared fallback logic + main, _selector_used = self._find_main_content(soup) if not main: logger.warning("⚠ No content: %s", url) @@ -329,15 +378,6 @@ class DocToSkillConverter: page["content"] = "\n\n".join(paragraphs) - # Extract links from entire page (not just main content) - # This allows discovery of navigation links outside the main content area - for link in soup.find_all("a", href=True): - href = urljoin(url, link["href"]) - # Strip anchor fragments to avoid treating #anchors as separate pages - href = href.split("#")[0] - if self.is_valid_url(href) and href not in page["links"]: - page["links"].append(href) - return page def _extract_markdown_content(self, content: str, url: str) -> dict[str, Any]: @@ -1070,16 +1110,13 @@ class DocToSkillConverter: response = requests.get(url, headers=headers, timeout=10) soup = BeautifulSoup(response.content, "html.parser") - main_selector = self.config.get("selectors", {}).get( - "main_content", 'div[role="main"]' - ) - main = soup.select_one(main_selector) - - if main: - for link in main.find_all("a", href=True): - href = urljoin(url, link["href"]) - if self.is_valid_url(href) and href not in self.visited_urls: - self.pending_urls.append(href) + # Discover links from full page (not just main content) + # to match real scrape path behaviour in extract_content() + for link in soup.find_all("a", href=True): + href = urljoin(url, link["href"]) + href = href.split("#")[0] + if self.is_valid_url(href) and href not in self.visited_urls: + self.pending_urls.append(href) except Exception as e: # Failed to extract links in fast mode, continue anyway logger.warning("⚠️ Warning: Could not extract links from %s: %s", url, e) @@ -1249,6 +1286,25 @@ class DocToSkillConverter: if unlimited or len(self.visited_urls) <= preview_limit: if self.dry_run: logger.info(" [Preview] %s", url) + # Discover links from full page (async dry-run) + try: + response = await client.get( + url, + headers={ + "User-Agent": "Mozilla/5.0 (Documentation Scraper - Dry Run)" + }, + timeout=10, + ) + soup = BeautifulSoup(response.content, "html.parser") + for link in soup.find_all("a", href=True): + href = urljoin(url, link["href"]) + href = href.split("#")[0] + if self.is_valid_url(href) and href not in self.visited_urls: + self.pending_urls.append(href) + except Exception as e: + logger.warning( + "⚠️ Warning: Could not extract links from %s: %s", url, e + ) else: task = asyncio.create_task( self.scrape_page_async(url, semaphore, client) @@ -2039,7 +2095,6 @@ def get_configuration(args: argparse.Namespace) -> dict[str, Any]: "description": args.description or f"Use when working with {args.name}", "base_url": effective_url, "selectors": { - "main_content": "div[role='main']", "title": "title", "code_blocks": "pre code", }, diff --git a/tests/test_config_fetcher.py b/tests/test_config_fetcher.py index 6cba6fc..2d39664 100644 --- a/tests/test_config_fetcher.py +++ b/tests/test_config_fetcher.py @@ -265,16 +265,16 @@ class TestResolveConfigPath: @patch("skill_seekers.cli.config_fetcher.fetch_config_from_api") def test_auto_fetch_enabled(self, mock_fetch, tmp_path): """Test that auto-fetch runs when enabled.""" - # Mock fetch to return a path - mock_config = tmp_path / "configs" / "react.json" + # Use a name that does NOT exist locally (react.json exists in configs/) + mock_config = tmp_path / "configs" / "obscure_framework.json" mock_config.parent.mkdir(exist_ok=True) - mock_config.write_text('{"name": "react"}') + mock_config.write_text('{"name": "obscure_framework"}') mock_fetch.return_value = mock_config - result = resolve_config_path("react.json", auto_fetch=True) + result = resolve_config_path("obscure_framework.json", auto_fetch=True) # Verify fetch was called - mock_fetch.assert_called_once_with("react", destination="configs") + mock_fetch.assert_called_once_with("obscure_framework", destination="configs") assert result is not None assert result.exists() diff --git a/tests/test_unified_mcp_integration.py b/tests/test_unified_mcp_integration.py index 98355aa..ad447a7 100644 --- a/tests/test_unified_mcp_integration.py +++ b/tests/test_unified_mcp_integration.py @@ -67,22 +67,30 @@ async def test_mcp_validate_legacy_config(): """Test that MCP can validate legacy configs""" print("\n✓ Testing MCP validate_config_tool with legacy config...") - # Use existing legacy config - config_path = "configs/react.json" + # Create a truly legacy config (no "sources" key — just base_url + selectors) + legacy_config = { + "name": "test-legacy", + "base_url": "https://example.com/", + "selectors": {"main_content": "main", "title": "h1", "code_blocks": "pre code"}, + "url_patterns": {"include": [], "exclude": []}, + "rate_limit": 0.5, + } - if not Path(config_path).exists(): - print(f" ⚠️ Skipping: {config_path} not found") - return + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(legacy_config, f) + config_path = f.name - args = {"config_path": config_path} - result = await validate_config_tool(args) + try: + args = {"config_path": config_path} + result = await validate_config_tool(args) - # Check result - text = result[0].text - assert "✅" in text, f"Expected success, got: {text}" - assert "Legacy" in text, f"Expected legacy format detected, got: {text}" + # Legacy configs are rejected since v2.11.0 — validator should detect the format + text = result[0].text + assert "LEGACY" in text.upper(), f"Expected legacy format detected, got: {text}" - print(" ✅ MCP correctly validates legacy config") + print(" ✅ MCP correctly detects legacy config format") + finally: + os.unlink(config_path) @pytest.mark.skipif(not MCP_AVAILABLE, reason="MCP package not installed")