fix(#300): centralize selector fallback, fix dry-run link discovery, and smart --config routing

- Add FALLBACK_MAIN_SELECTORS constant and _find_main_content() helper to eliminate 3 duplicated fallback loops in doc_scraper.py - Move link extraction before early return in extract_content() so links are always discovered from the full page, not just main content - Fix single-threaded dry-run to extract links from soup (full page) instead of main element only — fixes reactflow.dev finding only 1 page - Add link extraction to async dry-run path (was completely missing) - Remove main_content from get_configuration() defaults so fallback logic kicks in instead of a broad CSS comma selector matching body - Smart create --config routing: peek at JSON to determine unified (sources array → unified_scraper) vs simple (base_url → doc_scraper) - Update docs/user-guide/02-scraping.md and docs/reference/CONFIG_FORMAT.md to use unified config format (legacy format rejected since v2.11.0) - Fix test_auto_fetch_enabled and test_mcp_validate_legacy_config Closes #300 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-26 22:25:59 +03:00
parent b6d4dd8423
commit 4c8e16c8b1
9 changed files with 426 additions and 194 deletions
--- a/src/skill_seekers/cli/create_command.py
+++ b/src/skill_seekers/cli/create_command.py
@@ -603,9 +603,30 @@ Common Workflows:
    log_level = logging.DEBUG if args.verbose else (logging.WARNING if args.quiet else logging.INFO)
    logging.basicConfig(level=log_level, format="%(levelname)s: %(message)s")

-    # Validate source provided
-    if not args.source:
-        parser.error("source is required")
+    # Validate source provided (config file can serve as source)
+    if not args.source and not args.config:
+        parser.error("source is required (or use --config to specify a config file)")
+
+    # If config is provided but no source, peek at the JSON to route correctly
+    if not args.source and args.config:
+        import json
+
+        try:
+            with open(args.config) as f:
+                config_peek = json.load(f)
+            if "sources" in config_peek:
+                # Unified format → route to unified_scraper via config type detection
+                args.source = args.config
+            elif "base_url" in config_peek:
+                # Simple web config → route to doc_scraper by using the base_url
+                args.source = config_peek["base_url"]
+                # source will be detected as web URL; --config is already set
+            else:
+                parser.error("Config file must contain 'sources' (unified) or 'base_url' (web)")
+        except json.JSONDecodeError as e:
+            parser.error(f"Cannot parse config file as JSON: {e}")
+        except FileNotFoundError:
+            parser.error(f"Config file not found: {args.config}")

    # Execute create command
    command = CreateCommand(args)
--- a/src/skill_seekers/cli/doc_scraper.py
+++ b/src/skill_seekers/cli/doc_scraper.py
@@ -52,6 +52,18 @@ from skill_seekers.cli.utils import setup_logging
 # Configure logging
 logger = logging.getLogger(__name__)

+# Shared fallback selectors for finding main content across all code paths.
+# No 'body' — it matches everything and hides real selector failures.
+FALLBACK_MAIN_SELECTORS = [
+    "main",
+    'div[role="main"]',
+    "article",
+    '[role="main"]',
+    ".content",
+    ".doc-content",
+    "#main-content",
+]
+

 def infer_description_from_docs(
    base_url: str, first_page_content: str | None = None, name: str = ""
@@ -275,6 +287,35 @@ class DocToSkillConverter:
            except Exception as e:
                logger.warning("⚠️  Failed to clear checkpoint: %s", e)

+    def _find_main_content(self, soup: Any) -> tuple[Any, str | None]:
+        """Find the main content element using config selector with fallbacks.
+
+        Tries the config-specified selector first, then falls back through
+        FALLBACK_MAIN_SELECTORS. Does NOT fall back to <body> since that
+        matches everything and hides real selector failures.
+
+        Args:
+            soup: BeautifulSoup parsed page
+
+        Returns:
+            Tuple of (element, selector_used) or (None, None) if nothing matched
+        """
+        selectors = self.config.get("selectors", {})
+        main_selector = selectors.get("main_content")
+
+        if main_selector:
+            main = soup.select_one(main_selector)
+            if main:
+                return main, main_selector
+            # Config selector didn't match — fall through to fallbacks
+
+        for selector in FALLBACK_MAIN_SELECTORS:
+            main = soup.select_one(selector)
+            if main:
+                return main, selector
+
+        return None, None
+
    def extract_content(self, soup: Any, url: str) -> dict[str, Any]:
        """Extract content with improved code and pattern detection"""
        page = {
@@ -294,9 +335,17 @@ class DocToSkillConverter:
        if title_elem:
            page["title"] = self.clean_text(title_elem.get_text())

-        # Find main content
-        main_selector = selectors.get("main_content", 'div[role="main"]')
-        main = soup.select_one(main_selector)
+        # Extract links from entire page (always, even if main content not found).
+        # This allows discovery of navigation links outside the main content area.
+        for link in soup.find_all("a", href=True):
+            href = urljoin(url, link["href"])
+            # Strip anchor fragments to avoid treating #anchors as separate pages
+            href = href.split("#")[0]
+            if self.is_valid_url(href) and href not in page["links"]:
+                page["links"].append(href)
+
+        # Find main content using shared fallback logic
+        main, _selector_used = self._find_main_content(soup)

        if not main:
            logger.warning("⚠ No content: %s", url)
@@ -329,15 +378,6 @@ class DocToSkillConverter:

        page["content"] = "\n\n".join(paragraphs)

-        # Extract links from entire page (not just main content)
-        # This allows discovery of navigation links outside the main content area
-        for link in soup.find_all("a", href=True):
-            href = urljoin(url, link["href"])
-            # Strip anchor fragments to avoid treating #anchors as separate pages
-            href = href.split("#")[0]
-            if self.is_valid_url(href) and href not in page["links"]:
-                page["links"].append(href)
-
        return page

    def _extract_markdown_content(self, content: str, url: str) -> dict[str, Any]:
@@ -1070,16 +1110,13 @@ class DocToSkillConverter:
                        response = requests.get(url, headers=headers, timeout=10)
                        soup = BeautifulSoup(response.content, "html.parser")

-                        main_selector = self.config.get("selectors", {}).get(
-                            "main_content", 'div[role="main"]'
-                        )
-                        main = soup.select_one(main_selector)
-
-                        if main:
-                            for link in main.find_all("a", href=True):
-                                href = urljoin(url, link["href"])
-                                if self.is_valid_url(href) and href not in self.visited_urls:
-                                    self.pending_urls.append(href)
+                        # Discover links from full page (not just main content)
+                        # to match real scrape path behaviour in extract_content()
+                        for link in soup.find_all("a", href=True):
+                            href = urljoin(url, link["href"])
+                            href = href.split("#")[0]
+                            if self.is_valid_url(href) and href not in self.visited_urls:
+                                self.pending_urls.append(href)
                    except Exception as e:
                        # Failed to extract links in fast mode, continue anyway
                        logger.warning("⚠️  Warning: Could not extract links from %s: %s", url, e)
@@ -1249,6 +1286,25 @@ class DocToSkillConverter:
                    if unlimited or len(self.visited_urls) <= preview_limit:
                        if self.dry_run:
                            logger.info("  [Preview] %s", url)
+                            # Discover links from full page (async dry-run)
+                            try:
+                                response = await client.get(
+                                    url,
+                                    headers={
+                                        "User-Agent": "Mozilla/5.0 (Documentation Scraper - Dry Run)"
+                                    },
+                                    timeout=10,
+                                )
+                                soup = BeautifulSoup(response.content, "html.parser")
+                                for link in soup.find_all("a", href=True):
+                                    href = urljoin(url, link["href"])
+                                    href = href.split("#")[0]
+                                    if self.is_valid_url(href) and href not in self.visited_urls:
+                                        self.pending_urls.append(href)
+                            except Exception as e:
+                                logger.warning(
+                                    "⚠️  Warning: Could not extract links from %s: %s", url, e
+                                )
                        else:
                            task = asyncio.create_task(
                                self.scrape_page_async(url, semaphore, client)
@@ -2039,7 +2095,6 @@ def get_configuration(args: argparse.Namespace) -> dict[str, Any]:
            "description": args.description or f"Use when working with {args.name}",
            "base_url": effective_url,
            "selectors": {
-                "main_content": "div[role='main']",
                "title": "title",
                "code_blocks": "pre code",
            },