fix(#300): centralize selector fallback, fix dry-run link discovery, and smart --config routing

- Add FALLBACK_MAIN_SELECTORS constant and _find_main_content() helper to
  eliminate 3 duplicated fallback loops in doc_scraper.py
- Move link extraction before early return in extract_content() so links
  are always discovered from the full page, not just main content
- Fix single-threaded dry-run to extract links from soup (full page)
  instead of main element only — fixes reactflow.dev finding only 1 page
- Add link extraction to async dry-run path (was completely missing)
- Remove main_content from get_configuration() defaults so fallback logic
  kicks in instead of a broad CSS comma selector matching body
- Smart create --config routing: peek at JSON to determine unified
  (sources array → unified_scraper) vs simple (base_url → doc_scraper)
- Update docs/user-guide/02-scraping.md and docs/reference/CONFIG_FORMAT.md
  to use unified config format (legacy format rejected since v2.11.0)
- Fix test_auto_fetch_enabled and test_mcp_validate_legacy_config

Closes #300

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-02-26 22:25:59 +03:00
parent b6d4dd8423
commit 4c8e16c8b1
9 changed files with 426 additions and 194 deletions

View File

@@ -603,9 +603,30 @@ Common Workflows:
log_level = logging.DEBUG if args.verbose else (logging.WARNING if args.quiet else logging.INFO)
logging.basicConfig(level=log_level, format="%(levelname)s: %(message)s")
# Validate source provided
if not args.source:
parser.error("source is required")
# Validate source provided (config file can serve as source)
if not args.source and not args.config:
parser.error("source is required (or use --config to specify a config file)")
# If config is provided but no source, peek at the JSON to route correctly
if not args.source and args.config:
import json
try:
with open(args.config) as f:
config_peek = json.load(f)
if "sources" in config_peek:
# Unified format → route to unified_scraper via config type detection
args.source = args.config
elif "base_url" in config_peek:
# Simple web config → route to doc_scraper by using the base_url
args.source = config_peek["base_url"]
# source will be detected as web URL; --config is already set
else:
parser.error("Config file must contain 'sources' (unified) or 'base_url' (web)")
except json.JSONDecodeError as e:
parser.error(f"Cannot parse config file as JSON: {e}")
except FileNotFoundError:
parser.error(f"Config file not found: {args.config}")
# Execute create command
command = CreateCommand(args)

View File

@@ -52,6 +52,18 @@ from skill_seekers.cli.utils import setup_logging
# Configure logging
logger = logging.getLogger(__name__)
# Shared fallback selectors for finding main content across all code paths.
# No 'body' — it matches everything and hides real selector failures.
FALLBACK_MAIN_SELECTORS = [
"main",
'div[role="main"]',
"article",
'[role="main"]',
".content",
".doc-content",
"#main-content",
]
def infer_description_from_docs(
base_url: str, first_page_content: str | None = None, name: str = ""
@@ -275,6 +287,35 @@ class DocToSkillConverter:
except Exception as e:
logger.warning("⚠️ Failed to clear checkpoint: %s", e)
def _find_main_content(self, soup: Any) -> tuple[Any, str | None]:
"""Find the main content element using config selector with fallbacks.
Tries the config-specified selector first, then falls back through
FALLBACK_MAIN_SELECTORS. Does NOT fall back to <body> since that
matches everything and hides real selector failures.
Args:
soup: BeautifulSoup parsed page
Returns:
Tuple of (element, selector_used) or (None, None) if nothing matched
"""
selectors = self.config.get("selectors", {})
main_selector = selectors.get("main_content")
if main_selector:
main = soup.select_one(main_selector)
if main:
return main, main_selector
# Config selector didn't match — fall through to fallbacks
for selector in FALLBACK_MAIN_SELECTORS:
main = soup.select_one(selector)
if main:
return main, selector
return None, None
def extract_content(self, soup: Any, url: str) -> dict[str, Any]:
"""Extract content with improved code and pattern detection"""
page = {
@@ -294,9 +335,17 @@ class DocToSkillConverter:
if title_elem:
page["title"] = self.clean_text(title_elem.get_text())
# Find main content
main_selector = selectors.get("main_content", 'div[role="main"]')
main = soup.select_one(main_selector)
# Extract links from entire page (always, even if main content not found).
# This allows discovery of navigation links outside the main content area.
for link in soup.find_all("a", href=True):
href = urljoin(url, link["href"])
# Strip anchor fragments to avoid treating #anchors as separate pages
href = href.split("#")[0]
if self.is_valid_url(href) and href not in page["links"]:
page["links"].append(href)
# Find main content using shared fallback logic
main, _selector_used = self._find_main_content(soup)
if not main:
logger.warning("⚠ No content: %s", url)
@@ -329,15 +378,6 @@ class DocToSkillConverter:
page["content"] = "\n\n".join(paragraphs)
# Extract links from entire page (not just main content)
# This allows discovery of navigation links outside the main content area
for link in soup.find_all("a", href=True):
href = urljoin(url, link["href"])
# Strip anchor fragments to avoid treating #anchors as separate pages
href = href.split("#")[0]
if self.is_valid_url(href) and href not in page["links"]:
page["links"].append(href)
return page
def _extract_markdown_content(self, content: str, url: str) -> dict[str, Any]:
@@ -1070,16 +1110,13 @@ class DocToSkillConverter:
response = requests.get(url, headers=headers, timeout=10)
soup = BeautifulSoup(response.content, "html.parser")
main_selector = self.config.get("selectors", {}).get(
"main_content", 'div[role="main"]'
)
main = soup.select_one(main_selector)
if main:
for link in main.find_all("a", href=True):
href = urljoin(url, link["href"])
if self.is_valid_url(href) and href not in self.visited_urls:
self.pending_urls.append(href)
# Discover links from full page (not just main content)
# to match real scrape path behaviour in extract_content()
for link in soup.find_all("a", href=True):
href = urljoin(url, link["href"])
href = href.split("#")[0]
if self.is_valid_url(href) and href not in self.visited_urls:
self.pending_urls.append(href)
except Exception as e:
# Failed to extract links in fast mode, continue anyway
logger.warning("⚠️ Warning: Could not extract links from %s: %s", url, e)
@@ -1249,6 +1286,25 @@ class DocToSkillConverter:
if unlimited or len(self.visited_urls) <= preview_limit:
if self.dry_run:
logger.info(" [Preview] %s", url)
# Discover links from full page (async dry-run)
try:
response = await client.get(
url,
headers={
"User-Agent": "Mozilla/5.0 (Documentation Scraper - Dry Run)"
},
timeout=10,
)
soup = BeautifulSoup(response.content, "html.parser")
for link in soup.find_all("a", href=True):
href = urljoin(url, link["href"])
href = href.split("#")[0]
if self.is_valid_url(href) and href not in self.visited_urls:
self.pending_urls.append(href)
except Exception as e:
logger.warning(
"⚠️ Warning: Could not extract links from %s: %s", url, e
)
else:
task = asyncio.create_task(
self.scrape_page_async(url, semaphore, client)
@@ -2039,7 +2095,6 @@ def get_configuration(args: argparse.Namespace) -> dict[str, Any]:
"description": args.description or f"Use when working with {args.name}",
"base_url": effective_url,
"selectors": {
"main_content": "div[role='main']",
"title": "title",
"code_blocks": "pre code",
},