fix(#300): centralize selector fallback, fix dry-run link discovery, and smart --config routing
- Add FALLBACK_MAIN_SELECTORS constant and _find_main_content() helper to eliminate 3 duplicated fallback loops in doc_scraper.py - Move link extraction before early return in extract_content() so links are always discovered from the full page, not just main content - Fix single-threaded dry-run to extract links from soup (full page) instead of main element only — fixes reactflow.dev finding only 1 page - Add link extraction to async dry-run path (was completely missing) - Remove main_content from get_configuration() defaults so fallback logic kicks in instead of a broad CSS comma selector matching body - Smart create --config routing: peek at JSON to determine unified (sources array → unified_scraper) vs simple (base_url → doc_scraper) - Update docs/user-guide/02-scraping.md and docs/reference/CONFIG_FORMAT.md to use unified config format (legacy format rejected since v2.11.0) - Fix test_auto_fetch_enabled and test_mcp_validate_legacy_config Closes #300 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -603,9 +603,30 @@ Common Workflows:
|
||||
log_level = logging.DEBUG if args.verbose else (logging.WARNING if args.quiet else logging.INFO)
|
||||
logging.basicConfig(level=log_level, format="%(levelname)s: %(message)s")
|
||||
|
||||
# Validate source provided
|
||||
if not args.source:
|
||||
parser.error("source is required")
|
||||
# Validate source provided (config file can serve as source)
|
||||
if not args.source and not args.config:
|
||||
parser.error("source is required (or use --config to specify a config file)")
|
||||
|
||||
# If config is provided but no source, peek at the JSON to route correctly
|
||||
if not args.source and args.config:
|
||||
import json
|
||||
|
||||
try:
|
||||
with open(args.config) as f:
|
||||
config_peek = json.load(f)
|
||||
if "sources" in config_peek:
|
||||
# Unified format → route to unified_scraper via config type detection
|
||||
args.source = args.config
|
||||
elif "base_url" in config_peek:
|
||||
# Simple web config → route to doc_scraper by using the base_url
|
||||
args.source = config_peek["base_url"]
|
||||
# source will be detected as web URL; --config is already set
|
||||
else:
|
||||
parser.error("Config file must contain 'sources' (unified) or 'base_url' (web)")
|
||||
except json.JSONDecodeError as e:
|
||||
parser.error(f"Cannot parse config file as JSON: {e}")
|
||||
except FileNotFoundError:
|
||||
parser.error(f"Config file not found: {args.config}")
|
||||
|
||||
# Execute create command
|
||||
command = CreateCommand(args)
|
||||
|
||||
@@ -52,6 +52,18 @@ from skill_seekers.cli.utils import setup_logging
|
||||
# Configure logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Shared fallback selectors for finding main content across all code paths.
|
||||
# No 'body' — it matches everything and hides real selector failures.
|
||||
FALLBACK_MAIN_SELECTORS = [
|
||||
"main",
|
||||
'div[role="main"]',
|
||||
"article",
|
||||
'[role="main"]',
|
||||
".content",
|
||||
".doc-content",
|
||||
"#main-content",
|
||||
]
|
||||
|
||||
|
||||
def infer_description_from_docs(
|
||||
base_url: str, first_page_content: str | None = None, name: str = ""
|
||||
@@ -275,6 +287,35 @@ class DocToSkillConverter:
|
||||
except Exception as e:
|
||||
logger.warning("⚠️ Failed to clear checkpoint: %s", e)
|
||||
|
||||
def _find_main_content(self, soup: Any) -> tuple[Any, str | None]:
|
||||
"""Find the main content element using config selector with fallbacks.
|
||||
|
||||
Tries the config-specified selector first, then falls back through
|
||||
FALLBACK_MAIN_SELECTORS. Does NOT fall back to <body> since that
|
||||
matches everything and hides real selector failures.
|
||||
|
||||
Args:
|
||||
soup: BeautifulSoup parsed page
|
||||
|
||||
Returns:
|
||||
Tuple of (element, selector_used) or (None, None) if nothing matched
|
||||
"""
|
||||
selectors = self.config.get("selectors", {})
|
||||
main_selector = selectors.get("main_content")
|
||||
|
||||
if main_selector:
|
||||
main = soup.select_one(main_selector)
|
||||
if main:
|
||||
return main, main_selector
|
||||
# Config selector didn't match — fall through to fallbacks
|
||||
|
||||
for selector in FALLBACK_MAIN_SELECTORS:
|
||||
main = soup.select_one(selector)
|
||||
if main:
|
||||
return main, selector
|
||||
|
||||
return None, None
|
||||
|
||||
def extract_content(self, soup: Any, url: str) -> dict[str, Any]:
|
||||
"""Extract content with improved code and pattern detection"""
|
||||
page = {
|
||||
@@ -294,9 +335,17 @@ class DocToSkillConverter:
|
||||
if title_elem:
|
||||
page["title"] = self.clean_text(title_elem.get_text())
|
||||
|
||||
# Find main content
|
||||
main_selector = selectors.get("main_content", 'div[role="main"]')
|
||||
main = soup.select_one(main_selector)
|
||||
# Extract links from entire page (always, even if main content not found).
|
||||
# This allows discovery of navigation links outside the main content area.
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = urljoin(url, link["href"])
|
||||
# Strip anchor fragments to avoid treating #anchors as separate pages
|
||||
href = href.split("#")[0]
|
||||
if self.is_valid_url(href) and href not in page["links"]:
|
||||
page["links"].append(href)
|
||||
|
||||
# Find main content using shared fallback logic
|
||||
main, _selector_used = self._find_main_content(soup)
|
||||
|
||||
if not main:
|
||||
logger.warning("⚠ No content: %s", url)
|
||||
@@ -329,15 +378,6 @@ class DocToSkillConverter:
|
||||
|
||||
page["content"] = "\n\n".join(paragraphs)
|
||||
|
||||
# Extract links from entire page (not just main content)
|
||||
# This allows discovery of navigation links outside the main content area
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = urljoin(url, link["href"])
|
||||
# Strip anchor fragments to avoid treating #anchors as separate pages
|
||||
href = href.split("#")[0]
|
||||
if self.is_valid_url(href) and href not in page["links"]:
|
||||
page["links"].append(href)
|
||||
|
||||
return page
|
||||
|
||||
def _extract_markdown_content(self, content: str, url: str) -> dict[str, Any]:
|
||||
@@ -1070,16 +1110,13 @@ class DocToSkillConverter:
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
|
||||
main_selector = self.config.get("selectors", {}).get(
|
||||
"main_content", 'div[role="main"]'
|
||||
)
|
||||
main = soup.select_one(main_selector)
|
||||
|
||||
if main:
|
||||
for link in main.find_all("a", href=True):
|
||||
href = urljoin(url, link["href"])
|
||||
if self.is_valid_url(href) and href not in self.visited_urls:
|
||||
self.pending_urls.append(href)
|
||||
# Discover links from full page (not just main content)
|
||||
# to match real scrape path behaviour in extract_content()
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = urljoin(url, link["href"])
|
||||
href = href.split("#")[0]
|
||||
if self.is_valid_url(href) and href not in self.visited_urls:
|
||||
self.pending_urls.append(href)
|
||||
except Exception as e:
|
||||
# Failed to extract links in fast mode, continue anyway
|
||||
logger.warning("⚠️ Warning: Could not extract links from %s: %s", url, e)
|
||||
@@ -1249,6 +1286,25 @@ class DocToSkillConverter:
|
||||
if unlimited or len(self.visited_urls) <= preview_limit:
|
||||
if self.dry_run:
|
||||
logger.info(" [Preview] %s", url)
|
||||
# Discover links from full page (async dry-run)
|
||||
try:
|
||||
response = await client.get(
|
||||
url,
|
||||
headers={
|
||||
"User-Agent": "Mozilla/5.0 (Documentation Scraper - Dry Run)"
|
||||
},
|
||||
timeout=10,
|
||||
)
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = urljoin(url, link["href"])
|
||||
href = href.split("#")[0]
|
||||
if self.is_valid_url(href) and href not in self.visited_urls:
|
||||
self.pending_urls.append(href)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"⚠️ Warning: Could not extract links from %s: %s", url, e
|
||||
)
|
||||
else:
|
||||
task = asyncio.create_task(
|
||||
self.scrape_page_async(url, semaphore, client)
|
||||
@@ -2039,7 +2095,6 @@ def get_configuration(args: argparse.Namespace) -> dict[str, Any]:
|
||||
"description": args.description or f"Use when working with {args.name}",
|
||||
"base_url": effective_url,
|
||||
"selectors": {
|
||||
"main_content": "div[role='main']",
|
||||
"title": "title",
|
||||
"code_blocks": "pre code",
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user