fix(#300): centralize selector fallback, fix dry-run link discovery, and smart --config routing
- Add FALLBACK_MAIN_SELECTORS constant and _find_main_content() helper to eliminate 3 duplicated fallback loops in doc_scraper.py - Move link extraction before early return in extract_content() so links are always discovered from the full page, not just main content - Fix single-threaded dry-run to extract links from soup (full page) instead of main element only — fixes reactflow.dev finding only 1 page - Add link extraction to async dry-run path (was completely missing) - Remove main_content from get_configuration() defaults so fallback logic kicks in instead of a broad CSS comma selector matching body - Smart create --config routing: peek at JSON to determine unified (sources array → unified_scraper) vs simple (base_url → doc_scraper) - Update docs/user-guide/02-scraping.md and docs/reference/CONFIG_FORMAT.md to use unified config format (legacy format rejected since v2.11.0) - Fix test_auto_fetch_enabled and test_mcp_validate_legacy_config Closes #300 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -603,9 +603,30 @@ Common Workflows:
|
||||
log_level = logging.DEBUG if args.verbose else (logging.WARNING if args.quiet else logging.INFO)
|
||||
logging.basicConfig(level=log_level, format="%(levelname)s: %(message)s")
|
||||
|
||||
# Validate source provided
|
||||
if not args.source:
|
||||
parser.error("source is required")
|
||||
# Validate source provided (config file can serve as source)
|
||||
if not args.source and not args.config:
|
||||
parser.error("source is required (or use --config to specify a config file)")
|
||||
|
||||
# If config is provided but no source, peek at the JSON to route correctly
|
||||
if not args.source and args.config:
|
||||
import json
|
||||
|
||||
try:
|
||||
with open(args.config) as f:
|
||||
config_peek = json.load(f)
|
||||
if "sources" in config_peek:
|
||||
# Unified format → route to unified_scraper via config type detection
|
||||
args.source = args.config
|
||||
elif "base_url" in config_peek:
|
||||
# Simple web config → route to doc_scraper by using the base_url
|
||||
args.source = config_peek["base_url"]
|
||||
# source will be detected as web URL; --config is already set
|
||||
else:
|
||||
parser.error("Config file must contain 'sources' (unified) or 'base_url' (web)")
|
||||
except json.JSONDecodeError as e:
|
||||
parser.error(f"Cannot parse config file as JSON: {e}")
|
||||
except FileNotFoundError:
|
||||
parser.error(f"Config file not found: {args.config}")
|
||||
|
||||
# Execute create command
|
||||
command = CreateCommand(args)
|
||||
|
||||
Reference in New Issue
Block a user