fix(#300): centralize selector fallback, fix dry-run link discovery, and smart --config routing

- Add FALLBACK_MAIN_SELECTORS constant and _find_main_content() helper to
  eliminate 3 duplicated fallback loops in doc_scraper.py
- Move link extraction before early return in extract_content() so links
  are always discovered from the full page, not just main content
- Fix single-threaded dry-run to extract links from soup (full page)
  instead of main element only — fixes reactflow.dev finding only 1 page
- Add link extraction to async dry-run path (was completely missing)
- Remove main_content from get_configuration() defaults so fallback logic
  kicks in instead of a broad CSS comma selector matching body
- Smart create --config routing: peek at JSON to determine unified
  (sources array → unified_scraper) vs simple (base_url → doc_scraper)
- Update docs/user-guide/02-scraping.md and docs/reference/CONFIG_FORMAT.md
  to use unified config format (legacy format rejected since v2.11.0)
- Fix test_auto_fetch_enabled and test_mcp_validate_legacy_config

Closes #300

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-02-26 22:25:59 +03:00
parent b6d4dd8423
commit 4c8e16c8b1
9 changed files with 426 additions and 194 deletions

View File

@@ -265,16 +265,16 @@ class TestResolveConfigPath:
@patch("skill_seekers.cli.config_fetcher.fetch_config_from_api")
def test_auto_fetch_enabled(self, mock_fetch, tmp_path):
"""Test that auto-fetch runs when enabled."""
# Mock fetch to return a path
mock_config = tmp_path / "configs" / "react.json"
# Use a name that does NOT exist locally (react.json exists in configs/)
mock_config = tmp_path / "configs" / "obscure_framework.json"
mock_config.parent.mkdir(exist_ok=True)
mock_config.write_text('{"name": "react"}')
mock_config.write_text('{"name": "obscure_framework"}')
mock_fetch.return_value = mock_config
result = resolve_config_path("react.json", auto_fetch=True)
result = resolve_config_path("obscure_framework.json", auto_fetch=True)
# Verify fetch was called
mock_fetch.assert_called_once_with("react", destination="configs")
mock_fetch.assert_called_once_with("obscure_framework", destination="configs")
assert result is not None
assert result.exists()

View File

@@ -67,22 +67,30 @@ async def test_mcp_validate_legacy_config():
"""Test that MCP can validate legacy configs"""
print("\n✓ Testing MCP validate_config_tool with legacy config...")
# Use existing legacy config
config_path = "configs/react.json"
# Create a truly legacy config (no "sources" key — just base_url + selectors)
legacy_config = {
"name": "test-legacy",
"base_url": "https://example.com/",
"selectors": {"main_content": "main", "title": "h1", "code_blocks": "pre code"},
"url_patterns": {"include": [], "exclude": []},
"rate_limit": 0.5,
}
if not Path(config_path).exists():
print(f" ⚠️ Skipping: {config_path} not found")
return
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
json.dump(legacy_config, f)
config_path = f.name
args = {"config_path": config_path}
result = await validate_config_tool(args)
try:
args = {"config_path": config_path}
result = await validate_config_tool(args)
# Check result
text = result[0].text
assert "" in text, f"Expected success, got: {text}"
assert "Legacy" in text, f"Expected legacy format detected, got: {text}"
# Legacy configs are rejected since v2.11.0 — validator should detect the format
text = result[0].text
assert "LEGACY" in text.upper(), f"Expected legacy format detected, got: {text}"
print(" ✅ MCP correctly validates legacy config")
print(" ✅ MCP correctly detects legacy config format")
finally:
os.unlink(config_path)
@pytest.mark.skipif(not MCP_AVAILABLE, reason="MCP package not installed")