- Add FALLBACK_MAIN_SELECTORS constant and _find_main_content() helper to eliminate 3 duplicated fallback loops in doc_scraper.py - Move link extraction before early return in extract_content() so links are always discovered from the full page, not just main content - Fix single-threaded dry-run to extract links from soup (full page) instead of main element only — fixes reactflow.dev finding only 1 page - Add link extraction to async dry-run path (was completely missing) - Remove main_content from get_configuration() defaults so fallback logic kicks in instead of a broad CSS comma selector matching body - Smart create --config routing: peek at JSON to determine unified (sources array → unified_scraper) vs simple (base_url → doc_scraper) - Update docs/user-guide/02-scraping.md and docs/reference/CONFIG_FORMAT.md to use unified config format (legacy format rejected since v2.11.0) - Fix test_auto_fetch_enabled and test_mcp_validate_legacy_config Closes #300 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
216 lines
6.5 KiB
Python
216 lines
6.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test MCP Integration with Unified Scraping
|
|
|
|
Tests that the MCP server correctly handles unified configs.
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import os
|
|
import sys
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
# WORKAROUND for shadowing issue: Temporarily change to /tmp to import external mcp
|
|
# This avoids any local mcp/ directory being in the import path
|
|
_original_dir = os.getcwd()
|
|
MCP_AVAILABLE = False
|
|
try:
|
|
os.chdir("/tmp") # Change away from project directory
|
|
from mcp.types import TextContent # noqa: F401
|
|
|
|
MCP_AVAILABLE = True
|
|
except ImportError:
|
|
pass
|
|
finally:
|
|
os.chdir(_original_dir) # Restore original directory
|
|
|
|
# Configure pytest to only use asyncio backend (not trio)
|
|
pytestmark = pytest.mark.anyio
|
|
|
|
if MCP_AVAILABLE:
|
|
from skill_seekers.mcp.server import scrape_docs_tool, validate_config_tool
|
|
else:
|
|
validate_config_tool = None
|
|
scrape_docs_tool = None
|
|
|
|
|
|
@pytest.mark.skipif(not MCP_AVAILABLE, reason="MCP package not installed")
|
|
async def test_mcp_validate_unified_config():
|
|
"""Test that MCP can validate unified configs"""
|
|
print("\n✓ Testing MCP validate_config_tool with unified config...")
|
|
|
|
# Use existing unified config
|
|
config_path = "configs/react_unified.json"
|
|
|
|
if not Path(config_path).exists():
|
|
print(f" ⚠️ Skipping: {config_path} not found")
|
|
return
|
|
|
|
args = {"config_path": config_path}
|
|
result = await validate_config_tool(args)
|
|
|
|
# Check result
|
|
text = result[0].text
|
|
assert "✅" in text, f"Expected success, got: {text}"
|
|
assert "Unified" in text, f"Expected unified format detected, got: {text}"
|
|
assert "Sources:" in text, f"Expected sources count, got: {text}"
|
|
|
|
print(" ✅ MCP correctly validates unified config")
|
|
|
|
|
|
@pytest.mark.skipif(not MCP_AVAILABLE, reason="MCP package not installed")
|
|
async def test_mcp_validate_legacy_config():
|
|
"""Test that MCP can validate legacy configs"""
|
|
print("\n✓ Testing MCP validate_config_tool with legacy config...")
|
|
|
|
# Create a truly legacy config (no "sources" key — just base_url + selectors)
|
|
legacy_config = {
|
|
"name": "test-legacy",
|
|
"base_url": "https://example.com/",
|
|
"selectors": {"main_content": "main", "title": "h1", "code_blocks": "pre code"},
|
|
"url_patterns": {"include": [], "exclude": []},
|
|
"rate_limit": 0.5,
|
|
}
|
|
|
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
|
|
json.dump(legacy_config, f)
|
|
config_path = f.name
|
|
|
|
try:
|
|
args = {"config_path": config_path}
|
|
result = await validate_config_tool(args)
|
|
|
|
# Legacy configs are rejected since v2.11.0 — validator should detect the format
|
|
text = result[0].text
|
|
assert "LEGACY" in text.upper(), f"Expected legacy format detected, got: {text}"
|
|
|
|
print(" ✅ MCP correctly detects legacy config format")
|
|
finally:
|
|
os.unlink(config_path)
|
|
|
|
|
|
@pytest.mark.skipif(not MCP_AVAILABLE, reason="MCP package not installed")
|
|
async def test_mcp_scrape_docs_detection():
|
|
"""Test that MCP scrape_docs correctly detects format"""
|
|
print("\n✓ Testing MCP scrape_docs format detection...")
|
|
|
|
# Create temporary unified config
|
|
unified_config = {
|
|
"name": "test_mcp_unified",
|
|
"description": "Test unified via MCP",
|
|
"merge_mode": "rule-based",
|
|
"sources": [
|
|
{
|
|
"type": "documentation",
|
|
"base_url": "https://example.com",
|
|
"extract_api": True,
|
|
"max_pages": 5,
|
|
}
|
|
],
|
|
}
|
|
|
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
|
|
json.dump(unified_config, f)
|
|
unified_config_path = f.name
|
|
|
|
# Create temporary legacy config
|
|
legacy_config = {
|
|
"name": "test_mcp_legacy",
|
|
"description": "Test legacy via MCP",
|
|
"base_url": "https://example.com",
|
|
"max_pages": 5,
|
|
}
|
|
|
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
|
|
json.dump(legacy_config, f)
|
|
legacy_config_path = f.name
|
|
|
|
try:
|
|
# Test unified detection
|
|
with open(unified_config_path) as f:
|
|
config = json.load(f)
|
|
|
|
is_unified = "sources" in config and isinstance(config["sources"], list)
|
|
assert is_unified, "Should detect unified format"
|
|
print(" ✅ Unified format detected correctly")
|
|
|
|
# Test legacy detection
|
|
with open(legacy_config_path) as f:
|
|
config = json.load(f)
|
|
|
|
is_unified = "sources" in config and isinstance(config["sources"], list)
|
|
assert not is_unified, "Should detect legacy format"
|
|
print(" ✅ Legacy format detected correctly")
|
|
|
|
finally:
|
|
# Cleanup
|
|
Path(unified_config_path).unlink(missing_ok=True)
|
|
Path(legacy_config_path).unlink(missing_ok=True)
|
|
|
|
|
|
@pytest.mark.skipif(not MCP_AVAILABLE, reason="MCP package not installed")
|
|
async def test_mcp_merge_mode_override():
|
|
"""Test that MCP can override merge mode"""
|
|
print("\n✓ Testing MCP merge_mode override...")
|
|
|
|
# Create unified config
|
|
config = {
|
|
"name": "test_merge_override",
|
|
"description": "Test merge mode override",
|
|
"merge_mode": "rule-based",
|
|
"sources": [{"type": "documentation", "base_url": "https://example.com"}],
|
|
}
|
|
|
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
|
|
json.dump(config, f)
|
|
config_path = f.name
|
|
|
|
try:
|
|
# Test that we can override merge_mode in args
|
|
args = {
|
|
"config_path": config_path,
|
|
"merge_mode": "claude-enhanced", # Override
|
|
}
|
|
|
|
# Check that args has merge_mode
|
|
assert args.get("merge_mode") == "claude-enhanced"
|
|
print(" ✅ Merge mode override supported")
|
|
|
|
finally:
|
|
Path(config_path).unlink(missing_ok=True)
|
|
|
|
|
|
# Run all tests
|
|
async def run_all_tests():
|
|
print("=" * 60)
|
|
print("MCP Unified Scraping Integration Tests")
|
|
print("=" * 60)
|
|
|
|
try:
|
|
await test_mcp_validate_unified_config()
|
|
await test_mcp_validate_legacy_config()
|
|
await test_mcp_scrape_docs_detection()
|
|
await test_mcp_merge_mode_override()
|
|
|
|
print("\n" + "=" * 60)
|
|
print("✅ All MCP integration tests passed!")
|
|
print("=" * 60)
|
|
|
|
except AssertionError as e:
|
|
print(f"\n❌ Test failed: {e}")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"\n❌ Unexpected error: {e}")
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(run_all_tests())
|