Files
skill-seekers-reference/tests/test_sync_config_e2e.py
yusyus 83b9a695ba feat: add sync-config command to detect and update config start_urls (#306)
## Summary

Add `skill-seekers sync-config` subcommand that crawls a docs site's navigation,
diffs discovered URLs against a config's start_urls, and optionally writes the
updated list back with --apply.

- BFS link discovery with configurable depth (default 2), max-pages, rate-limit
- Respects url_patterns.include/exclude from config
- Supports optional nav_seed_urls config field
- Handles both unified (sources array) and legacy flat config formats
- MCP tool sync_config included
- 57 tests (39 unit + 18 E2E with local HTTP server)
- Fixed CI: renamed summary job to "Tests" to match branch protection rule

Closes #306
2026-03-15 02:16:32 +03:00

627 lines
21 KiB
Python

#!/usr/bin/env python3
"""End-to-end tests for the sync-config command.
Uses a local HTTP server with realistic multi-page HTML navigation to test
the full pipeline: BFS crawl -> link discovery -> diff -> config update.
Also includes an integration test against a real public docs site.
"""
import json
import subprocess
import sys
import tempfile
import threading
import unittest
from http.server import HTTPServer, SimpleHTTPRequestHandler
from pathlib import Path
import pytest
from skill_seekers.cli.sync_config import discover_urls, sync_config
# ---------------------------------------------------------------------------
# Local test HTTP server
# ---------------------------------------------------------------------------
# Simulates a docs site with this navigation structure:
#
# /docs/ (index — links to guide, api, faq)
# /docs/guide (links to guide/install, guide/usage)
# /docs/guide/install (leaf page)
# /docs/guide/usage (leaf page, links back to guide)
# /docs/api (links to api/auth, api/users)
# /docs/api/auth (leaf page)
# /docs/api/users (leaf page)
# /docs/faq (leaf page)
# /blog/post-1 (outside /docs/ — should be excluded)
_SITE_PAGES = {
"/docs/": """<!DOCTYPE html><html><head><title>Docs Home</title></head><body>
<h1>Documentation</h1>
<nav>
<a href="/docs/guide">Guide</a>
<a href="/docs/api">API Reference</a>
<a href="/docs/faq">FAQ</a>
<a href="/blog/post-1">Blog</a>
<a href="https://github.com/example/repo">GitHub</a>
</nav>
</body></html>""",
"/docs/guide": """<!DOCTYPE html><html><body>
<h1>Guide</h1>
<a href="/docs/guide/install">Installation</a>
<a href="/docs/guide/usage">Usage</a>
<a href="/docs/">Back to docs</a>
</body></html>""",
"/docs/guide/install": """<!DOCTYPE html><html><body>
<h1>Installation</h1><p>pip install example</p>
<a href="/docs/guide">Back to guide</a>
</body></html>""",
"/docs/guide/usage": """<!DOCTYPE html><html><body>
<h1>Usage</h1><p>import example</p>
<a href="/docs/guide">Back to guide</a>
</body></html>""",
"/docs/api": """<!DOCTYPE html><html><body>
<h1>API Reference</h1>
<a href="/docs/api/auth">Authentication</a>
<a href="/docs/api/users">Users</a>
</body></html>""",
"/docs/api/auth": """<!DOCTYPE html><html><body>
<h1>Authentication</h1><p>Use tokens.</p>
</body></html>""",
"/docs/api/users": """<!DOCTYPE html><html><body>
<h1>Users API</h1><p>CRUD operations.</p>
</body></html>""",
"/docs/faq": """<!DOCTYPE html><html><body>
<h1>FAQ</h1><p>Common questions.</p>
</body></html>""",
"/blog/post-1": """<!DOCTYPE html><html><body>
<h1>Blog Post</h1><p>This is a blog post outside /docs/.</p>
</body></html>""",
}
# All docs pages that should be discovered (excluding /blog/)
_ALL_DOC_URLS_PATHS = {
"/docs/",
"/docs/guide",
"/docs/guide/install",
"/docs/guide/usage",
"/docs/api",
"/docs/api/auth",
"/docs/api/users",
"/docs/faq",
}
class _TestHandler(SimpleHTTPRequestHandler):
"""Serve pages from the in-memory _SITE_PAGES dict."""
def do_GET(self):
path = self.path.split("?")[0].split("#")[0]
content = _SITE_PAGES.get(path)
if content is None:
self.send_error(404)
return
self.send_response(200)
self.send_header("Content-Type", "text/html; charset=utf-8")
self.end_headers()
self.wfile.write(content.encode("utf-8"))
def log_message(self, format, *args): # noqa: ARG002
pass # Suppress request logging during tests
def _start_server() -> tuple[HTTPServer, int]:
"""Start a local HTTP server on a random port. Returns (server, port)."""
server = HTTPServer(("127.0.0.1", 0), _TestHandler)
port = server.server_address[1]
thread = threading.Thread(target=server.serve_forever, daemon=True)
thread.start()
return server, port
# ---------------------------------------------------------------------------
# Helper
# ---------------------------------------------------------------------------
def _write_config(config: dict) -> Path:
"""Write a config dict to a temp JSON file and return its path."""
tmp = tempfile.mktemp(suffix=".json")
with open(tmp, "w", encoding="utf-8") as f:
json.dump(config, f, indent=2)
return Path(tmp)
# ---------------------------------------------------------------------------
# E2E tests using local HTTP server
# ---------------------------------------------------------------------------
@pytest.mark.e2e
class TestSyncConfigE2E(unittest.TestCase):
"""End-to-end tests using a local HTTP server with realistic HTML."""
@classmethod
def setUpClass(cls):
cls.server, cls.port = _start_server()
cls.base_url = f"http://127.0.0.1:{cls.port}/docs/"
@classmethod
def tearDownClass(cls):
cls.server.shutdown()
# -- discover_urls --
def test_discover_finds_all_doc_pages(self):
"""BFS should discover all 8 /docs/ pages from the root."""
discovered = discover_urls(
base_url=self.base_url,
seed_urls=[self.base_url],
depth=3,
rate_limit=0,
)
expected = {f"http://127.0.0.1:{self.port}{p}" for p in _ALL_DOC_URLS_PATHS}
self.assertEqual(discovered, expected)
def test_discover_excludes_blog(self):
"""Pages outside /docs/ base_url should be excluded."""
discovered = discover_urls(
base_url=self.base_url,
seed_urls=[self.base_url],
depth=3,
rate_limit=0,
)
blog_url = f"http://127.0.0.1:{self.port}/blog/post-1"
self.assertNotIn(blog_url, discovered)
def test_discover_excludes_external(self):
"""External URLs (github.com) should be excluded."""
discovered = discover_urls(
base_url=self.base_url,
seed_urls=[self.base_url],
depth=3,
rate_limit=0,
)
self.assertFalse(
any("github.com" in u for u in discovered),
"External URLs should not be discovered",
)
def test_discover_depth_1_finds_direct_links_only(self):
"""Depth 1 from root should find guide, api, faq but NOT nested pages."""
discovered = discover_urls(
base_url=self.base_url,
seed_urls=[self.base_url],
depth=1,
rate_limit=0,
)
# Direct children of /docs/
self.assertIn(f"http://127.0.0.1:{self.port}/docs/guide", discovered)
self.assertIn(f"http://127.0.0.1:{self.port}/docs/api", discovered)
self.assertIn(f"http://127.0.0.1:{self.port}/docs/faq", discovered)
# Nested pages should NOT be present (they're at depth 2)
self.assertNotIn(f"http://127.0.0.1:{self.port}/docs/guide/install", discovered)
self.assertNotIn(f"http://127.0.0.1:{self.port}/docs/api/auth", discovered)
def test_discover_with_include_pattern(self):
"""Include pattern should filter results."""
discovered = discover_urls(
base_url=self.base_url,
seed_urls=[self.base_url],
include_patterns=["/api"],
depth=3,
rate_limit=0,
)
# Only /api/ pages should be discovered
for url in discovered:
self.assertIn("/api", url, f"URL {url} does not match include pattern /api")
def test_discover_with_exclude_pattern(self):
"""Exclude pattern should remove matching pages."""
discovered = discover_urls(
base_url=self.base_url,
seed_urls=[self.base_url],
exclude_patterns=["/faq"],
depth=3,
rate_limit=0,
)
faq_url = f"http://127.0.0.1:{self.port}/docs/faq"
self.assertNotIn(faq_url, discovered)
# Other pages should still be found
self.assertIn(f"http://127.0.0.1:{self.port}/docs/guide", discovered)
def test_discover_max_pages_limit(self):
"""max_pages should cap discovery."""
discovered = discover_urls(
base_url=self.base_url,
seed_urls=[self.base_url],
depth=3,
max_pages=3,
rate_limit=0,
)
self.assertLessEqual(len(discovered), 3)
# -- sync_config (full pipeline with file I/O) --
def test_sync_config_dry_run_detects_new_pages(self):
"""Dry-run should detect pages missing from the config."""
config = {
"name": "test-site",
"sources": [
{
"type": "documentation",
"base_url": self.base_url,
"start_urls": [
f"http://127.0.0.1:{self.port}/docs/guide",
f"http://127.0.0.1:{self.port}/docs/faq",
],
}
],
}
path = _write_config(config)
result = sync_config(str(path), apply=False, depth=3, rate_limit=0)
self.assertFalse(result["applied"])
self.assertGreater(len(result["added"]), 0, "Should detect new pages")
# api, api/auth, api/users, guide/install, guide/usage, /docs/ itself
# should all be in added
self.assertGreaterEqual(result["total_discovered"], 6)
# File should NOT be modified
with open(path, encoding="utf-8") as f:
saved = json.load(f)
self.assertEqual(len(saved["sources"][0]["start_urls"]), 2)
path.unlink()
def test_sync_config_apply_updates_config(self):
"""--apply should write all discovered URLs to the config."""
config = {
"name": "test-site",
"sources": [
{
"type": "documentation",
"base_url": self.base_url,
"start_urls": [f"http://127.0.0.1:{self.port}/docs/guide"],
}
],
}
path = _write_config(config)
result = sync_config(str(path), apply=True, depth=3, rate_limit=0)
self.assertTrue(result["applied"])
# Verify the file was updated
with open(path, encoding="utf-8") as f:
saved = json.load(f)
saved_urls = saved["sources"][0]["start_urls"]
self.assertEqual(len(saved_urls), result["total_discovered"])
# All expected URLs should be present
expected = {f"http://127.0.0.1:{self.port}{p}" for p in _ALL_DOC_URLS_PATHS}
for url in expected:
self.assertIn(url, saved_urls, f"Expected URL missing from saved config: {url}")
path.unlink()
def test_sync_config_idempotent(self):
"""Running sync twice with --apply should be a no-op the second time."""
config = {
"name": "test-site",
"sources": [
{
"type": "documentation",
"base_url": self.base_url,
"start_urls": [],
}
],
}
path = _write_config(config)
# First run: should apply changes
result1 = sync_config(str(path), apply=True, depth=3, rate_limit=0)
self.assertTrue(result1["applied"])
self.assertGreater(len(result1["added"]), 0)
# Second run: should detect no changes
result2 = sync_config(str(path), apply=True, depth=3, rate_limit=0)
self.assertFalse(result2["applied"])
self.assertEqual(result2["added"], [])
self.assertEqual(result2["removed"], [])
path.unlink()
def test_sync_config_detects_removed_pages(self):
"""Pages in config but not discovered should show as removed."""
config = {
"name": "test-site",
"sources": [
{
"type": "documentation",
"base_url": self.base_url,
"start_urls": [
f"http://127.0.0.1:{self.port}/docs/guide",
f"http://127.0.0.1:{self.port}/docs/old-page-that-no-longer-exists",
],
}
],
}
path = _write_config(config)
result = sync_config(str(path), apply=False, depth=3, rate_limit=0)
self.assertIn(
f"http://127.0.0.1:{self.port}/docs/old-page-that-no-longer-exists",
result["removed"],
)
path.unlink()
def test_sync_config_preserves_other_config_fields(self):
"""--apply should only modify start_urls, preserving all other fields."""
config = {
"name": "my-skill",
"description": "Important skill description",
"version": "1.0.0",
"sources": [
{
"type": "documentation",
"base_url": self.base_url,
"start_urls": [],
"selectors": {"main_content": "article", "title": "h1"},
"url_patterns": {"include": [], "exclude": []},
"rate_limit": 0.5,
"max_pages": 100,
},
{
"type": "github",
"repo": "owner/repo",
},
],
}
path = _write_config(config)
sync_config(str(path), apply=True, depth=3, rate_limit=0)
with open(path, encoding="utf-8") as f:
saved = json.load(f)
# Non-start_urls fields should be untouched
self.assertEqual(saved["name"], "my-skill")
self.assertEqual(saved["description"], "Important skill description")
self.assertEqual(saved["version"], "1.0.0")
self.assertEqual(saved["sources"][0]["selectors"]["main_content"], "article")
self.assertEqual(saved["sources"][0]["rate_limit"], 0.5)
self.assertEqual(saved["sources"][1]["type"], "github")
self.assertEqual(saved["sources"][1]["repo"], "owner/repo")
# start_urls should be updated
self.assertGreater(len(saved["sources"][0]["start_urls"]), 0)
path.unlink()
def test_sync_config_with_nav_seed_urls(self):
"""nav_seed_urls should be used as BFS seeds instead of start_urls."""
config = {
"name": "test-site",
"sources": [
{
"type": "documentation",
"base_url": self.base_url,
"start_urls": [],
# Only seed from /docs/api — should only discover API pages
"nav_seed_urls": [f"http://127.0.0.1:{self.port}/docs/api"],
}
],
}
path = _write_config(config)
result = sync_config(str(path), apply=False, depth=1, rate_limit=0)
# Should discover at least the API seed page
self.assertGreater(len(result["added"]), 0, "nav_seed_urls should discover pages")
# All added URLs should be under /docs/
for url in result["added"]:
self.assertTrue(url.startswith(self.base_url), f"URL outside base: {url}")
path.unlink()
def test_sync_config_legacy_format(self):
"""Legacy flat config format should work end-to-end."""
config = {
"name": "test-site",
"base_url": self.base_url,
"start_urls": [f"http://127.0.0.1:{self.port}/docs/guide"],
}
path = _write_config(config)
result = sync_config(str(path), apply=True, depth=3, rate_limit=0)
self.assertTrue(result["applied"])
with open(path, encoding="utf-8") as f:
saved = json.load(f)
self.assertGreater(len(saved["start_urls"]), 1)
path.unlink()
# ---------------------------------------------------------------------------
# CLI subprocess tests
# ---------------------------------------------------------------------------
@pytest.mark.e2e
class TestSyncConfigCLIE2E(unittest.TestCase):
"""Test the CLI entry point via subprocess."""
@classmethod
def setUpClass(cls):
cls.server, cls.port = _start_server()
cls.base_url = f"http://127.0.0.1:{cls.port}/docs/"
@classmethod
def tearDownClass(cls):
cls.server.shutdown()
def test_cli_dry_run(self):
"""CLI dry-run should print diff and exit 0."""
config = {
"name": "test",
"sources": [
{
"type": "documentation",
"base_url": self.base_url,
# Only one URL configured — the rest should show as "new"
"start_urls": [f"http://127.0.0.1:{self.port}/docs/faq"],
# Seed from root to discover all pages
"nav_seed_urls": [self.base_url],
}
],
}
path = _write_config(config)
result = subprocess.run(
[
sys.executable,
"-m",
"skill_seekers.cli.sync_config",
"--config",
str(path),
"--depth",
"3",
"--rate-limit",
"0",
],
capture_output=True,
text=True,
timeout=30,
)
self.assertEqual(result.returncode, 0, f"CLI failed: {result.stderr}")
# Should mention new pages in the output (logged to stderr)
combined = result.stderr.lower() + result.stdout.lower()
self.assertIn("new page", combined, f"Expected 'new page' in output: {combined}")
path.unlink()
def test_cli_apply(self):
"""CLI --apply should update the config file."""
config = {
"name": "test",
"sources": [
{
"type": "documentation",
"base_url": self.base_url,
"start_urls": [f"http://127.0.0.1:{self.port}/docs/faq"],
"nav_seed_urls": [self.base_url],
}
],
}
path = _write_config(config)
result = subprocess.run(
[
sys.executable,
"-m",
"skill_seekers.cli.sync_config",
"--config",
str(path),
"--apply",
"--depth",
"3",
"--rate-limit",
"0",
],
capture_output=True,
text=True,
timeout=30,
)
self.assertEqual(result.returncode, 0, f"CLI failed: {result.stderr}")
with open(path, encoding="utf-8") as f:
saved = json.load(f)
self.assertGreater(len(saved["sources"][0]["start_urls"]), 0)
path.unlink()
def test_cli_help(self):
"""CLI --help should print usage and exit 0."""
result = subprocess.run(
[sys.executable, "-m", "skill_seekers.cli.sync_config", "--help"],
capture_output=True,
text=True,
timeout=10,
)
self.assertEqual(result.returncode, 0)
self.assertIn("sync", result.stdout.lower())
self.assertIn("--config", result.stdout)
self.assertIn("--apply", result.stdout)
self.assertIn("--depth", result.stdout)
def test_cli_missing_config_exits_nonzero(self):
"""CLI with a non-existent config should fail."""
result = subprocess.run(
[
sys.executable,
"-m",
"skill_seekers.cli.sync_config",
"--config",
"/nonexistent/path/config.json",
],
capture_output=True,
text=True,
timeout=10,
)
self.assertNotEqual(result.returncode, 0)
# ---------------------------------------------------------------------------
# Integration test against real public site
# ---------------------------------------------------------------------------
@pytest.mark.integration
class TestSyncConfigRealSite(unittest.TestCase):
"""Integration test against a real public docs site.
Skipped by default (use ``-m integration`` to run).
Uses httpbin.org which is a stable, small public HTTP test service.
"""
def test_discover_urls_real_http(self):
"""discover_urls should work against a real HTTP server."""
# Use Python docs — small, stable, well-structured
discovered = discover_urls(
base_url="https://docs.python.org/3/library/",
seed_urls=["https://docs.python.org/3/library/functions.html"],
depth=1,
max_pages=10,
rate_limit=0.5,
)
# Should find at least the seed page itself
self.assertGreater(len(discovered), 0)
# All discovered URLs should be under the base
for url in discovered:
self.assertTrue(
url.startswith("https://docs.python.org/3/library/"),
f"Discovered URL outside base: {url}",
)
if __name__ == "__main__":
unittest.main()