#!/usr/bin/env python3 """End-to-end tests for the sync-config command. Uses a local HTTP server with realistic multi-page HTML navigation to test the full pipeline: BFS crawl -> link discovery -> diff -> config update. Also includes an integration test against a real public docs site. """ import json import subprocess import sys import tempfile import threading import unittest from http.server import HTTPServer, SimpleHTTPRequestHandler from pathlib import Path import pytest from skill_seekers.cli.sync_config import discover_urls, sync_config # --------------------------------------------------------------------------- # Local test HTTP server # --------------------------------------------------------------------------- # Simulates a docs site with this navigation structure: # # /docs/ (index — links to guide, api, faq) # /docs/guide (links to guide/install, guide/usage) # /docs/guide/install (leaf page) # /docs/guide/usage (leaf page, links back to guide) # /docs/api (links to api/auth, api/users) # /docs/api/auth (leaf page) # /docs/api/users (leaf page) # /docs/faq (leaf page) # /blog/post-1 (outside /docs/ — should be excluded) _SITE_PAGES = { "/docs/": """Docs Home

Documentation

""", "/docs/guide": """

Guide

Installation Usage Back to docs """, "/docs/guide/install": """

Installation

pip install example

Back to guide """, "/docs/guide/usage": """

Usage

import example

Back to guide """, "/docs/api": """

API Reference

Authentication Users """, "/docs/api/auth": """

Authentication

Use tokens.

""", "/docs/api/users": """

Users API

CRUD operations.

""", "/docs/faq": """

FAQ

Common questions.

""", "/blog/post-1": """

Blog Post

This is a blog post outside /docs/.

""", } # All docs pages that should be discovered (excluding /blog/) _ALL_DOC_URLS_PATHS = { "/docs/", "/docs/guide", "/docs/guide/install", "/docs/guide/usage", "/docs/api", "/docs/api/auth", "/docs/api/users", "/docs/faq", } class _TestHandler(SimpleHTTPRequestHandler): """Serve pages from the in-memory _SITE_PAGES dict.""" def do_GET(self): path = self.path.split("?")[0].split("#")[0] content = _SITE_PAGES.get(path) if content is None: self.send_error(404) return self.send_response(200) self.send_header("Content-Type", "text/html; charset=utf-8") self.end_headers() self.wfile.write(content.encode("utf-8")) def log_message(self, format, *args): # noqa: ARG002 pass # Suppress request logging during tests def _start_server() -> tuple[HTTPServer, int]: """Start a local HTTP server on a random port. Returns (server, port).""" server = HTTPServer(("127.0.0.1", 0), _TestHandler) port = server.server_address[1] thread = threading.Thread(target=server.serve_forever, daemon=True) thread.start() return server, port # --------------------------------------------------------------------------- # Helper # --------------------------------------------------------------------------- def _write_config(config: dict) -> Path: """Write a config dict to a temp JSON file and return its path.""" tmp = tempfile.mktemp(suffix=".json") with open(tmp, "w", encoding="utf-8") as f: json.dump(config, f, indent=2) return Path(tmp) # --------------------------------------------------------------------------- # E2E tests using local HTTP server # --------------------------------------------------------------------------- @pytest.mark.e2e class TestSyncConfigE2E(unittest.TestCase): """End-to-end tests using a local HTTP server with realistic HTML.""" @classmethod def setUpClass(cls): cls.server, cls.port = _start_server() cls.base_url = f"http://127.0.0.1:{cls.port}/docs/" @classmethod def tearDownClass(cls): cls.server.shutdown() # -- discover_urls -- def test_discover_finds_all_doc_pages(self): """BFS should discover all 8 /docs/ pages from the root.""" discovered = discover_urls( base_url=self.base_url, seed_urls=[self.base_url], depth=3, rate_limit=0, ) expected = {f"http://127.0.0.1:{self.port}{p}" for p in _ALL_DOC_URLS_PATHS} self.assertEqual(discovered, expected) def test_discover_excludes_blog(self): """Pages outside /docs/ base_url should be excluded.""" discovered = discover_urls( base_url=self.base_url, seed_urls=[self.base_url], depth=3, rate_limit=0, ) blog_url = f"http://127.0.0.1:{self.port}/blog/post-1" self.assertNotIn(blog_url, discovered) def test_discover_excludes_external(self): """External URLs (github.com) should be excluded.""" discovered = discover_urls( base_url=self.base_url, seed_urls=[self.base_url], depth=3, rate_limit=0, ) self.assertFalse( any("github.com" in u for u in discovered), "External URLs should not be discovered", ) def test_discover_depth_1_finds_direct_links_only(self): """Depth 1 from root should find guide, api, faq but NOT nested pages.""" discovered = discover_urls( base_url=self.base_url, seed_urls=[self.base_url], depth=1, rate_limit=0, ) # Direct children of /docs/ self.assertIn(f"http://127.0.0.1:{self.port}/docs/guide", discovered) self.assertIn(f"http://127.0.0.1:{self.port}/docs/api", discovered) self.assertIn(f"http://127.0.0.1:{self.port}/docs/faq", discovered) # Nested pages should NOT be present (they're at depth 2) self.assertNotIn(f"http://127.0.0.1:{self.port}/docs/guide/install", discovered) self.assertNotIn(f"http://127.0.0.1:{self.port}/docs/api/auth", discovered) def test_discover_with_include_pattern(self): """Include pattern should filter results.""" discovered = discover_urls( base_url=self.base_url, seed_urls=[self.base_url], include_patterns=["/api"], depth=3, rate_limit=0, ) # Only /api/ pages should be discovered for url in discovered: self.assertIn("/api", url, f"URL {url} does not match include pattern /api") def test_discover_with_exclude_pattern(self): """Exclude pattern should remove matching pages.""" discovered = discover_urls( base_url=self.base_url, seed_urls=[self.base_url], exclude_patterns=["/faq"], depth=3, rate_limit=0, ) faq_url = f"http://127.0.0.1:{self.port}/docs/faq" self.assertNotIn(faq_url, discovered) # Other pages should still be found self.assertIn(f"http://127.0.0.1:{self.port}/docs/guide", discovered) def test_discover_max_pages_limit(self): """max_pages should cap discovery.""" discovered = discover_urls( base_url=self.base_url, seed_urls=[self.base_url], depth=3, max_pages=3, rate_limit=0, ) self.assertLessEqual(len(discovered), 3) # -- sync_config (full pipeline with file I/O) -- def test_sync_config_dry_run_detects_new_pages(self): """Dry-run should detect pages missing from the config.""" config = { "name": "test-site", "sources": [ { "type": "documentation", "base_url": self.base_url, "start_urls": [ f"http://127.0.0.1:{self.port}/docs/guide", f"http://127.0.0.1:{self.port}/docs/faq", ], } ], } path = _write_config(config) result = sync_config(str(path), apply=False, depth=3, rate_limit=0) self.assertFalse(result["applied"]) self.assertGreater(len(result["added"]), 0, "Should detect new pages") # api, api/auth, api/users, guide/install, guide/usage, /docs/ itself # should all be in added self.assertGreaterEqual(result["total_discovered"], 6) # File should NOT be modified with open(path, encoding="utf-8") as f: saved = json.load(f) self.assertEqual(len(saved["sources"][0]["start_urls"]), 2) path.unlink() def test_sync_config_apply_updates_config(self): """--apply should write all discovered URLs to the config.""" config = { "name": "test-site", "sources": [ { "type": "documentation", "base_url": self.base_url, "start_urls": [f"http://127.0.0.1:{self.port}/docs/guide"], } ], } path = _write_config(config) result = sync_config(str(path), apply=True, depth=3, rate_limit=0) self.assertTrue(result["applied"]) # Verify the file was updated with open(path, encoding="utf-8") as f: saved = json.load(f) saved_urls = saved["sources"][0]["start_urls"] self.assertEqual(len(saved_urls), result["total_discovered"]) # All expected URLs should be present expected = {f"http://127.0.0.1:{self.port}{p}" for p in _ALL_DOC_URLS_PATHS} for url in expected: self.assertIn(url, saved_urls, f"Expected URL missing from saved config: {url}") path.unlink() def test_sync_config_idempotent(self): """Running sync twice with --apply should be a no-op the second time.""" config = { "name": "test-site", "sources": [ { "type": "documentation", "base_url": self.base_url, "start_urls": [], } ], } path = _write_config(config) # First run: should apply changes result1 = sync_config(str(path), apply=True, depth=3, rate_limit=0) self.assertTrue(result1["applied"]) self.assertGreater(len(result1["added"]), 0) # Second run: should detect no changes result2 = sync_config(str(path), apply=True, depth=3, rate_limit=0) self.assertFalse(result2["applied"]) self.assertEqual(result2["added"], []) self.assertEqual(result2["removed"], []) path.unlink() def test_sync_config_detects_removed_pages(self): """Pages in config but not discovered should show as removed.""" config = { "name": "test-site", "sources": [ { "type": "documentation", "base_url": self.base_url, "start_urls": [ f"http://127.0.0.1:{self.port}/docs/guide", f"http://127.0.0.1:{self.port}/docs/old-page-that-no-longer-exists", ], } ], } path = _write_config(config) result = sync_config(str(path), apply=False, depth=3, rate_limit=0) self.assertIn( f"http://127.0.0.1:{self.port}/docs/old-page-that-no-longer-exists", result["removed"], ) path.unlink() def test_sync_config_preserves_other_config_fields(self): """--apply should only modify start_urls, preserving all other fields.""" config = { "name": "my-skill", "description": "Important skill description", "version": "1.0.0", "sources": [ { "type": "documentation", "base_url": self.base_url, "start_urls": [], "selectors": {"main_content": "article", "title": "h1"}, "url_patterns": {"include": [], "exclude": []}, "rate_limit": 0.5, "max_pages": 100, }, { "type": "github", "repo": "owner/repo", }, ], } path = _write_config(config) sync_config(str(path), apply=True, depth=3, rate_limit=0) with open(path, encoding="utf-8") as f: saved = json.load(f) # Non-start_urls fields should be untouched self.assertEqual(saved["name"], "my-skill") self.assertEqual(saved["description"], "Important skill description") self.assertEqual(saved["version"], "1.0.0") self.assertEqual(saved["sources"][0]["selectors"]["main_content"], "article") self.assertEqual(saved["sources"][0]["rate_limit"], 0.5) self.assertEqual(saved["sources"][1]["type"], "github") self.assertEqual(saved["sources"][1]["repo"], "owner/repo") # start_urls should be updated self.assertGreater(len(saved["sources"][0]["start_urls"]), 0) path.unlink() def test_sync_config_with_nav_seed_urls(self): """nav_seed_urls should be used as BFS seeds instead of start_urls.""" config = { "name": "test-site", "sources": [ { "type": "documentation", "base_url": self.base_url, "start_urls": [], # Only seed from /docs/api — should only discover API pages "nav_seed_urls": [f"http://127.0.0.1:{self.port}/docs/api"], } ], } path = _write_config(config) result = sync_config(str(path), apply=False, depth=1, rate_limit=0) # Should discover at least the API seed page self.assertGreater(len(result["added"]), 0, "nav_seed_urls should discover pages") # All added URLs should be under /docs/ for url in result["added"]: self.assertTrue(url.startswith(self.base_url), f"URL outside base: {url}") path.unlink() def test_sync_config_legacy_format(self): """Legacy flat config format should work end-to-end.""" config = { "name": "test-site", "base_url": self.base_url, "start_urls": [f"http://127.0.0.1:{self.port}/docs/guide"], } path = _write_config(config) result = sync_config(str(path), apply=True, depth=3, rate_limit=0) self.assertTrue(result["applied"]) with open(path, encoding="utf-8") as f: saved = json.load(f) self.assertGreater(len(saved["start_urls"]), 1) path.unlink() # --------------------------------------------------------------------------- # CLI subprocess tests # --------------------------------------------------------------------------- @pytest.mark.e2e class TestSyncConfigCLIE2E(unittest.TestCase): """Test the CLI entry point via subprocess.""" @classmethod def setUpClass(cls): cls.server, cls.port = _start_server() cls.base_url = f"http://127.0.0.1:{cls.port}/docs/" @classmethod def tearDownClass(cls): cls.server.shutdown() def test_cli_dry_run(self): """CLI dry-run should print diff and exit 0.""" config = { "name": "test", "sources": [ { "type": "documentation", "base_url": self.base_url, # Only one URL configured — the rest should show as "new" "start_urls": [f"http://127.0.0.1:{self.port}/docs/faq"], # Seed from root to discover all pages "nav_seed_urls": [self.base_url], } ], } path = _write_config(config) result = subprocess.run( [ sys.executable, "-m", "skill_seekers.cli.sync_config", "--config", str(path), "--depth", "3", "--rate-limit", "0", ], capture_output=True, text=True, timeout=30, ) self.assertEqual(result.returncode, 0, f"CLI failed: {result.stderr}") # Should mention new pages in the output (logged to stderr) combined = result.stderr.lower() + result.stdout.lower() self.assertIn("new page", combined, f"Expected 'new page' in output: {combined}") path.unlink() def test_cli_apply(self): """CLI --apply should update the config file.""" config = { "name": "test", "sources": [ { "type": "documentation", "base_url": self.base_url, "start_urls": [f"http://127.0.0.1:{self.port}/docs/faq"], "nav_seed_urls": [self.base_url], } ], } path = _write_config(config) result = subprocess.run( [ sys.executable, "-m", "skill_seekers.cli.sync_config", "--config", str(path), "--apply", "--depth", "3", "--rate-limit", "0", ], capture_output=True, text=True, timeout=30, ) self.assertEqual(result.returncode, 0, f"CLI failed: {result.stderr}") with open(path, encoding="utf-8") as f: saved = json.load(f) self.assertGreater(len(saved["sources"][0]["start_urls"]), 0) path.unlink() def test_cli_help(self): """CLI --help should print usage and exit 0.""" result = subprocess.run( [sys.executable, "-m", "skill_seekers.cli.sync_config", "--help"], capture_output=True, text=True, timeout=10, ) self.assertEqual(result.returncode, 0) self.assertIn("sync", result.stdout.lower()) self.assertIn("--config", result.stdout) self.assertIn("--apply", result.stdout) self.assertIn("--depth", result.stdout) def test_cli_missing_config_exits_nonzero(self): """CLI with a non-existent config should fail.""" result = subprocess.run( [ sys.executable, "-m", "skill_seekers.cli.sync_config", "--config", "/nonexistent/path/config.json", ], capture_output=True, text=True, timeout=10, ) self.assertNotEqual(result.returncode, 0) # --------------------------------------------------------------------------- # Integration test against real public site # --------------------------------------------------------------------------- @pytest.mark.integration class TestSyncConfigRealSite(unittest.TestCase): """Integration test against a real public docs site. Skipped by default (use ``-m integration`` to run). Uses httpbin.org which is a stable, small public HTTP test service. """ def test_discover_urls_real_http(self): """discover_urls should work against a real HTTP server.""" # Use Python docs — small, stable, well-structured discovered = discover_urls( base_url="https://docs.python.org/3/library/", seed_urls=["https://docs.python.org/3/library/functions.html"], depth=1, max_pages=10, rate_limit=0.5, ) # Should find at least the seed page itself self.assertGreater(len(discovered), 0) # All discovered URLs should be under the base for url in discovered: self.assertTrue( url.startswith("https://docs.python.org/3/library/"), f"Discovered URL outside base: {url}", ) if __name__ == "__main__": unittest.main()