From 83b9a695ba817f4011a1959c86163c895081b4d2 Mon Sep 17 00:00:00 2001 From: yusyus Date: Sun, 15 Mar 2026 02:16:32 +0300 Subject: [PATCH] feat: add sync-config command to detect and update config start_urls (#306) ## Summary Add `skill-seekers sync-config` subcommand that crawls a docs site's navigation, diffs discovered URLs against a config's start_urls, and optionally writes the updated list back with --apply. - BFS link discovery with configurable depth (default 2), max-pages, rate-limit - Respects url_patterns.include/exclude from config - Supports optional nav_seed_urls config field - Handles both unified (sources array) and legacy flat config formats - MCP tool sync_config included - 57 tests (39 unit + 18 E2E with local HTTP server) - Fixed CI: renamed summary job to "Tests" to match branch protection rule Closes #306 --- pyproject.toml | 1 + .../cli/arguments/sync_config.py | 64 ++ src/skill_seekers/cli/main.py | 1 + src/skill_seekers/cli/parsers/__init__.py | 2 + .../cli/parsers/sync_config_parser.py | 30 + src/skill_seekers/cli/sync_config.py | 325 +++++++++ src/skill_seekers/mcp/server_fastmcp.py | 49 ++ src/skill_seekers/mcp/tools/__init__.py | 5 + .../mcp/tools/sync_config_tools.py | 85 +++ tests/test_cli_parsers.py | 10 +- tests/test_sync_config.py | 590 +++++++++++++++++ tests/test_sync_config_e2e.py | 626 ++++++++++++++++++ 12 files changed, 1783 insertions(+), 5 deletions(-) create mode 100644 src/skill_seekers/cli/arguments/sync_config.py create mode 100644 src/skill_seekers/cli/parsers/sync_config_parser.py create mode 100644 src/skill_seekers/cli/sync_config.py create mode 100644 src/skill_seekers/mcp/tools/sync_config_tools.py create mode 100644 tests/test_sync_config.py create mode 100644 tests/test_sync_config_e2e.py diff --git a/pyproject.toml b/pyproject.toml index b2f8fae..6e4eac3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -244,6 +244,7 @@ skill-seekers-update = "skill_seekers.cli.incremental_updater:main" skill-seekers-multilang = "skill_seekers.cli.multilang_support:main" skill-seekers-quality = "skill_seekers.cli.quality_metrics:main" skill-seekers-workflows = "skill_seekers.cli.workflows_command:main" +skill-seekers-sync-config = "skill_seekers.cli.sync_config:main" [tool.setuptools] package-dir = {"" = "src"} diff --git a/src/skill_seekers/cli/arguments/sync_config.py b/src/skill_seekers/cli/arguments/sync_config.py new file mode 100644 index 0000000..86ced16 --- /dev/null +++ b/src/skill_seekers/cli/arguments/sync_config.py @@ -0,0 +1,64 @@ +"""Sync-config command argument definitions. + +Shared between sync_config.py (standalone) and parsers/sync_config_parser.py +(unified CLI) so the two entry points never drift out of sync. +""" + +import argparse + + +def add_sync_config_arguments(parser: argparse.ArgumentParser) -> None: + """Add all sync-config arguments to *parser*.""" + + parser.add_argument( + "--config", + "-c", + type=str, + required=True, + help="Path to the config JSON file to sync", + metavar="FILE", + ) + parser.add_argument( + "--apply", + action="store_true", + default=False, + help="Write updated start_urls back to the config file (default: dry-run)", + ) + parser.add_argument( + "--depth", + type=int, + default=2, + help="BFS crawl depth from seed pages (default: 2)", + ) + parser.add_argument( + "--max-pages", + type=int, + default=500, + help="Maximum pages to discover (default: 500)", + ) + parser.add_argument( + "--rate-limit", + type=float, + default=None, + help="Override config rate-limit (seconds between requests)", + ) + parser.add_argument( + "--source-index", + type=int, + default=0, + help="Index of the documentation source to sync (default: 0)", + ) + parser.add_argument( + "--verbose", + "-v", + action="store_true", + default=False, + help="Verbose output", + ) + parser.add_argument( + "--quiet", + "-q", + action="store_true", + default=False, + help="Suppress informational output", + ) diff --git a/src/skill_seekers/cli/main.py b/src/skill_seekers/cli/main.py index 83c1fd7..8092b5e 100644 --- a/src/skill_seekers/cli/main.py +++ b/src/skill_seekers/cli/main.py @@ -67,6 +67,7 @@ COMMAND_MODULES = { "multilang": "skill_seekers.cli.multilang_support", "quality": "skill_seekers.cli.quality_metrics", "workflows": "skill_seekers.cli.workflows_command", + "sync-config": "skill_seekers.cli.sync_config", } diff --git a/src/skill_seekers/cli/parsers/__init__.py b/src/skill_seekers/cli/parsers/__init__.py index 7cca3c4..f0aaf2b 100644 --- a/src/skill_seekers/cli/parsers/__init__.py +++ b/src/skill_seekers/cli/parsers/__init__.py @@ -30,6 +30,7 @@ from .update_parser import UpdateParser from .multilang_parser import MultilangParser from .quality_parser import QualityParser from .workflows_parser import WorkflowsParser +from .sync_config_parser import SyncConfigParser # Registry of all parsers (in order of usage frequency) PARSERS = [ @@ -56,6 +57,7 @@ PARSERS = [ MultilangParser(), QualityParser(), WorkflowsParser(), + SyncConfigParser(), ] diff --git a/src/skill_seekers/cli/parsers/sync_config_parser.py b/src/skill_seekers/cli/parsers/sync_config_parser.py new file mode 100644 index 0000000..f4af85a --- /dev/null +++ b/src/skill_seekers/cli/parsers/sync_config_parser.py @@ -0,0 +1,30 @@ +"""Parser for the sync-config subcommand.""" + +import argparse + +from .base import SubcommandParser + + +class SyncConfigParser(SubcommandParser): + """Subcommand parser for ``skill-seekers sync-config``.""" + + @property + def name(self) -> str: + return "sync-config" + + @property + def help(self) -> str: + return "Diff/update a config's start_urls against the live docs site" + + @property + def description(self) -> str: + return ( + "Crawl navigation links from a docs site, compare them against " + "the config's start_urls, and optionally write the updated list " + "back with --apply." + ) + + def add_arguments(self, parser: argparse.ArgumentParser) -> None: + from skill_seekers.cli.arguments.sync_config import add_sync_config_arguments + + add_sync_config_arguments(parser) diff --git a/src/skill_seekers/cli/sync_config.py b/src/skill_seekers/cli/sync_config.py new file mode 100644 index 0000000..dad4adb --- /dev/null +++ b/src/skill_seekers/cli/sync_config.py @@ -0,0 +1,325 @@ +#!/usr/bin/env python3 +"""Sync a config file's start_urls against what's currently live on a docs site. + +Crawls navigation links from seed pages, diffs them against the config's +``start_urls``, and optionally writes the updated list back. + +Usage: + skill-seekers sync-config --config configs/claude-code.json + skill-seekers sync-config --config configs/claude-code.json --apply +""" + +import argparse +import json +import logging +import sys +import time +from collections import deque +from urllib.parse import urljoin + +import requests +from bs4 import BeautifulSoup + +from skill_seekers.cli.utils import sanitize_url, setup_logging + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# URL filtering (mirrors DocToSkillConverter.is_valid_url logic) +# --------------------------------------------------------------------------- + + +def _is_valid_url( + url: str, + base_url: str, + include_patterns: list[str], + exclude_patterns: list[str], +) -> bool: + """Return True if *url* passes include/exclude pattern filters.""" + if not url.startswith(base_url): + return False + if include_patterns and not any(p in url for p in include_patterns): + return False + return not any(p in url for p in exclude_patterns) + + +# --------------------------------------------------------------------------- +# Lightweight BFS link discovery +# --------------------------------------------------------------------------- + + +def discover_urls( + base_url: str, + seed_urls: list[str], + include_patterns: list[str] | None = None, + exclude_patterns: list[str] | None = None, + depth: int = 2, + max_pages: int = 500, + rate_limit: float = 0.5, +) -> set[str]: + """BFS-crawl *seed_urls* and return all discovered internal URLs. + + Only follows ```` links on HTML pages; does not download + full page content. Applies the same include/exclude filtering as + :class:`DocToSkillConverter`. + + Args: + base_url: Only URLs under this prefix are accepted. + seed_urls: Starting points for the BFS. + include_patterns: Substring patterns a URL must contain (any). + exclude_patterns: Substring patterns that disqualify a URL. + depth: Maximum number of BFS hops from the seed pages. + max_pages: Stop after discovering this many unique URLs. + rate_limit: Seconds to wait between HTTP requests. + + Returns: + Set of discovered absolute URLs (fragments stripped). + """ + includes = include_patterns or [] + excludes = exclude_patterns or [] + + visited: set[str] = set() + # Queue entries are (url, current_depth) + queue: deque[tuple[str, int]] = deque() + for u in seed_urls: + u = sanitize_url(u) + queue.append((u, 0)) + + discovered: set[str] = set() + + while queue and len(discovered) < max_pages: + url, cur_depth = queue.popleft() + if url in visited: + continue + visited.add(url) + + if not _is_valid_url(url, base_url, includes, excludes): + continue + + logger.debug(" [depth %d] %s", cur_depth, url) + + try: + headers = {"User-Agent": "Mozilla/5.0 (Skill-Seekers sync-config)"} + resp = requests.get(url, headers=headers, timeout=15) + resp.raise_for_status() + except Exception as e: + logger.warning(" Could not fetch %s: %s", url, e) + continue + + # Only mark as "discovered" after a successful fetch — 404s and + # other errors mean the page no longer exists on the live site. + discovered.add(url) + + # Follow links if we haven't hit the depth limit + if cur_depth < depth: + soup = BeautifulSoup(resp.content, "html.parser") + for link in soup.find_all("a", href=True): + href = urljoin(url, link["href"]) + href = href.split("#")[0] # strip fragment + href = sanitize_url(href) + if href not in visited and _is_valid_url(href, base_url, includes, excludes): + queue.append((href, cur_depth + 1)) + + if rate_limit > 0: + time.sleep(rate_limit) + + return discovered + + +# --------------------------------------------------------------------------- +# Diff logic +# --------------------------------------------------------------------------- + + +def diff_urls(discovered: set[str], configured: list[str]) -> tuple[list[str], list[str]]: + """Compare *discovered* URLs against a *configured* list. + + Returns: + ``(added, removed)`` — both sorted lists of URLs. + """ + configured_set = set(configured) + added = sorted(discovered - configured_set) + removed = sorted(configured_set - discovered) + return added, removed + + +# --------------------------------------------------------------------------- +# Config helpers +# --------------------------------------------------------------------------- + + +def _get_doc_source(config: dict, source_index: int = 0) -> dict | None: + """Extract the documentation source dict from *config*. + + Handles both the unified format (``sources`` array) and legacy flat + format (fields at the top level). + """ + sources = config.get("sources") + if sources: + doc_sources = [s for s in sources if s.get("type") == "documentation"] + if source_index < len(doc_sources): + return doc_sources[source_index] + return None + + # Legacy flat format — treat the whole config as a single source + if config.get("base_url"): + return config + return None + + +def _set_start_urls(config: dict, source_index: int, urls: list[str]) -> None: + """Write *urls* into the correct ``start_urls`` field in *config*.""" + sources = config.get("sources") + if sources: + doc_sources = [s for s in sources if s.get("type") == "documentation"] + if source_index < len(doc_sources): + doc_sources[source_index]["start_urls"] = urls + return + # Legacy flat format + config["start_urls"] = urls + + +# --------------------------------------------------------------------------- +# Main orchestrator +# --------------------------------------------------------------------------- + + +def sync_config( + config_path: str, + apply: bool = False, + depth: int = 2, + max_pages: int = 500, + rate_limit: float | None = None, + source_index: int = 0, +) -> dict: + """Run the sync-config workflow. + + Returns: + Dict with keys ``added``, ``removed``, ``total_discovered``, + ``total_configured``, ``applied``. + """ + # Load config + with open(config_path, encoding="utf-8") as f: + config = json.load(f) + + source = _get_doc_source(config, source_index) + if source is None: + logger.error("No documentation source found at index %d in %s", source_index, config_path) + return { + "added": [], + "removed": [], + "total_discovered": 0, + "total_configured": 0, + "applied": False, + "error": "No documentation source found", + } + + base_url: str = source["base_url"] + configured_urls: list[str] = source.get("start_urls") or [] + seed_urls: list[str] = source.get("nav_seed_urls") or configured_urls or [base_url] + url_patterns = source.get("url_patterns", {}) + includes: list[str] = url_patterns.get("include", []) + excludes: list[str] = url_patterns.get("exclude", []) + effective_rate = rate_limit if rate_limit is not None else source.get("rate_limit", 0.5) + + logger.info("Syncing config: %s", config_path) + logger.info(" Base URL: %s", base_url) + logger.info(" Seed URLs: %d", len(seed_urls)) + logger.info(" Configured: %d start_urls", len(configured_urls)) + logger.info(" Depth: %d", depth) + logger.info(" Rate limit: %.1fs", effective_rate) + logger.info("") + + # Discover + discovered = discover_urls( + base_url=base_url, + seed_urls=seed_urls, + include_patterns=includes, + exclude_patterns=excludes, + depth=depth, + max_pages=max_pages, + rate_limit=effective_rate, + ) + + # Diff + added, removed = diff_urls(discovered, configured_urls) + + # Report + if added: + logger.info("New pages (%d):", len(added)) + for url in added: + path = url.replace(base_url, "/") + logger.info(" + %s", path) + if removed: + logger.info("Removed pages (%d):", len(removed)) + for url in removed: + path = url.replace(base_url, "/") + logger.info(" - %s", path) + + if not added and not removed: + logger.info("Config is up to date. No changes detected.") + else: + logger.info("") + logger.info( + "Summary: %d new, %d removed (discovered %d total, configured %d)", + len(added), + len(removed), + len(discovered), + len(configured_urls), + ) + + applied = False + if apply and (added or removed): + new_urls = sorted(discovered) + _set_start_urls(config, source_index, new_urls) + with open(config_path, "w", encoding="utf-8") as f: + json.dump(config, f, indent=2, ensure_ascii=False) + f.write("\n") + logger.info("Updated %s (%d start_urls)", config_path, len(new_urls)) + applied = True + elif added or removed: + logger.info("Run with --apply to update %s", config_path) + + return { + "added": added, + "removed": removed, + "total_discovered": len(discovered), + "total_configured": len(configured_urls), + "applied": applied, + } + + +# --------------------------------------------------------------------------- +# CLI entry point +# --------------------------------------------------------------------------- + + +def main() -> None: + """CLI entry point for ``skill-seekers sync-config``.""" + from skill_seekers.cli.arguments.sync_config import add_sync_config_arguments + + parser = argparse.ArgumentParser( + prog="skill-seekers-sync-config", + description="Sync a config's start_urls against what's live on the docs site.", + ) + add_sync_config_arguments(parser) + args = parser.parse_args() + + setup_logging(verbose=args.verbose, quiet=args.quiet) + + result = sync_config( + config_path=args.config, + apply=args.apply, + depth=args.depth, + max_pages=args.max_pages, + rate_limit=args.rate_limit, + source_index=args.source_index, + ) + + if result.get("error"): + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/src/skill_seekers/mcp/server_fastmcp.py b/src/skill_seekers/mcp/server_fastmcp.py index 8e729b2..f955c33 100644 --- a/src/skill_seekers/mcp/server_fastmcp.py +++ b/src/skill_seekers/mcp/server_fastmcp.py @@ -103,6 +103,8 @@ try: # Splitting tools split_config_impl, submit_config_impl, + # Sync config tools + sync_config_impl, upload_skill_impl, validate_config_impl, # Workflow tools @@ -144,6 +146,7 @@ except ImportError: scrape_video_impl, split_config_impl, submit_config_impl, + sync_config_impl, upload_skill_impl, validate_config_impl, list_workflows_impl, @@ -251,6 +254,52 @@ async def validate_config(config_path: str) -> str: return str(result) +# ============================================================================ +# SYNC CONFIG TOOLS (1 tool) +# ============================================================================ + + +@safe_tool_decorator(description="Sync a config's start_urls against what's live on the docs site.") +async def sync_config( + config_path: str, + apply: bool = False, + depth: int = 2, + max_pages: int = 500, + rate_limit: float | None = None, + source_index: int = 0, +) -> str: + """ + Sync a config file's start_urls against the live docs site. + + Crawls seed/nav pages, discovers internal links, and diffs against the + config's existing start_urls. Optionally writes the update with apply=True. + + Args: + config_path: Path to the config JSON file. + apply: Write changes back to the config file (default: False). + depth: BFS crawl depth from seed pages (default: 2). + max_pages: Maximum URLs to discover (default: 500). + rate_limit: Override config rate limit (seconds between requests). + source_index: Index of the documentation source to sync (default: 0). + + Returns: + Report of added/removed URLs. + """ + result = await sync_config_impl( + { + "config_path": config_path, + "apply": apply, + "depth": depth, + "max_pages": max_pages, + "rate_limit": rate_limit, + "source_index": source_index, + } + ) + if isinstance(result, list) and result: + return result[0].text if hasattr(result[0], "text") else str(result[0]) + return str(result) + + # ============================================================================ # SCRAPING TOOLS (10 tools) # ============================================================================ diff --git a/src/skill_seekers/mcp/tools/__init__.py b/src/skill_seekers/mcp/tools/__init__.py index 66284c4..6783c9d 100644 --- a/src/skill_seekers/mcp/tools/__init__.py +++ b/src/skill_seekers/mcp/tools/__init__.py @@ -99,6 +99,9 @@ from .vector_db_tools import ( from .vector_db_tools import ( export_to_weaviate_impl, ) +from .sync_config_tools import ( + sync_config_tool as sync_config_impl, +) from .workflow_tools import ( create_workflow_tool as create_workflow_impl, ) @@ -151,6 +154,8 @@ __all__ = [ "export_to_chroma_impl", "export_to_faiss_impl", "export_to_qdrant_impl", + # Sync config tools + "sync_config_impl", # Workflow tools "list_workflows_impl", "get_workflow_impl", diff --git a/src/skill_seekers/mcp/tools/sync_config_tools.py b/src/skill_seekers/mcp/tools/sync_config_tools.py new file mode 100644 index 0000000..2597ef4 --- /dev/null +++ b/src/skill_seekers/mcp/tools/sync_config_tools.py @@ -0,0 +1,85 @@ +"""Sync-config MCP tool for Skill Seekers MCP Server. + +Provides the ``sync_config`` tool that diffs a config's start_urls against +the live docs site and optionally applies the update. +""" + +try: + from mcp.types import TextContent +except ImportError: + + class TextContent: + """Fallback TextContent for when MCP is not installed.""" + + def __init__(self, type: str, text: str): + self.type = type + self.text = text + + +async def sync_config_tool(args: dict) -> list[TextContent]: + """Sync a config file's start_urls against what's live on the docs site. + + Crawls seed/nav pages, discovers internal links, diffs against the + config's existing ``start_urls``, and optionally writes the update. + + Args: + args: Dictionary containing: + - config_path (str): Path to the config JSON file. + - apply (bool, optional): Write changes back (default: False). + - depth (int, optional): BFS crawl depth (default: 2). + - max_pages (int, optional): Max URLs to discover (default: 500). + - rate_limit (float, optional): Seconds between requests. + - source_index (int, optional): Documentation source index (default: 0). + + Returns: + List[TextContent]: Report of added/removed URLs, or error message. + """ + config_path = args.get("config_path", "") + if not config_path: + return [TextContent(type="text", text="Error: config_path is required")] + + try: + from skill_seekers.cli.sync_config import sync_config + + result = sync_config( + config_path=config_path, + apply=args.get("apply", False), + depth=args.get("depth", 2), + max_pages=args.get("max_pages", 500), + rate_limit=args.get("rate_limit"), + source_index=args.get("source_index", 0), + ) + except FileNotFoundError: + return [TextContent(type="text", text=f"Error: Config file not found: {config_path}")] + except Exception as e: + return [TextContent(type="text", text=f"Error syncing config: {e}")] + + if result.get("error"): + return [TextContent(type="text", text=f"Error: {result['error']}")] + + lines = [] + added = result["added"] + removed = result["removed"] + + if added: + lines.append(f"New pages ({len(added)}):") + for url in added: + lines.append(f" + {url}") + if removed: + lines.append(f"Removed pages ({len(removed)}):") + for url in removed: + lines.append(f" - {url}") + if not added and not removed: + lines.append("Config is up to date. No changes detected.") + else: + lines.append( + f"\nSummary: {len(added)} new, {len(removed)} removed " + f"(discovered {result['total_discovered']}, " + f"configured {result['total_configured']})" + ) + if result["applied"]: + lines.append(f"Updated {config_path}") + else: + lines.append(f"Run with apply=true to update {config_path}") + + return [TextContent(type="text", text="\n".join(lines))] diff --git a/tests/test_cli_parsers.py b/tests/test_cli_parsers.py index 8d240d1..55339db 100644 --- a/tests/test_cli_parsers.py +++ b/tests/test_cli_parsers.py @@ -24,12 +24,12 @@ class TestParserRegistry: def test_all_parsers_registered(self): """Test that all parsers are registered.""" - assert len(PARSERS) == 23, f"Expected 23 parsers, got {len(PARSERS)}" + assert len(PARSERS) == 24, f"Expected 24 parsers, got {len(PARSERS)}" def test_get_parser_names(self): """Test getting list of parser names.""" names = get_parser_names() - assert len(names) == 23 + assert len(names) == 24 assert "scrape" in names assert "github" in names assert "package" in names @@ -243,9 +243,9 @@ class TestBackwardCompatibility: assert cmd in names, f"Command '{cmd}' not found in parser registry!" def test_command_count_matches(self): - """Test that we have exactly 23 commands (includes create, workflows, word, and video commands).""" - assert len(PARSERS) == 23 - assert len(get_parser_names()) == 23 + """Test that we have exactly 24 commands (includes create, workflows, word, video, and sync-config commands).""" + assert len(PARSERS) == 24 + assert len(get_parser_names()) == 24 if __name__ == "__main__": diff --git a/tests/test_sync_config.py b/tests/test_sync_config.py new file mode 100644 index 0000000..905ec85 --- /dev/null +++ b/tests/test_sync_config.py @@ -0,0 +1,590 @@ +#!/usr/bin/env python3 +"""Tests for the sync-config command. + +Covers: +- URL diffing logic +- URL filtering (_is_valid_url) +- BFS discovery with mocked HTTP responses +- Config loading (unified + legacy formats) +- --apply writes correct JSON +- CLI argument parsing +- MCP tool wrapper +""" + +import json +import tempfile +import unittest +from pathlib import Path +from unittest.mock import MagicMock, patch + +from skill_seekers.cli.sync_config import ( + _get_doc_source, + _is_valid_url, + _set_start_urls, + diff_urls, + discover_urls, + sync_config, +) + + +# --------------------------------------------------------------------------- +# diff_urls +# --------------------------------------------------------------------------- + + +class TestDiffUrls(unittest.TestCase): + """Test the URL diffing logic.""" + + def test_no_changes(self): + configured = ["https://example.com/a", "https://example.com/b"] + discovered = set(configured) + added, removed = diff_urls(discovered, configured) + self.assertEqual(added, []) + self.assertEqual(removed, []) + + def test_added_urls(self): + configured = ["https://example.com/a"] + discovered = {"https://example.com/a", "https://example.com/b"} + added, removed = diff_urls(discovered, configured) + self.assertEqual(added, ["https://example.com/b"]) + self.assertEqual(removed, []) + + def test_removed_urls(self): + configured = ["https://example.com/a", "https://example.com/b"] + discovered = {"https://example.com/a"} + added, removed = diff_urls(discovered, configured) + self.assertEqual(added, []) + self.assertEqual(removed, ["https://example.com/b"]) + + def test_both_added_and_removed(self): + configured = ["https://example.com/a", "https://example.com/b"] + discovered = {"https://example.com/a", "https://example.com/c"} + added, removed = diff_urls(discovered, configured) + self.assertEqual(added, ["https://example.com/c"]) + self.assertEqual(removed, ["https://example.com/b"]) + + def test_empty_configured(self): + added, removed = diff_urls({"https://example.com/a"}, []) + self.assertEqual(added, ["https://example.com/a"]) + self.assertEqual(removed, []) + + def test_empty_discovered(self): + added, removed = diff_urls(set(), ["https://example.com/a"]) + self.assertEqual(added, []) + self.assertEqual(removed, ["https://example.com/a"]) + + def test_results_sorted(self): + configured = ["https://example.com/z"] + discovered = {"https://example.com/b", "https://example.com/a"} + added, _ = diff_urls(discovered, configured) + self.assertEqual(added, ["https://example.com/a", "https://example.com/b"]) + + +# --------------------------------------------------------------------------- +# _is_valid_url +# --------------------------------------------------------------------------- + + +class TestIsValidUrl(unittest.TestCase): + """Test the URL filtering logic.""" + + def test_url_under_base(self): + self.assertTrue( + _is_valid_url("https://docs.example.com/guide", "https://docs.example.com/", [], []) + ) + + def test_url_not_under_base(self): + self.assertFalse( + _is_valid_url("https://other.com/guide", "https://docs.example.com/", [], []) + ) + + def test_include_pattern_match(self): + self.assertTrue( + _is_valid_url( + "https://docs.example.com/docs/en/guide", + "https://docs.example.com/", + ["/docs/en/"], + [], + ) + ) + + def test_include_pattern_no_match(self): + self.assertFalse( + _is_valid_url( + "https://docs.example.com/blog/post", + "https://docs.example.com/", + ["/docs/en/"], + [], + ) + ) + + def test_exclude_pattern(self): + self.assertFalse( + _is_valid_url( + "https://docs.example.com/docs/en/changelog", + "https://docs.example.com/", + [], + ["/changelog"], + ) + ) + + def test_include_and_exclude(self): + # Matches include but also matches exclude -> rejected + self.assertFalse( + _is_valid_url( + "https://docs.example.com/docs/en/changelog", + "https://docs.example.com/", + ["/docs/en/"], + ["/changelog"], + ) + ) + + def test_no_patterns_all_valid(self): + self.assertTrue( + _is_valid_url("https://docs.example.com/anything", "https://docs.example.com/", [], []) + ) + + +# --------------------------------------------------------------------------- +# _get_doc_source / _set_start_urls +# --------------------------------------------------------------------------- + + +class TestConfigHelpers(unittest.TestCase): + """Test config extraction for both unified and legacy formats.""" + + def test_unified_format(self): + config = { + "name": "test", + "sources": [ + {"type": "documentation", "base_url": "https://docs.example.com/"}, + {"type": "github", "repo": "owner/repo"}, + ], + } + source = _get_doc_source(config) + self.assertIsNotNone(source) + self.assertEqual(source["base_url"], "https://docs.example.com/") + + def test_unified_format_second_source(self): + config = { + "name": "test", + "sources": [ + {"type": "documentation", "base_url": "https://first.com/"}, + {"type": "documentation", "base_url": "https://second.com/"}, + ], + } + source = _get_doc_source(config, source_index=1) + self.assertEqual(source["base_url"], "https://second.com/") + + def test_unified_format_invalid_index(self): + config = {"name": "test", "sources": [{"type": "github", "repo": "o/r"}]} + self.assertIsNone(_get_doc_source(config)) + + def test_legacy_flat_format(self): + config = {"name": "test", "base_url": "https://docs.example.com/"} + source = _get_doc_source(config) + self.assertEqual(source["base_url"], "https://docs.example.com/") + + def test_no_source_found(self): + config = {"name": "test"} + self.assertIsNone(_get_doc_source(config)) + + def test_set_start_urls_unified(self): + config = { + "sources": [ + {"type": "documentation", "base_url": "https://x.com/", "start_urls": []}, + ] + } + _set_start_urls(config, 0, ["https://x.com/a", "https://x.com/b"]) + self.assertEqual(config["sources"][0]["start_urls"], ["https://x.com/a", "https://x.com/b"]) + + def test_set_start_urls_legacy(self): + config = {"base_url": "https://x.com/", "start_urls": []} + _set_start_urls(config, 0, ["https://x.com/new"]) + self.assertEqual(config["start_urls"], ["https://x.com/new"]) + + +# --------------------------------------------------------------------------- +# discover_urls (with mocked HTTP) +# --------------------------------------------------------------------------- + + +class TestDiscoverUrls(unittest.TestCase): + """Test BFS link discovery with mocked HTTP responses.""" + + def _make_html(self, links: list[str]) -> str: + hrefs = "".join(f'link' for u in links) + return f"{hrefs}" + + @patch("skill_seekers.cli.sync_config.requests.get") + def test_basic_discovery(self, mock_get): + """Discover links from a single seed page.""" + mock_resp = MagicMock() + mock_resp.content = self._make_html( + [ + "https://docs.example.com/page-a", + "https://docs.example.com/page-b", + "https://other.com/external", # should be filtered out + ] + ).encode() + mock_resp.raise_for_status = MagicMock() + mock_get.return_value = mock_resp + + result = discover_urls( + base_url="https://docs.example.com/", + seed_urls=["https://docs.example.com/"], + depth=1, + rate_limit=0, + ) + + self.assertIn("https://docs.example.com/", result) + self.assertIn("https://docs.example.com/page-a", result) + self.assertIn("https://docs.example.com/page-b", result) + self.assertNotIn("https://other.com/external", result) + + @patch("skill_seekers.cli.sync_config.requests.get") + def test_depth_limiting(self, mock_get): + """URLs at depth > limit should be discovered but not followed.""" + # Seed returns one link + seed_html = self._make_html(["https://docs.example.com/child"]) + child_html = self._make_html(["https://docs.example.com/grandchild"]) + + mock_get.side_effect = [ + MagicMock(content=seed_html.encode(), raise_for_status=MagicMock()), + MagicMock(content=child_html.encode(), raise_for_status=MagicMock()), + ] + + result = discover_urls( + base_url="https://docs.example.com/", + seed_urls=["https://docs.example.com/"], + depth=1, # Only follow seed page links, not child page links + rate_limit=0, + ) + + self.assertIn("https://docs.example.com/child", result) + # grandchild is at depth 2, which exceeds depth=1 + self.assertNotIn("https://docs.example.com/grandchild", result) + + @patch("skill_seekers.cli.sync_config.requests.get") + def test_max_pages_limit(self, mock_get): + """Stop after max_pages.""" + links = [f"https://docs.example.com/page-{i}" for i in range(20)] + mock_resp = MagicMock() + mock_resp.content = self._make_html(links).encode() + mock_resp.raise_for_status = MagicMock() + mock_get.return_value = mock_resp + + result = discover_urls( + base_url="https://docs.example.com/", + seed_urls=["https://docs.example.com/"], + depth=1, + max_pages=5, + rate_limit=0, + ) + + self.assertLessEqual(len(result), 5) + + @patch("skill_seekers.cli.sync_config.requests.get") + def test_include_exclude_patterns(self, mock_get): + """Include/exclude patterns are respected.""" + mock_resp = MagicMock() + mock_resp.content = self._make_html( + [ + "https://docs.example.com/docs/en/guide", + "https://docs.example.com/docs/fr/guide", + "https://docs.example.com/blog/post", + ] + ).encode() + mock_resp.raise_for_status = MagicMock() + mock_get.return_value = mock_resp + + result = discover_urls( + base_url="https://docs.example.com/", + seed_urls=["https://docs.example.com/docs/en/overview"], + include_patterns=["/docs/en/"], + exclude_patterns=["/blog/"], + depth=1, + rate_limit=0, + ) + + self.assertIn("https://docs.example.com/docs/en/guide", result) + self.assertNotIn("https://docs.example.com/docs/fr/guide", result) + self.assertNotIn("https://docs.example.com/blog/post", result) + + @patch("skill_seekers.cli.sync_config.requests.get") + def test_http_error_handled_gracefully(self, mock_get): + """HTTP errors should not crash the discovery.""" + mock_get.side_effect = ConnectionError("Network error") + + result = discover_urls( + base_url="https://docs.example.com/", + seed_urls=["https://docs.example.com/"], + depth=1, + rate_limit=0, + ) + + # URLs that fail to fetch are NOT added to discovered (they may + # have been removed from the live site). + self.assertEqual(result, set()) + + @patch("skill_seekers.cli.sync_config.requests.get") + def test_fragments_stripped(self, mock_get): + """URL fragments (#anchor) should be stripped.""" + mock_resp = MagicMock() + mock_resp.content = self._make_html( + [ + "https://docs.example.com/guide#section1", + "https://docs.example.com/guide#section2", + ] + ).encode() + mock_resp.raise_for_status = MagicMock() + mock_get.return_value = mock_resp + + result = discover_urls( + base_url="https://docs.example.com/", + seed_urls=["https://docs.example.com/"], + depth=1, + rate_limit=0, + ) + + # Both anchors should resolve to the same URL + self.assertIn("https://docs.example.com/guide", result) + + +# --------------------------------------------------------------------------- +# sync_config (integration with file I/O) +# --------------------------------------------------------------------------- + + +class TestSyncConfigIntegration(unittest.TestCase): + """Test the full sync_config workflow with mocked HTTP.""" + + def _write_config(self, config: dict) -> Path: + tmp = tempfile.mktemp(suffix=".json") # noqa: SIM115 + with open(tmp, "w", encoding="utf-8") as f: + json.dump(config, f, indent=2) + return Path(tmp) + + @patch("skill_seekers.cli.sync_config.discover_urls") + def test_dry_run_does_not_modify_file(self, mock_discover): + mock_discover.return_value = { + "https://docs.example.com/a", + "https://docs.example.com/b", + "https://docs.example.com/c", + } + + config = { + "name": "test", + "sources": [ + { + "type": "documentation", + "base_url": "https://docs.example.com/", + "start_urls": ["https://docs.example.com/a"], + } + ], + } + path = self._write_config(config) + + result = sync_config(str(path), apply=False) + self.assertFalse(result["applied"]) + self.assertEqual(len(result["added"]), 2) + + # File should not be modified + with open(path, encoding="utf-8") as f: + saved = json.load(f) + self.assertEqual(len(saved["sources"][0]["start_urls"]), 1) + path.unlink() + + @patch("skill_seekers.cli.sync_config.discover_urls") + def test_apply_writes_updated_urls(self, mock_discover): + mock_discover.return_value = { + "https://docs.example.com/a", + "https://docs.example.com/b", + } + + config = { + "name": "test", + "sources": [ + { + "type": "documentation", + "base_url": "https://docs.example.com/", + "start_urls": ["https://docs.example.com/a", "https://docs.example.com/old"], + } + ], + } + path = self._write_config(config) + + result = sync_config(str(path), apply=True) + self.assertTrue(result["applied"]) + self.assertEqual(result["added"], ["https://docs.example.com/b"]) + self.assertEqual(result["removed"], ["https://docs.example.com/old"]) + + # File should be updated + with open(path, encoding="utf-8") as f: + saved = json.load(f) + urls = saved["sources"][0]["start_urls"] + self.assertIn("https://docs.example.com/a", urls) + self.assertIn("https://docs.example.com/b", urls) + self.assertNotIn("https://docs.example.com/old", urls) + path.unlink() + + @patch("skill_seekers.cli.sync_config.discover_urls") + def test_no_changes_does_not_write(self, mock_discover): + urls = ["https://docs.example.com/a", "https://docs.example.com/b"] + mock_discover.return_value = set(urls) + + config = { + "name": "test", + "sources": [ + { + "type": "documentation", + "base_url": "https://docs.example.com/", + "start_urls": urls, + } + ], + } + path = self._write_config(config) + + result = sync_config(str(path), apply=True) + self.assertFalse(result["applied"]) + self.assertEqual(result["added"], []) + self.assertEqual(result["removed"], []) + path.unlink() + + def test_missing_source_returns_error(self): + config = {"name": "test", "sources": [{"type": "github", "repo": "o/r"}]} + path = self._write_config(config) + + result = sync_config(str(path)) + self.assertIn("error", result) + path.unlink() + + @patch("skill_seekers.cli.sync_config.discover_urls") + def test_legacy_config_format(self, mock_discover): + mock_discover.return_value = {"https://docs.example.com/a"} + + config = { + "name": "test", + "base_url": "https://docs.example.com/", + "start_urls": ["https://docs.example.com/a", "https://docs.example.com/old"], + } + path = self._write_config(config) + + result = sync_config(str(path), apply=True) + self.assertTrue(result["applied"]) + self.assertEqual(result["removed"], ["https://docs.example.com/old"]) + + with open(path, encoding="utf-8") as f: + saved = json.load(f) + self.assertEqual(saved["start_urls"], ["https://docs.example.com/a"]) + path.unlink() + + @patch("skill_seekers.cli.sync_config.discover_urls") + def test_nav_seed_urls_used_over_start_urls(self, mock_discover): + """When nav_seed_urls is present, it should be used as the seed.""" + mock_discover.return_value = {"https://docs.example.com/a"} + + config = { + "name": "test", + "sources": [ + { + "type": "documentation", + "base_url": "https://docs.example.com/", + "start_urls": ["https://docs.example.com/a"], + "nav_seed_urls": [ + "https://docs.example.com/nav1", + "https://docs.example.com/nav2", + ], + } + ], + } + path = self._write_config(config) + + sync_config(str(path)) + + # Verify discover_urls was called with nav_seed_urls + call_kwargs = mock_discover.call_args[1] + self.assertEqual( + call_kwargs["seed_urls"], + ["https://docs.example.com/nav1", "https://docs.example.com/nav2"], + ) + path.unlink() + + +# --------------------------------------------------------------------------- +# CLI argument parsing +# --------------------------------------------------------------------------- + + +class TestSyncConfigCLI(unittest.TestCase): + """Test CLI argument parsing and subcommand registration.""" + + def test_sync_config_parser_registered(self): + """sync-config should be a registered subcommand.""" + from skill_seekers.cli.parsers import get_parser_names + + self.assertIn("sync-config", get_parser_names()) + + def test_sync_config_in_command_modules(self): + """sync-config should be in COMMAND_MODULES.""" + from skill_seekers.cli.main import COMMAND_MODULES + + self.assertIn("sync-config", COMMAND_MODULES) + + def test_arguments_created(self): + """Argument parser should accept all expected flags.""" + import argparse + + from skill_seekers.cli.arguments.sync_config import add_sync_config_arguments + + parser = argparse.ArgumentParser() + add_sync_config_arguments(parser) + + args = parser.parse_args(["--config", "test.json", "--apply", "--depth", "3"]) + self.assertEqual(args.config, "test.json") + self.assertTrue(args.apply) + self.assertEqual(args.depth, 3) + + def test_default_values(self): + """Default values should be sensible.""" + import argparse + + from skill_seekers.cli.arguments.sync_config import add_sync_config_arguments + + parser = argparse.ArgumentParser() + add_sync_config_arguments(parser) + + args = parser.parse_args(["--config", "test.json"]) + self.assertFalse(args.apply) + self.assertEqual(args.depth, 2) + self.assertEqual(args.max_pages, 500) + self.assertIsNone(args.rate_limit) + self.assertEqual(args.source_index, 0) + + +# --------------------------------------------------------------------------- +# MCP tool +# --------------------------------------------------------------------------- + + +class TestSyncConfigMCPTool(unittest.TestCase): + """Test MCP tool wrapper.""" + + def test_mcp_tool_importable(self): + """The sync_config MCP tool should be importable.""" + from skill_seekers.mcp.tools import sync_config_impl + + self.assertTrue(callable(sync_config_impl)) + + def test_mcp_tool_missing_config_path(self): + """Missing config_path should return an error.""" + import asyncio + + from skill_seekers.mcp.tools.sync_config_tools import sync_config_tool + + result = asyncio.run(sync_config_tool({})) + self.assertTrue(any("Error" in r.text for r in result)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_sync_config_e2e.py b/tests/test_sync_config_e2e.py new file mode 100644 index 0000000..513d2f5 --- /dev/null +++ b/tests/test_sync_config_e2e.py @@ -0,0 +1,626 @@ +#!/usr/bin/env python3 +"""End-to-end tests for the sync-config command. + +Uses a local HTTP server with realistic multi-page HTML navigation to test +the full pipeline: BFS crawl -> link discovery -> diff -> config update. + +Also includes an integration test against a real public docs site. +""" + +import json +import subprocess +import sys +import tempfile +import threading +import unittest +from http.server import HTTPServer, SimpleHTTPRequestHandler +from pathlib import Path + +import pytest + +from skill_seekers.cli.sync_config import discover_urls, sync_config + + +# --------------------------------------------------------------------------- +# Local test HTTP server +# --------------------------------------------------------------------------- + +# Simulates a docs site with this navigation structure: +# +# /docs/ (index — links to guide, api, faq) +# /docs/guide (links to guide/install, guide/usage) +# /docs/guide/install (leaf page) +# /docs/guide/usage (leaf page, links back to guide) +# /docs/api (links to api/auth, api/users) +# /docs/api/auth (leaf page) +# /docs/api/users (leaf page) +# /docs/faq (leaf page) +# /blog/post-1 (outside /docs/ — should be excluded) + +_SITE_PAGES = { + "/docs/": """Docs Home +

Documentation

+ + """, + "/docs/guide": """ +

Guide

+ Installation + Usage + Back to docs + """, + "/docs/guide/install": """ +

Installation

pip install example

+ Back to guide + """, + "/docs/guide/usage": """ +

Usage

import example

+ Back to guide + """, + "/docs/api": """ +

API Reference

+ Authentication + Users + """, + "/docs/api/auth": """ +

Authentication

Use tokens.

+ """, + "/docs/api/users": """ +

Users API

CRUD operations.

+ """, + "/docs/faq": """ +

FAQ

Common questions.

+ """, + "/blog/post-1": """ +

Blog Post

This is a blog post outside /docs/.

+ """, +} + +# All docs pages that should be discovered (excluding /blog/) +_ALL_DOC_URLS_PATHS = { + "/docs/", + "/docs/guide", + "/docs/guide/install", + "/docs/guide/usage", + "/docs/api", + "/docs/api/auth", + "/docs/api/users", + "/docs/faq", +} + + +class _TestHandler(SimpleHTTPRequestHandler): + """Serve pages from the in-memory _SITE_PAGES dict.""" + + def do_GET(self): + path = self.path.split("?")[0].split("#")[0] + content = _SITE_PAGES.get(path) + if content is None: + self.send_error(404) + return + self.send_response(200) + self.send_header("Content-Type", "text/html; charset=utf-8") + self.end_headers() + self.wfile.write(content.encode("utf-8")) + + def log_message(self, format, *args): # noqa: ARG002 + pass # Suppress request logging during tests + + +def _start_server() -> tuple[HTTPServer, int]: + """Start a local HTTP server on a random port. Returns (server, port).""" + server = HTTPServer(("127.0.0.1", 0), _TestHandler) + port = server.server_address[1] + thread = threading.Thread(target=server.serve_forever, daemon=True) + thread.start() + return server, port + + +# --------------------------------------------------------------------------- +# Helper +# --------------------------------------------------------------------------- + + +def _write_config(config: dict) -> Path: + """Write a config dict to a temp JSON file and return its path.""" + tmp = tempfile.mktemp(suffix=".json") + with open(tmp, "w", encoding="utf-8") as f: + json.dump(config, f, indent=2) + return Path(tmp) + + +# --------------------------------------------------------------------------- +# E2E tests using local HTTP server +# --------------------------------------------------------------------------- + + +@pytest.mark.e2e +class TestSyncConfigE2E(unittest.TestCase): + """End-to-end tests using a local HTTP server with realistic HTML.""" + + @classmethod + def setUpClass(cls): + cls.server, cls.port = _start_server() + cls.base_url = f"http://127.0.0.1:{cls.port}/docs/" + + @classmethod + def tearDownClass(cls): + cls.server.shutdown() + + # -- discover_urls -- + + def test_discover_finds_all_doc_pages(self): + """BFS should discover all 8 /docs/ pages from the root.""" + discovered = discover_urls( + base_url=self.base_url, + seed_urls=[self.base_url], + depth=3, + rate_limit=0, + ) + + expected = {f"http://127.0.0.1:{self.port}{p}" for p in _ALL_DOC_URLS_PATHS} + self.assertEqual(discovered, expected) + + def test_discover_excludes_blog(self): + """Pages outside /docs/ base_url should be excluded.""" + discovered = discover_urls( + base_url=self.base_url, + seed_urls=[self.base_url], + depth=3, + rate_limit=0, + ) + + blog_url = f"http://127.0.0.1:{self.port}/blog/post-1" + self.assertNotIn(blog_url, discovered) + + def test_discover_excludes_external(self): + """External URLs (github.com) should be excluded.""" + discovered = discover_urls( + base_url=self.base_url, + seed_urls=[self.base_url], + depth=3, + rate_limit=0, + ) + + self.assertFalse( + any("github.com" in u for u in discovered), + "External URLs should not be discovered", + ) + + def test_discover_depth_1_finds_direct_links_only(self): + """Depth 1 from root should find guide, api, faq but NOT nested pages.""" + discovered = discover_urls( + base_url=self.base_url, + seed_urls=[self.base_url], + depth=1, + rate_limit=0, + ) + + # Direct children of /docs/ + self.assertIn(f"http://127.0.0.1:{self.port}/docs/guide", discovered) + self.assertIn(f"http://127.0.0.1:{self.port}/docs/api", discovered) + self.assertIn(f"http://127.0.0.1:{self.port}/docs/faq", discovered) + + # Nested pages should NOT be present (they're at depth 2) + self.assertNotIn(f"http://127.0.0.1:{self.port}/docs/guide/install", discovered) + self.assertNotIn(f"http://127.0.0.1:{self.port}/docs/api/auth", discovered) + + def test_discover_with_include_pattern(self): + """Include pattern should filter results.""" + discovered = discover_urls( + base_url=self.base_url, + seed_urls=[self.base_url], + include_patterns=["/api"], + depth=3, + rate_limit=0, + ) + + # Only /api/ pages should be discovered + for url in discovered: + self.assertIn("/api", url, f"URL {url} does not match include pattern /api") + + def test_discover_with_exclude_pattern(self): + """Exclude pattern should remove matching pages.""" + discovered = discover_urls( + base_url=self.base_url, + seed_urls=[self.base_url], + exclude_patterns=["/faq"], + depth=3, + rate_limit=0, + ) + + faq_url = f"http://127.0.0.1:{self.port}/docs/faq" + self.assertNotIn(faq_url, discovered) + # Other pages should still be found + self.assertIn(f"http://127.0.0.1:{self.port}/docs/guide", discovered) + + def test_discover_max_pages_limit(self): + """max_pages should cap discovery.""" + discovered = discover_urls( + base_url=self.base_url, + seed_urls=[self.base_url], + depth=3, + max_pages=3, + rate_limit=0, + ) + + self.assertLessEqual(len(discovered), 3) + + # -- sync_config (full pipeline with file I/O) -- + + def test_sync_config_dry_run_detects_new_pages(self): + """Dry-run should detect pages missing from the config.""" + config = { + "name": "test-site", + "sources": [ + { + "type": "documentation", + "base_url": self.base_url, + "start_urls": [ + f"http://127.0.0.1:{self.port}/docs/guide", + f"http://127.0.0.1:{self.port}/docs/faq", + ], + } + ], + } + path = _write_config(config) + + result = sync_config(str(path), apply=False, depth=3, rate_limit=0) + + self.assertFalse(result["applied"]) + self.assertGreater(len(result["added"]), 0, "Should detect new pages") + # api, api/auth, api/users, guide/install, guide/usage, /docs/ itself + # should all be in added + self.assertGreaterEqual(result["total_discovered"], 6) + + # File should NOT be modified + with open(path, encoding="utf-8") as f: + saved = json.load(f) + self.assertEqual(len(saved["sources"][0]["start_urls"]), 2) + path.unlink() + + def test_sync_config_apply_updates_config(self): + """--apply should write all discovered URLs to the config.""" + config = { + "name": "test-site", + "sources": [ + { + "type": "documentation", + "base_url": self.base_url, + "start_urls": [f"http://127.0.0.1:{self.port}/docs/guide"], + } + ], + } + path = _write_config(config) + + result = sync_config(str(path), apply=True, depth=3, rate_limit=0) + + self.assertTrue(result["applied"]) + + # Verify the file was updated + with open(path, encoding="utf-8") as f: + saved = json.load(f) + saved_urls = saved["sources"][0]["start_urls"] + self.assertEqual(len(saved_urls), result["total_discovered"]) + + # All expected URLs should be present + expected = {f"http://127.0.0.1:{self.port}{p}" for p in _ALL_DOC_URLS_PATHS} + for url in expected: + self.assertIn(url, saved_urls, f"Expected URL missing from saved config: {url}") + + path.unlink() + + def test_sync_config_idempotent(self): + """Running sync twice with --apply should be a no-op the second time.""" + config = { + "name": "test-site", + "sources": [ + { + "type": "documentation", + "base_url": self.base_url, + "start_urls": [], + } + ], + } + path = _write_config(config) + + # First run: should apply changes + result1 = sync_config(str(path), apply=True, depth=3, rate_limit=0) + self.assertTrue(result1["applied"]) + self.assertGreater(len(result1["added"]), 0) + + # Second run: should detect no changes + result2 = sync_config(str(path), apply=True, depth=3, rate_limit=0) + self.assertFalse(result2["applied"]) + self.assertEqual(result2["added"], []) + self.assertEqual(result2["removed"], []) + + path.unlink() + + def test_sync_config_detects_removed_pages(self): + """Pages in config but not discovered should show as removed.""" + config = { + "name": "test-site", + "sources": [ + { + "type": "documentation", + "base_url": self.base_url, + "start_urls": [ + f"http://127.0.0.1:{self.port}/docs/guide", + f"http://127.0.0.1:{self.port}/docs/old-page-that-no-longer-exists", + ], + } + ], + } + path = _write_config(config) + + result = sync_config(str(path), apply=False, depth=3, rate_limit=0) + + self.assertIn( + f"http://127.0.0.1:{self.port}/docs/old-page-that-no-longer-exists", + result["removed"], + ) + path.unlink() + + def test_sync_config_preserves_other_config_fields(self): + """--apply should only modify start_urls, preserving all other fields.""" + config = { + "name": "my-skill", + "description": "Important skill description", + "version": "1.0.0", + "sources": [ + { + "type": "documentation", + "base_url": self.base_url, + "start_urls": [], + "selectors": {"main_content": "article", "title": "h1"}, + "url_patterns": {"include": [], "exclude": []}, + "rate_limit": 0.5, + "max_pages": 100, + }, + { + "type": "github", + "repo": "owner/repo", + }, + ], + } + path = _write_config(config) + + sync_config(str(path), apply=True, depth=3, rate_limit=0) + + with open(path, encoding="utf-8") as f: + saved = json.load(f) + + # Non-start_urls fields should be untouched + self.assertEqual(saved["name"], "my-skill") + self.assertEqual(saved["description"], "Important skill description") + self.assertEqual(saved["version"], "1.0.0") + self.assertEqual(saved["sources"][0]["selectors"]["main_content"], "article") + self.assertEqual(saved["sources"][0]["rate_limit"], 0.5) + self.assertEqual(saved["sources"][1]["type"], "github") + self.assertEqual(saved["sources"][1]["repo"], "owner/repo") + + # start_urls should be updated + self.assertGreater(len(saved["sources"][0]["start_urls"]), 0) + + path.unlink() + + def test_sync_config_with_nav_seed_urls(self): + """nav_seed_urls should be used as BFS seeds instead of start_urls.""" + config = { + "name": "test-site", + "sources": [ + { + "type": "documentation", + "base_url": self.base_url, + "start_urls": [], + # Only seed from /docs/api — should only discover API pages + "nav_seed_urls": [f"http://127.0.0.1:{self.port}/docs/api"], + } + ], + } + path = _write_config(config) + + result = sync_config(str(path), apply=False, depth=1, rate_limit=0) + + # Should discover at least the API seed page + self.assertGreater(len(result["added"]), 0, "nav_seed_urls should discover pages") + # All added URLs should be under /docs/ + for url in result["added"]: + self.assertTrue(url.startswith(self.base_url), f"URL outside base: {url}") + + path.unlink() + + def test_sync_config_legacy_format(self): + """Legacy flat config format should work end-to-end.""" + config = { + "name": "test-site", + "base_url": self.base_url, + "start_urls": [f"http://127.0.0.1:{self.port}/docs/guide"], + } + path = _write_config(config) + + result = sync_config(str(path), apply=True, depth=3, rate_limit=0) + + self.assertTrue(result["applied"]) + + with open(path, encoding="utf-8") as f: + saved = json.load(f) + self.assertGreater(len(saved["start_urls"]), 1) + + path.unlink() + + +# --------------------------------------------------------------------------- +# CLI subprocess tests +# --------------------------------------------------------------------------- + + +@pytest.mark.e2e +class TestSyncConfigCLIE2E(unittest.TestCase): + """Test the CLI entry point via subprocess.""" + + @classmethod + def setUpClass(cls): + cls.server, cls.port = _start_server() + cls.base_url = f"http://127.0.0.1:{cls.port}/docs/" + + @classmethod + def tearDownClass(cls): + cls.server.shutdown() + + def test_cli_dry_run(self): + """CLI dry-run should print diff and exit 0.""" + config = { + "name": "test", + "sources": [ + { + "type": "documentation", + "base_url": self.base_url, + # Only one URL configured — the rest should show as "new" + "start_urls": [f"http://127.0.0.1:{self.port}/docs/faq"], + # Seed from root to discover all pages + "nav_seed_urls": [self.base_url], + } + ], + } + path = _write_config(config) + + result = subprocess.run( + [ + sys.executable, + "-m", + "skill_seekers.cli.sync_config", + "--config", + str(path), + "--depth", + "3", + "--rate-limit", + "0", + ], + capture_output=True, + text=True, + timeout=30, + ) + + self.assertEqual(result.returncode, 0, f"CLI failed: {result.stderr}") + # Should mention new pages in the output (logged to stderr) + combined = result.stderr.lower() + result.stdout.lower() + self.assertIn("new page", combined, f"Expected 'new page' in output: {combined}") + path.unlink() + + def test_cli_apply(self): + """CLI --apply should update the config file.""" + config = { + "name": "test", + "sources": [ + { + "type": "documentation", + "base_url": self.base_url, + "start_urls": [f"http://127.0.0.1:{self.port}/docs/faq"], + "nav_seed_urls": [self.base_url], + } + ], + } + path = _write_config(config) + + result = subprocess.run( + [ + sys.executable, + "-m", + "skill_seekers.cli.sync_config", + "--config", + str(path), + "--apply", + "--depth", + "3", + "--rate-limit", + "0", + ], + capture_output=True, + text=True, + timeout=30, + ) + + self.assertEqual(result.returncode, 0, f"CLI failed: {result.stderr}") + + with open(path, encoding="utf-8") as f: + saved = json.load(f) + self.assertGreater(len(saved["sources"][0]["start_urls"]), 0) + + path.unlink() + + def test_cli_help(self): + """CLI --help should print usage and exit 0.""" + result = subprocess.run( + [sys.executable, "-m", "skill_seekers.cli.sync_config", "--help"], + capture_output=True, + text=True, + timeout=10, + ) + + self.assertEqual(result.returncode, 0) + self.assertIn("sync", result.stdout.lower()) + self.assertIn("--config", result.stdout) + self.assertIn("--apply", result.stdout) + self.assertIn("--depth", result.stdout) + + def test_cli_missing_config_exits_nonzero(self): + """CLI with a non-existent config should fail.""" + result = subprocess.run( + [ + sys.executable, + "-m", + "skill_seekers.cli.sync_config", + "--config", + "/nonexistent/path/config.json", + ], + capture_output=True, + text=True, + timeout=10, + ) + + self.assertNotEqual(result.returncode, 0) + + +# --------------------------------------------------------------------------- +# Integration test against real public site +# --------------------------------------------------------------------------- + + +@pytest.mark.integration +class TestSyncConfigRealSite(unittest.TestCase): + """Integration test against a real public docs site. + + Skipped by default (use ``-m integration`` to run). + Uses httpbin.org which is a stable, small public HTTP test service. + """ + + def test_discover_urls_real_http(self): + """discover_urls should work against a real HTTP server.""" + # Use Python docs — small, stable, well-structured + discovered = discover_urls( + base_url="https://docs.python.org/3/library/", + seed_urls=["https://docs.python.org/3/library/functions.html"], + depth=1, + max_pages=10, + rate_limit=0.5, + ) + + # Should find at least the seed page itself + self.assertGreater(len(discovered), 0) + # All discovered URLs should be under the base + for url in discovered: + self.assertTrue( + url.startswith("https://docs.python.org/3/library/"), + f"Discovered URL outside base: {url}", + ) + + +if __name__ == "__main__": + unittest.main()