feat: add sync-config command to detect and update config start_urls (#306)

## Summary Add `skill-seekers sync-config` subcommand that crawls a docs site's navigation, diffs discovered URLs against a config's start_urls, and optionally writes the updated list back with --apply. - BFS link discovery with configurable depth (default 2), max-pages, rate-limit - Respects url_patterns.include/exclude from config - Supports optional nav_seed_urls config field - Handles both unified (sources array) and legacy flat config formats - MCP tool sync_config included - 57 tests (39 unit + 18 E2E with local HTTP server) - Fixed CI: renamed summary job to "Tests" to match branch protection rule Closes #306
2026-03-15 02:16:32 +03:00
parent 0c9504c944
commit 83b9a695ba
12 changed files with 1783 additions and 5 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -244,6 +244,7 @@ skill-seekers-update = "skill_seekers.cli.incremental_updater:main"
 skill-seekers-multilang = "skill_seekers.cli.multilang_support:main"
 skill-seekers-quality = "skill_seekers.cli.quality_metrics:main"
 skill-seekers-workflows = "skill_seekers.cli.workflows_command:main"
 skill-seekers-sync-config = "skill_seekers.cli.sync_config:main"
 [tool.setuptools]
 package-dir = {"" = "src"}
--- a/src/skill_seekers/cli/arguments/sync_config.py
+++ b/src/skill_seekers/cli/arguments/sync_config.py
@@ -0,0 +1,64 @@
 """Sync-config command argument definitions.
 Shared between sync_config.py (standalone) and parsers/sync_config_parser.py
 (unified CLI) so the two entry points never drift out of sync.
 """
 import argparse
 def add_sync_config_arguments(parser: argparse.ArgumentParser) -> None:
    """Add all sync-config arguments to *parser*."""
    parser.add_argument(
        "--config",
        "-c",
        type=str,
        required=True,
        help="Path to the config JSON file to sync",
        metavar="FILE",
    )
    parser.add_argument(
        "--apply",
        action="store_true",
        default=False,
        help="Write updated start_urls back to the config file (default: dry-run)",
    )
    parser.add_argument(
        "--depth",
        type=int,
        default=2,
        help="BFS crawl depth from seed pages (default: 2)",
    )
    parser.add_argument(
        "--max-pages",
        type=int,
        default=500,
        help="Maximum pages to discover (default: 500)",
    )
    parser.add_argument(
        "--rate-limit",
        type=float,
        default=None,
        help="Override config rate-limit (seconds between requests)",
    )
    parser.add_argument(
        "--source-index",
        type=int,
        default=0,
        help="Index of the documentation source to sync (default: 0)",
    )
    parser.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        default=False,
        help="Verbose output",
    )
    parser.add_argument(
        "--quiet",
        "-q",
        action="store_true",
        default=False,
        help="Suppress informational output",
    )
--- a/src/skill_seekers/cli/main.py
+++ b/src/skill_seekers/cli/main.py
@@ -67,6 +67,7 @@ COMMAND_MODULES = {
    "multilang": "skill_seekers.cli.multilang_support",
    "quality": "skill_seekers.cli.quality_metrics",
    "workflows": "skill_seekers.cli.workflows_command",
    "sync-config": "skill_seekers.cli.sync_config",
 }
--- a/src/skill_seekers/cli/parsers/init.py
+++ b/src/skill_seekers/cli/parsers/init.py
@@ -30,6 +30,7 @@ from .update_parser import UpdateParser
 from .multilang_parser import MultilangParser
 from .quality_parser import QualityParser
 from .workflows_parser import WorkflowsParser
 from .sync_config_parser import SyncConfigParser
 # Registry of all parsers (in order of usage frequency)
 PARSERS = [
@@ -56,6 +57,7 @@ PARSERS = [
    MultilangParser(),
    QualityParser(),
    WorkflowsParser(),
    SyncConfigParser(),
 ]
--- a/src/skill_seekers/cli/parsers/sync_config_parser.py
+++ b/src/skill_seekers/cli/parsers/sync_config_parser.py
@@ -0,0 +1,30 @@
 """Parser for the sync-config subcommand."""
 import argparse
 from .base import SubcommandParser
 class SyncConfigParser(SubcommandParser):
    """Subcommand parser for ``skill-seekers sync-config``."""
    @property
    def name(self) -> str:
        return "sync-config"
    @property
    def help(self) -> str:
        return "Diff/update a config's start_urls against the live docs site"
    @property
    def description(self) -> str:
        return (
            "Crawl navigation links from a docs site, compare them against "
            "the config's start_urls, and optionally write the updated list "
            "back with --apply."
        )
    def add_arguments(self, parser: argparse.ArgumentParser) -> None:
        from skill_seekers.cli.arguments.sync_config import add_sync_config_arguments
        add_sync_config_arguments(parser)
--- a/src/skill_seekers/cli/sync_config.py
+++ b/src/skill_seekers/cli/sync_config.py
@@ -0,0 +1,325 @@
 #!/usr/bin/env python3
 """Sync a config file's start_urls against what's currently live on a docs site.
 Crawls navigation links from seed pages, diffs them against the config's
 ``start_urls``, and optionally writes the updated list back.
 Usage:
    skill-seekers sync-config --config configs/claude-code.json
    skill-seekers sync-config --config configs/claude-code.json --apply
 """
 import argparse
 import json
 import logging
 import sys
 import time
 from collections import deque
 from urllib.parse import urljoin
 import requests
 from bs4 import BeautifulSoup
 from skill_seekers.cli.utils import sanitize_url, setup_logging
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # URL filtering (mirrors DocToSkillConverter.is_valid_url logic)
 # ---------------------------------------------------------------------------
 def _is_valid_url(
    url: str,
    base_url: str,
    include_patterns: list[str],
    exclude_patterns: list[str],
 ) -> bool:
    """Return True if *url* passes include/exclude pattern filters."""
    if not url.startswith(base_url):
        return False
    if include_patterns and not any(p in url for p in include_patterns):
        return False
    return not any(p in url for p in exclude_patterns)
 # ---------------------------------------------------------------------------
 # Lightweight BFS link discovery
 # ---------------------------------------------------------------------------
 def discover_urls(
    base_url: str,
    seed_urls: list[str],
    include_patterns: list[str] | None = None,
    exclude_patterns: list[str] | None = None,
    depth: int = 2,
    max_pages: int = 500,
    rate_limit: float = 0.5,
 ) -> set[str]:
    """BFS-crawl *seed_urls* and return all discovered internal URLs.
    Only follows ``<a href>`` links on HTML pages; does not download
    full page content.  Applies the same include/exclude filtering as
    :class:`DocToSkillConverter`.
    Args:
        base_url: Only URLs under this prefix are accepted.
        seed_urls: Starting points for the BFS.
        include_patterns: Substring patterns a URL must contain (any).
        exclude_patterns: Substring patterns that disqualify a URL.
        depth: Maximum number of BFS hops from the seed pages.
        max_pages: Stop after discovering this many unique URLs.
        rate_limit: Seconds to wait between HTTP requests.
    Returns:
        Set of discovered absolute URLs (fragments stripped).
    """
    includes = include_patterns or []
    excludes = exclude_patterns or []
    visited: set[str] = set()
    # Queue entries are (url, current_depth)
    queue: deque[tuple[str, int]] = deque()
    for u in seed_urls:
        u = sanitize_url(u)
        queue.append((u, 0))
    discovered: set[str] = set()
    while queue and len(discovered) < max_pages:
        url, cur_depth = queue.popleft()
        if url in visited:
            continue
        visited.add(url)
        if not _is_valid_url(url, base_url, includes, excludes):
            continue
        logger.debug("  [depth %d] %s", cur_depth, url)
        try:
            headers = {"User-Agent": "Mozilla/5.0 (Skill-Seekers sync-config)"}
            resp = requests.get(url, headers=headers, timeout=15)
            resp.raise_for_status()
        except Exception as e:
            logger.warning("  Could not fetch %s: %s", url, e)
            continue
        # Only mark as "discovered" after a successful fetch — 404s and
        # other errors mean the page no longer exists on the live site.
        discovered.add(url)
        # Follow links if we haven't hit the depth limit
        if cur_depth < depth:
            soup = BeautifulSoup(resp.content, "html.parser")
            for link in soup.find_all("a", href=True):
                href = urljoin(url, link["href"])
                href = href.split("#")[0]  # strip fragment
                href = sanitize_url(href)
                if href not in visited and _is_valid_url(href, base_url, includes, excludes):
                    queue.append((href, cur_depth + 1))
        if rate_limit > 0:
            time.sleep(rate_limit)
    return discovered
 # ---------------------------------------------------------------------------
 # Diff logic
 # ---------------------------------------------------------------------------
 def diff_urls(discovered: set[str], configured: list[str]) -> tuple[list[str], list[str]]:
    """Compare *discovered* URLs against a *configured* list.
    Returns:
        ``(added, removed)`` — both sorted lists of URLs.
    """
    configured_set = set(configured)
    added = sorted(discovered - configured_set)
    removed = sorted(configured_set - discovered)
    return added, removed
 # ---------------------------------------------------------------------------
 # Config helpers
 # ---------------------------------------------------------------------------
 def _get_doc_source(config: dict, source_index: int = 0) -> dict | None:
    """Extract the documentation source dict from *config*.
    Handles both the unified format (``sources`` array) and legacy flat
    format (fields at the top level).
    """
    sources = config.get("sources")
    if sources:
        doc_sources = [s for s in sources if s.get("type") == "documentation"]
        if source_index < len(doc_sources):
            return doc_sources[source_index]
        return None
    # Legacy flat format — treat the whole config as a single source
    if config.get("base_url"):
        return config
    return None
 def _set_start_urls(config: dict, source_index: int, urls: list[str]) -> None:
    """Write *urls* into the correct ``start_urls`` field in *config*."""
    sources = config.get("sources")
    if sources:
        doc_sources = [s for s in sources if s.get("type") == "documentation"]
        if source_index < len(doc_sources):
            doc_sources[source_index]["start_urls"] = urls
            return
    # Legacy flat format
    config["start_urls"] = urls
 # ---------------------------------------------------------------------------
 # Main orchestrator
 # ---------------------------------------------------------------------------
 def sync_config(
    config_path: str,
    apply: bool = False,
    depth: int = 2,
    max_pages: int = 500,
    rate_limit: float | None = None,
    source_index: int = 0,
 ) -> dict:
    """Run the sync-config workflow.
    Returns:
        Dict with keys ``added``, ``removed``, ``total_discovered``,
        ``total_configured``, ``applied``.
    """
    # Load config
    with open(config_path, encoding="utf-8") as f:
        config = json.load(f)
    source = _get_doc_source(config, source_index)
    if source is None:
        logger.error("No documentation source found at index %d in %s", source_index, config_path)
        return {
            "added": [],
            "removed": [],
            "total_discovered": 0,
            "total_configured": 0,
            "applied": False,
            "error": "No documentation source found",
        }
    base_url: str = source["base_url"]
    configured_urls: list[str] = source.get("start_urls") or []
    seed_urls: list[str] = source.get("nav_seed_urls") or configured_urls or [base_url]
    url_patterns = source.get("url_patterns", {})
    includes: list[str] = url_patterns.get("include", [])
    excludes: list[str] = url_patterns.get("exclude", [])
    effective_rate = rate_limit if rate_limit is not None else source.get("rate_limit", 0.5)
    logger.info("Syncing config: %s", config_path)
    logger.info("  Base URL:      %s", base_url)
    logger.info("  Seed URLs:     %d", len(seed_urls))
    logger.info("  Configured:    %d start_urls", len(configured_urls))
    logger.info("  Depth:         %d", depth)
    logger.info("  Rate limit:    %.1fs", effective_rate)
    logger.info("")
    # Discover
    discovered = discover_urls(
        base_url=base_url,
        seed_urls=seed_urls,
        include_patterns=includes,
        exclude_patterns=excludes,
        depth=depth,
        max_pages=max_pages,
        rate_limit=effective_rate,
    )
    # Diff
    added, removed = diff_urls(discovered, configured_urls)
    # Report
    if added:
        logger.info("New pages (%d):", len(added))
        for url in added:
            path = url.replace(base_url, "/")
            logger.info("  + %s", path)
    if removed:
        logger.info("Removed pages (%d):", len(removed))
        for url in removed:
            path = url.replace(base_url, "/")
            logger.info("  - %s", path)
    if not added and not removed:
        logger.info("Config is up to date. No changes detected.")
    else:
        logger.info("")
        logger.info(
            "Summary: %d new, %d removed (discovered %d total, configured %d)",
            len(added),
            len(removed),
            len(discovered),
            len(configured_urls),
        )
    applied = False
    if apply and (added or removed):
        new_urls = sorted(discovered)
        _set_start_urls(config, source_index, new_urls)
        with open(config_path, "w", encoding="utf-8") as f:
            json.dump(config, f, indent=2, ensure_ascii=False)
            f.write("\n")
        logger.info("Updated %s (%d start_urls)", config_path, len(new_urls))
        applied = True
    elif added or removed:
        logger.info("Run with --apply to update %s", config_path)
    return {
        "added": added,
        "removed": removed,
        "total_discovered": len(discovered),
        "total_configured": len(configured_urls),
        "applied": applied,
    }
 # ---------------------------------------------------------------------------
 # CLI entry point
 # ---------------------------------------------------------------------------
 def main() -> None:
    """CLI entry point for ``skill-seekers sync-config``."""
    from skill_seekers.cli.arguments.sync_config import add_sync_config_arguments
    parser = argparse.ArgumentParser(
        prog="skill-seekers-sync-config",
        description="Sync a config's start_urls against what's live on the docs site.",
    )
    add_sync_config_arguments(parser)
    args = parser.parse_args()
    setup_logging(verbose=args.verbose, quiet=args.quiet)
    result = sync_config(
        config_path=args.config,
        apply=args.apply,
        depth=args.depth,
        max_pages=args.max_pages,
        rate_limit=args.rate_limit,
        source_index=args.source_index,
    )
    if result.get("error"):
        sys.exit(1)
 if __name__ == "__main__":
    main()
--- a/src/skill_seekers/mcp/server_fastmcp.py
+++ b/src/skill_seekers/mcp/server_fastmcp.py
@@ -103,6 +103,8 @@ try:
        # Splitting tools
        split_config_impl,
        submit_config_impl,
        # Sync config tools
        sync_config_impl,
        upload_skill_impl,
        validate_config_impl,
        # Workflow tools
@@ -144,6 +146,7 @@ except ImportError:
        scrape_video_impl,
        split_config_impl,
        submit_config_impl,
        sync_config_impl,
        upload_skill_impl,
        validate_config_impl,
        list_workflows_impl,
@@ -251,6 +254,52 @@ async def validate_config(config_path: str) -> str:
    return str(result)
 # ============================================================================
 # SYNC CONFIG TOOLS (1 tool)
 # ============================================================================
@safe_tool_decorator(description="Sync a config's start_urls against what's live on the docs site.")
 async def sync_config(
    config_path: str,
    apply: bool = False,
    depth: int = 2,
    max_pages: int = 500,
    rate_limit: float | None = None,
    source_index: int = 0,
 ) -> str:
    """
    Sync a config file's start_urls against the live docs site.
    Crawls seed/nav pages, discovers internal links, and diffs against the
    config's existing start_urls. Optionally writes the update with apply=True.
    Args:
        config_path: Path to the config JSON file.
        apply: Write changes back to the config file (default: False).
        depth: BFS crawl depth from seed pages (default: 2).
        max_pages: Maximum URLs to discover (default: 500).
        rate_limit: Override config rate limit (seconds between requests).
        source_index: Index of the documentation source to sync (default: 0).
    Returns:
        Report of added/removed URLs.
    """
    result = await sync_config_impl(
        {
            "config_path": config_path,
            "apply": apply,
            "depth": depth,
            "max_pages": max_pages,
            "rate_limit": rate_limit,
            "source_index": source_index,
        }
    )
    if isinstance(result, list) and result:
        return result[0].text if hasattr(result[0], "text") else str(result[0])
    return str(result)
 # ============================================================================
 # SCRAPING TOOLS (10 tools)
 # ============================================================================
--- a/src/skill_seekers/mcp/tools/init.py
+++ b/src/skill_seekers/mcp/tools/init.py
@@ -99,6 +99,9 @@ from .vector_db_tools import (
 from .vector_db_tools import (
    export_to_weaviate_impl,
 )
 from .sync_config_tools import (
    sync_config_tool as sync_config_impl,
 )
 from .workflow_tools import (
    create_workflow_tool as create_workflow_impl,
 )
@@ -151,6 +154,8 @@ __all__ = [
    "export_to_chroma_impl",
    "export_to_faiss_impl",
    "export_to_qdrant_impl",
    # Sync config tools
    "sync_config_impl",
    # Workflow tools
    "list_workflows_impl",
    "get_workflow_impl",
--- a/src/skill_seekers/mcp/tools/sync_config_tools.py
+++ b/src/skill_seekers/mcp/tools/sync_config_tools.py
@@ -0,0 +1,85 @@
 """Sync-config MCP tool for Skill Seekers MCP Server.
 Provides the ``sync_config`` tool that diffs a config's start_urls against
 the live docs site and optionally applies the update.
 """
 try:
    from mcp.types import TextContent
 except ImportError:
    class TextContent:
        """Fallback TextContent for when MCP is not installed."""
        def __init__(self, type: str, text: str):
            self.type = type
            self.text = text
 async def sync_config_tool(args: dict) -> list[TextContent]:
    """Sync a config file's start_urls against what's live on the docs site.
    Crawls seed/nav pages, discovers internal links, diffs against the
    config's existing ``start_urls``, and optionally writes the update.
    Args:
        args: Dictionary containing:
            - config_path (str): Path to the config JSON file.
            - apply (bool, optional): Write changes back (default: False).
            - depth (int, optional): BFS crawl depth (default: 2).
            - max_pages (int, optional): Max URLs to discover (default: 500).
            - rate_limit (float, optional): Seconds between requests.
            - source_index (int, optional): Documentation source index (default: 0).
    Returns:
        List[TextContent]: Report of added/removed URLs, or error message.
    """
    config_path = args.get("config_path", "")
    if not config_path:
        return [TextContent(type="text", text="Error: config_path is required")]
    try:
        from skill_seekers.cli.sync_config import sync_config
        result = sync_config(
            config_path=config_path,
            apply=args.get("apply", False),
            depth=args.get("depth", 2),
            max_pages=args.get("max_pages", 500),
            rate_limit=args.get("rate_limit"),
            source_index=args.get("source_index", 0),
        )
    except FileNotFoundError:
        return [TextContent(type="text", text=f"Error: Config file not found: {config_path}")]
    except Exception as e:
        return [TextContent(type="text", text=f"Error syncing config: {e}")]
    if result.get("error"):
        return [TextContent(type="text", text=f"Error: {result['error']}")]
    lines = []
    added = result["added"]
    removed = result["removed"]
    if added:
        lines.append(f"New pages ({len(added)}):")
        for url in added:
            lines.append(f"  + {url}")
    if removed:
        lines.append(f"Removed pages ({len(removed)}):")
        for url in removed:
            lines.append(f"  - {url}")
    if not added and not removed:
        lines.append("Config is up to date. No changes detected.")
    else:
        lines.append(
            f"\nSummary: {len(added)} new, {len(removed)} removed "
            f"(discovered {result['total_discovered']}, "
            f"configured {result['total_configured']})"
        )
        if result["applied"]:
            lines.append(f"Updated {config_path}")
        else:
            lines.append(f"Run with apply=true to update {config_path}")
    return [TextContent(type="text", text="\n".join(lines))]
--- a/tests/test_cli_parsers.py
+++ b/tests/test_cli_parsers.py
@@ -24,12 +24,12 @@ class TestParserRegistry:
    def test_all_parsers_registered(self):
        """Test that all parsers are registered."""
-        assert len(PARSERS) == 23, f"Expected 23 parsers, got {len(PARSERS)}"
+        assert len(PARSERS) == 24, f"Expected 24 parsers, got {len(PARSERS)}"
    def test_get_parser_names(self):
        """Test getting list of parser names."""
        names = get_parser_names()
-        assert len(names) == 23
+        assert len(names) == 24
        assert "scrape" in names
        assert "github" in names
        assert "package" in names
@@ -243,9 +243,9 @@ class TestBackwardCompatibility:
            assert cmd in names, f"Command '{cmd}' not found in parser registry!"
    def test_command_count_matches(self):
-        """Test that we have exactly 23 commands (includes create, workflows, word, and video commands)."""
+        """Test that we have exactly 24 commands (includes create, workflows, word, video, and sync-config commands)."""
-        assert len(PARSERS) == 23
+        assert len(PARSERS) == 24
-        assert len(get_parser_names()) == 23
+        assert len(get_parser_names()) == 24
 if __name__ == "__main__":
--- a/tests/test_sync_config.py
+++ b/tests/test_sync_config.py
@@ -0,0 +1,590 @@
 #!/usr/bin/env python3
 """Tests for the sync-config command.
 Covers:
 - URL diffing logic
 - URL filtering (_is_valid_url)
 - BFS discovery with mocked HTTP responses
 - Config loading (unified + legacy formats)
 - --apply writes correct JSON
 - CLI argument parsing
 - MCP tool wrapper
 """
 import json
 import tempfile
 import unittest
 from pathlib import Path
 from unittest.mock import MagicMock, patch
 from skill_seekers.cli.sync_config import (
    _get_doc_source,
    _is_valid_url,
    _set_start_urls,
    diff_urls,
    discover_urls,
    sync_config,
 )
 # ---------------------------------------------------------------------------
 # diff_urls
 # ---------------------------------------------------------------------------
 class TestDiffUrls(unittest.TestCase):
    """Test the URL diffing logic."""
    def test_no_changes(self):
        configured = ["https://example.com/a", "https://example.com/b"]
        discovered = set(configured)
        added, removed = diff_urls(discovered, configured)
        self.assertEqual(added, [])
        self.assertEqual(removed, [])
    def test_added_urls(self):
        configured = ["https://example.com/a"]
        discovered = {"https://example.com/a", "https://example.com/b"}
        added, removed = diff_urls(discovered, configured)
        self.assertEqual(added, ["https://example.com/b"])
        self.assertEqual(removed, [])
    def test_removed_urls(self):
        configured = ["https://example.com/a", "https://example.com/b"]
        discovered = {"https://example.com/a"}
        added, removed = diff_urls(discovered, configured)
        self.assertEqual(added, [])
        self.assertEqual(removed, ["https://example.com/b"])
    def test_both_added_and_removed(self):
        configured = ["https://example.com/a", "https://example.com/b"]
        discovered = {"https://example.com/a", "https://example.com/c"}
        added, removed = diff_urls(discovered, configured)
        self.assertEqual(added, ["https://example.com/c"])
        self.assertEqual(removed, ["https://example.com/b"])
    def test_empty_configured(self):
        added, removed = diff_urls({"https://example.com/a"}, [])
        self.assertEqual(added, ["https://example.com/a"])
        self.assertEqual(removed, [])
    def test_empty_discovered(self):
        added, removed = diff_urls(set(), ["https://example.com/a"])
        self.assertEqual(added, [])
        self.assertEqual(removed, ["https://example.com/a"])
    def test_results_sorted(self):
        configured = ["https://example.com/z"]
        discovered = {"https://example.com/b", "https://example.com/a"}
        added, _ = diff_urls(discovered, configured)
        self.assertEqual(added, ["https://example.com/a", "https://example.com/b"])
 # ---------------------------------------------------------------------------
 # _is_valid_url
 # ---------------------------------------------------------------------------
 class TestIsValidUrl(unittest.TestCase):
    """Test the URL filtering logic."""
    def test_url_under_base(self):
        self.assertTrue(
            _is_valid_url("https://docs.example.com/guide", "https://docs.example.com/", [], [])
        )
    def test_url_not_under_base(self):
        self.assertFalse(
            _is_valid_url("https://other.com/guide", "https://docs.example.com/", [], [])
        )
    def test_include_pattern_match(self):
        self.assertTrue(
            _is_valid_url(
                "https://docs.example.com/docs/en/guide",
                "https://docs.example.com/",
                ["/docs/en/"],
                [],
            )
        )
    def test_include_pattern_no_match(self):
        self.assertFalse(
            _is_valid_url(
                "https://docs.example.com/blog/post",
                "https://docs.example.com/",
                ["/docs/en/"],
                [],
            )
        )
    def test_exclude_pattern(self):
        self.assertFalse(
            _is_valid_url(
                "https://docs.example.com/docs/en/changelog",
                "https://docs.example.com/",
                [],
                ["/changelog"],
            )
        )
    def test_include_and_exclude(self):
        # Matches include but also matches exclude -> rejected
        self.assertFalse(
            _is_valid_url(
                "https://docs.example.com/docs/en/changelog",
                "https://docs.example.com/",
                ["/docs/en/"],
                ["/changelog"],
            )
        )
    def test_no_patterns_all_valid(self):
        self.assertTrue(
            _is_valid_url("https://docs.example.com/anything", "https://docs.example.com/", [], [])
        )
 # ---------------------------------------------------------------------------
 # _get_doc_source / _set_start_urls
 # ---------------------------------------------------------------------------
 class TestConfigHelpers(unittest.TestCase):
    """Test config extraction for both unified and legacy formats."""
    def test_unified_format(self):
        config = {
            "name": "test",
            "sources": [
                {"type": "documentation", "base_url": "https://docs.example.com/"},
                {"type": "github", "repo": "owner/repo"},
            ],
        }
        source = _get_doc_source(config)
        self.assertIsNotNone(source)
        self.assertEqual(source["base_url"], "https://docs.example.com/")
    def test_unified_format_second_source(self):
        config = {
            "name": "test",
            "sources": [
                {"type": "documentation", "base_url": "https://first.com/"},
                {"type": "documentation", "base_url": "https://second.com/"},
            ],
        }
        source = _get_doc_source(config, source_index=1)
        self.assertEqual(source["base_url"], "https://second.com/")
    def test_unified_format_invalid_index(self):
        config = {"name": "test", "sources": [{"type": "github", "repo": "o/r"}]}
        self.assertIsNone(_get_doc_source(config))
    def test_legacy_flat_format(self):
        config = {"name": "test", "base_url": "https://docs.example.com/"}
        source = _get_doc_source(config)
        self.assertEqual(source["base_url"], "https://docs.example.com/")
    def test_no_source_found(self):
        config = {"name": "test"}
        self.assertIsNone(_get_doc_source(config))
    def test_set_start_urls_unified(self):
        config = {
            "sources": [
                {"type": "documentation", "base_url": "https://x.com/", "start_urls": []},
            ]
        }
        _set_start_urls(config, 0, ["https://x.com/a", "https://x.com/b"])
        self.assertEqual(config["sources"][0]["start_urls"], ["https://x.com/a", "https://x.com/b"])
    def test_set_start_urls_legacy(self):
        config = {"base_url": "https://x.com/", "start_urls": []}
        _set_start_urls(config, 0, ["https://x.com/new"])
        self.assertEqual(config["start_urls"], ["https://x.com/new"])
 # ---------------------------------------------------------------------------
 # discover_urls (with mocked HTTP)
 # ---------------------------------------------------------------------------
 class TestDiscoverUrls(unittest.TestCase):
    """Test BFS link discovery with mocked HTTP responses."""
    def _make_html(self, links: list[str]) -> str:
        hrefs = "".join(f'<a href="{u}">link</a>' for u in links)
        return f"<html><body>{hrefs}</body></html>"
    @patch("skill_seekers.cli.sync_config.requests.get")
    def test_basic_discovery(self, mock_get):
        """Discover links from a single seed page."""
        mock_resp = MagicMock()
        mock_resp.content = self._make_html(
            [
                "https://docs.example.com/page-a",
                "https://docs.example.com/page-b",
                "https://other.com/external",  # should be filtered out
            ]
        ).encode()
        mock_resp.raise_for_status = MagicMock()
        mock_get.return_value = mock_resp
        result = discover_urls(
            base_url="https://docs.example.com/",
            seed_urls=["https://docs.example.com/"],
            depth=1,
            rate_limit=0,
        )
        self.assertIn("https://docs.example.com/", result)
        self.assertIn("https://docs.example.com/page-a", result)
        self.assertIn("https://docs.example.com/page-b", result)
        self.assertNotIn("https://other.com/external", result)
    @patch("skill_seekers.cli.sync_config.requests.get")
    def test_depth_limiting(self, mock_get):
        """URLs at depth > limit should be discovered but not followed."""
        # Seed returns one link
        seed_html = self._make_html(["https://docs.example.com/child"])
        child_html = self._make_html(["https://docs.example.com/grandchild"])
        mock_get.side_effect = [
            MagicMock(content=seed_html.encode(), raise_for_status=MagicMock()),
            MagicMock(content=child_html.encode(), raise_for_status=MagicMock()),
        ]
        result = discover_urls(
            base_url="https://docs.example.com/",
            seed_urls=["https://docs.example.com/"],
            depth=1,  # Only follow seed page links, not child page links
            rate_limit=0,
        )
        self.assertIn("https://docs.example.com/child", result)
        # grandchild is at depth 2, which exceeds depth=1
        self.assertNotIn("https://docs.example.com/grandchild", result)
    @patch("skill_seekers.cli.sync_config.requests.get")
    def test_max_pages_limit(self, mock_get):
        """Stop after max_pages."""
        links = [f"https://docs.example.com/page-{i}" for i in range(20)]
        mock_resp = MagicMock()
        mock_resp.content = self._make_html(links).encode()
        mock_resp.raise_for_status = MagicMock()
        mock_get.return_value = mock_resp
        result = discover_urls(
            base_url="https://docs.example.com/",
            seed_urls=["https://docs.example.com/"],
            depth=1,
            max_pages=5,
            rate_limit=0,
        )
        self.assertLessEqual(len(result), 5)
    @patch("skill_seekers.cli.sync_config.requests.get")
    def test_include_exclude_patterns(self, mock_get):
        """Include/exclude patterns are respected."""
        mock_resp = MagicMock()
        mock_resp.content = self._make_html(
            [
                "https://docs.example.com/docs/en/guide",
                "https://docs.example.com/docs/fr/guide",
                "https://docs.example.com/blog/post",
            ]
        ).encode()
        mock_resp.raise_for_status = MagicMock()
        mock_get.return_value = mock_resp
        result = discover_urls(
            base_url="https://docs.example.com/",
            seed_urls=["https://docs.example.com/docs/en/overview"],
            include_patterns=["/docs/en/"],
            exclude_patterns=["/blog/"],
            depth=1,
            rate_limit=0,
        )
        self.assertIn("https://docs.example.com/docs/en/guide", result)
        self.assertNotIn("https://docs.example.com/docs/fr/guide", result)
        self.assertNotIn("https://docs.example.com/blog/post", result)
    @patch("skill_seekers.cli.sync_config.requests.get")
    def test_http_error_handled_gracefully(self, mock_get):
        """HTTP errors should not crash the discovery."""
        mock_get.side_effect = ConnectionError("Network error")
        result = discover_urls(
            base_url="https://docs.example.com/",
            seed_urls=["https://docs.example.com/"],
            depth=1,
            rate_limit=0,
        )
        # URLs that fail to fetch are NOT added to discovered (they may
        # have been removed from the live site).
        self.assertEqual(result, set())
    @patch("skill_seekers.cli.sync_config.requests.get")
    def test_fragments_stripped(self, mock_get):
        """URL fragments (#anchor) should be stripped."""
        mock_resp = MagicMock()
        mock_resp.content = self._make_html(
            [
                "https://docs.example.com/guide#section1",
                "https://docs.example.com/guide#section2",
            ]
        ).encode()
        mock_resp.raise_for_status = MagicMock()
        mock_get.return_value = mock_resp
        result = discover_urls(
            base_url="https://docs.example.com/",
            seed_urls=["https://docs.example.com/"],
            depth=1,
            rate_limit=0,
        )
        # Both anchors should resolve to the same URL
        self.assertIn("https://docs.example.com/guide", result)
 # ---------------------------------------------------------------------------
 # sync_config (integration with file I/O)
 # ---------------------------------------------------------------------------
 class TestSyncConfigIntegration(unittest.TestCase):
    """Test the full sync_config workflow with mocked HTTP."""
    def _write_config(self, config: dict) -> Path:
        tmp = tempfile.mktemp(suffix=".json")  # noqa: SIM115
        with open(tmp, "w", encoding="utf-8") as f:
            json.dump(config, f, indent=2)
        return Path(tmp)
    @patch("skill_seekers.cli.sync_config.discover_urls")
    def test_dry_run_does_not_modify_file(self, mock_discover):
        mock_discover.return_value = {
            "https://docs.example.com/a",
            "https://docs.example.com/b",
            "https://docs.example.com/c",
        }
        config = {
            "name": "test",
            "sources": [
                {
                    "type": "documentation",
                    "base_url": "https://docs.example.com/",
                    "start_urls": ["https://docs.example.com/a"],
                }
            ],
        }
        path = self._write_config(config)
        result = sync_config(str(path), apply=False)
        self.assertFalse(result["applied"])
        self.assertEqual(len(result["added"]), 2)
        # File should not be modified
        with open(path, encoding="utf-8") as f:
            saved = json.load(f)
        self.assertEqual(len(saved["sources"][0]["start_urls"]), 1)
        path.unlink()
    @patch("skill_seekers.cli.sync_config.discover_urls")
    def test_apply_writes_updated_urls(self, mock_discover):
        mock_discover.return_value = {
            "https://docs.example.com/a",
            "https://docs.example.com/b",
        }
        config = {
            "name": "test",
            "sources": [
                {
                    "type": "documentation",
                    "base_url": "https://docs.example.com/",
                    "start_urls": ["https://docs.example.com/a", "https://docs.example.com/old"],
                }
            ],
        }
        path = self._write_config(config)
        result = sync_config(str(path), apply=True)
        self.assertTrue(result["applied"])
        self.assertEqual(result["added"], ["https://docs.example.com/b"])
        self.assertEqual(result["removed"], ["https://docs.example.com/old"])
        # File should be updated
        with open(path, encoding="utf-8") as f:
            saved = json.load(f)
        urls = saved["sources"][0]["start_urls"]
        self.assertIn("https://docs.example.com/a", urls)
        self.assertIn("https://docs.example.com/b", urls)
        self.assertNotIn("https://docs.example.com/old", urls)
        path.unlink()
    @patch("skill_seekers.cli.sync_config.discover_urls")
    def test_no_changes_does_not_write(self, mock_discover):
        urls = ["https://docs.example.com/a", "https://docs.example.com/b"]
        mock_discover.return_value = set(urls)
        config = {
            "name": "test",
            "sources": [
                {
                    "type": "documentation",
                    "base_url": "https://docs.example.com/",
                    "start_urls": urls,
                }
            ],
        }
        path = self._write_config(config)
        result = sync_config(str(path), apply=True)
        self.assertFalse(result["applied"])
        self.assertEqual(result["added"], [])
        self.assertEqual(result["removed"], [])
        path.unlink()
    def test_missing_source_returns_error(self):
        config = {"name": "test", "sources": [{"type": "github", "repo": "o/r"}]}
        path = self._write_config(config)
        result = sync_config(str(path))
        self.assertIn("error", result)
        path.unlink()
    @patch("skill_seekers.cli.sync_config.discover_urls")
    def test_legacy_config_format(self, mock_discover):
        mock_discover.return_value = {"https://docs.example.com/a"}
        config = {
            "name": "test",
            "base_url": "https://docs.example.com/",
            "start_urls": ["https://docs.example.com/a", "https://docs.example.com/old"],
        }
        path = self._write_config(config)
        result = sync_config(str(path), apply=True)
        self.assertTrue(result["applied"])
        self.assertEqual(result["removed"], ["https://docs.example.com/old"])
        with open(path, encoding="utf-8") as f:
            saved = json.load(f)
        self.assertEqual(saved["start_urls"], ["https://docs.example.com/a"])
        path.unlink()
    @patch("skill_seekers.cli.sync_config.discover_urls")
    def test_nav_seed_urls_used_over_start_urls(self, mock_discover):
        """When nav_seed_urls is present, it should be used as the seed."""
        mock_discover.return_value = {"https://docs.example.com/a"}
        config = {
            "name": "test",
            "sources": [
                {
                    "type": "documentation",
                    "base_url": "https://docs.example.com/",
                    "start_urls": ["https://docs.example.com/a"],
                    "nav_seed_urls": [
                        "https://docs.example.com/nav1",
                        "https://docs.example.com/nav2",
                    ],
                }
            ],
        }
        path = self._write_config(config)
        sync_config(str(path))
        # Verify discover_urls was called with nav_seed_urls
        call_kwargs = mock_discover.call_args[1]
        self.assertEqual(
            call_kwargs["seed_urls"],
            ["https://docs.example.com/nav1", "https://docs.example.com/nav2"],
        )
        path.unlink()
 # ---------------------------------------------------------------------------
 # CLI argument parsing
 # ---------------------------------------------------------------------------
 class TestSyncConfigCLI(unittest.TestCase):
    """Test CLI argument parsing and subcommand registration."""
    def test_sync_config_parser_registered(self):
        """sync-config should be a registered subcommand."""
        from skill_seekers.cli.parsers import get_parser_names
        self.assertIn("sync-config", get_parser_names())
    def test_sync_config_in_command_modules(self):
        """sync-config should be in COMMAND_MODULES."""
        from skill_seekers.cli.main import COMMAND_MODULES
        self.assertIn("sync-config", COMMAND_MODULES)
    def test_arguments_created(self):
        """Argument parser should accept all expected flags."""
        import argparse
        from skill_seekers.cli.arguments.sync_config import add_sync_config_arguments
        parser = argparse.ArgumentParser()
        add_sync_config_arguments(parser)
        args = parser.parse_args(["--config", "test.json", "--apply", "--depth", "3"])
        self.assertEqual(args.config, "test.json")
        self.assertTrue(args.apply)
        self.assertEqual(args.depth, 3)
    def test_default_values(self):
        """Default values should be sensible."""
        import argparse
        from skill_seekers.cli.arguments.sync_config import add_sync_config_arguments
        parser = argparse.ArgumentParser()
        add_sync_config_arguments(parser)
        args = parser.parse_args(["--config", "test.json"])
        self.assertFalse(args.apply)
        self.assertEqual(args.depth, 2)
        self.assertEqual(args.max_pages, 500)
        self.assertIsNone(args.rate_limit)
        self.assertEqual(args.source_index, 0)
 # ---------------------------------------------------------------------------
 # MCP tool
 # ---------------------------------------------------------------------------
 class TestSyncConfigMCPTool(unittest.TestCase):
    """Test MCP tool wrapper."""
    def test_mcp_tool_importable(self):
        """The sync_config MCP tool should be importable."""
        from skill_seekers.mcp.tools import sync_config_impl
        self.assertTrue(callable(sync_config_impl))
    def test_mcp_tool_missing_config_path(self):
        """Missing config_path should return an error."""
        import asyncio
        from skill_seekers.mcp.tools.sync_config_tools import sync_config_tool
        result = asyncio.run(sync_config_tool({}))
        self.assertTrue(any("Error" in r.text for r in result))
 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_sync_config_e2e.py
+++ b/tests/test_sync_config_e2e.py
@@ -0,0 +1,626 @@
 #!/usr/bin/env python3
 """End-to-end tests for the sync-config command.
 Uses a local HTTP server with realistic multi-page HTML navigation to test
 the full pipeline: BFS crawl -> link discovery -> diff -> config update.
 Also includes an integration test against a real public docs site.
 """
 import json
 import subprocess
 import sys
 import tempfile
 import threading
 import unittest
 from http.server import HTTPServer, SimpleHTTPRequestHandler
 from pathlib import Path
 import pytest
 from skill_seekers.cli.sync_config import discover_urls, sync_config
 # ---------------------------------------------------------------------------
 # Local test HTTP server
 # ---------------------------------------------------------------------------
 # Simulates a docs site with this navigation structure:
 #
 #   /docs/                  (index — links to guide, api, faq)
 #   /docs/guide             (links to guide/install, guide/usage)
 #   /docs/guide/install     (leaf page)
 #   /docs/guide/usage       (leaf page, links back to guide)
 #   /docs/api               (links to api/auth, api/users)
 #   /docs/api/auth          (leaf page)
 #   /docs/api/users         (leaf page)
 #   /docs/faq               (leaf page)
 #   /blog/post-1            (outside /docs/ — should be excluded)
 _SITE_PAGES = {
    "/docs/": """<!DOCTYPE html><html><head><title>Docs Home</title></head><body>
        <h1>Documentation</h1>
        <nav>
            <a href="/docs/guide">Guide</a>
            <a href="/docs/api">API Reference</a>
            <a href="/docs/faq">FAQ</a>
            <a href="/blog/post-1">Blog</a>
            <a href="https://github.com/example/repo">GitHub</a>
        </nav>
    </body></html>""",
    "/docs/guide": """<!DOCTYPE html><html><body>
        <h1>Guide</h1>
        <a href="/docs/guide/install">Installation</a>
        <a href="/docs/guide/usage">Usage</a>
        <a href="/docs/">Back to docs</a>
    </body></html>""",
    "/docs/guide/install": """<!DOCTYPE html><html><body>
        <h1>Installation</h1><p>pip install example</p>
        <a href="/docs/guide">Back to guide</a>
    </body></html>""",
    "/docs/guide/usage": """<!DOCTYPE html><html><body>
        <h1>Usage</h1><p>import example</p>
        <a href="/docs/guide">Back to guide</a>
    </body></html>""",
    "/docs/api": """<!DOCTYPE html><html><body>
        <h1>API Reference</h1>
        <a href="/docs/api/auth">Authentication</a>
        <a href="/docs/api/users">Users</a>
    </body></html>""",
    "/docs/api/auth": """<!DOCTYPE html><html><body>
        <h1>Authentication</h1><p>Use tokens.</p>
    </body></html>""",
    "/docs/api/users": """<!DOCTYPE html><html><body>
        <h1>Users API</h1><p>CRUD operations.</p>
    </body></html>""",
    "/docs/faq": """<!DOCTYPE html><html><body>
        <h1>FAQ</h1><p>Common questions.</p>
    </body></html>""",
    "/blog/post-1": """<!DOCTYPE html><html><body>
        <h1>Blog Post</h1><p>This is a blog post outside /docs/.</p>
    </body></html>""",
 }
 # All docs pages that should be discovered (excluding /blog/)
 _ALL_DOC_URLS_PATHS = {
    "/docs/",
    "/docs/guide",
    "/docs/guide/install",
    "/docs/guide/usage",
    "/docs/api",
    "/docs/api/auth",
    "/docs/api/users",
    "/docs/faq",
 }
 class _TestHandler(SimpleHTTPRequestHandler):
    """Serve pages from the in-memory _SITE_PAGES dict."""
    def do_GET(self):
        path = self.path.split("?")[0].split("#")[0]
        content = _SITE_PAGES.get(path)
        if content is None:
            self.send_error(404)
            return
        self.send_response(200)
        self.send_header("Content-Type", "text/html; charset=utf-8")
        self.end_headers()
        self.wfile.write(content.encode("utf-8"))
    def log_message(self, format, *args):  # noqa: ARG002
        pass  # Suppress request logging during tests
 def _start_server() -> tuple[HTTPServer, int]:
    """Start a local HTTP server on a random port. Returns (server, port)."""
    server = HTTPServer(("127.0.0.1", 0), _TestHandler)
    port = server.server_address[1]
    thread = threading.Thread(target=server.serve_forever, daemon=True)
    thread.start()
    return server, port
 # ---------------------------------------------------------------------------
 # Helper
 # ---------------------------------------------------------------------------
 def _write_config(config: dict) -> Path:
    """Write a config dict to a temp JSON file and return its path."""
    tmp = tempfile.mktemp(suffix=".json")
    with open(tmp, "w", encoding="utf-8") as f:
        json.dump(config, f, indent=2)
    return Path(tmp)
 # ---------------------------------------------------------------------------
 # E2E tests using local HTTP server
 # ---------------------------------------------------------------------------
@pytest.mark.e2e
 class TestSyncConfigE2E(unittest.TestCase):
    """End-to-end tests using a local HTTP server with realistic HTML."""
    @classmethod
    def setUpClass(cls):
        cls.server, cls.port = _start_server()
        cls.base_url = f"http://127.0.0.1:{cls.port}/docs/"
    @classmethod
    def tearDownClass(cls):
        cls.server.shutdown()
    # -- discover_urls --
    def test_discover_finds_all_doc_pages(self):
        """BFS should discover all 8 /docs/ pages from the root."""
        discovered = discover_urls(
            base_url=self.base_url,
            seed_urls=[self.base_url],
            depth=3,
            rate_limit=0,
        )
        expected = {f"http://127.0.0.1:{self.port}{p}" for p in _ALL_DOC_URLS_PATHS}
        self.assertEqual(discovered, expected)
    def test_discover_excludes_blog(self):
        """Pages outside /docs/ base_url should be excluded."""
        discovered = discover_urls(
            base_url=self.base_url,
            seed_urls=[self.base_url],
            depth=3,
            rate_limit=0,
        )
        blog_url = f"http://127.0.0.1:{self.port}/blog/post-1"
        self.assertNotIn(blog_url, discovered)
    def test_discover_excludes_external(self):
        """External URLs (github.com) should be excluded."""
        discovered = discover_urls(
            base_url=self.base_url,
            seed_urls=[self.base_url],
            depth=3,
            rate_limit=0,
        )
        self.assertFalse(
            any("github.com" in u for u in discovered),
            "External URLs should not be discovered",
        )
    def test_discover_depth_1_finds_direct_links_only(self):
        """Depth 1 from root should find guide, api, faq but NOT nested pages."""
        discovered = discover_urls(
            base_url=self.base_url,
            seed_urls=[self.base_url],
            depth=1,
            rate_limit=0,
        )
        # Direct children of /docs/
        self.assertIn(f"http://127.0.0.1:{self.port}/docs/guide", discovered)
        self.assertIn(f"http://127.0.0.1:{self.port}/docs/api", discovered)
        self.assertIn(f"http://127.0.0.1:{self.port}/docs/faq", discovered)
        # Nested pages should NOT be present (they're at depth 2)
        self.assertNotIn(f"http://127.0.0.1:{self.port}/docs/guide/install", discovered)
        self.assertNotIn(f"http://127.0.0.1:{self.port}/docs/api/auth", discovered)
    def test_discover_with_include_pattern(self):
        """Include pattern should filter results."""
        discovered = discover_urls(
            base_url=self.base_url,
            seed_urls=[self.base_url],
            include_patterns=["/api"],
            depth=3,
            rate_limit=0,
        )
        # Only /api/ pages should be discovered
        for url in discovered:
            self.assertIn("/api", url, f"URL {url} does not match include pattern /api")
    def test_discover_with_exclude_pattern(self):
        """Exclude pattern should remove matching pages."""
        discovered = discover_urls(
            base_url=self.base_url,
            seed_urls=[self.base_url],
            exclude_patterns=["/faq"],
            depth=3,
            rate_limit=0,
        )
        faq_url = f"http://127.0.0.1:{self.port}/docs/faq"
        self.assertNotIn(faq_url, discovered)
        # Other pages should still be found
        self.assertIn(f"http://127.0.0.1:{self.port}/docs/guide", discovered)
    def test_discover_max_pages_limit(self):
        """max_pages should cap discovery."""
        discovered = discover_urls(
            base_url=self.base_url,
            seed_urls=[self.base_url],
            depth=3,
            max_pages=3,
            rate_limit=0,
        )
        self.assertLessEqual(len(discovered), 3)
    # -- sync_config (full pipeline with file I/O) --
    def test_sync_config_dry_run_detects_new_pages(self):
        """Dry-run should detect pages missing from the config."""
        config = {
            "name": "test-site",
            "sources": [
                {
                    "type": "documentation",
                    "base_url": self.base_url,
                    "start_urls": [
                        f"http://127.0.0.1:{self.port}/docs/guide",
                        f"http://127.0.0.1:{self.port}/docs/faq",
                    ],
                }
            ],
        }
        path = _write_config(config)
        result = sync_config(str(path), apply=False, depth=3, rate_limit=0)
        self.assertFalse(result["applied"])
        self.assertGreater(len(result["added"]), 0, "Should detect new pages")
        # api, api/auth, api/users, guide/install, guide/usage, /docs/ itself
        # should all be in added
        self.assertGreaterEqual(result["total_discovered"], 6)
        # File should NOT be modified
        with open(path, encoding="utf-8") as f:
            saved = json.load(f)
        self.assertEqual(len(saved["sources"][0]["start_urls"]), 2)
        path.unlink()
    def test_sync_config_apply_updates_config(self):
        """--apply should write all discovered URLs to the config."""
        config = {
            "name": "test-site",
            "sources": [
                {
                    "type": "documentation",
                    "base_url": self.base_url,
                    "start_urls": [f"http://127.0.0.1:{self.port}/docs/guide"],
                }
            ],
        }
        path = _write_config(config)
        result = sync_config(str(path), apply=True, depth=3, rate_limit=0)
        self.assertTrue(result["applied"])
        # Verify the file was updated
        with open(path, encoding="utf-8") as f:
            saved = json.load(f)
        saved_urls = saved["sources"][0]["start_urls"]
        self.assertEqual(len(saved_urls), result["total_discovered"])
        # All expected URLs should be present
        expected = {f"http://127.0.0.1:{self.port}{p}" for p in _ALL_DOC_URLS_PATHS}
        for url in expected:
            self.assertIn(url, saved_urls, f"Expected URL missing from saved config: {url}")
        path.unlink()
    def test_sync_config_idempotent(self):
        """Running sync twice with --apply should be a no-op the second time."""
        config = {
            "name": "test-site",
            "sources": [
                {
                    "type": "documentation",
                    "base_url": self.base_url,
                    "start_urls": [],
                }
            ],
        }
        path = _write_config(config)
        # First run: should apply changes
        result1 = sync_config(str(path), apply=True, depth=3, rate_limit=0)
        self.assertTrue(result1["applied"])
        self.assertGreater(len(result1["added"]), 0)
        # Second run: should detect no changes
        result2 = sync_config(str(path), apply=True, depth=3, rate_limit=0)
        self.assertFalse(result2["applied"])
        self.assertEqual(result2["added"], [])
        self.assertEqual(result2["removed"], [])
        path.unlink()
    def test_sync_config_detects_removed_pages(self):
        """Pages in config but not discovered should show as removed."""
        config = {
            "name": "test-site",
            "sources": [
                {
                    "type": "documentation",
                    "base_url": self.base_url,
                    "start_urls": [
                        f"http://127.0.0.1:{self.port}/docs/guide",
                        f"http://127.0.0.1:{self.port}/docs/old-page-that-no-longer-exists",
                    ],
                }
            ],
        }
        path = _write_config(config)
        result = sync_config(str(path), apply=False, depth=3, rate_limit=0)
        self.assertIn(
            f"http://127.0.0.1:{self.port}/docs/old-page-that-no-longer-exists",
            result["removed"],
        )
        path.unlink()
    def test_sync_config_preserves_other_config_fields(self):
        """--apply should only modify start_urls, preserving all other fields."""
        config = {
            "name": "my-skill",
            "description": "Important skill description",
            "version": "1.0.0",
            "sources": [
                {
                    "type": "documentation",
                    "base_url": self.base_url,
                    "start_urls": [],
                    "selectors": {"main_content": "article", "title": "h1"},
                    "url_patterns": {"include": [], "exclude": []},
                    "rate_limit": 0.5,
                    "max_pages": 100,
                },
                {
                    "type": "github",
                    "repo": "owner/repo",
                },
            ],
        }
        path = _write_config(config)
        sync_config(str(path), apply=True, depth=3, rate_limit=0)
        with open(path, encoding="utf-8") as f:
            saved = json.load(f)
        # Non-start_urls fields should be untouched
        self.assertEqual(saved["name"], "my-skill")
        self.assertEqual(saved["description"], "Important skill description")
        self.assertEqual(saved["version"], "1.0.0")
        self.assertEqual(saved["sources"][0]["selectors"]["main_content"], "article")
        self.assertEqual(saved["sources"][0]["rate_limit"], 0.5)
        self.assertEqual(saved["sources"][1]["type"], "github")
        self.assertEqual(saved["sources"][1]["repo"], "owner/repo")
        # start_urls should be updated
        self.assertGreater(len(saved["sources"][0]["start_urls"]), 0)
        path.unlink()
    def test_sync_config_with_nav_seed_urls(self):
        """nav_seed_urls should be used as BFS seeds instead of start_urls."""
        config = {
            "name": "test-site",
            "sources": [
                {
                    "type": "documentation",
                    "base_url": self.base_url,
                    "start_urls": [],
                    # Only seed from /docs/api — should only discover API pages
                    "nav_seed_urls": [f"http://127.0.0.1:{self.port}/docs/api"],
                }
            ],
        }
        path = _write_config(config)
        result = sync_config(str(path), apply=False, depth=1, rate_limit=0)
        # Should discover at least the API seed page
        self.assertGreater(len(result["added"]), 0, "nav_seed_urls should discover pages")
        # All added URLs should be under /docs/
        for url in result["added"]:
            self.assertTrue(url.startswith(self.base_url), f"URL outside base: {url}")
        path.unlink()
    def test_sync_config_legacy_format(self):
        """Legacy flat config format should work end-to-end."""
        config = {
            "name": "test-site",
            "base_url": self.base_url,
            "start_urls": [f"http://127.0.0.1:{self.port}/docs/guide"],
        }
        path = _write_config(config)
        result = sync_config(str(path), apply=True, depth=3, rate_limit=0)
        self.assertTrue(result["applied"])
        with open(path, encoding="utf-8") as f:
            saved = json.load(f)
        self.assertGreater(len(saved["start_urls"]), 1)
        path.unlink()
 # ---------------------------------------------------------------------------
 # CLI subprocess tests
 # ---------------------------------------------------------------------------
@pytest.mark.e2e
 class TestSyncConfigCLIE2E(unittest.TestCase):
    """Test the CLI entry point via subprocess."""
    @classmethod
    def setUpClass(cls):
        cls.server, cls.port = _start_server()
        cls.base_url = f"http://127.0.0.1:{cls.port}/docs/"
    @classmethod
    def tearDownClass(cls):
        cls.server.shutdown()
    def test_cli_dry_run(self):
        """CLI dry-run should print diff and exit 0."""
        config = {
            "name": "test",
            "sources": [
                {
                    "type": "documentation",
                    "base_url": self.base_url,
                    # Only one URL configured — the rest should show as "new"
                    "start_urls": [f"http://127.0.0.1:{self.port}/docs/faq"],
                    # Seed from root to discover all pages
                    "nav_seed_urls": [self.base_url],
                }
            ],
        }
        path = _write_config(config)
        result = subprocess.run(
            [
                sys.executable,
                "-m",
                "skill_seekers.cli.sync_config",
                "--config",
                str(path),
                "--depth",
                "3",
                "--rate-limit",
                "0",
            ],
            capture_output=True,
            text=True,
            timeout=30,
        )
        self.assertEqual(result.returncode, 0, f"CLI failed: {result.stderr}")
        # Should mention new pages in the output (logged to stderr)
        combined = result.stderr.lower() + result.stdout.lower()
        self.assertIn("new page", combined, f"Expected 'new page' in output: {combined}")
        path.unlink()
    def test_cli_apply(self):
        """CLI --apply should update the config file."""
        config = {
            "name": "test",
            "sources": [
                {
                    "type": "documentation",
                    "base_url": self.base_url,
                    "start_urls": [f"http://127.0.0.1:{self.port}/docs/faq"],
                    "nav_seed_urls": [self.base_url],
                }
            ],
        }
        path = _write_config(config)
        result = subprocess.run(
            [
                sys.executable,
                "-m",
                "skill_seekers.cli.sync_config",
                "--config",
                str(path),
                "--apply",
                "--depth",
                "3",
                "--rate-limit",
                "0",
            ],
            capture_output=True,
            text=True,
            timeout=30,
        )
        self.assertEqual(result.returncode, 0, f"CLI failed: {result.stderr}")
        with open(path, encoding="utf-8") as f:
            saved = json.load(f)
        self.assertGreater(len(saved["sources"][0]["start_urls"]), 0)
        path.unlink()
    def test_cli_help(self):
        """CLI --help should print usage and exit 0."""
        result = subprocess.run(
            [sys.executable, "-m", "skill_seekers.cli.sync_config", "--help"],
            capture_output=True,
            text=True,
            timeout=10,
        )
        self.assertEqual(result.returncode, 0)
        self.assertIn("sync", result.stdout.lower())
        self.assertIn("--config", result.stdout)
        self.assertIn("--apply", result.stdout)
        self.assertIn("--depth", result.stdout)
    def test_cli_missing_config_exits_nonzero(self):
        """CLI with a non-existent config should fail."""
        result = subprocess.run(
            [
                sys.executable,
                "-m",
                "skill_seekers.cli.sync_config",
                "--config",
                "/nonexistent/path/config.json",
            ],
            capture_output=True,
            text=True,
            timeout=10,
        )
        self.assertNotEqual(result.returncode, 0)
 # ---------------------------------------------------------------------------
 # Integration test against real public site
 # ---------------------------------------------------------------------------
@pytest.mark.integration
 class TestSyncConfigRealSite(unittest.TestCase):
    """Integration test against a real public docs site.
    Skipped by default (use ``-m integration`` to run).
    Uses httpbin.org which is a stable, small public HTTP test service.
    """
    def test_discover_urls_real_http(self):
        """discover_urls should work against a real HTTP server."""
        # Use Python docs — small, stable, well-structured
        discovered = discover_urls(
            base_url="https://docs.python.org/3/library/",
            seed_urls=["https://docs.python.org/3/library/functions.html"],
            depth=1,
            max_pages=10,
            rate_limit=0.5,
        )
        # Should find at least the seed page itself
        self.assertGreater(len(discovered), 0)
        # All discovered URLs should be under the base
        for url in discovered:
            self.assertTrue(
                url.startswith("https://docs.python.org/3/library/"),
                f"Discovered URL outside base: {url}",
            )
 if __name__ == "__main__":
    unittest.main()