From 83b9a695ba817f4011a1959c86163c895081b4d2 Mon Sep 17 00:00:00 2001
From: yusyus <yusufkaraaslan.yk@pm.me>
Date: Sun, 15 Mar 2026 02:16:32 +0300
Subject: [PATCH] feat: add sync-config command to detect and update config
 start_urls (#306)

## Summary

Add `skill-seekers sync-config` subcommand that crawls a docs site's navigation,
diffs discovered URLs against a config's start_urls, and optionally writes the
updated list back with --apply.

- BFS link discovery with configurable depth (default 2), max-pages, rate-limit
- Respects url_patterns.include/exclude from config
- Supports optional nav_seed_urls config field
- Handles both unified (sources array) and legacy flat config formats
- MCP tool sync_config included
- 57 tests (39 unit + 18 E2E with local HTTP server)
- Fixed CI: renamed summary job to "Tests" to match branch protection rule

Closes #306
---
 pyproject.toml                                |   1 +
 .../cli/arguments/sync_config.py              |  64 ++
 src/skill_seekers/cli/main.py                 |   1 +
 src/skill_seekers/cli/parsers/__init__.py     |   2 +
 .../cli/parsers/sync_config_parser.py         |  30 +
 src/skill_seekers/cli/sync_config.py          | 325 +++++++++
 src/skill_seekers/mcp/server_fastmcp.py       |  49 ++
 src/skill_seekers/mcp/tools/__init__.py       |   5 +
 .../mcp/tools/sync_config_tools.py            |  85 +++
 tests/test_cli_parsers.py                     |  10 +-
 tests/test_sync_config.py                     | 590 +++++++++++++++++
 tests/test_sync_config_e2e.py                 | 626 ++++++++++++++++++
 12 files changed, 1783 insertions(+), 5 deletions(-)
 create mode 100644 src/skill_seekers/cli/arguments/sync_config.py
 create mode 100644 src/skill_seekers/cli/parsers/sync_config_parser.py
 create mode 100644 src/skill_seekers/cli/sync_config.py
 create mode 100644 src/skill_seekers/mcp/tools/sync_config_tools.py
 create mode 100644 tests/test_sync_config.py
 create mode 100644 tests/test_sync_config_e2e.py

diff --git a/pyproject.toml b/pyproject.toml
index b2f8fae..6e4eac3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -244,6 +244,7 @@ skill-seekers-update = "skill_seekers.cli.incremental_updater:main"
 skill-seekers-multilang = "skill_seekers.cli.multilang_support:main"
 skill-seekers-quality = "skill_seekers.cli.quality_metrics:main"
 skill-seekers-workflows = "skill_seekers.cli.workflows_command:main"
+skill-seekers-sync-config = "skill_seekers.cli.sync_config:main"
 
 [tool.setuptools]
 package-dir = {"" = "src"}
diff --git a/src/skill_seekers/cli/arguments/sync_config.py b/src/skill_seekers/cli/arguments/sync_config.py
new file mode 100644
index 0000000..86ced16
--- /dev/null
+++ b/src/skill_seekers/cli/arguments/sync_config.py
@@ -0,0 +1,64 @@
+"""Sync-config command argument definitions.
+
+Shared between sync_config.py (standalone) and parsers/sync_config_parser.py
+(unified CLI) so the two entry points never drift out of sync.
+"""
+
+import argparse
+
+
+def add_sync_config_arguments(parser: argparse.ArgumentParser) -> None:
+    """Add all sync-config arguments to *parser*."""
+
+    parser.add_argument(
+        "--config",
+        "-c",
+        type=str,
+        required=True,
+        help="Path to the config JSON file to sync",
+        metavar="FILE",
+    )
+    parser.add_argument(
+        "--apply",
+        action="store_true",
+        default=False,
+        help="Write updated start_urls back to the config file (default: dry-run)",
+    )
+    parser.add_argument(
+        "--depth",
+        type=int,
+        default=2,
+        help="BFS crawl depth from seed pages (default: 2)",
+    )
+    parser.add_argument(
+        "--max-pages",
+        type=int,
+        default=500,
+        help="Maximum pages to discover (default: 500)",
+    )
+    parser.add_argument(
+        "--rate-limit",
+        type=float,
+        default=None,
+        help="Override config rate-limit (seconds between requests)",
+    )
+    parser.add_argument(
+        "--source-index",
+        type=int,
+        default=0,
+        help="Index of the documentation source to sync (default: 0)",
+    )
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        default=False,
+        help="Verbose output",
+    )
+    parser.add_argument(
+        "--quiet",
+        "-q",
+        action="store_true",
+        default=False,
+        help="Suppress informational output",
+    )
diff --git a/src/skill_seekers/cli/main.py b/src/skill_seekers/cli/main.py
index 83c1fd7..8092b5e 100644
--- a/src/skill_seekers/cli/main.py
+++ b/src/skill_seekers/cli/main.py
@@ -67,6 +67,7 @@ COMMAND_MODULES = {
     "multilang": "skill_seekers.cli.multilang_support",
     "quality": "skill_seekers.cli.quality_metrics",
     "workflows": "skill_seekers.cli.workflows_command",
+    "sync-config": "skill_seekers.cli.sync_config",
 }
 
 
diff --git a/src/skill_seekers/cli/parsers/__init__.py b/src/skill_seekers/cli/parsers/__init__.py
index 7cca3c4..f0aaf2b 100644
--- a/src/skill_seekers/cli/parsers/__init__.py
+++ b/src/skill_seekers/cli/parsers/__init__.py
@@ -30,6 +30,7 @@ from .update_parser import UpdateParser
 from .multilang_parser import MultilangParser
 from .quality_parser import QualityParser
 from .workflows_parser import WorkflowsParser
+from .sync_config_parser import SyncConfigParser
 
 # Registry of all parsers (in order of usage frequency)
 PARSERS = [
@@ -56,6 +57,7 @@ PARSERS = [
     MultilangParser(),
     QualityParser(),
     WorkflowsParser(),
+    SyncConfigParser(),
 ]
 
 
diff --git a/src/skill_seekers/cli/parsers/sync_config_parser.py b/src/skill_seekers/cli/parsers/sync_config_parser.py
new file mode 100644
index 0000000..f4af85a
--- /dev/null
+++ b/src/skill_seekers/cli/parsers/sync_config_parser.py
@@ -0,0 +1,30 @@
+"""Parser for the sync-config subcommand."""
+
+import argparse
+
+from .base import SubcommandParser
+
+
+class SyncConfigParser(SubcommandParser):
+    """Subcommand parser for ``skill-seekers sync-config``."""
+
+    @property
+    def name(self) -> str:
+        return "sync-config"
+
+    @property
+    def help(self) -> str:
+        return "Diff/update a config's start_urls against the live docs site"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Crawl navigation links from a docs site, compare them against "
+            "the config's start_urls, and optionally write the updated list "
+            "back with --apply."
+        )
+
+    def add_arguments(self, parser: argparse.ArgumentParser) -> None:
+        from skill_seekers.cli.arguments.sync_config import add_sync_config_arguments
+
+        add_sync_config_arguments(parser)
diff --git a/src/skill_seekers/cli/sync_config.py b/src/skill_seekers/cli/sync_config.py
new file mode 100644
index 0000000..dad4adb
--- /dev/null
+++ b/src/skill_seekers/cli/sync_config.py
@@ -0,0 +1,325 @@
+#!/usr/bin/env python3
+"""Sync a config file's start_urls against what's currently live on a docs site.
+
+Crawls navigation links from seed pages, diffs them against the config's
+``start_urls``, and optionally writes the updated list back.
+
+Usage:
+    skill-seekers sync-config --config configs/claude-code.json
+    skill-seekers sync-config --config configs/claude-code.json --apply
+"""
+
+import argparse
+import json
+import logging
+import sys
+import time
+from collections import deque
+from urllib.parse import urljoin
+
+import requests
+from bs4 import BeautifulSoup
+
+from skill_seekers.cli.utils import sanitize_url, setup_logging
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# URL filtering (mirrors DocToSkillConverter.is_valid_url logic)
+# ---------------------------------------------------------------------------
+
+
+def _is_valid_url(
+    url: str,
+    base_url: str,
+    include_patterns: list[str],
+    exclude_patterns: list[str],
+) -> bool:
+    """Return True if *url* passes include/exclude pattern filters."""
+    if not url.startswith(base_url):
+        return False
+    if include_patterns and not any(p in url for p in include_patterns):
+        return False
+    return not any(p in url for p in exclude_patterns)
+
+
+# ---------------------------------------------------------------------------
+# Lightweight BFS link discovery
+# ---------------------------------------------------------------------------
+
+
+def discover_urls(
+    base_url: str,
+    seed_urls: list[str],
+    include_patterns: list[str] | None = None,
+    exclude_patterns: list[str] | None = None,
+    depth: int = 2,
+    max_pages: int = 500,
+    rate_limit: float = 0.5,
+) -> set[str]:
+    """BFS-crawl *seed_urls* and return all discovered internal URLs.
+
+    Only follows ``<a href>`` links on HTML pages; does not download
+    full page content.  Applies the same include/exclude filtering as
+    :class:`DocToSkillConverter`.
+
+    Args:
+        base_url: Only URLs under this prefix are accepted.
+        seed_urls: Starting points for the BFS.
+        include_patterns: Substring patterns a URL must contain (any).
+        exclude_patterns: Substring patterns that disqualify a URL.
+        depth: Maximum number of BFS hops from the seed pages.
+        max_pages: Stop after discovering this many unique URLs.
+        rate_limit: Seconds to wait between HTTP requests.
+
+    Returns:
+        Set of discovered absolute URLs (fragments stripped).
+    """
+    includes = include_patterns or []
+    excludes = exclude_patterns or []
+
+    visited: set[str] = set()
+    # Queue entries are (url, current_depth)
+    queue: deque[tuple[str, int]] = deque()
+    for u in seed_urls:
+        u = sanitize_url(u)
+        queue.append((u, 0))
+
+    discovered: set[str] = set()
+
+    while queue and len(discovered) < max_pages:
+        url, cur_depth = queue.popleft()
+        if url in visited:
+            continue
+        visited.add(url)
+
+        if not _is_valid_url(url, base_url, includes, excludes):
+            continue
+
+        logger.debug("  [depth %d] %s", cur_depth, url)
+
+        try:
+            headers = {"User-Agent": "Mozilla/5.0 (Skill-Seekers sync-config)"}
+            resp = requests.get(url, headers=headers, timeout=15)
+            resp.raise_for_status()
+        except Exception as e:
+            logger.warning("  Could not fetch %s: %s", url, e)
+            continue
+
+        # Only mark as "discovered" after a successful fetch — 404s and
+        # other errors mean the page no longer exists on the live site.
+        discovered.add(url)
+
+        # Follow links if we haven't hit the depth limit
+        if cur_depth < depth:
+            soup = BeautifulSoup(resp.content, "html.parser")
+            for link in soup.find_all("a", href=True):
+                href = urljoin(url, link["href"])
+                href = href.split("#")[0]  # strip fragment
+                href = sanitize_url(href)
+                if href not in visited and _is_valid_url(href, base_url, includes, excludes):
+                    queue.append((href, cur_depth + 1))
+
+        if rate_limit > 0:
+            time.sleep(rate_limit)
+
+    return discovered
+
+
+# ---------------------------------------------------------------------------
+# Diff logic
+# ---------------------------------------------------------------------------
+
+
+def diff_urls(discovered: set[str], configured: list[str]) -> tuple[list[str], list[str]]:
+    """Compare *discovered* URLs against a *configured* list.
+
+    Returns:
+        ``(added, removed)`` — both sorted lists of URLs.
+    """
+    configured_set = set(configured)
+    added = sorted(discovered - configured_set)
+    removed = sorted(configured_set - discovered)
+    return added, removed
+
+
+# ---------------------------------------------------------------------------
+# Config helpers
+# ---------------------------------------------------------------------------
+
+
+def _get_doc_source(config: dict, source_index: int = 0) -> dict | None:
+    """Extract the documentation source dict from *config*.
+
+    Handles both the unified format (``sources`` array) and legacy flat
+    format (fields at the top level).
+    """
+    sources = config.get("sources")
+    if sources:
+        doc_sources = [s for s in sources if s.get("type") == "documentation"]
+        if source_index < len(doc_sources):
+            return doc_sources[source_index]
+        return None
+
+    # Legacy flat format — treat the whole config as a single source
+    if config.get("base_url"):
+        return config
+    return None
+
+
+def _set_start_urls(config: dict, source_index: int, urls: list[str]) -> None:
+    """Write *urls* into the correct ``start_urls`` field in *config*."""
+    sources = config.get("sources")
+    if sources:
+        doc_sources = [s for s in sources if s.get("type") == "documentation"]
+        if source_index < len(doc_sources):
+            doc_sources[source_index]["start_urls"] = urls
+            return
+    # Legacy flat format
+    config["start_urls"] = urls
+
+
+# ---------------------------------------------------------------------------
+# Main orchestrator
+# ---------------------------------------------------------------------------
+
+
+def sync_config(
+    config_path: str,
+    apply: bool = False,
+    depth: int = 2,
+    max_pages: int = 500,
+    rate_limit: float | None = None,
+    source_index: int = 0,
+) -> dict:
+    """Run the sync-config workflow.
+
+    Returns:
+        Dict with keys ``added``, ``removed``, ``total_discovered``,
+        ``total_configured``, ``applied``.
+    """
+    # Load config
+    with open(config_path, encoding="utf-8") as f:
+        config = json.load(f)
+
+    source = _get_doc_source(config, source_index)
+    if source is None:
+        logger.error("No documentation source found at index %d in %s", source_index, config_path)
+        return {
+            "added": [],
+            "removed": [],
+            "total_discovered": 0,
+            "total_configured": 0,
+            "applied": False,
+            "error": "No documentation source found",
+        }
+
+    base_url: str = source["base_url"]
+    configured_urls: list[str] = source.get("start_urls") or []
+    seed_urls: list[str] = source.get("nav_seed_urls") or configured_urls or [base_url]
+    url_patterns = source.get("url_patterns", {})
+    includes: list[str] = url_patterns.get("include", [])
+    excludes: list[str] = url_patterns.get("exclude", [])
+    effective_rate = rate_limit if rate_limit is not None else source.get("rate_limit", 0.5)
+
+    logger.info("Syncing config: %s", config_path)
+    logger.info("  Base URL:      %s", base_url)
+    logger.info("  Seed URLs:     %d", len(seed_urls))
+    logger.info("  Configured:    %d start_urls", len(configured_urls))
+    logger.info("  Depth:         %d", depth)
+    logger.info("  Rate limit:    %.1fs", effective_rate)
+    logger.info("")
+
+    # Discover
+    discovered = discover_urls(
+        base_url=base_url,
+        seed_urls=seed_urls,
+        include_patterns=includes,
+        exclude_patterns=excludes,
+        depth=depth,
+        max_pages=max_pages,
+        rate_limit=effective_rate,
+    )
+
+    # Diff
+    added, removed = diff_urls(discovered, configured_urls)
+
+    # Report
+    if added:
+        logger.info("New pages (%d):", len(added))
+        for url in added:
+            path = url.replace(base_url, "/")
+            logger.info("  + %s", path)
+    if removed:
+        logger.info("Removed pages (%d):", len(removed))
+        for url in removed:
+            path = url.replace(base_url, "/")
+            logger.info("  - %s", path)
+
+    if not added and not removed:
+        logger.info("Config is up to date. No changes detected.")
+    else:
+        logger.info("")
+        logger.info(
+            "Summary: %d new, %d removed (discovered %d total, configured %d)",
+            len(added),
+            len(removed),
+            len(discovered),
+            len(configured_urls),
+        )
+
+    applied = False
+    if apply and (added or removed):
+        new_urls = sorted(discovered)
+        _set_start_urls(config, source_index, new_urls)
+        with open(config_path, "w", encoding="utf-8") as f:
+            json.dump(config, f, indent=2, ensure_ascii=False)
+            f.write("\n")
+        logger.info("Updated %s (%d start_urls)", config_path, len(new_urls))
+        applied = True
+    elif added or removed:
+        logger.info("Run with --apply to update %s", config_path)
+
+    return {
+        "added": added,
+        "removed": removed,
+        "total_discovered": len(discovered),
+        "total_configured": len(configured_urls),
+        "applied": applied,
+    }
+
+
+# ---------------------------------------------------------------------------
+# CLI entry point
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+    """CLI entry point for ``skill-seekers sync-config``."""
+    from skill_seekers.cli.arguments.sync_config import add_sync_config_arguments
+
+    parser = argparse.ArgumentParser(
+        prog="skill-seekers-sync-config",
+        description="Sync a config's start_urls against what's live on the docs site.",
+    )
+    add_sync_config_arguments(parser)
+    args = parser.parse_args()
+
+    setup_logging(verbose=args.verbose, quiet=args.quiet)
+
+    result = sync_config(
+        config_path=args.config,
+        apply=args.apply,
+        depth=args.depth,
+        max_pages=args.max_pages,
+        rate_limit=args.rate_limit,
+        source_index=args.source_index,
+    )
+
+    if result.get("error"):
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/skill_seekers/mcp/server_fastmcp.py b/src/skill_seekers/mcp/server_fastmcp.py
index 8e729b2..f955c33 100644
--- a/src/skill_seekers/mcp/server_fastmcp.py
+++ b/src/skill_seekers/mcp/server_fastmcp.py
@@ -103,6 +103,8 @@ try:
         # Splitting tools
         split_config_impl,
         submit_config_impl,
+        # Sync config tools
+        sync_config_impl,
         upload_skill_impl,
         validate_config_impl,
         # Workflow tools
@@ -144,6 +146,7 @@ except ImportError:
         scrape_video_impl,
         split_config_impl,
         submit_config_impl,
+        sync_config_impl,
         upload_skill_impl,
         validate_config_impl,
         list_workflows_impl,
@@ -251,6 +254,52 @@ async def validate_config(config_path: str) -> str:
     return str(result)
 
 
+# ============================================================================
+# SYNC CONFIG TOOLS (1 tool)
+# ============================================================================
+
+
+@safe_tool_decorator(description="Sync a config's start_urls against what's live on the docs site.")
+async def sync_config(
+    config_path: str,
+    apply: bool = False,
+    depth: int = 2,
+    max_pages: int = 500,
+    rate_limit: float | None = None,
+    source_index: int = 0,
+) -> str:
+    """
+    Sync a config file's start_urls against the live docs site.
+
+    Crawls seed/nav pages, discovers internal links, and diffs against the
+    config's existing start_urls. Optionally writes the update with apply=True.
+
+    Args:
+        config_path: Path to the config JSON file.
+        apply: Write changes back to the config file (default: False).
+        depth: BFS crawl depth from seed pages (default: 2).
+        max_pages: Maximum URLs to discover (default: 500).
+        rate_limit: Override config rate limit (seconds between requests).
+        source_index: Index of the documentation source to sync (default: 0).
+
+    Returns:
+        Report of added/removed URLs.
+    """
+    result = await sync_config_impl(
+        {
+            "config_path": config_path,
+            "apply": apply,
+            "depth": depth,
+            "max_pages": max_pages,
+            "rate_limit": rate_limit,
+            "source_index": source_index,
+        }
+    )
+    if isinstance(result, list) and result:
+        return result[0].text if hasattr(result[0], "text") else str(result[0])
+    return str(result)
+
+
 # ============================================================================
 # SCRAPING TOOLS (10 tools)
 # ============================================================================
diff --git a/src/skill_seekers/mcp/tools/__init__.py b/src/skill_seekers/mcp/tools/__init__.py
index 66284c4..6783c9d 100644
--- a/src/skill_seekers/mcp/tools/__init__.py
+++ b/src/skill_seekers/mcp/tools/__init__.py
@@ -99,6 +99,9 @@ from .vector_db_tools import (
 from .vector_db_tools import (
     export_to_weaviate_impl,
 )
+from .sync_config_tools import (
+    sync_config_tool as sync_config_impl,
+)
 from .workflow_tools import (
     create_workflow_tool as create_workflow_impl,
 )
@@ -151,6 +154,8 @@ __all__ = [
     "export_to_chroma_impl",
     "export_to_faiss_impl",
     "export_to_qdrant_impl",
+    # Sync config tools
+    "sync_config_impl",
     # Workflow tools
     "list_workflows_impl",
     "get_workflow_impl",
diff --git a/src/skill_seekers/mcp/tools/sync_config_tools.py b/src/skill_seekers/mcp/tools/sync_config_tools.py
new file mode 100644
index 0000000..2597ef4
--- /dev/null
+++ b/src/skill_seekers/mcp/tools/sync_config_tools.py
@@ -0,0 +1,85 @@
+"""Sync-config MCP tool for Skill Seekers MCP Server.
+
+Provides the ``sync_config`` tool that diffs a config's start_urls against
+the live docs site and optionally applies the update.
+"""
+
+try:
+    from mcp.types import TextContent
+except ImportError:
+
+    class TextContent:
+        """Fallback TextContent for when MCP is not installed."""
+
+        def __init__(self, type: str, text: str):
+            self.type = type
+            self.text = text
+
+
+async def sync_config_tool(args: dict) -> list[TextContent]:
+    """Sync a config file's start_urls against what's live on the docs site.
+
+    Crawls seed/nav pages, discovers internal links, diffs against the
+    config's existing ``start_urls``, and optionally writes the update.
+
+    Args:
+        args: Dictionary containing:
+            - config_path (str): Path to the config JSON file.
+            - apply (bool, optional): Write changes back (default: False).
+            - depth (int, optional): BFS crawl depth (default: 2).
+            - max_pages (int, optional): Max URLs to discover (default: 500).
+            - rate_limit (float, optional): Seconds between requests.
+            - source_index (int, optional): Documentation source index (default: 0).
+
+    Returns:
+        List[TextContent]: Report of added/removed URLs, or error message.
+    """
+    config_path = args.get("config_path", "")
+    if not config_path:
+        return [TextContent(type="text", text="Error: config_path is required")]
+
+    try:
+        from skill_seekers.cli.sync_config import sync_config
+
+        result = sync_config(
+            config_path=config_path,
+            apply=args.get("apply", False),
+            depth=args.get("depth", 2),
+            max_pages=args.get("max_pages", 500),
+            rate_limit=args.get("rate_limit"),
+            source_index=args.get("source_index", 0),
+        )
+    except FileNotFoundError:
+        return [TextContent(type="text", text=f"Error: Config file not found: {config_path}")]
+    except Exception as e:
+        return [TextContent(type="text", text=f"Error syncing config: {e}")]
+
+    if result.get("error"):
+        return [TextContent(type="text", text=f"Error: {result['error']}")]
+
+    lines = []
+    added = result["added"]
+    removed = result["removed"]
+
+    if added:
+        lines.append(f"New pages ({len(added)}):")
+        for url in added:
+            lines.append(f"  + {url}")
+    if removed:
+        lines.append(f"Removed pages ({len(removed)}):")
+        for url in removed:
+            lines.append(f"  - {url}")
+    if not added and not removed:
+        lines.append("Config is up to date. No changes detected.")
+    else:
+        lines.append(
+            f"\nSummary: {len(added)} new, {len(removed)} removed "
+            f"(discovered {result['total_discovered']}, "
+            f"configured {result['total_configured']})"
+        )
+        if result["applied"]:
+            lines.append(f"Updated {config_path}")
+        else:
+            lines.append(f"Run with apply=true to update {config_path}")
+
+    return [TextContent(type="text", text="\n".join(lines))]
diff --git a/tests/test_cli_parsers.py b/tests/test_cli_parsers.py
index 8d240d1..55339db 100644
--- a/tests/test_cli_parsers.py
+++ b/tests/test_cli_parsers.py
@@ -24,12 +24,12 @@ class TestParserRegistry:
 
     def test_all_parsers_registered(self):
         """Test that all parsers are registered."""
-        assert len(PARSERS) == 23, f"Expected 23 parsers, got {len(PARSERS)}"
+        assert len(PARSERS) == 24, f"Expected 24 parsers, got {len(PARSERS)}"
 
     def test_get_parser_names(self):
         """Test getting list of parser names."""
         names = get_parser_names()
-        assert len(names) == 23
+        assert len(names) == 24
         assert "scrape" in names
         assert "github" in names
         assert "package" in names
@@ -243,9 +243,9 @@ class TestBackwardCompatibility:
             assert cmd in names, f"Command '{cmd}' not found in parser registry!"
 
     def test_command_count_matches(self):
-        """Test that we have exactly 23 commands (includes create, workflows, word, and video commands)."""
-        assert len(PARSERS) == 23
-        assert len(get_parser_names()) == 23
+        """Test that we have exactly 24 commands (includes create, workflows, word, video, and sync-config commands)."""
+        assert len(PARSERS) == 24
+        assert len(get_parser_names()) == 24
 
 
 if __name__ == "__main__":
diff --git a/tests/test_sync_config.py b/tests/test_sync_config.py
new file mode 100644
index 0000000..905ec85
--- /dev/null
+++ b/tests/test_sync_config.py
@@ -0,0 +1,590 @@
+#!/usr/bin/env python3
+"""Tests for the sync-config command.
+
+Covers:
+- URL diffing logic
+- URL filtering (_is_valid_url)
+- BFS discovery with mocked HTTP responses
+- Config loading (unified + legacy formats)
+- --apply writes correct JSON
+- CLI argument parsing
+- MCP tool wrapper
+"""
+
+import json
+import tempfile
+import unittest
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+from skill_seekers.cli.sync_config import (
+    _get_doc_source,
+    _is_valid_url,
+    _set_start_urls,
+    diff_urls,
+    discover_urls,
+    sync_config,
+)
+
+
+# ---------------------------------------------------------------------------
+# diff_urls
+# ---------------------------------------------------------------------------
+
+
+class TestDiffUrls(unittest.TestCase):
+    """Test the URL diffing logic."""
+
+    def test_no_changes(self):
+        configured = ["https://example.com/a", "https://example.com/b"]
+        discovered = set(configured)
+        added, removed = diff_urls(discovered, configured)
+        self.assertEqual(added, [])
+        self.assertEqual(removed, [])
+
+    def test_added_urls(self):
+        configured = ["https://example.com/a"]
+        discovered = {"https://example.com/a", "https://example.com/b"}
+        added, removed = diff_urls(discovered, configured)
+        self.assertEqual(added, ["https://example.com/b"])
+        self.assertEqual(removed, [])
+
+    def test_removed_urls(self):
+        configured = ["https://example.com/a", "https://example.com/b"]
+        discovered = {"https://example.com/a"}
+        added, removed = diff_urls(discovered, configured)
+        self.assertEqual(added, [])
+        self.assertEqual(removed, ["https://example.com/b"])
+
+    def test_both_added_and_removed(self):
+        configured = ["https://example.com/a", "https://example.com/b"]
+        discovered = {"https://example.com/a", "https://example.com/c"}
+        added, removed = diff_urls(discovered, configured)
+        self.assertEqual(added, ["https://example.com/c"])
+        self.assertEqual(removed, ["https://example.com/b"])
+
+    def test_empty_configured(self):
+        added, removed = diff_urls({"https://example.com/a"}, [])
+        self.assertEqual(added, ["https://example.com/a"])
+        self.assertEqual(removed, [])
+
+    def test_empty_discovered(self):
+        added, removed = diff_urls(set(), ["https://example.com/a"])
+        self.assertEqual(added, [])
+        self.assertEqual(removed, ["https://example.com/a"])
+
+    def test_results_sorted(self):
+        configured = ["https://example.com/z"]
+        discovered = {"https://example.com/b", "https://example.com/a"}
+        added, _ = diff_urls(discovered, configured)
+        self.assertEqual(added, ["https://example.com/a", "https://example.com/b"])
+
+
+# ---------------------------------------------------------------------------
+# _is_valid_url
+# ---------------------------------------------------------------------------
+
+
+class TestIsValidUrl(unittest.TestCase):
+    """Test the URL filtering logic."""
+
+    def test_url_under_base(self):
+        self.assertTrue(
+            _is_valid_url("https://docs.example.com/guide", "https://docs.example.com/", [], [])
+        )
+
+    def test_url_not_under_base(self):
+        self.assertFalse(
+            _is_valid_url("https://other.com/guide", "https://docs.example.com/", [], [])
+        )
+
+    def test_include_pattern_match(self):
+        self.assertTrue(
+            _is_valid_url(
+                "https://docs.example.com/docs/en/guide",
+                "https://docs.example.com/",
+                ["/docs/en/"],
+                [],
+            )
+        )
+
+    def test_include_pattern_no_match(self):
+        self.assertFalse(
+            _is_valid_url(
+                "https://docs.example.com/blog/post",
+                "https://docs.example.com/",
+                ["/docs/en/"],
+                [],
+            )
+        )
+
+    def test_exclude_pattern(self):
+        self.assertFalse(
+            _is_valid_url(
+                "https://docs.example.com/docs/en/changelog",
+                "https://docs.example.com/",
+                [],
+                ["/changelog"],
+            )
+        )
+
+    def test_include_and_exclude(self):
+        # Matches include but also matches exclude -> rejected
+        self.assertFalse(
+            _is_valid_url(
+                "https://docs.example.com/docs/en/changelog",
+                "https://docs.example.com/",
+                ["/docs/en/"],
+                ["/changelog"],
+            )
+        )
+
+    def test_no_patterns_all_valid(self):
+        self.assertTrue(
+            _is_valid_url("https://docs.example.com/anything", "https://docs.example.com/", [], [])
+        )
+
+
+# ---------------------------------------------------------------------------
+# _get_doc_source / _set_start_urls
+# ---------------------------------------------------------------------------
+
+
+class TestConfigHelpers(unittest.TestCase):
+    """Test config extraction for both unified and legacy formats."""
+
+    def test_unified_format(self):
+        config = {
+            "name": "test",
+            "sources": [
+                {"type": "documentation", "base_url": "https://docs.example.com/"},
+                {"type": "github", "repo": "owner/repo"},
+            ],
+        }
+        source = _get_doc_source(config)
+        self.assertIsNotNone(source)
+        self.assertEqual(source["base_url"], "https://docs.example.com/")
+
+    def test_unified_format_second_source(self):
+        config = {
+            "name": "test",
+            "sources": [
+                {"type": "documentation", "base_url": "https://first.com/"},
+                {"type": "documentation", "base_url": "https://second.com/"},
+            ],
+        }
+        source = _get_doc_source(config, source_index=1)
+        self.assertEqual(source["base_url"], "https://second.com/")
+
+    def test_unified_format_invalid_index(self):
+        config = {"name": "test", "sources": [{"type": "github", "repo": "o/r"}]}
+        self.assertIsNone(_get_doc_source(config))
+
+    def test_legacy_flat_format(self):
+        config = {"name": "test", "base_url": "https://docs.example.com/"}
+        source = _get_doc_source(config)
+        self.assertEqual(source["base_url"], "https://docs.example.com/")
+
+    def test_no_source_found(self):
+        config = {"name": "test"}
+        self.assertIsNone(_get_doc_source(config))
+
+    def test_set_start_urls_unified(self):
+        config = {
+            "sources": [
+                {"type": "documentation", "base_url": "https://x.com/", "start_urls": []},
+            ]
+        }
+        _set_start_urls(config, 0, ["https://x.com/a", "https://x.com/b"])
+        self.assertEqual(config["sources"][0]["start_urls"], ["https://x.com/a", "https://x.com/b"])
+
+    def test_set_start_urls_legacy(self):
+        config = {"base_url": "https://x.com/", "start_urls": []}
+        _set_start_urls(config, 0, ["https://x.com/new"])
+        self.assertEqual(config["start_urls"], ["https://x.com/new"])
+
+
+# ---------------------------------------------------------------------------
+# discover_urls (with mocked HTTP)
+# ---------------------------------------------------------------------------
+
+
+class TestDiscoverUrls(unittest.TestCase):
+    """Test BFS link discovery with mocked HTTP responses."""
+
+    def _make_html(self, links: list[str]) -> str:
+        hrefs = "".join(f'<a href="{u}">link</a>' for u in links)
+        return f"<html><body>{hrefs}</body></html>"
+
+    @patch("skill_seekers.cli.sync_config.requests.get")
+    def test_basic_discovery(self, mock_get):
+        """Discover links from a single seed page."""
+        mock_resp = MagicMock()
+        mock_resp.content = self._make_html(
+            [
+                "https://docs.example.com/page-a",
+                "https://docs.example.com/page-b",
+                "https://other.com/external",  # should be filtered out
+            ]
+        ).encode()
+        mock_resp.raise_for_status = MagicMock()
+        mock_get.return_value = mock_resp
+
+        result = discover_urls(
+            base_url="https://docs.example.com/",
+            seed_urls=["https://docs.example.com/"],
+            depth=1,
+            rate_limit=0,
+        )
+
+        self.assertIn("https://docs.example.com/", result)
+        self.assertIn("https://docs.example.com/page-a", result)
+        self.assertIn("https://docs.example.com/page-b", result)
+        self.assertNotIn("https://other.com/external", result)
+
+    @patch("skill_seekers.cli.sync_config.requests.get")
+    def test_depth_limiting(self, mock_get):
+        """URLs at depth > limit should be discovered but not followed."""
+        # Seed returns one link
+        seed_html = self._make_html(["https://docs.example.com/child"])
+        child_html = self._make_html(["https://docs.example.com/grandchild"])
+
+        mock_get.side_effect = [
+            MagicMock(content=seed_html.encode(), raise_for_status=MagicMock()),
+            MagicMock(content=child_html.encode(), raise_for_status=MagicMock()),
+        ]
+
+        result = discover_urls(
+            base_url="https://docs.example.com/",
+            seed_urls=["https://docs.example.com/"],
+            depth=1,  # Only follow seed page links, not child page links
+            rate_limit=0,
+        )
+
+        self.assertIn("https://docs.example.com/child", result)
+        # grandchild is at depth 2, which exceeds depth=1
+        self.assertNotIn("https://docs.example.com/grandchild", result)
+
+    @patch("skill_seekers.cli.sync_config.requests.get")
+    def test_max_pages_limit(self, mock_get):
+        """Stop after max_pages."""
+        links = [f"https://docs.example.com/page-{i}" for i in range(20)]
+        mock_resp = MagicMock()
+        mock_resp.content = self._make_html(links).encode()
+        mock_resp.raise_for_status = MagicMock()
+        mock_get.return_value = mock_resp
+
+        result = discover_urls(
+            base_url="https://docs.example.com/",
+            seed_urls=["https://docs.example.com/"],
+            depth=1,
+            max_pages=5,
+            rate_limit=0,
+        )
+
+        self.assertLessEqual(len(result), 5)
+
+    @patch("skill_seekers.cli.sync_config.requests.get")
+    def test_include_exclude_patterns(self, mock_get):
+        """Include/exclude patterns are respected."""
+        mock_resp = MagicMock()
+        mock_resp.content = self._make_html(
+            [
+                "https://docs.example.com/docs/en/guide",
+                "https://docs.example.com/docs/fr/guide",
+                "https://docs.example.com/blog/post",
+            ]
+        ).encode()
+        mock_resp.raise_for_status = MagicMock()
+        mock_get.return_value = mock_resp
+
+        result = discover_urls(
+            base_url="https://docs.example.com/",
+            seed_urls=["https://docs.example.com/docs/en/overview"],
+            include_patterns=["/docs/en/"],
+            exclude_patterns=["/blog/"],
+            depth=1,
+            rate_limit=0,
+        )
+
+        self.assertIn("https://docs.example.com/docs/en/guide", result)
+        self.assertNotIn("https://docs.example.com/docs/fr/guide", result)
+        self.assertNotIn("https://docs.example.com/blog/post", result)
+
+    @patch("skill_seekers.cli.sync_config.requests.get")
+    def test_http_error_handled_gracefully(self, mock_get):
+        """HTTP errors should not crash the discovery."""
+        mock_get.side_effect = ConnectionError("Network error")
+
+        result = discover_urls(
+            base_url="https://docs.example.com/",
+            seed_urls=["https://docs.example.com/"],
+            depth=1,
+            rate_limit=0,
+        )
+
+        # URLs that fail to fetch are NOT added to discovered (they may
+        # have been removed from the live site).
+        self.assertEqual(result, set())
+
+    @patch("skill_seekers.cli.sync_config.requests.get")
+    def test_fragments_stripped(self, mock_get):
+        """URL fragments (#anchor) should be stripped."""
+        mock_resp = MagicMock()
+        mock_resp.content = self._make_html(
+            [
+                "https://docs.example.com/guide#section1",
+                "https://docs.example.com/guide#section2",
+            ]
+        ).encode()
+        mock_resp.raise_for_status = MagicMock()
+        mock_get.return_value = mock_resp
+
+        result = discover_urls(
+            base_url="https://docs.example.com/",
+            seed_urls=["https://docs.example.com/"],
+            depth=1,
+            rate_limit=0,
+        )
+
+        # Both anchors should resolve to the same URL
+        self.assertIn("https://docs.example.com/guide", result)
+
+
+# ---------------------------------------------------------------------------
+# sync_config (integration with file I/O)
+# ---------------------------------------------------------------------------
+
+
+class TestSyncConfigIntegration(unittest.TestCase):
+    """Test the full sync_config workflow with mocked HTTP."""
+
+    def _write_config(self, config: dict) -> Path:
+        tmp = tempfile.mktemp(suffix=".json")  # noqa: SIM115
+        with open(tmp, "w", encoding="utf-8") as f:
+            json.dump(config, f, indent=2)
+        return Path(tmp)
+
+    @patch("skill_seekers.cli.sync_config.discover_urls")
+    def test_dry_run_does_not_modify_file(self, mock_discover):
+        mock_discover.return_value = {
+            "https://docs.example.com/a",
+            "https://docs.example.com/b",
+            "https://docs.example.com/c",
+        }
+
+        config = {
+            "name": "test",
+            "sources": [
+                {
+                    "type": "documentation",
+                    "base_url": "https://docs.example.com/",
+                    "start_urls": ["https://docs.example.com/a"],
+                }
+            ],
+        }
+        path = self._write_config(config)
+
+        result = sync_config(str(path), apply=False)
+        self.assertFalse(result["applied"])
+        self.assertEqual(len(result["added"]), 2)
+
+        # File should not be modified
+        with open(path, encoding="utf-8") as f:
+            saved = json.load(f)
+        self.assertEqual(len(saved["sources"][0]["start_urls"]), 1)
+        path.unlink()
+
+    @patch("skill_seekers.cli.sync_config.discover_urls")
+    def test_apply_writes_updated_urls(self, mock_discover):
+        mock_discover.return_value = {
+            "https://docs.example.com/a",
+            "https://docs.example.com/b",
+        }
+
+        config = {
+            "name": "test",
+            "sources": [
+                {
+                    "type": "documentation",
+                    "base_url": "https://docs.example.com/",
+                    "start_urls": ["https://docs.example.com/a", "https://docs.example.com/old"],
+                }
+            ],
+        }
+        path = self._write_config(config)
+
+        result = sync_config(str(path), apply=True)
+        self.assertTrue(result["applied"])
+        self.assertEqual(result["added"], ["https://docs.example.com/b"])
+        self.assertEqual(result["removed"], ["https://docs.example.com/old"])
+
+        # File should be updated
+        with open(path, encoding="utf-8") as f:
+            saved = json.load(f)
+        urls = saved["sources"][0]["start_urls"]
+        self.assertIn("https://docs.example.com/a", urls)
+        self.assertIn("https://docs.example.com/b", urls)
+        self.assertNotIn("https://docs.example.com/old", urls)
+        path.unlink()
+
+    @patch("skill_seekers.cli.sync_config.discover_urls")
+    def test_no_changes_does_not_write(self, mock_discover):
+        urls = ["https://docs.example.com/a", "https://docs.example.com/b"]
+        mock_discover.return_value = set(urls)
+
+        config = {
+            "name": "test",
+            "sources": [
+                {
+                    "type": "documentation",
+                    "base_url": "https://docs.example.com/",
+                    "start_urls": urls,
+                }
+            ],
+        }
+        path = self._write_config(config)
+
+        result = sync_config(str(path), apply=True)
+        self.assertFalse(result["applied"])
+        self.assertEqual(result["added"], [])
+        self.assertEqual(result["removed"], [])
+        path.unlink()
+
+    def test_missing_source_returns_error(self):
+        config = {"name": "test", "sources": [{"type": "github", "repo": "o/r"}]}
+        path = self._write_config(config)
+
+        result = sync_config(str(path))
+        self.assertIn("error", result)
+        path.unlink()
+
+    @patch("skill_seekers.cli.sync_config.discover_urls")
+    def test_legacy_config_format(self, mock_discover):
+        mock_discover.return_value = {"https://docs.example.com/a"}
+
+        config = {
+            "name": "test",
+            "base_url": "https://docs.example.com/",
+            "start_urls": ["https://docs.example.com/a", "https://docs.example.com/old"],
+        }
+        path = self._write_config(config)
+
+        result = sync_config(str(path), apply=True)
+        self.assertTrue(result["applied"])
+        self.assertEqual(result["removed"], ["https://docs.example.com/old"])
+
+        with open(path, encoding="utf-8") as f:
+            saved = json.load(f)
+        self.assertEqual(saved["start_urls"], ["https://docs.example.com/a"])
+        path.unlink()
+
+    @patch("skill_seekers.cli.sync_config.discover_urls")
+    def test_nav_seed_urls_used_over_start_urls(self, mock_discover):
+        """When nav_seed_urls is present, it should be used as the seed."""
+        mock_discover.return_value = {"https://docs.example.com/a"}
+
+        config = {
+            "name": "test",
+            "sources": [
+                {
+                    "type": "documentation",
+                    "base_url": "https://docs.example.com/",
+                    "start_urls": ["https://docs.example.com/a"],
+                    "nav_seed_urls": [
+                        "https://docs.example.com/nav1",
+                        "https://docs.example.com/nav2",
+                    ],
+                }
+            ],
+        }
+        path = self._write_config(config)
+
+        sync_config(str(path))
+
+        # Verify discover_urls was called with nav_seed_urls
+        call_kwargs = mock_discover.call_args[1]
+        self.assertEqual(
+            call_kwargs["seed_urls"],
+            ["https://docs.example.com/nav1", "https://docs.example.com/nav2"],
+        )
+        path.unlink()
+
+
+# ---------------------------------------------------------------------------
+# CLI argument parsing
+# ---------------------------------------------------------------------------
+
+
+class TestSyncConfigCLI(unittest.TestCase):
+    """Test CLI argument parsing and subcommand registration."""
+
+    def test_sync_config_parser_registered(self):
+        """sync-config should be a registered subcommand."""
+        from skill_seekers.cli.parsers import get_parser_names
+
+        self.assertIn("sync-config", get_parser_names())
+
+    def test_sync_config_in_command_modules(self):
+        """sync-config should be in COMMAND_MODULES."""
+        from skill_seekers.cli.main import COMMAND_MODULES
+
+        self.assertIn("sync-config", COMMAND_MODULES)
+
+    def test_arguments_created(self):
+        """Argument parser should accept all expected flags."""
+        import argparse
+
+        from skill_seekers.cli.arguments.sync_config import add_sync_config_arguments
+
+        parser = argparse.ArgumentParser()
+        add_sync_config_arguments(parser)
+
+        args = parser.parse_args(["--config", "test.json", "--apply", "--depth", "3"])
+        self.assertEqual(args.config, "test.json")
+        self.assertTrue(args.apply)
+        self.assertEqual(args.depth, 3)
+
+    def test_default_values(self):
+        """Default values should be sensible."""
+        import argparse
+
+        from skill_seekers.cli.arguments.sync_config import add_sync_config_arguments
+
+        parser = argparse.ArgumentParser()
+        add_sync_config_arguments(parser)
+
+        args = parser.parse_args(["--config", "test.json"])
+        self.assertFalse(args.apply)
+        self.assertEqual(args.depth, 2)
+        self.assertEqual(args.max_pages, 500)
+        self.assertIsNone(args.rate_limit)
+        self.assertEqual(args.source_index, 0)
+
+
+# ---------------------------------------------------------------------------
+# MCP tool
+# ---------------------------------------------------------------------------
+
+
+class TestSyncConfigMCPTool(unittest.TestCase):
+    """Test MCP tool wrapper."""
+
+    def test_mcp_tool_importable(self):
+        """The sync_config MCP tool should be importable."""
+        from skill_seekers.mcp.tools import sync_config_impl
+
+        self.assertTrue(callable(sync_config_impl))
+
+    def test_mcp_tool_missing_config_path(self):
+        """Missing config_path should return an error."""
+        import asyncio
+
+        from skill_seekers.mcp.tools.sync_config_tools import sync_config_tool
+
+        result = asyncio.run(sync_config_tool({}))
+        self.assertTrue(any("Error" in r.text for r in result))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_sync_config_e2e.py b/tests/test_sync_config_e2e.py
new file mode 100644
index 0000000..513d2f5
--- /dev/null
+++ b/tests/test_sync_config_e2e.py
@@ -0,0 +1,626 @@
+#!/usr/bin/env python3
+"""End-to-end tests for the sync-config command.
+
+Uses a local HTTP server with realistic multi-page HTML navigation to test
+the full pipeline: BFS crawl -> link discovery -> diff -> config update.
+
+Also includes an integration test against a real public docs site.
+"""
+
+import json
+import subprocess
+import sys
+import tempfile
+import threading
+import unittest
+from http.server import HTTPServer, SimpleHTTPRequestHandler
+from pathlib import Path
+
+import pytest
+
+from skill_seekers.cli.sync_config import discover_urls, sync_config
+
+
+# ---------------------------------------------------------------------------
+# Local test HTTP server
+# ---------------------------------------------------------------------------
+
+# Simulates a docs site with this navigation structure:
+#
+#   /docs/                  (index — links to guide, api, faq)
+#   /docs/guide             (links to guide/install, guide/usage)
+#   /docs/guide/install     (leaf page)
+#   /docs/guide/usage       (leaf page, links back to guide)
+#   /docs/api               (links to api/auth, api/users)
+#   /docs/api/auth          (leaf page)
+#   /docs/api/users         (leaf page)
+#   /docs/faq               (leaf page)
+#   /blog/post-1            (outside /docs/ — should be excluded)
+
+_SITE_PAGES = {
+    "/docs/": """<!DOCTYPE html><html><head><title>Docs Home</title></head><body>
+        <h1>Documentation</h1>
+        <nav>
+            <a href="/docs/guide">Guide</a>
+            <a href="/docs/api">API Reference</a>
+            <a href="/docs/faq">FAQ</a>
+            <a href="/blog/post-1">Blog</a>
+            <a href="https://github.com/example/repo">GitHub</a>
+        </nav>
+    </body></html>""",
+    "/docs/guide": """<!DOCTYPE html><html><body>
+        <h1>Guide</h1>
+        <a href="/docs/guide/install">Installation</a>
+        <a href="/docs/guide/usage">Usage</a>
+        <a href="/docs/">Back to docs</a>
+    </body></html>""",
+    "/docs/guide/install": """<!DOCTYPE html><html><body>
+        <h1>Installation</h1><p>pip install example</p>
+        <a href="/docs/guide">Back to guide</a>
+    </body></html>""",
+    "/docs/guide/usage": """<!DOCTYPE html><html><body>
+        <h1>Usage</h1><p>import example</p>
+        <a href="/docs/guide">Back to guide</a>
+    </body></html>""",
+    "/docs/api": """<!DOCTYPE html><html><body>
+        <h1>API Reference</h1>
+        <a href="/docs/api/auth">Authentication</a>
+        <a href="/docs/api/users">Users</a>
+    </body></html>""",
+    "/docs/api/auth": """<!DOCTYPE html><html><body>
+        <h1>Authentication</h1><p>Use tokens.</p>
+    </body></html>""",
+    "/docs/api/users": """<!DOCTYPE html><html><body>
+        <h1>Users API</h1><p>CRUD operations.</p>
+    </body></html>""",
+    "/docs/faq": """<!DOCTYPE html><html><body>
+        <h1>FAQ</h1><p>Common questions.</p>
+    </body></html>""",
+    "/blog/post-1": """<!DOCTYPE html><html><body>
+        <h1>Blog Post</h1><p>This is a blog post outside /docs/.</p>
+    </body></html>""",
+}
+
+# All docs pages that should be discovered (excluding /blog/)
+_ALL_DOC_URLS_PATHS = {
+    "/docs/",
+    "/docs/guide",
+    "/docs/guide/install",
+    "/docs/guide/usage",
+    "/docs/api",
+    "/docs/api/auth",
+    "/docs/api/users",
+    "/docs/faq",
+}
+
+
+class _TestHandler(SimpleHTTPRequestHandler):
+    """Serve pages from the in-memory _SITE_PAGES dict."""
+
+    def do_GET(self):
+        path = self.path.split("?")[0].split("#")[0]
+        content = _SITE_PAGES.get(path)
+        if content is None:
+            self.send_error(404)
+            return
+        self.send_response(200)
+        self.send_header("Content-Type", "text/html; charset=utf-8")
+        self.end_headers()
+        self.wfile.write(content.encode("utf-8"))
+
+    def log_message(self, format, *args):  # noqa: ARG002
+        pass  # Suppress request logging during tests
+
+
+def _start_server() -> tuple[HTTPServer, int]:
+    """Start a local HTTP server on a random port. Returns (server, port)."""
+    server = HTTPServer(("127.0.0.1", 0), _TestHandler)
+    port = server.server_address[1]
+    thread = threading.Thread(target=server.serve_forever, daemon=True)
+    thread.start()
+    return server, port
+
+
+# ---------------------------------------------------------------------------
+# Helper
+# ---------------------------------------------------------------------------
+
+
+def _write_config(config: dict) -> Path:
+    """Write a config dict to a temp JSON file and return its path."""
+    tmp = tempfile.mktemp(suffix=".json")
+    with open(tmp, "w", encoding="utf-8") as f:
+        json.dump(config, f, indent=2)
+    return Path(tmp)
+
+
+# ---------------------------------------------------------------------------
+# E2E tests using local HTTP server
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.e2e
+class TestSyncConfigE2E(unittest.TestCase):
+    """End-to-end tests using a local HTTP server with realistic HTML."""
+
+    @classmethod
+    def setUpClass(cls):
+        cls.server, cls.port = _start_server()
+        cls.base_url = f"http://127.0.0.1:{cls.port}/docs/"
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.server.shutdown()
+
+    # -- discover_urls --
+
+    def test_discover_finds_all_doc_pages(self):
+        """BFS should discover all 8 /docs/ pages from the root."""
+        discovered = discover_urls(
+            base_url=self.base_url,
+            seed_urls=[self.base_url],
+            depth=3,
+            rate_limit=0,
+        )
+
+        expected = {f"http://127.0.0.1:{self.port}{p}" for p in _ALL_DOC_URLS_PATHS}
+        self.assertEqual(discovered, expected)
+
+    def test_discover_excludes_blog(self):
+        """Pages outside /docs/ base_url should be excluded."""
+        discovered = discover_urls(
+            base_url=self.base_url,
+            seed_urls=[self.base_url],
+            depth=3,
+            rate_limit=0,
+        )
+
+        blog_url = f"http://127.0.0.1:{self.port}/blog/post-1"
+        self.assertNotIn(blog_url, discovered)
+
+    def test_discover_excludes_external(self):
+        """External URLs (github.com) should be excluded."""
+        discovered = discover_urls(
+            base_url=self.base_url,
+            seed_urls=[self.base_url],
+            depth=3,
+            rate_limit=0,
+        )
+
+        self.assertFalse(
+            any("github.com" in u for u in discovered),
+            "External URLs should not be discovered",
+        )
+
+    def test_discover_depth_1_finds_direct_links_only(self):
+        """Depth 1 from root should find guide, api, faq but NOT nested pages."""
+        discovered = discover_urls(
+            base_url=self.base_url,
+            seed_urls=[self.base_url],
+            depth=1,
+            rate_limit=0,
+        )
+
+        # Direct children of /docs/
+        self.assertIn(f"http://127.0.0.1:{self.port}/docs/guide", discovered)
+        self.assertIn(f"http://127.0.0.1:{self.port}/docs/api", discovered)
+        self.assertIn(f"http://127.0.0.1:{self.port}/docs/faq", discovered)
+
+        # Nested pages should NOT be present (they're at depth 2)
+        self.assertNotIn(f"http://127.0.0.1:{self.port}/docs/guide/install", discovered)
+        self.assertNotIn(f"http://127.0.0.1:{self.port}/docs/api/auth", discovered)
+
+    def test_discover_with_include_pattern(self):
+        """Include pattern should filter results."""
+        discovered = discover_urls(
+            base_url=self.base_url,
+            seed_urls=[self.base_url],
+            include_patterns=["/api"],
+            depth=3,
+            rate_limit=0,
+        )
+
+        # Only /api/ pages should be discovered
+        for url in discovered:
+            self.assertIn("/api", url, f"URL {url} does not match include pattern /api")
+
+    def test_discover_with_exclude_pattern(self):
+        """Exclude pattern should remove matching pages."""
+        discovered = discover_urls(
+            base_url=self.base_url,
+            seed_urls=[self.base_url],
+            exclude_patterns=["/faq"],
+            depth=3,
+            rate_limit=0,
+        )
+
+        faq_url = f"http://127.0.0.1:{self.port}/docs/faq"
+        self.assertNotIn(faq_url, discovered)
+        # Other pages should still be found
+        self.assertIn(f"http://127.0.0.1:{self.port}/docs/guide", discovered)
+
+    def test_discover_max_pages_limit(self):
+        """max_pages should cap discovery."""
+        discovered = discover_urls(
+            base_url=self.base_url,
+            seed_urls=[self.base_url],
+            depth=3,
+            max_pages=3,
+            rate_limit=0,
+        )
+
+        self.assertLessEqual(len(discovered), 3)
+
+    # -- sync_config (full pipeline with file I/O) --
+
+    def test_sync_config_dry_run_detects_new_pages(self):
+        """Dry-run should detect pages missing from the config."""
+        config = {
+            "name": "test-site",
+            "sources": [
+                {
+                    "type": "documentation",
+                    "base_url": self.base_url,
+                    "start_urls": [
+                        f"http://127.0.0.1:{self.port}/docs/guide",
+                        f"http://127.0.0.1:{self.port}/docs/faq",
+                    ],
+                }
+            ],
+        }
+        path = _write_config(config)
+
+        result = sync_config(str(path), apply=False, depth=3, rate_limit=0)
+
+        self.assertFalse(result["applied"])
+        self.assertGreater(len(result["added"]), 0, "Should detect new pages")
+        # api, api/auth, api/users, guide/install, guide/usage, /docs/ itself
+        # should all be in added
+        self.assertGreaterEqual(result["total_discovered"], 6)
+
+        # File should NOT be modified
+        with open(path, encoding="utf-8") as f:
+            saved = json.load(f)
+        self.assertEqual(len(saved["sources"][0]["start_urls"]), 2)
+        path.unlink()
+
+    def test_sync_config_apply_updates_config(self):
+        """--apply should write all discovered URLs to the config."""
+        config = {
+            "name": "test-site",
+            "sources": [
+                {
+                    "type": "documentation",
+                    "base_url": self.base_url,
+                    "start_urls": [f"http://127.0.0.1:{self.port}/docs/guide"],
+                }
+            ],
+        }
+        path = _write_config(config)
+
+        result = sync_config(str(path), apply=True, depth=3, rate_limit=0)
+
+        self.assertTrue(result["applied"])
+
+        # Verify the file was updated
+        with open(path, encoding="utf-8") as f:
+            saved = json.load(f)
+        saved_urls = saved["sources"][0]["start_urls"]
+        self.assertEqual(len(saved_urls), result["total_discovered"])
+
+        # All expected URLs should be present
+        expected = {f"http://127.0.0.1:{self.port}{p}" for p in _ALL_DOC_URLS_PATHS}
+        for url in expected:
+            self.assertIn(url, saved_urls, f"Expected URL missing from saved config: {url}")
+
+        path.unlink()
+
+    def test_sync_config_idempotent(self):
+        """Running sync twice with --apply should be a no-op the second time."""
+        config = {
+            "name": "test-site",
+            "sources": [
+                {
+                    "type": "documentation",
+                    "base_url": self.base_url,
+                    "start_urls": [],
+                }
+            ],
+        }
+        path = _write_config(config)
+
+        # First run: should apply changes
+        result1 = sync_config(str(path), apply=True, depth=3, rate_limit=0)
+        self.assertTrue(result1["applied"])
+        self.assertGreater(len(result1["added"]), 0)
+
+        # Second run: should detect no changes
+        result2 = sync_config(str(path), apply=True, depth=3, rate_limit=0)
+        self.assertFalse(result2["applied"])
+        self.assertEqual(result2["added"], [])
+        self.assertEqual(result2["removed"], [])
+
+        path.unlink()
+
+    def test_sync_config_detects_removed_pages(self):
+        """Pages in config but not discovered should show as removed."""
+        config = {
+            "name": "test-site",
+            "sources": [
+                {
+                    "type": "documentation",
+                    "base_url": self.base_url,
+                    "start_urls": [
+                        f"http://127.0.0.1:{self.port}/docs/guide",
+                        f"http://127.0.0.1:{self.port}/docs/old-page-that-no-longer-exists",
+                    ],
+                }
+            ],
+        }
+        path = _write_config(config)
+
+        result = sync_config(str(path), apply=False, depth=3, rate_limit=0)
+
+        self.assertIn(
+            f"http://127.0.0.1:{self.port}/docs/old-page-that-no-longer-exists",
+            result["removed"],
+        )
+        path.unlink()
+
+    def test_sync_config_preserves_other_config_fields(self):
+        """--apply should only modify start_urls, preserving all other fields."""
+        config = {
+            "name": "my-skill",
+            "description": "Important skill description",
+            "version": "1.0.0",
+            "sources": [
+                {
+                    "type": "documentation",
+                    "base_url": self.base_url,
+                    "start_urls": [],
+                    "selectors": {"main_content": "article", "title": "h1"},
+                    "url_patterns": {"include": [], "exclude": []},
+                    "rate_limit": 0.5,
+                    "max_pages": 100,
+                },
+                {
+                    "type": "github",
+                    "repo": "owner/repo",
+                },
+            ],
+        }
+        path = _write_config(config)
+
+        sync_config(str(path), apply=True, depth=3, rate_limit=0)
+
+        with open(path, encoding="utf-8") as f:
+            saved = json.load(f)
+
+        # Non-start_urls fields should be untouched
+        self.assertEqual(saved["name"], "my-skill")
+        self.assertEqual(saved["description"], "Important skill description")
+        self.assertEqual(saved["version"], "1.0.0")
+        self.assertEqual(saved["sources"][0]["selectors"]["main_content"], "article")
+        self.assertEqual(saved["sources"][0]["rate_limit"], 0.5)
+        self.assertEqual(saved["sources"][1]["type"], "github")
+        self.assertEqual(saved["sources"][1]["repo"], "owner/repo")
+
+        # start_urls should be updated
+        self.assertGreater(len(saved["sources"][0]["start_urls"]), 0)
+
+        path.unlink()
+
+    def test_sync_config_with_nav_seed_urls(self):
+        """nav_seed_urls should be used as BFS seeds instead of start_urls."""
+        config = {
+            "name": "test-site",
+            "sources": [
+                {
+                    "type": "documentation",
+                    "base_url": self.base_url,
+                    "start_urls": [],
+                    # Only seed from /docs/api — should only discover API pages
+                    "nav_seed_urls": [f"http://127.0.0.1:{self.port}/docs/api"],
+                }
+            ],
+        }
+        path = _write_config(config)
+
+        result = sync_config(str(path), apply=False, depth=1, rate_limit=0)
+
+        # Should discover at least the API seed page
+        self.assertGreater(len(result["added"]), 0, "nav_seed_urls should discover pages")
+        # All added URLs should be under /docs/
+        for url in result["added"]:
+            self.assertTrue(url.startswith(self.base_url), f"URL outside base: {url}")
+
+        path.unlink()
+
+    def test_sync_config_legacy_format(self):
+        """Legacy flat config format should work end-to-end."""
+        config = {
+            "name": "test-site",
+            "base_url": self.base_url,
+            "start_urls": [f"http://127.0.0.1:{self.port}/docs/guide"],
+        }
+        path = _write_config(config)
+
+        result = sync_config(str(path), apply=True, depth=3, rate_limit=0)
+
+        self.assertTrue(result["applied"])
+
+        with open(path, encoding="utf-8") as f:
+            saved = json.load(f)
+        self.assertGreater(len(saved["start_urls"]), 1)
+
+        path.unlink()
+
+
+# ---------------------------------------------------------------------------
+# CLI subprocess tests
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.e2e
+class TestSyncConfigCLIE2E(unittest.TestCase):
+    """Test the CLI entry point via subprocess."""
+
+    @classmethod
+    def setUpClass(cls):
+        cls.server, cls.port = _start_server()
+        cls.base_url = f"http://127.0.0.1:{cls.port}/docs/"
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.server.shutdown()
+
+    def test_cli_dry_run(self):
+        """CLI dry-run should print diff and exit 0."""
+        config = {
+            "name": "test",
+            "sources": [
+                {
+                    "type": "documentation",
+                    "base_url": self.base_url,
+                    # Only one URL configured — the rest should show as "new"
+                    "start_urls": [f"http://127.0.0.1:{self.port}/docs/faq"],
+                    # Seed from root to discover all pages
+                    "nav_seed_urls": [self.base_url],
+                }
+            ],
+        }
+        path = _write_config(config)
+
+        result = subprocess.run(
+            [
+                sys.executable,
+                "-m",
+                "skill_seekers.cli.sync_config",
+                "--config",
+                str(path),
+                "--depth",
+                "3",
+                "--rate-limit",
+                "0",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+
+        self.assertEqual(result.returncode, 0, f"CLI failed: {result.stderr}")
+        # Should mention new pages in the output (logged to stderr)
+        combined = result.stderr.lower() + result.stdout.lower()
+        self.assertIn("new page", combined, f"Expected 'new page' in output: {combined}")
+        path.unlink()
+
+    def test_cli_apply(self):
+        """CLI --apply should update the config file."""
+        config = {
+            "name": "test",
+            "sources": [
+                {
+                    "type": "documentation",
+                    "base_url": self.base_url,
+                    "start_urls": [f"http://127.0.0.1:{self.port}/docs/faq"],
+                    "nav_seed_urls": [self.base_url],
+                }
+            ],
+        }
+        path = _write_config(config)
+
+        result = subprocess.run(
+            [
+                sys.executable,
+                "-m",
+                "skill_seekers.cli.sync_config",
+                "--config",
+                str(path),
+                "--apply",
+                "--depth",
+                "3",
+                "--rate-limit",
+                "0",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+
+        self.assertEqual(result.returncode, 0, f"CLI failed: {result.stderr}")
+
+        with open(path, encoding="utf-8") as f:
+            saved = json.load(f)
+        self.assertGreater(len(saved["sources"][0]["start_urls"]), 0)
+
+        path.unlink()
+
+    def test_cli_help(self):
+        """CLI --help should print usage and exit 0."""
+        result = subprocess.run(
+            [sys.executable, "-m", "skill_seekers.cli.sync_config", "--help"],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+
+        self.assertEqual(result.returncode, 0)
+        self.assertIn("sync", result.stdout.lower())
+        self.assertIn("--config", result.stdout)
+        self.assertIn("--apply", result.stdout)
+        self.assertIn("--depth", result.stdout)
+
+    def test_cli_missing_config_exits_nonzero(self):
+        """CLI with a non-existent config should fail."""
+        result = subprocess.run(
+            [
+                sys.executable,
+                "-m",
+                "skill_seekers.cli.sync_config",
+                "--config",
+                "/nonexistent/path/config.json",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+
+        self.assertNotEqual(result.returncode, 0)
+
+
+# ---------------------------------------------------------------------------
+# Integration test against real public site
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.integration
+class TestSyncConfigRealSite(unittest.TestCase):
+    """Integration test against a real public docs site.
+
+    Skipped by default (use ``-m integration`` to run).
+    Uses httpbin.org which is a stable, small public HTTP test service.
+    """
+
+    def test_discover_urls_real_http(self):
+        """discover_urls should work against a real HTTP server."""
+        # Use Python docs — small, stable, well-structured
+        discovered = discover_urls(
+            base_url="https://docs.python.org/3/library/",
+            seed_urls=["https://docs.python.org/3/library/functions.html"],
+            depth=1,
+            max_pages=10,
+            rate_limit=0.5,
+        )
+
+        # Should find at least the seed page itself
+        self.assertGreater(len(discovered), 0)
+        # All discovered URLs should be under the base
+        for url in discovered:
+            self.assertTrue(
+                url.startswith("https://docs.python.org/3/library/"),
+                f"Discovered URL outside base: {url}",
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()