feat: add sync-config command to detect and update config start_urls (#306)

## Summary

Add `skill-seekers sync-config` subcommand that crawls a docs site's navigation,
diffs discovered URLs against a config's start_urls, and optionally writes the
updated list back with --apply.

- BFS link discovery with configurable depth (default 2), max-pages, rate-limit
- Respects url_patterns.include/exclude from config
- Supports optional nav_seed_urls config field
- Handles both unified (sources array) and legacy flat config formats
- MCP tool sync_config included
- 57 tests (39 unit + 18 E2E with local HTTP server)
- Fixed CI: renamed summary job to "Tests" to match branch protection rule

Closes #306
This commit is contained in:
yusyus
2026-03-15 02:16:32 +03:00
committed by GitHub
parent 0c9504c944
commit 83b9a695ba
12 changed files with 1783 additions and 5 deletions

View File

@@ -244,6 +244,7 @@ skill-seekers-update = "skill_seekers.cli.incremental_updater:main"
skill-seekers-multilang = "skill_seekers.cli.multilang_support:main" skill-seekers-multilang = "skill_seekers.cli.multilang_support:main"
skill-seekers-quality = "skill_seekers.cli.quality_metrics:main" skill-seekers-quality = "skill_seekers.cli.quality_metrics:main"
skill-seekers-workflows = "skill_seekers.cli.workflows_command:main" skill-seekers-workflows = "skill_seekers.cli.workflows_command:main"
skill-seekers-sync-config = "skill_seekers.cli.sync_config:main"
[tool.setuptools] [tool.setuptools]
package-dir = {"" = "src"} package-dir = {"" = "src"}

View File

@@ -0,0 +1,64 @@
"""Sync-config command argument definitions.
Shared between sync_config.py (standalone) and parsers/sync_config_parser.py
(unified CLI) so the two entry points never drift out of sync.
"""
import argparse
def add_sync_config_arguments(parser: argparse.ArgumentParser) -> None:
"""Add all sync-config arguments to *parser*."""
parser.add_argument(
"--config",
"-c",
type=str,
required=True,
help="Path to the config JSON file to sync",
metavar="FILE",
)
parser.add_argument(
"--apply",
action="store_true",
default=False,
help="Write updated start_urls back to the config file (default: dry-run)",
)
parser.add_argument(
"--depth",
type=int,
default=2,
help="BFS crawl depth from seed pages (default: 2)",
)
parser.add_argument(
"--max-pages",
type=int,
default=500,
help="Maximum pages to discover (default: 500)",
)
parser.add_argument(
"--rate-limit",
type=float,
default=None,
help="Override config rate-limit (seconds between requests)",
)
parser.add_argument(
"--source-index",
type=int,
default=0,
help="Index of the documentation source to sync (default: 0)",
)
parser.add_argument(
"--verbose",
"-v",
action="store_true",
default=False,
help="Verbose output",
)
parser.add_argument(
"--quiet",
"-q",
action="store_true",
default=False,
help="Suppress informational output",
)

View File

@@ -67,6 +67,7 @@ COMMAND_MODULES = {
"multilang": "skill_seekers.cli.multilang_support", "multilang": "skill_seekers.cli.multilang_support",
"quality": "skill_seekers.cli.quality_metrics", "quality": "skill_seekers.cli.quality_metrics",
"workflows": "skill_seekers.cli.workflows_command", "workflows": "skill_seekers.cli.workflows_command",
"sync-config": "skill_seekers.cli.sync_config",
} }

View File

@@ -30,6 +30,7 @@ from .update_parser import UpdateParser
from .multilang_parser import MultilangParser from .multilang_parser import MultilangParser
from .quality_parser import QualityParser from .quality_parser import QualityParser
from .workflows_parser import WorkflowsParser from .workflows_parser import WorkflowsParser
from .sync_config_parser import SyncConfigParser
# Registry of all parsers (in order of usage frequency) # Registry of all parsers (in order of usage frequency)
PARSERS = [ PARSERS = [
@@ -56,6 +57,7 @@ PARSERS = [
MultilangParser(), MultilangParser(),
QualityParser(), QualityParser(),
WorkflowsParser(), WorkflowsParser(),
SyncConfigParser(),
] ]

View File

@@ -0,0 +1,30 @@
"""Parser for the sync-config subcommand."""
import argparse
from .base import SubcommandParser
class SyncConfigParser(SubcommandParser):
"""Subcommand parser for ``skill-seekers sync-config``."""
@property
def name(self) -> str:
return "sync-config"
@property
def help(self) -> str:
return "Diff/update a config's start_urls against the live docs site"
@property
def description(self) -> str:
return (
"Crawl navigation links from a docs site, compare them against "
"the config's start_urls, and optionally write the updated list "
"back with --apply."
)
def add_arguments(self, parser: argparse.ArgumentParser) -> None:
from skill_seekers.cli.arguments.sync_config import add_sync_config_arguments
add_sync_config_arguments(parser)

View File

@@ -0,0 +1,325 @@
#!/usr/bin/env python3
"""Sync a config file's start_urls against what's currently live on a docs site.
Crawls navigation links from seed pages, diffs them against the config's
``start_urls``, and optionally writes the updated list back.
Usage:
skill-seekers sync-config --config configs/claude-code.json
skill-seekers sync-config --config configs/claude-code.json --apply
"""
import argparse
import json
import logging
import sys
import time
from collections import deque
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
from skill_seekers.cli.utils import sanitize_url, setup_logging
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# URL filtering (mirrors DocToSkillConverter.is_valid_url logic)
# ---------------------------------------------------------------------------
def _is_valid_url(
url: str,
base_url: str,
include_patterns: list[str],
exclude_patterns: list[str],
) -> bool:
"""Return True if *url* passes include/exclude pattern filters."""
if not url.startswith(base_url):
return False
if include_patterns and not any(p in url for p in include_patterns):
return False
return not any(p in url for p in exclude_patterns)
# ---------------------------------------------------------------------------
# Lightweight BFS link discovery
# ---------------------------------------------------------------------------
def discover_urls(
base_url: str,
seed_urls: list[str],
include_patterns: list[str] | None = None,
exclude_patterns: list[str] | None = None,
depth: int = 2,
max_pages: int = 500,
rate_limit: float = 0.5,
) -> set[str]:
"""BFS-crawl *seed_urls* and return all discovered internal URLs.
Only follows ``<a href>`` links on HTML pages; does not download
full page content. Applies the same include/exclude filtering as
:class:`DocToSkillConverter`.
Args:
base_url: Only URLs under this prefix are accepted.
seed_urls: Starting points for the BFS.
include_patterns: Substring patterns a URL must contain (any).
exclude_patterns: Substring patterns that disqualify a URL.
depth: Maximum number of BFS hops from the seed pages.
max_pages: Stop after discovering this many unique URLs.
rate_limit: Seconds to wait between HTTP requests.
Returns:
Set of discovered absolute URLs (fragments stripped).
"""
includes = include_patterns or []
excludes = exclude_patterns or []
visited: set[str] = set()
# Queue entries are (url, current_depth)
queue: deque[tuple[str, int]] = deque()
for u in seed_urls:
u = sanitize_url(u)
queue.append((u, 0))
discovered: set[str] = set()
while queue and len(discovered) < max_pages:
url, cur_depth = queue.popleft()
if url in visited:
continue
visited.add(url)
if not _is_valid_url(url, base_url, includes, excludes):
continue
logger.debug(" [depth %d] %s", cur_depth, url)
try:
headers = {"User-Agent": "Mozilla/5.0 (Skill-Seekers sync-config)"}
resp = requests.get(url, headers=headers, timeout=15)
resp.raise_for_status()
except Exception as e:
logger.warning(" Could not fetch %s: %s", url, e)
continue
# Only mark as "discovered" after a successful fetch — 404s and
# other errors mean the page no longer exists on the live site.
discovered.add(url)
# Follow links if we haven't hit the depth limit
if cur_depth < depth:
soup = BeautifulSoup(resp.content, "html.parser")
for link in soup.find_all("a", href=True):
href = urljoin(url, link["href"])
href = href.split("#")[0] # strip fragment
href = sanitize_url(href)
if href not in visited and _is_valid_url(href, base_url, includes, excludes):
queue.append((href, cur_depth + 1))
if rate_limit > 0:
time.sleep(rate_limit)
return discovered
# ---------------------------------------------------------------------------
# Diff logic
# ---------------------------------------------------------------------------
def diff_urls(discovered: set[str], configured: list[str]) -> tuple[list[str], list[str]]:
"""Compare *discovered* URLs against a *configured* list.
Returns:
``(added, removed)`` — both sorted lists of URLs.
"""
configured_set = set(configured)
added = sorted(discovered - configured_set)
removed = sorted(configured_set - discovered)
return added, removed
# ---------------------------------------------------------------------------
# Config helpers
# ---------------------------------------------------------------------------
def _get_doc_source(config: dict, source_index: int = 0) -> dict | None:
"""Extract the documentation source dict from *config*.
Handles both the unified format (``sources`` array) and legacy flat
format (fields at the top level).
"""
sources = config.get("sources")
if sources:
doc_sources = [s for s in sources if s.get("type") == "documentation"]
if source_index < len(doc_sources):
return doc_sources[source_index]
return None
# Legacy flat format — treat the whole config as a single source
if config.get("base_url"):
return config
return None
def _set_start_urls(config: dict, source_index: int, urls: list[str]) -> None:
"""Write *urls* into the correct ``start_urls`` field in *config*."""
sources = config.get("sources")
if sources:
doc_sources = [s for s in sources if s.get("type") == "documentation"]
if source_index < len(doc_sources):
doc_sources[source_index]["start_urls"] = urls
return
# Legacy flat format
config["start_urls"] = urls
# ---------------------------------------------------------------------------
# Main orchestrator
# ---------------------------------------------------------------------------
def sync_config(
config_path: str,
apply: bool = False,
depth: int = 2,
max_pages: int = 500,
rate_limit: float | None = None,
source_index: int = 0,
) -> dict:
"""Run the sync-config workflow.
Returns:
Dict with keys ``added``, ``removed``, ``total_discovered``,
``total_configured``, ``applied``.
"""
# Load config
with open(config_path, encoding="utf-8") as f:
config = json.load(f)
source = _get_doc_source(config, source_index)
if source is None:
logger.error("No documentation source found at index %d in %s", source_index, config_path)
return {
"added": [],
"removed": [],
"total_discovered": 0,
"total_configured": 0,
"applied": False,
"error": "No documentation source found",
}
base_url: str = source["base_url"]
configured_urls: list[str] = source.get("start_urls") or []
seed_urls: list[str] = source.get("nav_seed_urls") or configured_urls or [base_url]
url_patterns = source.get("url_patterns", {})
includes: list[str] = url_patterns.get("include", [])
excludes: list[str] = url_patterns.get("exclude", [])
effective_rate = rate_limit if rate_limit is not None else source.get("rate_limit", 0.5)
logger.info("Syncing config: %s", config_path)
logger.info(" Base URL: %s", base_url)
logger.info(" Seed URLs: %d", len(seed_urls))
logger.info(" Configured: %d start_urls", len(configured_urls))
logger.info(" Depth: %d", depth)
logger.info(" Rate limit: %.1fs", effective_rate)
logger.info("")
# Discover
discovered = discover_urls(
base_url=base_url,
seed_urls=seed_urls,
include_patterns=includes,
exclude_patterns=excludes,
depth=depth,
max_pages=max_pages,
rate_limit=effective_rate,
)
# Diff
added, removed = diff_urls(discovered, configured_urls)
# Report
if added:
logger.info("New pages (%d):", len(added))
for url in added:
path = url.replace(base_url, "/")
logger.info(" + %s", path)
if removed:
logger.info("Removed pages (%d):", len(removed))
for url in removed:
path = url.replace(base_url, "/")
logger.info(" - %s", path)
if not added and not removed:
logger.info("Config is up to date. No changes detected.")
else:
logger.info("")
logger.info(
"Summary: %d new, %d removed (discovered %d total, configured %d)",
len(added),
len(removed),
len(discovered),
len(configured_urls),
)
applied = False
if apply and (added or removed):
new_urls = sorted(discovered)
_set_start_urls(config, source_index, new_urls)
with open(config_path, "w", encoding="utf-8") as f:
json.dump(config, f, indent=2, ensure_ascii=False)
f.write("\n")
logger.info("Updated %s (%d start_urls)", config_path, len(new_urls))
applied = True
elif added or removed:
logger.info("Run with --apply to update %s", config_path)
return {
"added": added,
"removed": removed,
"total_discovered": len(discovered),
"total_configured": len(configured_urls),
"applied": applied,
}
# ---------------------------------------------------------------------------
# CLI entry point
# ---------------------------------------------------------------------------
def main() -> None:
"""CLI entry point for ``skill-seekers sync-config``."""
from skill_seekers.cli.arguments.sync_config import add_sync_config_arguments
parser = argparse.ArgumentParser(
prog="skill-seekers-sync-config",
description="Sync a config's start_urls against what's live on the docs site.",
)
add_sync_config_arguments(parser)
args = parser.parse_args()
setup_logging(verbose=args.verbose, quiet=args.quiet)
result = sync_config(
config_path=args.config,
apply=args.apply,
depth=args.depth,
max_pages=args.max_pages,
rate_limit=args.rate_limit,
source_index=args.source_index,
)
if result.get("error"):
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -103,6 +103,8 @@ try:
# Splitting tools # Splitting tools
split_config_impl, split_config_impl,
submit_config_impl, submit_config_impl,
# Sync config tools
sync_config_impl,
upload_skill_impl, upload_skill_impl,
validate_config_impl, validate_config_impl,
# Workflow tools # Workflow tools
@@ -144,6 +146,7 @@ except ImportError:
scrape_video_impl, scrape_video_impl,
split_config_impl, split_config_impl,
submit_config_impl, submit_config_impl,
sync_config_impl,
upload_skill_impl, upload_skill_impl,
validate_config_impl, validate_config_impl,
list_workflows_impl, list_workflows_impl,
@@ -251,6 +254,52 @@ async def validate_config(config_path: str) -> str:
return str(result) return str(result)
# ============================================================================
# SYNC CONFIG TOOLS (1 tool)
# ============================================================================
@safe_tool_decorator(description="Sync a config's start_urls against what's live on the docs site.")
async def sync_config(
config_path: str,
apply: bool = False,
depth: int = 2,
max_pages: int = 500,
rate_limit: float | None = None,
source_index: int = 0,
) -> str:
"""
Sync a config file's start_urls against the live docs site.
Crawls seed/nav pages, discovers internal links, and diffs against the
config's existing start_urls. Optionally writes the update with apply=True.
Args:
config_path: Path to the config JSON file.
apply: Write changes back to the config file (default: False).
depth: BFS crawl depth from seed pages (default: 2).
max_pages: Maximum URLs to discover (default: 500).
rate_limit: Override config rate limit (seconds between requests).
source_index: Index of the documentation source to sync (default: 0).
Returns:
Report of added/removed URLs.
"""
result = await sync_config_impl(
{
"config_path": config_path,
"apply": apply,
"depth": depth,
"max_pages": max_pages,
"rate_limit": rate_limit,
"source_index": source_index,
}
)
if isinstance(result, list) and result:
return result[0].text if hasattr(result[0], "text") else str(result[0])
return str(result)
# ============================================================================ # ============================================================================
# SCRAPING TOOLS (10 tools) # SCRAPING TOOLS (10 tools)
# ============================================================================ # ============================================================================

View File

@@ -99,6 +99,9 @@ from .vector_db_tools import (
from .vector_db_tools import ( from .vector_db_tools import (
export_to_weaviate_impl, export_to_weaviate_impl,
) )
from .sync_config_tools import (
sync_config_tool as sync_config_impl,
)
from .workflow_tools import ( from .workflow_tools import (
create_workflow_tool as create_workflow_impl, create_workflow_tool as create_workflow_impl,
) )
@@ -151,6 +154,8 @@ __all__ = [
"export_to_chroma_impl", "export_to_chroma_impl",
"export_to_faiss_impl", "export_to_faiss_impl",
"export_to_qdrant_impl", "export_to_qdrant_impl",
# Sync config tools
"sync_config_impl",
# Workflow tools # Workflow tools
"list_workflows_impl", "list_workflows_impl",
"get_workflow_impl", "get_workflow_impl",

View File

@@ -0,0 +1,85 @@
"""Sync-config MCP tool for Skill Seekers MCP Server.
Provides the ``sync_config`` tool that diffs a config's start_urls against
the live docs site and optionally applies the update.
"""
try:
from mcp.types import TextContent
except ImportError:
class TextContent:
"""Fallback TextContent for when MCP is not installed."""
def __init__(self, type: str, text: str):
self.type = type
self.text = text
async def sync_config_tool(args: dict) -> list[TextContent]:
"""Sync a config file's start_urls against what's live on the docs site.
Crawls seed/nav pages, discovers internal links, diffs against the
config's existing ``start_urls``, and optionally writes the update.
Args:
args: Dictionary containing:
- config_path (str): Path to the config JSON file.
- apply (bool, optional): Write changes back (default: False).
- depth (int, optional): BFS crawl depth (default: 2).
- max_pages (int, optional): Max URLs to discover (default: 500).
- rate_limit (float, optional): Seconds between requests.
- source_index (int, optional): Documentation source index (default: 0).
Returns:
List[TextContent]: Report of added/removed URLs, or error message.
"""
config_path = args.get("config_path", "")
if not config_path:
return [TextContent(type="text", text="Error: config_path is required")]
try:
from skill_seekers.cli.sync_config import sync_config
result = sync_config(
config_path=config_path,
apply=args.get("apply", False),
depth=args.get("depth", 2),
max_pages=args.get("max_pages", 500),
rate_limit=args.get("rate_limit"),
source_index=args.get("source_index", 0),
)
except FileNotFoundError:
return [TextContent(type="text", text=f"Error: Config file not found: {config_path}")]
except Exception as e:
return [TextContent(type="text", text=f"Error syncing config: {e}")]
if result.get("error"):
return [TextContent(type="text", text=f"Error: {result['error']}")]
lines = []
added = result["added"]
removed = result["removed"]
if added:
lines.append(f"New pages ({len(added)}):")
for url in added:
lines.append(f" + {url}")
if removed:
lines.append(f"Removed pages ({len(removed)}):")
for url in removed:
lines.append(f" - {url}")
if not added and not removed:
lines.append("Config is up to date. No changes detected.")
else:
lines.append(
f"\nSummary: {len(added)} new, {len(removed)} removed "
f"(discovered {result['total_discovered']}, "
f"configured {result['total_configured']})"
)
if result["applied"]:
lines.append(f"Updated {config_path}")
else:
lines.append(f"Run with apply=true to update {config_path}")
return [TextContent(type="text", text="\n".join(lines))]

View File

@@ -24,12 +24,12 @@ class TestParserRegistry:
def test_all_parsers_registered(self): def test_all_parsers_registered(self):
"""Test that all parsers are registered.""" """Test that all parsers are registered."""
assert len(PARSERS) == 23, f"Expected 23 parsers, got {len(PARSERS)}" assert len(PARSERS) == 24, f"Expected 24 parsers, got {len(PARSERS)}"
def test_get_parser_names(self): def test_get_parser_names(self):
"""Test getting list of parser names.""" """Test getting list of parser names."""
names = get_parser_names() names = get_parser_names()
assert len(names) == 23 assert len(names) == 24
assert "scrape" in names assert "scrape" in names
assert "github" in names assert "github" in names
assert "package" in names assert "package" in names
@@ -243,9 +243,9 @@ class TestBackwardCompatibility:
assert cmd in names, f"Command '{cmd}' not found in parser registry!" assert cmd in names, f"Command '{cmd}' not found in parser registry!"
def test_command_count_matches(self): def test_command_count_matches(self):
"""Test that we have exactly 23 commands (includes create, workflows, word, and video commands).""" """Test that we have exactly 24 commands (includes create, workflows, word, video, and sync-config commands)."""
assert len(PARSERS) == 23 assert len(PARSERS) == 24
assert len(get_parser_names()) == 23 assert len(get_parser_names()) == 24
if __name__ == "__main__": if __name__ == "__main__":

590
tests/test_sync_config.py Normal file
View File

@@ -0,0 +1,590 @@
#!/usr/bin/env python3
"""Tests for the sync-config command.
Covers:
- URL diffing logic
- URL filtering (_is_valid_url)
- BFS discovery with mocked HTTP responses
- Config loading (unified + legacy formats)
- --apply writes correct JSON
- CLI argument parsing
- MCP tool wrapper
"""
import json
import tempfile
import unittest
from pathlib import Path
from unittest.mock import MagicMock, patch
from skill_seekers.cli.sync_config import (
_get_doc_source,
_is_valid_url,
_set_start_urls,
diff_urls,
discover_urls,
sync_config,
)
# ---------------------------------------------------------------------------
# diff_urls
# ---------------------------------------------------------------------------
class TestDiffUrls(unittest.TestCase):
"""Test the URL diffing logic."""
def test_no_changes(self):
configured = ["https://example.com/a", "https://example.com/b"]
discovered = set(configured)
added, removed = diff_urls(discovered, configured)
self.assertEqual(added, [])
self.assertEqual(removed, [])
def test_added_urls(self):
configured = ["https://example.com/a"]
discovered = {"https://example.com/a", "https://example.com/b"}
added, removed = diff_urls(discovered, configured)
self.assertEqual(added, ["https://example.com/b"])
self.assertEqual(removed, [])
def test_removed_urls(self):
configured = ["https://example.com/a", "https://example.com/b"]
discovered = {"https://example.com/a"}
added, removed = diff_urls(discovered, configured)
self.assertEqual(added, [])
self.assertEqual(removed, ["https://example.com/b"])
def test_both_added_and_removed(self):
configured = ["https://example.com/a", "https://example.com/b"]
discovered = {"https://example.com/a", "https://example.com/c"}
added, removed = diff_urls(discovered, configured)
self.assertEqual(added, ["https://example.com/c"])
self.assertEqual(removed, ["https://example.com/b"])
def test_empty_configured(self):
added, removed = diff_urls({"https://example.com/a"}, [])
self.assertEqual(added, ["https://example.com/a"])
self.assertEqual(removed, [])
def test_empty_discovered(self):
added, removed = diff_urls(set(), ["https://example.com/a"])
self.assertEqual(added, [])
self.assertEqual(removed, ["https://example.com/a"])
def test_results_sorted(self):
configured = ["https://example.com/z"]
discovered = {"https://example.com/b", "https://example.com/a"}
added, _ = diff_urls(discovered, configured)
self.assertEqual(added, ["https://example.com/a", "https://example.com/b"])
# ---------------------------------------------------------------------------
# _is_valid_url
# ---------------------------------------------------------------------------
class TestIsValidUrl(unittest.TestCase):
"""Test the URL filtering logic."""
def test_url_under_base(self):
self.assertTrue(
_is_valid_url("https://docs.example.com/guide", "https://docs.example.com/", [], [])
)
def test_url_not_under_base(self):
self.assertFalse(
_is_valid_url("https://other.com/guide", "https://docs.example.com/", [], [])
)
def test_include_pattern_match(self):
self.assertTrue(
_is_valid_url(
"https://docs.example.com/docs/en/guide",
"https://docs.example.com/",
["/docs/en/"],
[],
)
)
def test_include_pattern_no_match(self):
self.assertFalse(
_is_valid_url(
"https://docs.example.com/blog/post",
"https://docs.example.com/",
["/docs/en/"],
[],
)
)
def test_exclude_pattern(self):
self.assertFalse(
_is_valid_url(
"https://docs.example.com/docs/en/changelog",
"https://docs.example.com/",
[],
["/changelog"],
)
)
def test_include_and_exclude(self):
# Matches include but also matches exclude -> rejected
self.assertFalse(
_is_valid_url(
"https://docs.example.com/docs/en/changelog",
"https://docs.example.com/",
["/docs/en/"],
["/changelog"],
)
)
def test_no_patterns_all_valid(self):
self.assertTrue(
_is_valid_url("https://docs.example.com/anything", "https://docs.example.com/", [], [])
)
# ---------------------------------------------------------------------------
# _get_doc_source / _set_start_urls
# ---------------------------------------------------------------------------
class TestConfigHelpers(unittest.TestCase):
"""Test config extraction for both unified and legacy formats."""
def test_unified_format(self):
config = {
"name": "test",
"sources": [
{"type": "documentation", "base_url": "https://docs.example.com/"},
{"type": "github", "repo": "owner/repo"},
],
}
source = _get_doc_source(config)
self.assertIsNotNone(source)
self.assertEqual(source["base_url"], "https://docs.example.com/")
def test_unified_format_second_source(self):
config = {
"name": "test",
"sources": [
{"type": "documentation", "base_url": "https://first.com/"},
{"type": "documentation", "base_url": "https://second.com/"},
],
}
source = _get_doc_source(config, source_index=1)
self.assertEqual(source["base_url"], "https://second.com/")
def test_unified_format_invalid_index(self):
config = {"name": "test", "sources": [{"type": "github", "repo": "o/r"}]}
self.assertIsNone(_get_doc_source(config))
def test_legacy_flat_format(self):
config = {"name": "test", "base_url": "https://docs.example.com/"}
source = _get_doc_source(config)
self.assertEqual(source["base_url"], "https://docs.example.com/")
def test_no_source_found(self):
config = {"name": "test"}
self.assertIsNone(_get_doc_source(config))
def test_set_start_urls_unified(self):
config = {
"sources": [
{"type": "documentation", "base_url": "https://x.com/", "start_urls": []},
]
}
_set_start_urls(config, 0, ["https://x.com/a", "https://x.com/b"])
self.assertEqual(config["sources"][0]["start_urls"], ["https://x.com/a", "https://x.com/b"])
def test_set_start_urls_legacy(self):
config = {"base_url": "https://x.com/", "start_urls": []}
_set_start_urls(config, 0, ["https://x.com/new"])
self.assertEqual(config["start_urls"], ["https://x.com/new"])
# ---------------------------------------------------------------------------
# discover_urls (with mocked HTTP)
# ---------------------------------------------------------------------------
class TestDiscoverUrls(unittest.TestCase):
"""Test BFS link discovery with mocked HTTP responses."""
def _make_html(self, links: list[str]) -> str:
hrefs = "".join(f'<a href="{u}">link</a>' for u in links)
return f"<html><body>{hrefs}</body></html>"
@patch("skill_seekers.cli.sync_config.requests.get")
def test_basic_discovery(self, mock_get):
"""Discover links from a single seed page."""
mock_resp = MagicMock()
mock_resp.content = self._make_html(
[
"https://docs.example.com/page-a",
"https://docs.example.com/page-b",
"https://other.com/external", # should be filtered out
]
).encode()
mock_resp.raise_for_status = MagicMock()
mock_get.return_value = mock_resp
result = discover_urls(
base_url="https://docs.example.com/",
seed_urls=["https://docs.example.com/"],
depth=1,
rate_limit=0,
)
self.assertIn("https://docs.example.com/", result)
self.assertIn("https://docs.example.com/page-a", result)
self.assertIn("https://docs.example.com/page-b", result)
self.assertNotIn("https://other.com/external", result)
@patch("skill_seekers.cli.sync_config.requests.get")
def test_depth_limiting(self, mock_get):
"""URLs at depth > limit should be discovered but not followed."""
# Seed returns one link
seed_html = self._make_html(["https://docs.example.com/child"])
child_html = self._make_html(["https://docs.example.com/grandchild"])
mock_get.side_effect = [
MagicMock(content=seed_html.encode(), raise_for_status=MagicMock()),
MagicMock(content=child_html.encode(), raise_for_status=MagicMock()),
]
result = discover_urls(
base_url="https://docs.example.com/",
seed_urls=["https://docs.example.com/"],
depth=1, # Only follow seed page links, not child page links
rate_limit=0,
)
self.assertIn("https://docs.example.com/child", result)
# grandchild is at depth 2, which exceeds depth=1
self.assertNotIn("https://docs.example.com/grandchild", result)
@patch("skill_seekers.cli.sync_config.requests.get")
def test_max_pages_limit(self, mock_get):
"""Stop after max_pages."""
links = [f"https://docs.example.com/page-{i}" for i in range(20)]
mock_resp = MagicMock()
mock_resp.content = self._make_html(links).encode()
mock_resp.raise_for_status = MagicMock()
mock_get.return_value = mock_resp
result = discover_urls(
base_url="https://docs.example.com/",
seed_urls=["https://docs.example.com/"],
depth=1,
max_pages=5,
rate_limit=0,
)
self.assertLessEqual(len(result), 5)
@patch("skill_seekers.cli.sync_config.requests.get")
def test_include_exclude_patterns(self, mock_get):
"""Include/exclude patterns are respected."""
mock_resp = MagicMock()
mock_resp.content = self._make_html(
[
"https://docs.example.com/docs/en/guide",
"https://docs.example.com/docs/fr/guide",
"https://docs.example.com/blog/post",
]
).encode()
mock_resp.raise_for_status = MagicMock()
mock_get.return_value = mock_resp
result = discover_urls(
base_url="https://docs.example.com/",
seed_urls=["https://docs.example.com/docs/en/overview"],
include_patterns=["/docs/en/"],
exclude_patterns=["/blog/"],
depth=1,
rate_limit=0,
)
self.assertIn("https://docs.example.com/docs/en/guide", result)
self.assertNotIn("https://docs.example.com/docs/fr/guide", result)
self.assertNotIn("https://docs.example.com/blog/post", result)
@patch("skill_seekers.cli.sync_config.requests.get")
def test_http_error_handled_gracefully(self, mock_get):
"""HTTP errors should not crash the discovery."""
mock_get.side_effect = ConnectionError("Network error")
result = discover_urls(
base_url="https://docs.example.com/",
seed_urls=["https://docs.example.com/"],
depth=1,
rate_limit=0,
)
# URLs that fail to fetch are NOT added to discovered (they may
# have been removed from the live site).
self.assertEqual(result, set())
@patch("skill_seekers.cli.sync_config.requests.get")
def test_fragments_stripped(self, mock_get):
"""URL fragments (#anchor) should be stripped."""
mock_resp = MagicMock()
mock_resp.content = self._make_html(
[
"https://docs.example.com/guide#section1",
"https://docs.example.com/guide#section2",
]
).encode()
mock_resp.raise_for_status = MagicMock()
mock_get.return_value = mock_resp
result = discover_urls(
base_url="https://docs.example.com/",
seed_urls=["https://docs.example.com/"],
depth=1,
rate_limit=0,
)
# Both anchors should resolve to the same URL
self.assertIn("https://docs.example.com/guide", result)
# ---------------------------------------------------------------------------
# sync_config (integration with file I/O)
# ---------------------------------------------------------------------------
class TestSyncConfigIntegration(unittest.TestCase):
"""Test the full sync_config workflow with mocked HTTP."""
def _write_config(self, config: dict) -> Path:
tmp = tempfile.mktemp(suffix=".json") # noqa: SIM115
with open(tmp, "w", encoding="utf-8") as f:
json.dump(config, f, indent=2)
return Path(tmp)
@patch("skill_seekers.cli.sync_config.discover_urls")
def test_dry_run_does_not_modify_file(self, mock_discover):
mock_discover.return_value = {
"https://docs.example.com/a",
"https://docs.example.com/b",
"https://docs.example.com/c",
}
config = {
"name": "test",
"sources": [
{
"type": "documentation",
"base_url": "https://docs.example.com/",
"start_urls": ["https://docs.example.com/a"],
}
],
}
path = self._write_config(config)
result = sync_config(str(path), apply=False)
self.assertFalse(result["applied"])
self.assertEqual(len(result["added"]), 2)
# File should not be modified
with open(path, encoding="utf-8") as f:
saved = json.load(f)
self.assertEqual(len(saved["sources"][0]["start_urls"]), 1)
path.unlink()
@patch("skill_seekers.cli.sync_config.discover_urls")
def test_apply_writes_updated_urls(self, mock_discover):
mock_discover.return_value = {
"https://docs.example.com/a",
"https://docs.example.com/b",
}
config = {
"name": "test",
"sources": [
{
"type": "documentation",
"base_url": "https://docs.example.com/",
"start_urls": ["https://docs.example.com/a", "https://docs.example.com/old"],
}
],
}
path = self._write_config(config)
result = sync_config(str(path), apply=True)
self.assertTrue(result["applied"])
self.assertEqual(result["added"], ["https://docs.example.com/b"])
self.assertEqual(result["removed"], ["https://docs.example.com/old"])
# File should be updated
with open(path, encoding="utf-8") as f:
saved = json.load(f)
urls = saved["sources"][0]["start_urls"]
self.assertIn("https://docs.example.com/a", urls)
self.assertIn("https://docs.example.com/b", urls)
self.assertNotIn("https://docs.example.com/old", urls)
path.unlink()
@patch("skill_seekers.cli.sync_config.discover_urls")
def test_no_changes_does_not_write(self, mock_discover):
urls = ["https://docs.example.com/a", "https://docs.example.com/b"]
mock_discover.return_value = set(urls)
config = {
"name": "test",
"sources": [
{
"type": "documentation",
"base_url": "https://docs.example.com/",
"start_urls": urls,
}
],
}
path = self._write_config(config)
result = sync_config(str(path), apply=True)
self.assertFalse(result["applied"])
self.assertEqual(result["added"], [])
self.assertEqual(result["removed"], [])
path.unlink()
def test_missing_source_returns_error(self):
config = {"name": "test", "sources": [{"type": "github", "repo": "o/r"}]}
path = self._write_config(config)
result = sync_config(str(path))
self.assertIn("error", result)
path.unlink()
@patch("skill_seekers.cli.sync_config.discover_urls")
def test_legacy_config_format(self, mock_discover):
mock_discover.return_value = {"https://docs.example.com/a"}
config = {
"name": "test",
"base_url": "https://docs.example.com/",
"start_urls": ["https://docs.example.com/a", "https://docs.example.com/old"],
}
path = self._write_config(config)
result = sync_config(str(path), apply=True)
self.assertTrue(result["applied"])
self.assertEqual(result["removed"], ["https://docs.example.com/old"])
with open(path, encoding="utf-8") as f:
saved = json.load(f)
self.assertEqual(saved["start_urls"], ["https://docs.example.com/a"])
path.unlink()
@patch("skill_seekers.cli.sync_config.discover_urls")
def test_nav_seed_urls_used_over_start_urls(self, mock_discover):
"""When nav_seed_urls is present, it should be used as the seed."""
mock_discover.return_value = {"https://docs.example.com/a"}
config = {
"name": "test",
"sources": [
{
"type": "documentation",
"base_url": "https://docs.example.com/",
"start_urls": ["https://docs.example.com/a"],
"nav_seed_urls": [
"https://docs.example.com/nav1",
"https://docs.example.com/nav2",
],
}
],
}
path = self._write_config(config)
sync_config(str(path))
# Verify discover_urls was called with nav_seed_urls
call_kwargs = mock_discover.call_args[1]
self.assertEqual(
call_kwargs["seed_urls"],
["https://docs.example.com/nav1", "https://docs.example.com/nav2"],
)
path.unlink()
# ---------------------------------------------------------------------------
# CLI argument parsing
# ---------------------------------------------------------------------------
class TestSyncConfigCLI(unittest.TestCase):
"""Test CLI argument parsing and subcommand registration."""
def test_sync_config_parser_registered(self):
"""sync-config should be a registered subcommand."""
from skill_seekers.cli.parsers import get_parser_names
self.assertIn("sync-config", get_parser_names())
def test_sync_config_in_command_modules(self):
"""sync-config should be in COMMAND_MODULES."""
from skill_seekers.cli.main import COMMAND_MODULES
self.assertIn("sync-config", COMMAND_MODULES)
def test_arguments_created(self):
"""Argument parser should accept all expected flags."""
import argparse
from skill_seekers.cli.arguments.sync_config import add_sync_config_arguments
parser = argparse.ArgumentParser()
add_sync_config_arguments(parser)
args = parser.parse_args(["--config", "test.json", "--apply", "--depth", "3"])
self.assertEqual(args.config, "test.json")
self.assertTrue(args.apply)
self.assertEqual(args.depth, 3)
def test_default_values(self):
"""Default values should be sensible."""
import argparse
from skill_seekers.cli.arguments.sync_config import add_sync_config_arguments
parser = argparse.ArgumentParser()
add_sync_config_arguments(parser)
args = parser.parse_args(["--config", "test.json"])
self.assertFalse(args.apply)
self.assertEqual(args.depth, 2)
self.assertEqual(args.max_pages, 500)
self.assertIsNone(args.rate_limit)
self.assertEqual(args.source_index, 0)
# ---------------------------------------------------------------------------
# MCP tool
# ---------------------------------------------------------------------------
class TestSyncConfigMCPTool(unittest.TestCase):
"""Test MCP tool wrapper."""
def test_mcp_tool_importable(self):
"""The sync_config MCP tool should be importable."""
from skill_seekers.mcp.tools import sync_config_impl
self.assertTrue(callable(sync_config_impl))
def test_mcp_tool_missing_config_path(self):
"""Missing config_path should return an error."""
import asyncio
from skill_seekers.mcp.tools.sync_config_tools import sync_config_tool
result = asyncio.run(sync_config_tool({}))
self.assertTrue(any("Error" in r.text for r in result))
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,626 @@
#!/usr/bin/env python3
"""End-to-end tests for the sync-config command.
Uses a local HTTP server with realistic multi-page HTML navigation to test
the full pipeline: BFS crawl -> link discovery -> diff -> config update.
Also includes an integration test against a real public docs site.
"""
import json
import subprocess
import sys
import tempfile
import threading
import unittest
from http.server import HTTPServer, SimpleHTTPRequestHandler
from pathlib import Path
import pytest
from skill_seekers.cli.sync_config import discover_urls, sync_config
# ---------------------------------------------------------------------------
# Local test HTTP server
# ---------------------------------------------------------------------------
# Simulates a docs site with this navigation structure:
#
# /docs/ (index — links to guide, api, faq)
# /docs/guide (links to guide/install, guide/usage)
# /docs/guide/install (leaf page)
# /docs/guide/usage (leaf page, links back to guide)
# /docs/api (links to api/auth, api/users)
# /docs/api/auth (leaf page)
# /docs/api/users (leaf page)
# /docs/faq (leaf page)
# /blog/post-1 (outside /docs/ — should be excluded)
_SITE_PAGES = {
"/docs/": """<!DOCTYPE html><html><head><title>Docs Home</title></head><body>
<h1>Documentation</h1>
<nav>
<a href="/docs/guide">Guide</a>
<a href="/docs/api">API Reference</a>
<a href="/docs/faq">FAQ</a>
<a href="/blog/post-1">Blog</a>
<a href="https://github.com/example/repo">GitHub</a>
</nav>
</body></html>""",
"/docs/guide": """<!DOCTYPE html><html><body>
<h1>Guide</h1>
<a href="/docs/guide/install">Installation</a>
<a href="/docs/guide/usage">Usage</a>
<a href="/docs/">Back to docs</a>
</body></html>""",
"/docs/guide/install": """<!DOCTYPE html><html><body>
<h1>Installation</h1><p>pip install example</p>
<a href="/docs/guide">Back to guide</a>
</body></html>""",
"/docs/guide/usage": """<!DOCTYPE html><html><body>
<h1>Usage</h1><p>import example</p>
<a href="/docs/guide">Back to guide</a>
</body></html>""",
"/docs/api": """<!DOCTYPE html><html><body>
<h1>API Reference</h1>
<a href="/docs/api/auth">Authentication</a>
<a href="/docs/api/users">Users</a>
</body></html>""",
"/docs/api/auth": """<!DOCTYPE html><html><body>
<h1>Authentication</h1><p>Use tokens.</p>
</body></html>""",
"/docs/api/users": """<!DOCTYPE html><html><body>
<h1>Users API</h1><p>CRUD operations.</p>
</body></html>""",
"/docs/faq": """<!DOCTYPE html><html><body>
<h1>FAQ</h1><p>Common questions.</p>
</body></html>""",
"/blog/post-1": """<!DOCTYPE html><html><body>
<h1>Blog Post</h1><p>This is a blog post outside /docs/.</p>
</body></html>""",
}
# All docs pages that should be discovered (excluding /blog/)
_ALL_DOC_URLS_PATHS = {
"/docs/",
"/docs/guide",
"/docs/guide/install",
"/docs/guide/usage",
"/docs/api",
"/docs/api/auth",
"/docs/api/users",
"/docs/faq",
}
class _TestHandler(SimpleHTTPRequestHandler):
"""Serve pages from the in-memory _SITE_PAGES dict."""
def do_GET(self):
path = self.path.split("?")[0].split("#")[0]
content = _SITE_PAGES.get(path)
if content is None:
self.send_error(404)
return
self.send_response(200)
self.send_header("Content-Type", "text/html; charset=utf-8")
self.end_headers()
self.wfile.write(content.encode("utf-8"))
def log_message(self, format, *args): # noqa: ARG002
pass # Suppress request logging during tests
def _start_server() -> tuple[HTTPServer, int]:
"""Start a local HTTP server on a random port. Returns (server, port)."""
server = HTTPServer(("127.0.0.1", 0), _TestHandler)
port = server.server_address[1]
thread = threading.Thread(target=server.serve_forever, daemon=True)
thread.start()
return server, port
# ---------------------------------------------------------------------------
# Helper
# ---------------------------------------------------------------------------
def _write_config(config: dict) -> Path:
"""Write a config dict to a temp JSON file and return its path."""
tmp = tempfile.mktemp(suffix=".json")
with open(tmp, "w", encoding="utf-8") as f:
json.dump(config, f, indent=2)
return Path(tmp)
# ---------------------------------------------------------------------------
# E2E tests using local HTTP server
# ---------------------------------------------------------------------------
@pytest.mark.e2e
class TestSyncConfigE2E(unittest.TestCase):
"""End-to-end tests using a local HTTP server with realistic HTML."""
@classmethod
def setUpClass(cls):
cls.server, cls.port = _start_server()
cls.base_url = f"http://127.0.0.1:{cls.port}/docs/"
@classmethod
def tearDownClass(cls):
cls.server.shutdown()
# -- discover_urls --
def test_discover_finds_all_doc_pages(self):
"""BFS should discover all 8 /docs/ pages from the root."""
discovered = discover_urls(
base_url=self.base_url,
seed_urls=[self.base_url],
depth=3,
rate_limit=0,
)
expected = {f"http://127.0.0.1:{self.port}{p}" for p in _ALL_DOC_URLS_PATHS}
self.assertEqual(discovered, expected)
def test_discover_excludes_blog(self):
"""Pages outside /docs/ base_url should be excluded."""
discovered = discover_urls(
base_url=self.base_url,
seed_urls=[self.base_url],
depth=3,
rate_limit=0,
)
blog_url = f"http://127.0.0.1:{self.port}/blog/post-1"
self.assertNotIn(blog_url, discovered)
def test_discover_excludes_external(self):
"""External URLs (github.com) should be excluded."""
discovered = discover_urls(
base_url=self.base_url,
seed_urls=[self.base_url],
depth=3,
rate_limit=0,
)
self.assertFalse(
any("github.com" in u for u in discovered),
"External URLs should not be discovered",
)
def test_discover_depth_1_finds_direct_links_only(self):
"""Depth 1 from root should find guide, api, faq but NOT nested pages."""
discovered = discover_urls(
base_url=self.base_url,
seed_urls=[self.base_url],
depth=1,
rate_limit=0,
)
# Direct children of /docs/
self.assertIn(f"http://127.0.0.1:{self.port}/docs/guide", discovered)
self.assertIn(f"http://127.0.0.1:{self.port}/docs/api", discovered)
self.assertIn(f"http://127.0.0.1:{self.port}/docs/faq", discovered)
# Nested pages should NOT be present (they're at depth 2)
self.assertNotIn(f"http://127.0.0.1:{self.port}/docs/guide/install", discovered)
self.assertNotIn(f"http://127.0.0.1:{self.port}/docs/api/auth", discovered)
def test_discover_with_include_pattern(self):
"""Include pattern should filter results."""
discovered = discover_urls(
base_url=self.base_url,
seed_urls=[self.base_url],
include_patterns=["/api"],
depth=3,
rate_limit=0,
)
# Only /api/ pages should be discovered
for url in discovered:
self.assertIn("/api", url, f"URL {url} does not match include pattern /api")
def test_discover_with_exclude_pattern(self):
"""Exclude pattern should remove matching pages."""
discovered = discover_urls(
base_url=self.base_url,
seed_urls=[self.base_url],
exclude_patterns=["/faq"],
depth=3,
rate_limit=0,
)
faq_url = f"http://127.0.0.1:{self.port}/docs/faq"
self.assertNotIn(faq_url, discovered)
# Other pages should still be found
self.assertIn(f"http://127.0.0.1:{self.port}/docs/guide", discovered)
def test_discover_max_pages_limit(self):
"""max_pages should cap discovery."""
discovered = discover_urls(
base_url=self.base_url,
seed_urls=[self.base_url],
depth=3,
max_pages=3,
rate_limit=0,
)
self.assertLessEqual(len(discovered), 3)
# -- sync_config (full pipeline with file I/O) --
def test_sync_config_dry_run_detects_new_pages(self):
"""Dry-run should detect pages missing from the config."""
config = {
"name": "test-site",
"sources": [
{
"type": "documentation",
"base_url": self.base_url,
"start_urls": [
f"http://127.0.0.1:{self.port}/docs/guide",
f"http://127.0.0.1:{self.port}/docs/faq",
],
}
],
}
path = _write_config(config)
result = sync_config(str(path), apply=False, depth=3, rate_limit=0)
self.assertFalse(result["applied"])
self.assertGreater(len(result["added"]), 0, "Should detect new pages")
# api, api/auth, api/users, guide/install, guide/usage, /docs/ itself
# should all be in added
self.assertGreaterEqual(result["total_discovered"], 6)
# File should NOT be modified
with open(path, encoding="utf-8") as f:
saved = json.load(f)
self.assertEqual(len(saved["sources"][0]["start_urls"]), 2)
path.unlink()
def test_sync_config_apply_updates_config(self):
"""--apply should write all discovered URLs to the config."""
config = {
"name": "test-site",
"sources": [
{
"type": "documentation",
"base_url": self.base_url,
"start_urls": [f"http://127.0.0.1:{self.port}/docs/guide"],
}
],
}
path = _write_config(config)
result = sync_config(str(path), apply=True, depth=3, rate_limit=0)
self.assertTrue(result["applied"])
# Verify the file was updated
with open(path, encoding="utf-8") as f:
saved = json.load(f)
saved_urls = saved["sources"][0]["start_urls"]
self.assertEqual(len(saved_urls), result["total_discovered"])
# All expected URLs should be present
expected = {f"http://127.0.0.1:{self.port}{p}" for p in _ALL_DOC_URLS_PATHS}
for url in expected:
self.assertIn(url, saved_urls, f"Expected URL missing from saved config: {url}")
path.unlink()
def test_sync_config_idempotent(self):
"""Running sync twice with --apply should be a no-op the second time."""
config = {
"name": "test-site",
"sources": [
{
"type": "documentation",
"base_url": self.base_url,
"start_urls": [],
}
],
}
path = _write_config(config)
# First run: should apply changes
result1 = sync_config(str(path), apply=True, depth=3, rate_limit=0)
self.assertTrue(result1["applied"])
self.assertGreater(len(result1["added"]), 0)
# Second run: should detect no changes
result2 = sync_config(str(path), apply=True, depth=3, rate_limit=0)
self.assertFalse(result2["applied"])
self.assertEqual(result2["added"], [])
self.assertEqual(result2["removed"], [])
path.unlink()
def test_sync_config_detects_removed_pages(self):
"""Pages in config but not discovered should show as removed."""
config = {
"name": "test-site",
"sources": [
{
"type": "documentation",
"base_url": self.base_url,
"start_urls": [
f"http://127.0.0.1:{self.port}/docs/guide",
f"http://127.0.0.1:{self.port}/docs/old-page-that-no-longer-exists",
],
}
],
}
path = _write_config(config)
result = sync_config(str(path), apply=False, depth=3, rate_limit=0)
self.assertIn(
f"http://127.0.0.1:{self.port}/docs/old-page-that-no-longer-exists",
result["removed"],
)
path.unlink()
def test_sync_config_preserves_other_config_fields(self):
"""--apply should only modify start_urls, preserving all other fields."""
config = {
"name": "my-skill",
"description": "Important skill description",
"version": "1.0.0",
"sources": [
{
"type": "documentation",
"base_url": self.base_url,
"start_urls": [],
"selectors": {"main_content": "article", "title": "h1"},
"url_patterns": {"include": [], "exclude": []},
"rate_limit": 0.5,
"max_pages": 100,
},
{
"type": "github",
"repo": "owner/repo",
},
],
}
path = _write_config(config)
sync_config(str(path), apply=True, depth=3, rate_limit=0)
with open(path, encoding="utf-8") as f:
saved = json.load(f)
# Non-start_urls fields should be untouched
self.assertEqual(saved["name"], "my-skill")
self.assertEqual(saved["description"], "Important skill description")
self.assertEqual(saved["version"], "1.0.0")
self.assertEqual(saved["sources"][0]["selectors"]["main_content"], "article")
self.assertEqual(saved["sources"][0]["rate_limit"], 0.5)
self.assertEqual(saved["sources"][1]["type"], "github")
self.assertEqual(saved["sources"][1]["repo"], "owner/repo")
# start_urls should be updated
self.assertGreater(len(saved["sources"][0]["start_urls"]), 0)
path.unlink()
def test_sync_config_with_nav_seed_urls(self):
"""nav_seed_urls should be used as BFS seeds instead of start_urls."""
config = {
"name": "test-site",
"sources": [
{
"type": "documentation",
"base_url": self.base_url,
"start_urls": [],
# Only seed from /docs/api — should only discover API pages
"nav_seed_urls": [f"http://127.0.0.1:{self.port}/docs/api"],
}
],
}
path = _write_config(config)
result = sync_config(str(path), apply=False, depth=1, rate_limit=0)
# Should discover at least the API seed page
self.assertGreater(len(result["added"]), 0, "nav_seed_urls should discover pages")
# All added URLs should be under /docs/
for url in result["added"]:
self.assertTrue(url.startswith(self.base_url), f"URL outside base: {url}")
path.unlink()
def test_sync_config_legacy_format(self):
"""Legacy flat config format should work end-to-end."""
config = {
"name": "test-site",
"base_url": self.base_url,
"start_urls": [f"http://127.0.0.1:{self.port}/docs/guide"],
}
path = _write_config(config)
result = sync_config(str(path), apply=True, depth=3, rate_limit=0)
self.assertTrue(result["applied"])
with open(path, encoding="utf-8") as f:
saved = json.load(f)
self.assertGreater(len(saved["start_urls"]), 1)
path.unlink()
# ---------------------------------------------------------------------------
# CLI subprocess tests
# ---------------------------------------------------------------------------
@pytest.mark.e2e
class TestSyncConfigCLIE2E(unittest.TestCase):
"""Test the CLI entry point via subprocess."""
@classmethod
def setUpClass(cls):
cls.server, cls.port = _start_server()
cls.base_url = f"http://127.0.0.1:{cls.port}/docs/"
@classmethod
def tearDownClass(cls):
cls.server.shutdown()
def test_cli_dry_run(self):
"""CLI dry-run should print diff and exit 0."""
config = {
"name": "test",
"sources": [
{
"type": "documentation",
"base_url": self.base_url,
# Only one URL configured — the rest should show as "new"
"start_urls": [f"http://127.0.0.1:{self.port}/docs/faq"],
# Seed from root to discover all pages
"nav_seed_urls": [self.base_url],
}
],
}
path = _write_config(config)
result = subprocess.run(
[
sys.executable,
"-m",
"skill_seekers.cli.sync_config",
"--config",
str(path),
"--depth",
"3",
"--rate-limit",
"0",
],
capture_output=True,
text=True,
timeout=30,
)
self.assertEqual(result.returncode, 0, f"CLI failed: {result.stderr}")
# Should mention new pages in the output (logged to stderr)
combined = result.stderr.lower() + result.stdout.lower()
self.assertIn("new page", combined, f"Expected 'new page' in output: {combined}")
path.unlink()
def test_cli_apply(self):
"""CLI --apply should update the config file."""
config = {
"name": "test",
"sources": [
{
"type": "documentation",
"base_url": self.base_url,
"start_urls": [f"http://127.0.0.1:{self.port}/docs/faq"],
"nav_seed_urls": [self.base_url],
}
],
}
path = _write_config(config)
result = subprocess.run(
[
sys.executable,
"-m",
"skill_seekers.cli.sync_config",
"--config",
str(path),
"--apply",
"--depth",
"3",
"--rate-limit",
"0",
],
capture_output=True,
text=True,
timeout=30,
)
self.assertEqual(result.returncode, 0, f"CLI failed: {result.stderr}")
with open(path, encoding="utf-8") as f:
saved = json.load(f)
self.assertGreater(len(saved["sources"][0]["start_urls"]), 0)
path.unlink()
def test_cli_help(self):
"""CLI --help should print usage and exit 0."""
result = subprocess.run(
[sys.executable, "-m", "skill_seekers.cli.sync_config", "--help"],
capture_output=True,
text=True,
timeout=10,
)
self.assertEqual(result.returncode, 0)
self.assertIn("sync", result.stdout.lower())
self.assertIn("--config", result.stdout)
self.assertIn("--apply", result.stdout)
self.assertIn("--depth", result.stdout)
def test_cli_missing_config_exits_nonzero(self):
"""CLI with a non-existent config should fail."""
result = subprocess.run(
[
sys.executable,
"-m",
"skill_seekers.cli.sync_config",
"--config",
"/nonexistent/path/config.json",
],
capture_output=True,
text=True,
timeout=10,
)
self.assertNotEqual(result.returncode, 0)
# ---------------------------------------------------------------------------
# Integration test against real public site
# ---------------------------------------------------------------------------
@pytest.mark.integration
class TestSyncConfigRealSite(unittest.TestCase):
"""Integration test against a real public docs site.
Skipped by default (use ``-m integration`` to run).
Uses httpbin.org which is a stable, small public HTTP test service.
"""
def test_discover_urls_real_http(self):
"""discover_urls should work against a real HTTP server."""
# Use Python docs — small, stable, well-structured
discovered = discover_urls(
base_url="https://docs.python.org/3/library/",
seed_urls=["https://docs.python.org/3/library/functions.html"],
depth=1,
max_pages=10,
rate_limit=0.5,
)
# Should find at least the seed page itself
self.assertGreater(len(discovered), 0)
# All discovered URLs should be under the base
for url in discovered:
self.assertTrue(
url.startswith("https://docs.python.org/3/library/"),
f"Discovered URL outside base: {url}",
)
if __name__ == "__main__":
unittest.main()