feat: add sync-config command to detect and update config start_urls (#306)
## Summary Add `skill-seekers sync-config` subcommand that crawls a docs site's navigation, diffs discovered URLs against a config's start_urls, and optionally writes the updated list back with --apply. - BFS link discovery with configurable depth (default 2), max-pages, rate-limit - Respects url_patterns.include/exclude from config - Supports optional nav_seed_urls config field - Handles both unified (sources array) and legacy flat config formats - MCP tool sync_config included - 57 tests (39 unit + 18 E2E with local HTTP server) - Fixed CI: renamed summary job to "Tests" to match branch protection rule Closes #306
This commit is contained in:
@@ -244,6 +244,7 @@ skill-seekers-update = "skill_seekers.cli.incremental_updater:main"
|
|||||||
skill-seekers-multilang = "skill_seekers.cli.multilang_support:main"
|
skill-seekers-multilang = "skill_seekers.cli.multilang_support:main"
|
||||||
skill-seekers-quality = "skill_seekers.cli.quality_metrics:main"
|
skill-seekers-quality = "skill_seekers.cli.quality_metrics:main"
|
||||||
skill-seekers-workflows = "skill_seekers.cli.workflows_command:main"
|
skill-seekers-workflows = "skill_seekers.cli.workflows_command:main"
|
||||||
|
skill-seekers-sync-config = "skill_seekers.cli.sync_config:main"
|
||||||
|
|
||||||
[tool.setuptools]
|
[tool.setuptools]
|
||||||
package-dir = {"" = "src"}
|
package-dir = {"" = "src"}
|
||||||
|
|||||||
64
src/skill_seekers/cli/arguments/sync_config.py
Normal file
64
src/skill_seekers/cli/arguments/sync_config.py
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
"""Sync-config command argument definitions.
|
||||||
|
|
||||||
|
Shared between sync_config.py (standalone) and parsers/sync_config_parser.py
|
||||||
|
(unified CLI) so the two entry points never drift out of sync.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def add_sync_config_arguments(parser: argparse.ArgumentParser) -> None:
|
||||||
|
"""Add all sync-config arguments to *parser*."""
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--config",
|
||||||
|
"-c",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Path to the config JSON file to sync",
|
||||||
|
metavar="FILE",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--apply",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Write updated start_urls back to the config file (default: dry-run)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--depth",
|
||||||
|
type=int,
|
||||||
|
default=2,
|
||||||
|
help="BFS crawl depth from seed pages (default: 2)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-pages",
|
||||||
|
type=int,
|
||||||
|
default=500,
|
||||||
|
help="Maximum pages to discover (default: 500)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--rate-limit",
|
||||||
|
type=float,
|
||||||
|
default=None,
|
||||||
|
help="Override config rate-limit (seconds between requests)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--source-index",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="Index of the documentation source to sync (default: 0)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--verbose",
|
||||||
|
"-v",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Verbose output",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--quiet",
|
||||||
|
"-q",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Suppress informational output",
|
||||||
|
)
|
||||||
@@ -67,6 +67,7 @@ COMMAND_MODULES = {
|
|||||||
"multilang": "skill_seekers.cli.multilang_support",
|
"multilang": "skill_seekers.cli.multilang_support",
|
||||||
"quality": "skill_seekers.cli.quality_metrics",
|
"quality": "skill_seekers.cli.quality_metrics",
|
||||||
"workflows": "skill_seekers.cli.workflows_command",
|
"workflows": "skill_seekers.cli.workflows_command",
|
||||||
|
"sync-config": "skill_seekers.cli.sync_config",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ from .update_parser import UpdateParser
|
|||||||
from .multilang_parser import MultilangParser
|
from .multilang_parser import MultilangParser
|
||||||
from .quality_parser import QualityParser
|
from .quality_parser import QualityParser
|
||||||
from .workflows_parser import WorkflowsParser
|
from .workflows_parser import WorkflowsParser
|
||||||
|
from .sync_config_parser import SyncConfigParser
|
||||||
|
|
||||||
# Registry of all parsers (in order of usage frequency)
|
# Registry of all parsers (in order of usage frequency)
|
||||||
PARSERS = [
|
PARSERS = [
|
||||||
@@ -56,6 +57,7 @@ PARSERS = [
|
|||||||
MultilangParser(),
|
MultilangParser(),
|
||||||
QualityParser(),
|
QualityParser(),
|
||||||
WorkflowsParser(),
|
WorkflowsParser(),
|
||||||
|
SyncConfigParser(),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
30
src/skill_seekers/cli/parsers/sync_config_parser.py
Normal file
30
src/skill_seekers/cli/parsers/sync_config_parser.py
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
"""Parser for the sync-config subcommand."""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from .base import SubcommandParser
|
||||||
|
|
||||||
|
|
||||||
|
class SyncConfigParser(SubcommandParser):
|
||||||
|
"""Subcommand parser for ``skill-seekers sync-config``."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
return "sync-config"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def help(self) -> str:
|
||||||
|
return "Diff/update a config's start_urls against the live docs site"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def description(self) -> str:
|
||||||
|
return (
|
||||||
|
"Crawl navigation links from a docs site, compare them against "
|
||||||
|
"the config's start_urls, and optionally write the updated list "
|
||||||
|
"back with --apply."
|
||||||
|
)
|
||||||
|
|
||||||
|
def add_arguments(self, parser: argparse.ArgumentParser) -> None:
|
||||||
|
from skill_seekers.cli.arguments.sync_config import add_sync_config_arguments
|
||||||
|
|
||||||
|
add_sync_config_arguments(parser)
|
||||||
325
src/skill_seekers/cli/sync_config.py
Normal file
325
src/skill_seekers/cli/sync_config.py
Normal file
@@ -0,0 +1,325 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Sync a config file's start_urls against what's currently live on a docs site.
|
||||||
|
|
||||||
|
Crawls navigation links from seed pages, diffs them against the config's
|
||||||
|
``start_urls``, and optionally writes the updated list back.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
skill-seekers sync-config --config configs/claude-code.json
|
||||||
|
skill-seekers sync-config --config configs/claude-code.json --apply
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from collections import deque
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from skill_seekers.cli.utils import sanitize_url, setup_logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# URL filtering (mirrors DocToSkillConverter.is_valid_url logic)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _is_valid_url(
|
||||||
|
url: str,
|
||||||
|
base_url: str,
|
||||||
|
include_patterns: list[str],
|
||||||
|
exclude_patterns: list[str],
|
||||||
|
) -> bool:
|
||||||
|
"""Return True if *url* passes include/exclude pattern filters."""
|
||||||
|
if not url.startswith(base_url):
|
||||||
|
return False
|
||||||
|
if include_patterns and not any(p in url for p in include_patterns):
|
||||||
|
return False
|
||||||
|
return not any(p in url for p in exclude_patterns)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Lightweight BFS link discovery
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def discover_urls(
|
||||||
|
base_url: str,
|
||||||
|
seed_urls: list[str],
|
||||||
|
include_patterns: list[str] | None = None,
|
||||||
|
exclude_patterns: list[str] | None = None,
|
||||||
|
depth: int = 2,
|
||||||
|
max_pages: int = 500,
|
||||||
|
rate_limit: float = 0.5,
|
||||||
|
) -> set[str]:
|
||||||
|
"""BFS-crawl *seed_urls* and return all discovered internal URLs.
|
||||||
|
|
||||||
|
Only follows ``<a href>`` links on HTML pages; does not download
|
||||||
|
full page content. Applies the same include/exclude filtering as
|
||||||
|
:class:`DocToSkillConverter`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
base_url: Only URLs under this prefix are accepted.
|
||||||
|
seed_urls: Starting points for the BFS.
|
||||||
|
include_patterns: Substring patterns a URL must contain (any).
|
||||||
|
exclude_patterns: Substring patterns that disqualify a URL.
|
||||||
|
depth: Maximum number of BFS hops from the seed pages.
|
||||||
|
max_pages: Stop after discovering this many unique URLs.
|
||||||
|
rate_limit: Seconds to wait between HTTP requests.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Set of discovered absolute URLs (fragments stripped).
|
||||||
|
"""
|
||||||
|
includes = include_patterns or []
|
||||||
|
excludes = exclude_patterns or []
|
||||||
|
|
||||||
|
visited: set[str] = set()
|
||||||
|
# Queue entries are (url, current_depth)
|
||||||
|
queue: deque[tuple[str, int]] = deque()
|
||||||
|
for u in seed_urls:
|
||||||
|
u = sanitize_url(u)
|
||||||
|
queue.append((u, 0))
|
||||||
|
|
||||||
|
discovered: set[str] = set()
|
||||||
|
|
||||||
|
while queue and len(discovered) < max_pages:
|
||||||
|
url, cur_depth = queue.popleft()
|
||||||
|
if url in visited:
|
||||||
|
continue
|
||||||
|
visited.add(url)
|
||||||
|
|
||||||
|
if not _is_valid_url(url, base_url, includes, excludes):
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.debug(" [depth %d] %s", cur_depth, url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
headers = {"User-Agent": "Mozilla/5.0 (Skill-Seekers sync-config)"}
|
||||||
|
resp = requests.get(url, headers=headers, timeout=15)
|
||||||
|
resp.raise_for_status()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(" Could not fetch %s: %s", url, e)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Only mark as "discovered" after a successful fetch — 404s and
|
||||||
|
# other errors mean the page no longer exists on the live site.
|
||||||
|
discovered.add(url)
|
||||||
|
|
||||||
|
# Follow links if we haven't hit the depth limit
|
||||||
|
if cur_depth < depth:
|
||||||
|
soup = BeautifulSoup(resp.content, "html.parser")
|
||||||
|
for link in soup.find_all("a", href=True):
|
||||||
|
href = urljoin(url, link["href"])
|
||||||
|
href = href.split("#")[0] # strip fragment
|
||||||
|
href = sanitize_url(href)
|
||||||
|
if href not in visited and _is_valid_url(href, base_url, includes, excludes):
|
||||||
|
queue.append((href, cur_depth + 1))
|
||||||
|
|
||||||
|
if rate_limit > 0:
|
||||||
|
time.sleep(rate_limit)
|
||||||
|
|
||||||
|
return discovered
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Diff logic
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def diff_urls(discovered: set[str], configured: list[str]) -> tuple[list[str], list[str]]:
|
||||||
|
"""Compare *discovered* URLs against a *configured* list.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
``(added, removed)`` — both sorted lists of URLs.
|
||||||
|
"""
|
||||||
|
configured_set = set(configured)
|
||||||
|
added = sorted(discovered - configured_set)
|
||||||
|
removed = sorted(configured_set - discovered)
|
||||||
|
return added, removed
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Config helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _get_doc_source(config: dict, source_index: int = 0) -> dict | None:
|
||||||
|
"""Extract the documentation source dict from *config*.
|
||||||
|
|
||||||
|
Handles both the unified format (``sources`` array) and legacy flat
|
||||||
|
format (fields at the top level).
|
||||||
|
"""
|
||||||
|
sources = config.get("sources")
|
||||||
|
if sources:
|
||||||
|
doc_sources = [s for s in sources if s.get("type") == "documentation"]
|
||||||
|
if source_index < len(doc_sources):
|
||||||
|
return doc_sources[source_index]
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Legacy flat format — treat the whole config as a single source
|
||||||
|
if config.get("base_url"):
|
||||||
|
return config
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _set_start_urls(config: dict, source_index: int, urls: list[str]) -> None:
|
||||||
|
"""Write *urls* into the correct ``start_urls`` field in *config*."""
|
||||||
|
sources = config.get("sources")
|
||||||
|
if sources:
|
||||||
|
doc_sources = [s for s in sources if s.get("type") == "documentation"]
|
||||||
|
if source_index < len(doc_sources):
|
||||||
|
doc_sources[source_index]["start_urls"] = urls
|
||||||
|
return
|
||||||
|
# Legacy flat format
|
||||||
|
config["start_urls"] = urls
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Main orchestrator
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def sync_config(
|
||||||
|
config_path: str,
|
||||||
|
apply: bool = False,
|
||||||
|
depth: int = 2,
|
||||||
|
max_pages: int = 500,
|
||||||
|
rate_limit: float | None = None,
|
||||||
|
source_index: int = 0,
|
||||||
|
) -> dict:
|
||||||
|
"""Run the sync-config workflow.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with keys ``added``, ``removed``, ``total_discovered``,
|
||||||
|
``total_configured``, ``applied``.
|
||||||
|
"""
|
||||||
|
# Load config
|
||||||
|
with open(config_path, encoding="utf-8") as f:
|
||||||
|
config = json.load(f)
|
||||||
|
|
||||||
|
source = _get_doc_source(config, source_index)
|
||||||
|
if source is None:
|
||||||
|
logger.error("No documentation source found at index %d in %s", source_index, config_path)
|
||||||
|
return {
|
||||||
|
"added": [],
|
||||||
|
"removed": [],
|
||||||
|
"total_discovered": 0,
|
||||||
|
"total_configured": 0,
|
||||||
|
"applied": False,
|
||||||
|
"error": "No documentation source found",
|
||||||
|
}
|
||||||
|
|
||||||
|
base_url: str = source["base_url"]
|
||||||
|
configured_urls: list[str] = source.get("start_urls") or []
|
||||||
|
seed_urls: list[str] = source.get("nav_seed_urls") or configured_urls or [base_url]
|
||||||
|
url_patterns = source.get("url_patterns", {})
|
||||||
|
includes: list[str] = url_patterns.get("include", [])
|
||||||
|
excludes: list[str] = url_patterns.get("exclude", [])
|
||||||
|
effective_rate = rate_limit if rate_limit is not None else source.get("rate_limit", 0.5)
|
||||||
|
|
||||||
|
logger.info("Syncing config: %s", config_path)
|
||||||
|
logger.info(" Base URL: %s", base_url)
|
||||||
|
logger.info(" Seed URLs: %d", len(seed_urls))
|
||||||
|
logger.info(" Configured: %d start_urls", len(configured_urls))
|
||||||
|
logger.info(" Depth: %d", depth)
|
||||||
|
logger.info(" Rate limit: %.1fs", effective_rate)
|
||||||
|
logger.info("")
|
||||||
|
|
||||||
|
# Discover
|
||||||
|
discovered = discover_urls(
|
||||||
|
base_url=base_url,
|
||||||
|
seed_urls=seed_urls,
|
||||||
|
include_patterns=includes,
|
||||||
|
exclude_patterns=excludes,
|
||||||
|
depth=depth,
|
||||||
|
max_pages=max_pages,
|
||||||
|
rate_limit=effective_rate,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Diff
|
||||||
|
added, removed = diff_urls(discovered, configured_urls)
|
||||||
|
|
||||||
|
# Report
|
||||||
|
if added:
|
||||||
|
logger.info("New pages (%d):", len(added))
|
||||||
|
for url in added:
|
||||||
|
path = url.replace(base_url, "/")
|
||||||
|
logger.info(" + %s", path)
|
||||||
|
if removed:
|
||||||
|
logger.info("Removed pages (%d):", len(removed))
|
||||||
|
for url in removed:
|
||||||
|
path = url.replace(base_url, "/")
|
||||||
|
logger.info(" - %s", path)
|
||||||
|
|
||||||
|
if not added and not removed:
|
||||||
|
logger.info("Config is up to date. No changes detected.")
|
||||||
|
else:
|
||||||
|
logger.info("")
|
||||||
|
logger.info(
|
||||||
|
"Summary: %d new, %d removed (discovered %d total, configured %d)",
|
||||||
|
len(added),
|
||||||
|
len(removed),
|
||||||
|
len(discovered),
|
||||||
|
len(configured_urls),
|
||||||
|
)
|
||||||
|
|
||||||
|
applied = False
|
||||||
|
if apply and (added or removed):
|
||||||
|
new_urls = sorted(discovered)
|
||||||
|
_set_start_urls(config, source_index, new_urls)
|
||||||
|
with open(config_path, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(config, f, indent=2, ensure_ascii=False)
|
||||||
|
f.write("\n")
|
||||||
|
logger.info("Updated %s (%d start_urls)", config_path, len(new_urls))
|
||||||
|
applied = True
|
||||||
|
elif added or removed:
|
||||||
|
logger.info("Run with --apply to update %s", config_path)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"added": added,
|
||||||
|
"removed": removed,
|
||||||
|
"total_discovered": len(discovered),
|
||||||
|
"total_configured": len(configured_urls),
|
||||||
|
"applied": applied,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# CLI entry point
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
"""CLI entry point for ``skill-seekers sync-config``."""
|
||||||
|
from skill_seekers.cli.arguments.sync_config import add_sync_config_arguments
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
prog="skill-seekers-sync-config",
|
||||||
|
description="Sync a config's start_urls against what's live on the docs site.",
|
||||||
|
)
|
||||||
|
add_sync_config_arguments(parser)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
setup_logging(verbose=args.verbose, quiet=args.quiet)
|
||||||
|
|
||||||
|
result = sync_config(
|
||||||
|
config_path=args.config,
|
||||||
|
apply=args.apply,
|
||||||
|
depth=args.depth,
|
||||||
|
max_pages=args.max_pages,
|
||||||
|
rate_limit=args.rate_limit,
|
||||||
|
source_index=args.source_index,
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.get("error"):
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -103,6 +103,8 @@ try:
|
|||||||
# Splitting tools
|
# Splitting tools
|
||||||
split_config_impl,
|
split_config_impl,
|
||||||
submit_config_impl,
|
submit_config_impl,
|
||||||
|
# Sync config tools
|
||||||
|
sync_config_impl,
|
||||||
upload_skill_impl,
|
upload_skill_impl,
|
||||||
validate_config_impl,
|
validate_config_impl,
|
||||||
# Workflow tools
|
# Workflow tools
|
||||||
@@ -144,6 +146,7 @@ except ImportError:
|
|||||||
scrape_video_impl,
|
scrape_video_impl,
|
||||||
split_config_impl,
|
split_config_impl,
|
||||||
submit_config_impl,
|
submit_config_impl,
|
||||||
|
sync_config_impl,
|
||||||
upload_skill_impl,
|
upload_skill_impl,
|
||||||
validate_config_impl,
|
validate_config_impl,
|
||||||
list_workflows_impl,
|
list_workflows_impl,
|
||||||
@@ -251,6 +254,52 @@ async def validate_config(config_path: str) -> str:
|
|||||||
return str(result)
|
return str(result)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# SYNC CONFIG TOOLS (1 tool)
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
@safe_tool_decorator(description="Sync a config's start_urls against what's live on the docs site.")
|
||||||
|
async def sync_config(
|
||||||
|
config_path: str,
|
||||||
|
apply: bool = False,
|
||||||
|
depth: int = 2,
|
||||||
|
max_pages: int = 500,
|
||||||
|
rate_limit: float | None = None,
|
||||||
|
source_index: int = 0,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Sync a config file's start_urls against the live docs site.
|
||||||
|
|
||||||
|
Crawls seed/nav pages, discovers internal links, and diffs against the
|
||||||
|
config's existing start_urls. Optionally writes the update with apply=True.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config_path: Path to the config JSON file.
|
||||||
|
apply: Write changes back to the config file (default: False).
|
||||||
|
depth: BFS crawl depth from seed pages (default: 2).
|
||||||
|
max_pages: Maximum URLs to discover (default: 500).
|
||||||
|
rate_limit: Override config rate limit (seconds between requests).
|
||||||
|
source_index: Index of the documentation source to sync (default: 0).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Report of added/removed URLs.
|
||||||
|
"""
|
||||||
|
result = await sync_config_impl(
|
||||||
|
{
|
||||||
|
"config_path": config_path,
|
||||||
|
"apply": apply,
|
||||||
|
"depth": depth,
|
||||||
|
"max_pages": max_pages,
|
||||||
|
"rate_limit": rate_limit,
|
||||||
|
"source_index": source_index,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if isinstance(result, list) and result:
|
||||||
|
return result[0].text if hasattr(result[0], "text") else str(result[0])
|
||||||
|
return str(result)
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# SCRAPING TOOLS (10 tools)
|
# SCRAPING TOOLS (10 tools)
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
|||||||
@@ -99,6 +99,9 @@ from .vector_db_tools import (
|
|||||||
from .vector_db_tools import (
|
from .vector_db_tools import (
|
||||||
export_to_weaviate_impl,
|
export_to_weaviate_impl,
|
||||||
)
|
)
|
||||||
|
from .sync_config_tools import (
|
||||||
|
sync_config_tool as sync_config_impl,
|
||||||
|
)
|
||||||
from .workflow_tools import (
|
from .workflow_tools import (
|
||||||
create_workflow_tool as create_workflow_impl,
|
create_workflow_tool as create_workflow_impl,
|
||||||
)
|
)
|
||||||
@@ -151,6 +154,8 @@ __all__ = [
|
|||||||
"export_to_chroma_impl",
|
"export_to_chroma_impl",
|
||||||
"export_to_faiss_impl",
|
"export_to_faiss_impl",
|
||||||
"export_to_qdrant_impl",
|
"export_to_qdrant_impl",
|
||||||
|
# Sync config tools
|
||||||
|
"sync_config_impl",
|
||||||
# Workflow tools
|
# Workflow tools
|
||||||
"list_workflows_impl",
|
"list_workflows_impl",
|
||||||
"get_workflow_impl",
|
"get_workflow_impl",
|
||||||
|
|||||||
85
src/skill_seekers/mcp/tools/sync_config_tools.py
Normal file
85
src/skill_seekers/mcp/tools/sync_config_tools.py
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
"""Sync-config MCP tool for Skill Seekers MCP Server.
|
||||||
|
|
||||||
|
Provides the ``sync_config`` tool that diffs a config's start_urls against
|
||||||
|
the live docs site and optionally applies the update.
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
from mcp.types import TextContent
|
||||||
|
except ImportError:
|
||||||
|
|
||||||
|
class TextContent:
|
||||||
|
"""Fallback TextContent for when MCP is not installed."""
|
||||||
|
|
||||||
|
def __init__(self, type: str, text: str):
|
||||||
|
self.type = type
|
||||||
|
self.text = text
|
||||||
|
|
||||||
|
|
||||||
|
async def sync_config_tool(args: dict) -> list[TextContent]:
|
||||||
|
"""Sync a config file's start_urls against what's live on the docs site.
|
||||||
|
|
||||||
|
Crawls seed/nav pages, discovers internal links, diffs against the
|
||||||
|
config's existing ``start_urls``, and optionally writes the update.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
args: Dictionary containing:
|
||||||
|
- config_path (str): Path to the config JSON file.
|
||||||
|
- apply (bool, optional): Write changes back (default: False).
|
||||||
|
- depth (int, optional): BFS crawl depth (default: 2).
|
||||||
|
- max_pages (int, optional): Max URLs to discover (default: 500).
|
||||||
|
- rate_limit (float, optional): Seconds between requests.
|
||||||
|
- source_index (int, optional): Documentation source index (default: 0).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[TextContent]: Report of added/removed URLs, or error message.
|
||||||
|
"""
|
||||||
|
config_path = args.get("config_path", "")
|
||||||
|
if not config_path:
|
||||||
|
return [TextContent(type="text", text="Error: config_path is required")]
|
||||||
|
|
||||||
|
try:
|
||||||
|
from skill_seekers.cli.sync_config import sync_config
|
||||||
|
|
||||||
|
result = sync_config(
|
||||||
|
config_path=config_path,
|
||||||
|
apply=args.get("apply", False),
|
||||||
|
depth=args.get("depth", 2),
|
||||||
|
max_pages=args.get("max_pages", 500),
|
||||||
|
rate_limit=args.get("rate_limit"),
|
||||||
|
source_index=args.get("source_index", 0),
|
||||||
|
)
|
||||||
|
except FileNotFoundError:
|
||||||
|
return [TextContent(type="text", text=f"Error: Config file not found: {config_path}")]
|
||||||
|
except Exception as e:
|
||||||
|
return [TextContent(type="text", text=f"Error syncing config: {e}")]
|
||||||
|
|
||||||
|
if result.get("error"):
|
||||||
|
return [TextContent(type="text", text=f"Error: {result['error']}")]
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
added = result["added"]
|
||||||
|
removed = result["removed"]
|
||||||
|
|
||||||
|
if added:
|
||||||
|
lines.append(f"New pages ({len(added)}):")
|
||||||
|
for url in added:
|
||||||
|
lines.append(f" + {url}")
|
||||||
|
if removed:
|
||||||
|
lines.append(f"Removed pages ({len(removed)}):")
|
||||||
|
for url in removed:
|
||||||
|
lines.append(f" - {url}")
|
||||||
|
if not added and not removed:
|
||||||
|
lines.append("Config is up to date. No changes detected.")
|
||||||
|
else:
|
||||||
|
lines.append(
|
||||||
|
f"\nSummary: {len(added)} new, {len(removed)} removed "
|
||||||
|
f"(discovered {result['total_discovered']}, "
|
||||||
|
f"configured {result['total_configured']})"
|
||||||
|
)
|
||||||
|
if result["applied"]:
|
||||||
|
lines.append(f"Updated {config_path}")
|
||||||
|
else:
|
||||||
|
lines.append(f"Run with apply=true to update {config_path}")
|
||||||
|
|
||||||
|
return [TextContent(type="text", text="\n".join(lines))]
|
||||||
@@ -24,12 +24,12 @@ class TestParserRegistry:
|
|||||||
|
|
||||||
def test_all_parsers_registered(self):
|
def test_all_parsers_registered(self):
|
||||||
"""Test that all parsers are registered."""
|
"""Test that all parsers are registered."""
|
||||||
assert len(PARSERS) == 23, f"Expected 23 parsers, got {len(PARSERS)}"
|
assert len(PARSERS) == 24, f"Expected 24 parsers, got {len(PARSERS)}"
|
||||||
|
|
||||||
def test_get_parser_names(self):
|
def test_get_parser_names(self):
|
||||||
"""Test getting list of parser names."""
|
"""Test getting list of parser names."""
|
||||||
names = get_parser_names()
|
names = get_parser_names()
|
||||||
assert len(names) == 23
|
assert len(names) == 24
|
||||||
assert "scrape" in names
|
assert "scrape" in names
|
||||||
assert "github" in names
|
assert "github" in names
|
||||||
assert "package" in names
|
assert "package" in names
|
||||||
@@ -243,9 +243,9 @@ class TestBackwardCompatibility:
|
|||||||
assert cmd in names, f"Command '{cmd}' not found in parser registry!"
|
assert cmd in names, f"Command '{cmd}' not found in parser registry!"
|
||||||
|
|
||||||
def test_command_count_matches(self):
|
def test_command_count_matches(self):
|
||||||
"""Test that we have exactly 23 commands (includes create, workflows, word, and video commands)."""
|
"""Test that we have exactly 24 commands (includes create, workflows, word, video, and sync-config commands)."""
|
||||||
assert len(PARSERS) == 23
|
assert len(PARSERS) == 24
|
||||||
assert len(get_parser_names()) == 23
|
assert len(get_parser_names()) == 24
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
590
tests/test_sync_config.py
Normal file
590
tests/test_sync_config.py
Normal file
@@ -0,0 +1,590 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Tests for the sync-config command.
|
||||||
|
|
||||||
|
Covers:
|
||||||
|
- URL diffing logic
|
||||||
|
- URL filtering (_is_valid_url)
|
||||||
|
- BFS discovery with mocked HTTP responses
|
||||||
|
- Config loading (unified + legacy formats)
|
||||||
|
- --apply writes correct JSON
|
||||||
|
- CLI argument parsing
|
||||||
|
- MCP tool wrapper
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
from skill_seekers.cli.sync_config import (
|
||||||
|
_get_doc_source,
|
||||||
|
_is_valid_url,
|
||||||
|
_set_start_urls,
|
||||||
|
diff_urls,
|
||||||
|
discover_urls,
|
||||||
|
sync_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# diff_urls
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestDiffUrls(unittest.TestCase):
|
||||||
|
"""Test the URL diffing logic."""
|
||||||
|
|
||||||
|
def test_no_changes(self):
|
||||||
|
configured = ["https://example.com/a", "https://example.com/b"]
|
||||||
|
discovered = set(configured)
|
||||||
|
added, removed = diff_urls(discovered, configured)
|
||||||
|
self.assertEqual(added, [])
|
||||||
|
self.assertEqual(removed, [])
|
||||||
|
|
||||||
|
def test_added_urls(self):
|
||||||
|
configured = ["https://example.com/a"]
|
||||||
|
discovered = {"https://example.com/a", "https://example.com/b"}
|
||||||
|
added, removed = diff_urls(discovered, configured)
|
||||||
|
self.assertEqual(added, ["https://example.com/b"])
|
||||||
|
self.assertEqual(removed, [])
|
||||||
|
|
||||||
|
def test_removed_urls(self):
|
||||||
|
configured = ["https://example.com/a", "https://example.com/b"]
|
||||||
|
discovered = {"https://example.com/a"}
|
||||||
|
added, removed = diff_urls(discovered, configured)
|
||||||
|
self.assertEqual(added, [])
|
||||||
|
self.assertEqual(removed, ["https://example.com/b"])
|
||||||
|
|
||||||
|
def test_both_added_and_removed(self):
|
||||||
|
configured = ["https://example.com/a", "https://example.com/b"]
|
||||||
|
discovered = {"https://example.com/a", "https://example.com/c"}
|
||||||
|
added, removed = diff_urls(discovered, configured)
|
||||||
|
self.assertEqual(added, ["https://example.com/c"])
|
||||||
|
self.assertEqual(removed, ["https://example.com/b"])
|
||||||
|
|
||||||
|
def test_empty_configured(self):
|
||||||
|
added, removed = diff_urls({"https://example.com/a"}, [])
|
||||||
|
self.assertEqual(added, ["https://example.com/a"])
|
||||||
|
self.assertEqual(removed, [])
|
||||||
|
|
||||||
|
def test_empty_discovered(self):
|
||||||
|
added, removed = diff_urls(set(), ["https://example.com/a"])
|
||||||
|
self.assertEqual(added, [])
|
||||||
|
self.assertEqual(removed, ["https://example.com/a"])
|
||||||
|
|
||||||
|
def test_results_sorted(self):
|
||||||
|
configured = ["https://example.com/z"]
|
||||||
|
discovered = {"https://example.com/b", "https://example.com/a"}
|
||||||
|
added, _ = diff_urls(discovered, configured)
|
||||||
|
self.assertEqual(added, ["https://example.com/a", "https://example.com/b"])
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# _is_valid_url
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestIsValidUrl(unittest.TestCase):
|
||||||
|
"""Test the URL filtering logic."""
|
||||||
|
|
||||||
|
def test_url_under_base(self):
|
||||||
|
self.assertTrue(
|
||||||
|
_is_valid_url("https://docs.example.com/guide", "https://docs.example.com/", [], [])
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_url_not_under_base(self):
|
||||||
|
self.assertFalse(
|
||||||
|
_is_valid_url("https://other.com/guide", "https://docs.example.com/", [], [])
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_include_pattern_match(self):
|
||||||
|
self.assertTrue(
|
||||||
|
_is_valid_url(
|
||||||
|
"https://docs.example.com/docs/en/guide",
|
||||||
|
"https://docs.example.com/",
|
||||||
|
["/docs/en/"],
|
||||||
|
[],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_include_pattern_no_match(self):
|
||||||
|
self.assertFalse(
|
||||||
|
_is_valid_url(
|
||||||
|
"https://docs.example.com/blog/post",
|
||||||
|
"https://docs.example.com/",
|
||||||
|
["/docs/en/"],
|
||||||
|
[],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_exclude_pattern(self):
|
||||||
|
self.assertFalse(
|
||||||
|
_is_valid_url(
|
||||||
|
"https://docs.example.com/docs/en/changelog",
|
||||||
|
"https://docs.example.com/",
|
||||||
|
[],
|
||||||
|
["/changelog"],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_include_and_exclude(self):
|
||||||
|
# Matches include but also matches exclude -> rejected
|
||||||
|
self.assertFalse(
|
||||||
|
_is_valid_url(
|
||||||
|
"https://docs.example.com/docs/en/changelog",
|
||||||
|
"https://docs.example.com/",
|
||||||
|
["/docs/en/"],
|
||||||
|
["/changelog"],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_no_patterns_all_valid(self):
|
||||||
|
self.assertTrue(
|
||||||
|
_is_valid_url("https://docs.example.com/anything", "https://docs.example.com/", [], [])
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# _get_doc_source / _set_start_urls
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestConfigHelpers(unittest.TestCase):
|
||||||
|
"""Test config extraction for both unified and legacy formats."""
|
||||||
|
|
||||||
|
def test_unified_format(self):
|
||||||
|
config = {
|
||||||
|
"name": "test",
|
||||||
|
"sources": [
|
||||||
|
{"type": "documentation", "base_url": "https://docs.example.com/"},
|
||||||
|
{"type": "github", "repo": "owner/repo"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
source = _get_doc_source(config)
|
||||||
|
self.assertIsNotNone(source)
|
||||||
|
self.assertEqual(source["base_url"], "https://docs.example.com/")
|
||||||
|
|
||||||
|
def test_unified_format_second_source(self):
|
||||||
|
config = {
|
||||||
|
"name": "test",
|
||||||
|
"sources": [
|
||||||
|
{"type": "documentation", "base_url": "https://first.com/"},
|
||||||
|
{"type": "documentation", "base_url": "https://second.com/"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
source = _get_doc_source(config, source_index=1)
|
||||||
|
self.assertEqual(source["base_url"], "https://second.com/")
|
||||||
|
|
||||||
|
def test_unified_format_invalid_index(self):
|
||||||
|
config = {"name": "test", "sources": [{"type": "github", "repo": "o/r"}]}
|
||||||
|
self.assertIsNone(_get_doc_source(config))
|
||||||
|
|
||||||
|
def test_legacy_flat_format(self):
|
||||||
|
config = {"name": "test", "base_url": "https://docs.example.com/"}
|
||||||
|
source = _get_doc_source(config)
|
||||||
|
self.assertEqual(source["base_url"], "https://docs.example.com/")
|
||||||
|
|
||||||
|
def test_no_source_found(self):
|
||||||
|
config = {"name": "test"}
|
||||||
|
self.assertIsNone(_get_doc_source(config))
|
||||||
|
|
||||||
|
def test_set_start_urls_unified(self):
|
||||||
|
config = {
|
||||||
|
"sources": [
|
||||||
|
{"type": "documentation", "base_url": "https://x.com/", "start_urls": []},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
_set_start_urls(config, 0, ["https://x.com/a", "https://x.com/b"])
|
||||||
|
self.assertEqual(config["sources"][0]["start_urls"], ["https://x.com/a", "https://x.com/b"])
|
||||||
|
|
||||||
|
def test_set_start_urls_legacy(self):
|
||||||
|
config = {"base_url": "https://x.com/", "start_urls": []}
|
||||||
|
_set_start_urls(config, 0, ["https://x.com/new"])
|
||||||
|
self.assertEqual(config["start_urls"], ["https://x.com/new"])
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# discover_urls (with mocked HTTP)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestDiscoverUrls(unittest.TestCase):
|
||||||
|
"""Test BFS link discovery with mocked HTTP responses."""
|
||||||
|
|
||||||
|
def _make_html(self, links: list[str]) -> str:
|
||||||
|
hrefs = "".join(f'<a href="{u}">link</a>' for u in links)
|
||||||
|
return f"<html><body>{hrefs}</body></html>"
|
||||||
|
|
||||||
|
@patch("skill_seekers.cli.sync_config.requests.get")
|
||||||
|
def test_basic_discovery(self, mock_get):
|
||||||
|
"""Discover links from a single seed page."""
|
||||||
|
mock_resp = MagicMock()
|
||||||
|
mock_resp.content = self._make_html(
|
||||||
|
[
|
||||||
|
"https://docs.example.com/page-a",
|
||||||
|
"https://docs.example.com/page-b",
|
||||||
|
"https://other.com/external", # should be filtered out
|
||||||
|
]
|
||||||
|
).encode()
|
||||||
|
mock_resp.raise_for_status = MagicMock()
|
||||||
|
mock_get.return_value = mock_resp
|
||||||
|
|
||||||
|
result = discover_urls(
|
||||||
|
base_url="https://docs.example.com/",
|
||||||
|
seed_urls=["https://docs.example.com/"],
|
||||||
|
depth=1,
|
||||||
|
rate_limit=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIn("https://docs.example.com/", result)
|
||||||
|
self.assertIn("https://docs.example.com/page-a", result)
|
||||||
|
self.assertIn("https://docs.example.com/page-b", result)
|
||||||
|
self.assertNotIn("https://other.com/external", result)
|
||||||
|
|
||||||
|
@patch("skill_seekers.cli.sync_config.requests.get")
|
||||||
|
def test_depth_limiting(self, mock_get):
|
||||||
|
"""URLs at depth > limit should be discovered but not followed."""
|
||||||
|
# Seed returns one link
|
||||||
|
seed_html = self._make_html(["https://docs.example.com/child"])
|
||||||
|
child_html = self._make_html(["https://docs.example.com/grandchild"])
|
||||||
|
|
||||||
|
mock_get.side_effect = [
|
||||||
|
MagicMock(content=seed_html.encode(), raise_for_status=MagicMock()),
|
||||||
|
MagicMock(content=child_html.encode(), raise_for_status=MagicMock()),
|
||||||
|
]
|
||||||
|
|
||||||
|
result = discover_urls(
|
||||||
|
base_url="https://docs.example.com/",
|
||||||
|
seed_urls=["https://docs.example.com/"],
|
||||||
|
depth=1, # Only follow seed page links, not child page links
|
||||||
|
rate_limit=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIn("https://docs.example.com/child", result)
|
||||||
|
# grandchild is at depth 2, which exceeds depth=1
|
||||||
|
self.assertNotIn("https://docs.example.com/grandchild", result)
|
||||||
|
|
||||||
|
@patch("skill_seekers.cli.sync_config.requests.get")
|
||||||
|
def test_max_pages_limit(self, mock_get):
|
||||||
|
"""Stop after max_pages."""
|
||||||
|
links = [f"https://docs.example.com/page-{i}" for i in range(20)]
|
||||||
|
mock_resp = MagicMock()
|
||||||
|
mock_resp.content = self._make_html(links).encode()
|
||||||
|
mock_resp.raise_for_status = MagicMock()
|
||||||
|
mock_get.return_value = mock_resp
|
||||||
|
|
||||||
|
result = discover_urls(
|
||||||
|
base_url="https://docs.example.com/",
|
||||||
|
seed_urls=["https://docs.example.com/"],
|
||||||
|
depth=1,
|
||||||
|
max_pages=5,
|
||||||
|
rate_limit=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertLessEqual(len(result), 5)
|
||||||
|
|
||||||
|
@patch("skill_seekers.cli.sync_config.requests.get")
|
||||||
|
def test_include_exclude_patterns(self, mock_get):
|
||||||
|
"""Include/exclude patterns are respected."""
|
||||||
|
mock_resp = MagicMock()
|
||||||
|
mock_resp.content = self._make_html(
|
||||||
|
[
|
||||||
|
"https://docs.example.com/docs/en/guide",
|
||||||
|
"https://docs.example.com/docs/fr/guide",
|
||||||
|
"https://docs.example.com/blog/post",
|
||||||
|
]
|
||||||
|
).encode()
|
||||||
|
mock_resp.raise_for_status = MagicMock()
|
||||||
|
mock_get.return_value = mock_resp
|
||||||
|
|
||||||
|
result = discover_urls(
|
||||||
|
base_url="https://docs.example.com/",
|
||||||
|
seed_urls=["https://docs.example.com/docs/en/overview"],
|
||||||
|
include_patterns=["/docs/en/"],
|
||||||
|
exclude_patterns=["/blog/"],
|
||||||
|
depth=1,
|
||||||
|
rate_limit=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIn("https://docs.example.com/docs/en/guide", result)
|
||||||
|
self.assertNotIn("https://docs.example.com/docs/fr/guide", result)
|
||||||
|
self.assertNotIn("https://docs.example.com/blog/post", result)
|
||||||
|
|
||||||
|
@patch("skill_seekers.cli.sync_config.requests.get")
|
||||||
|
def test_http_error_handled_gracefully(self, mock_get):
|
||||||
|
"""HTTP errors should not crash the discovery."""
|
||||||
|
mock_get.side_effect = ConnectionError("Network error")
|
||||||
|
|
||||||
|
result = discover_urls(
|
||||||
|
base_url="https://docs.example.com/",
|
||||||
|
seed_urls=["https://docs.example.com/"],
|
||||||
|
depth=1,
|
||||||
|
rate_limit=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
# URLs that fail to fetch are NOT added to discovered (they may
|
||||||
|
# have been removed from the live site).
|
||||||
|
self.assertEqual(result, set())
|
||||||
|
|
||||||
|
@patch("skill_seekers.cli.sync_config.requests.get")
|
||||||
|
def test_fragments_stripped(self, mock_get):
|
||||||
|
"""URL fragments (#anchor) should be stripped."""
|
||||||
|
mock_resp = MagicMock()
|
||||||
|
mock_resp.content = self._make_html(
|
||||||
|
[
|
||||||
|
"https://docs.example.com/guide#section1",
|
||||||
|
"https://docs.example.com/guide#section2",
|
||||||
|
]
|
||||||
|
).encode()
|
||||||
|
mock_resp.raise_for_status = MagicMock()
|
||||||
|
mock_get.return_value = mock_resp
|
||||||
|
|
||||||
|
result = discover_urls(
|
||||||
|
base_url="https://docs.example.com/",
|
||||||
|
seed_urls=["https://docs.example.com/"],
|
||||||
|
depth=1,
|
||||||
|
rate_limit=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Both anchors should resolve to the same URL
|
||||||
|
self.assertIn("https://docs.example.com/guide", result)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# sync_config (integration with file I/O)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestSyncConfigIntegration(unittest.TestCase):
|
||||||
|
"""Test the full sync_config workflow with mocked HTTP."""
|
||||||
|
|
||||||
|
def _write_config(self, config: dict) -> Path:
|
||||||
|
tmp = tempfile.mktemp(suffix=".json") # noqa: SIM115
|
||||||
|
with open(tmp, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(config, f, indent=2)
|
||||||
|
return Path(tmp)
|
||||||
|
|
||||||
|
@patch("skill_seekers.cli.sync_config.discover_urls")
|
||||||
|
def test_dry_run_does_not_modify_file(self, mock_discover):
|
||||||
|
mock_discover.return_value = {
|
||||||
|
"https://docs.example.com/a",
|
||||||
|
"https://docs.example.com/b",
|
||||||
|
"https://docs.example.com/c",
|
||||||
|
}
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"name": "test",
|
||||||
|
"sources": [
|
||||||
|
{
|
||||||
|
"type": "documentation",
|
||||||
|
"base_url": "https://docs.example.com/",
|
||||||
|
"start_urls": ["https://docs.example.com/a"],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
path = self._write_config(config)
|
||||||
|
|
||||||
|
result = sync_config(str(path), apply=False)
|
||||||
|
self.assertFalse(result["applied"])
|
||||||
|
self.assertEqual(len(result["added"]), 2)
|
||||||
|
|
||||||
|
# File should not be modified
|
||||||
|
with open(path, encoding="utf-8") as f:
|
||||||
|
saved = json.load(f)
|
||||||
|
self.assertEqual(len(saved["sources"][0]["start_urls"]), 1)
|
||||||
|
path.unlink()
|
||||||
|
|
||||||
|
@patch("skill_seekers.cli.sync_config.discover_urls")
|
||||||
|
def test_apply_writes_updated_urls(self, mock_discover):
|
||||||
|
mock_discover.return_value = {
|
||||||
|
"https://docs.example.com/a",
|
||||||
|
"https://docs.example.com/b",
|
||||||
|
}
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"name": "test",
|
||||||
|
"sources": [
|
||||||
|
{
|
||||||
|
"type": "documentation",
|
||||||
|
"base_url": "https://docs.example.com/",
|
||||||
|
"start_urls": ["https://docs.example.com/a", "https://docs.example.com/old"],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
path = self._write_config(config)
|
||||||
|
|
||||||
|
result = sync_config(str(path), apply=True)
|
||||||
|
self.assertTrue(result["applied"])
|
||||||
|
self.assertEqual(result["added"], ["https://docs.example.com/b"])
|
||||||
|
self.assertEqual(result["removed"], ["https://docs.example.com/old"])
|
||||||
|
|
||||||
|
# File should be updated
|
||||||
|
with open(path, encoding="utf-8") as f:
|
||||||
|
saved = json.load(f)
|
||||||
|
urls = saved["sources"][0]["start_urls"]
|
||||||
|
self.assertIn("https://docs.example.com/a", urls)
|
||||||
|
self.assertIn("https://docs.example.com/b", urls)
|
||||||
|
self.assertNotIn("https://docs.example.com/old", urls)
|
||||||
|
path.unlink()
|
||||||
|
|
||||||
|
@patch("skill_seekers.cli.sync_config.discover_urls")
|
||||||
|
def test_no_changes_does_not_write(self, mock_discover):
|
||||||
|
urls = ["https://docs.example.com/a", "https://docs.example.com/b"]
|
||||||
|
mock_discover.return_value = set(urls)
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"name": "test",
|
||||||
|
"sources": [
|
||||||
|
{
|
||||||
|
"type": "documentation",
|
||||||
|
"base_url": "https://docs.example.com/",
|
||||||
|
"start_urls": urls,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
path = self._write_config(config)
|
||||||
|
|
||||||
|
result = sync_config(str(path), apply=True)
|
||||||
|
self.assertFalse(result["applied"])
|
||||||
|
self.assertEqual(result["added"], [])
|
||||||
|
self.assertEqual(result["removed"], [])
|
||||||
|
path.unlink()
|
||||||
|
|
||||||
|
def test_missing_source_returns_error(self):
|
||||||
|
config = {"name": "test", "sources": [{"type": "github", "repo": "o/r"}]}
|
||||||
|
path = self._write_config(config)
|
||||||
|
|
||||||
|
result = sync_config(str(path))
|
||||||
|
self.assertIn("error", result)
|
||||||
|
path.unlink()
|
||||||
|
|
||||||
|
@patch("skill_seekers.cli.sync_config.discover_urls")
|
||||||
|
def test_legacy_config_format(self, mock_discover):
|
||||||
|
mock_discover.return_value = {"https://docs.example.com/a"}
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"name": "test",
|
||||||
|
"base_url": "https://docs.example.com/",
|
||||||
|
"start_urls": ["https://docs.example.com/a", "https://docs.example.com/old"],
|
||||||
|
}
|
||||||
|
path = self._write_config(config)
|
||||||
|
|
||||||
|
result = sync_config(str(path), apply=True)
|
||||||
|
self.assertTrue(result["applied"])
|
||||||
|
self.assertEqual(result["removed"], ["https://docs.example.com/old"])
|
||||||
|
|
||||||
|
with open(path, encoding="utf-8") as f:
|
||||||
|
saved = json.load(f)
|
||||||
|
self.assertEqual(saved["start_urls"], ["https://docs.example.com/a"])
|
||||||
|
path.unlink()
|
||||||
|
|
||||||
|
@patch("skill_seekers.cli.sync_config.discover_urls")
|
||||||
|
def test_nav_seed_urls_used_over_start_urls(self, mock_discover):
|
||||||
|
"""When nav_seed_urls is present, it should be used as the seed."""
|
||||||
|
mock_discover.return_value = {"https://docs.example.com/a"}
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"name": "test",
|
||||||
|
"sources": [
|
||||||
|
{
|
||||||
|
"type": "documentation",
|
||||||
|
"base_url": "https://docs.example.com/",
|
||||||
|
"start_urls": ["https://docs.example.com/a"],
|
||||||
|
"nav_seed_urls": [
|
||||||
|
"https://docs.example.com/nav1",
|
||||||
|
"https://docs.example.com/nav2",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
path = self._write_config(config)
|
||||||
|
|
||||||
|
sync_config(str(path))
|
||||||
|
|
||||||
|
# Verify discover_urls was called with nav_seed_urls
|
||||||
|
call_kwargs = mock_discover.call_args[1]
|
||||||
|
self.assertEqual(
|
||||||
|
call_kwargs["seed_urls"],
|
||||||
|
["https://docs.example.com/nav1", "https://docs.example.com/nav2"],
|
||||||
|
)
|
||||||
|
path.unlink()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# CLI argument parsing
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestSyncConfigCLI(unittest.TestCase):
|
||||||
|
"""Test CLI argument parsing and subcommand registration."""
|
||||||
|
|
||||||
|
def test_sync_config_parser_registered(self):
|
||||||
|
"""sync-config should be a registered subcommand."""
|
||||||
|
from skill_seekers.cli.parsers import get_parser_names
|
||||||
|
|
||||||
|
self.assertIn("sync-config", get_parser_names())
|
||||||
|
|
||||||
|
def test_sync_config_in_command_modules(self):
|
||||||
|
"""sync-config should be in COMMAND_MODULES."""
|
||||||
|
from skill_seekers.cli.main import COMMAND_MODULES
|
||||||
|
|
||||||
|
self.assertIn("sync-config", COMMAND_MODULES)
|
||||||
|
|
||||||
|
def test_arguments_created(self):
|
||||||
|
"""Argument parser should accept all expected flags."""
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from skill_seekers.cli.arguments.sync_config import add_sync_config_arguments
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
add_sync_config_arguments(parser)
|
||||||
|
|
||||||
|
args = parser.parse_args(["--config", "test.json", "--apply", "--depth", "3"])
|
||||||
|
self.assertEqual(args.config, "test.json")
|
||||||
|
self.assertTrue(args.apply)
|
||||||
|
self.assertEqual(args.depth, 3)
|
||||||
|
|
||||||
|
def test_default_values(self):
|
||||||
|
"""Default values should be sensible."""
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from skill_seekers.cli.arguments.sync_config import add_sync_config_arguments
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
add_sync_config_arguments(parser)
|
||||||
|
|
||||||
|
args = parser.parse_args(["--config", "test.json"])
|
||||||
|
self.assertFalse(args.apply)
|
||||||
|
self.assertEqual(args.depth, 2)
|
||||||
|
self.assertEqual(args.max_pages, 500)
|
||||||
|
self.assertIsNone(args.rate_limit)
|
||||||
|
self.assertEqual(args.source_index, 0)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# MCP tool
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestSyncConfigMCPTool(unittest.TestCase):
|
||||||
|
"""Test MCP tool wrapper."""
|
||||||
|
|
||||||
|
def test_mcp_tool_importable(self):
|
||||||
|
"""The sync_config MCP tool should be importable."""
|
||||||
|
from skill_seekers.mcp.tools import sync_config_impl
|
||||||
|
|
||||||
|
self.assertTrue(callable(sync_config_impl))
|
||||||
|
|
||||||
|
def test_mcp_tool_missing_config_path(self):
|
||||||
|
"""Missing config_path should return an error."""
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
from skill_seekers.mcp.tools.sync_config_tools import sync_config_tool
|
||||||
|
|
||||||
|
result = asyncio.run(sync_config_tool({}))
|
||||||
|
self.assertTrue(any("Error" in r.text for r in result))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
626
tests/test_sync_config_e2e.py
Normal file
626
tests/test_sync_config_e2e.py
Normal file
@@ -0,0 +1,626 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""End-to-end tests for the sync-config command.
|
||||||
|
|
||||||
|
Uses a local HTTP server with realistic multi-page HTML navigation to test
|
||||||
|
the full pipeline: BFS crawl -> link discovery -> diff -> config update.
|
||||||
|
|
||||||
|
Also includes an integration test against a real public docs site.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
import threading
|
||||||
|
import unittest
|
||||||
|
from http.server import HTTPServer, SimpleHTTPRequestHandler
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from skill_seekers.cli.sync_config import discover_urls, sync_config
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Local test HTTP server
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Simulates a docs site with this navigation structure:
|
||||||
|
#
|
||||||
|
# /docs/ (index — links to guide, api, faq)
|
||||||
|
# /docs/guide (links to guide/install, guide/usage)
|
||||||
|
# /docs/guide/install (leaf page)
|
||||||
|
# /docs/guide/usage (leaf page, links back to guide)
|
||||||
|
# /docs/api (links to api/auth, api/users)
|
||||||
|
# /docs/api/auth (leaf page)
|
||||||
|
# /docs/api/users (leaf page)
|
||||||
|
# /docs/faq (leaf page)
|
||||||
|
# /blog/post-1 (outside /docs/ — should be excluded)
|
||||||
|
|
||||||
|
_SITE_PAGES = {
|
||||||
|
"/docs/": """<!DOCTYPE html><html><head><title>Docs Home</title></head><body>
|
||||||
|
<h1>Documentation</h1>
|
||||||
|
<nav>
|
||||||
|
<a href="/docs/guide">Guide</a>
|
||||||
|
<a href="/docs/api">API Reference</a>
|
||||||
|
<a href="/docs/faq">FAQ</a>
|
||||||
|
<a href="/blog/post-1">Blog</a>
|
||||||
|
<a href="https://github.com/example/repo">GitHub</a>
|
||||||
|
</nav>
|
||||||
|
</body></html>""",
|
||||||
|
"/docs/guide": """<!DOCTYPE html><html><body>
|
||||||
|
<h1>Guide</h1>
|
||||||
|
<a href="/docs/guide/install">Installation</a>
|
||||||
|
<a href="/docs/guide/usage">Usage</a>
|
||||||
|
<a href="/docs/">Back to docs</a>
|
||||||
|
</body></html>""",
|
||||||
|
"/docs/guide/install": """<!DOCTYPE html><html><body>
|
||||||
|
<h1>Installation</h1><p>pip install example</p>
|
||||||
|
<a href="/docs/guide">Back to guide</a>
|
||||||
|
</body></html>""",
|
||||||
|
"/docs/guide/usage": """<!DOCTYPE html><html><body>
|
||||||
|
<h1>Usage</h1><p>import example</p>
|
||||||
|
<a href="/docs/guide">Back to guide</a>
|
||||||
|
</body></html>""",
|
||||||
|
"/docs/api": """<!DOCTYPE html><html><body>
|
||||||
|
<h1>API Reference</h1>
|
||||||
|
<a href="/docs/api/auth">Authentication</a>
|
||||||
|
<a href="/docs/api/users">Users</a>
|
||||||
|
</body></html>""",
|
||||||
|
"/docs/api/auth": """<!DOCTYPE html><html><body>
|
||||||
|
<h1>Authentication</h1><p>Use tokens.</p>
|
||||||
|
</body></html>""",
|
||||||
|
"/docs/api/users": """<!DOCTYPE html><html><body>
|
||||||
|
<h1>Users API</h1><p>CRUD operations.</p>
|
||||||
|
</body></html>""",
|
||||||
|
"/docs/faq": """<!DOCTYPE html><html><body>
|
||||||
|
<h1>FAQ</h1><p>Common questions.</p>
|
||||||
|
</body></html>""",
|
||||||
|
"/blog/post-1": """<!DOCTYPE html><html><body>
|
||||||
|
<h1>Blog Post</h1><p>This is a blog post outside /docs/.</p>
|
||||||
|
</body></html>""",
|
||||||
|
}
|
||||||
|
|
||||||
|
# All docs pages that should be discovered (excluding /blog/)
|
||||||
|
_ALL_DOC_URLS_PATHS = {
|
||||||
|
"/docs/",
|
||||||
|
"/docs/guide",
|
||||||
|
"/docs/guide/install",
|
||||||
|
"/docs/guide/usage",
|
||||||
|
"/docs/api",
|
||||||
|
"/docs/api/auth",
|
||||||
|
"/docs/api/users",
|
||||||
|
"/docs/faq",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class _TestHandler(SimpleHTTPRequestHandler):
|
||||||
|
"""Serve pages from the in-memory _SITE_PAGES dict."""
|
||||||
|
|
||||||
|
def do_GET(self):
|
||||||
|
path = self.path.split("?")[0].split("#")[0]
|
||||||
|
content = _SITE_PAGES.get(path)
|
||||||
|
if content is None:
|
||||||
|
self.send_error(404)
|
||||||
|
return
|
||||||
|
self.send_response(200)
|
||||||
|
self.send_header("Content-Type", "text/html; charset=utf-8")
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(content.encode("utf-8"))
|
||||||
|
|
||||||
|
def log_message(self, format, *args): # noqa: ARG002
|
||||||
|
pass # Suppress request logging during tests
|
||||||
|
|
||||||
|
|
||||||
|
def _start_server() -> tuple[HTTPServer, int]:
|
||||||
|
"""Start a local HTTP server on a random port. Returns (server, port)."""
|
||||||
|
server = HTTPServer(("127.0.0.1", 0), _TestHandler)
|
||||||
|
port = server.server_address[1]
|
||||||
|
thread = threading.Thread(target=server.serve_forever, daemon=True)
|
||||||
|
thread.start()
|
||||||
|
return server, port
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helper
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _write_config(config: dict) -> Path:
|
||||||
|
"""Write a config dict to a temp JSON file and return its path."""
|
||||||
|
tmp = tempfile.mktemp(suffix=".json")
|
||||||
|
with open(tmp, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(config, f, indent=2)
|
||||||
|
return Path(tmp)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# E2E tests using local HTTP server
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.e2e
|
||||||
|
class TestSyncConfigE2E(unittest.TestCase):
|
||||||
|
"""End-to-end tests using a local HTTP server with realistic HTML."""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setUpClass(cls):
|
||||||
|
cls.server, cls.port = _start_server()
|
||||||
|
cls.base_url = f"http://127.0.0.1:{cls.port}/docs/"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def tearDownClass(cls):
|
||||||
|
cls.server.shutdown()
|
||||||
|
|
||||||
|
# -- discover_urls --
|
||||||
|
|
||||||
|
def test_discover_finds_all_doc_pages(self):
|
||||||
|
"""BFS should discover all 8 /docs/ pages from the root."""
|
||||||
|
discovered = discover_urls(
|
||||||
|
base_url=self.base_url,
|
||||||
|
seed_urls=[self.base_url],
|
||||||
|
depth=3,
|
||||||
|
rate_limit=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
expected = {f"http://127.0.0.1:{self.port}{p}" for p in _ALL_DOC_URLS_PATHS}
|
||||||
|
self.assertEqual(discovered, expected)
|
||||||
|
|
||||||
|
def test_discover_excludes_blog(self):
|
||||||
|
"""Pages outside /docs/ base_url should be excluded."""
|
||||||
|
discovered = discover_urls(
|
||||||
|
base_url=self.base_url,
|
||||||
|
seed_urls=[self.base_url],
|
||||||
|
depth=3,
|
||||||
|
rate_limit=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
blog_url = f"http://127.0.0.1:{self.port}/blog/post-1"
|
||||||
|
self.assertNotIn(blog_url, discovered)
|
||||||
|
|
||||||
|
def test_discover_excludes_external(self):
|
||||||
|
"""External URLs (github.com) should be excluded."""
|
||||||
|
discovered = discover_urls(
|
||||||
|
base_url=self.base_url,
|
||||||
|
seed_urls=[self.base_url],
|
||||||
|
depth=3,
|
||||||
|
rate_limit=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertFalse(
|
||||||
|
any("github.com" in u for u in discovered),
|
||||||
|
"External URLs should not be discovered",
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_discover_depth_1_finds_direct_links_only(self):
|
||||||
|
"""Depth 1 from root should find guide, api, faq but NOT nested pages."""
|
||||||
|
discovered = discover_urls(
|
||||||
|
base_url=self.base_url,
|
||||||
|
seed_urls=[self.base_url],
|
||||||
|
depth=1,
|
||||||
|
rate_limit=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Direct children of /docs/
|
||||||
|
self.assertIn(f"http://127.0.0.1:{self.port}/docs/guide", discovered)
|
||||||
|
self.assertIn(f"http://127.0.0.1:{self.port}/docs/api", discovered)
|
||||||
|
self.assertIn(f"http://127.0.0.1:{self.port}/docs/faq", discovered)
|
||||||
|
|
||||||
|
# Nested pages should NOT be present (they're at depth 2)
|
||||||
|
self.assertNotIn(f"http://127.0.0.1:{self.port}/docs/guide/install", discovered)
|
||||||
|
self.assertNotIn(f"http://127.0.0.1:{self.port}/docs/api/auth", discovered)
|
||||||
|
|
||||||
|
def test_discover_with_include_pattern(self):
|
||||||
|
"""Include pattern should filter results."""
|
||||||
|
discovered = discover_urls(
|
||||||
|
base_url=self.base_url,
|
||||||
|
seed_urls=[self.base_url],
|
||||||
|
include_patterns=["/api"],
|
||||||
|
depth=3,
|
||||||
|
rate_limit=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Only /api/ pages should be discovered
|
||||||
|
for url in discovered:
|
||||||
|
self.assertIn("/api", url, f"URL {url} does not match include pattern /api")
|
||||||
|
|
||||||
|
def test_discover_with_exclude_pattern(self):
|
||||||
|
"""Exclude pattern should remove matching pages."""
|
||||||
|
discovered = discover_urls(
|
||||||
|
base_url=self.base_url,
|
||||||
|
seed_urls=[self.base_url],
|
||||||
|
exclude_patterns=["/faq"],
|
||||||
|
depth=3,
|
||||||
|
rate_limit=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
faq_url = f"http://127.0.0.1:{self.port}/docs/faq"
|
||||||
|
self.assertNotIn(faq_url, discovered)
|
||||||
|
# Other pages should still be found
|
||||||
|
self.assertIn(f"http://127.0.0.1:{self.port}/docs/guide", discovered)
|
||||||
|
|
||||||
|
def test_discover_max_pages_limit(self):
|
||||||
|
"""max_pages should cap discovery."""
|
||||||
|
discovered = discover_urls(
|
||||||
|
base_url=self.base_url,
|
||||||
|
seed_urls=[self.base_url],
|
||||||
|
depth=3,
|
||||||
|
max_pages=3,
|
||||||
|
rate_limit=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertLessEqual(len(discovered), 3)
|
||||||
|
|
||||||
|
# -- sync_config (full pipeline with file I/O) --
|
||||||
|
|
||||||
|
def test_sync_config_dry_run_detects_new_pages(self):
|
||||||
|
"""Dry-run should detect pages missing from the config."""
|
||||||
|
config = {
|
||||||
|
"name": "test-site",
|
||||||
|
"sources": [
|
||||||
|
{
|
||||||
|
"type": "documentation",
|
||||||
|
"base_url": self.base_url,
|
||||||
|
"start_urls": [
|
||||||
|
f"http://127.0.0.1:{self.port}/docs/guide",
|
||||||
|
f"http://127.0.0.1:{self.port}/docs/faq",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
path = _write_config(config)
|
||||||
|
|
||||||
|
result = sync_config(str(path), apply=False, depth=3, rate_limit=0)
|
||||||
|
|
||||||
|
self.assertFalse(result["applied"])
|
||||||
|
self.assertGreater(len(result["added"]), 0, "Should detect new pages")
|
||||||
|
# api, api/auth, api/users, guide/install, guide/usage, /docs/ itself
|
||||||
|
# should all be in added
|
||||||
|
self.assertGreaterEqual(result["total_discovered"], 6)
|
||||||
|
|
||||||
|
# File should NOT be modified
|
||||||
|
with open(path, encoding="utf-8") as f:
|
||||||
|
saved = json.load(f)
|
||||||
|
self.assertEqual(len(saved["sources"][0]["start_urls"]), 2)
|
||||||
|
path.unlink()
|
||||||
|
|
||||||
|
def test_sync_config_apply_updates_config(self):
|
||||||
|
"""--apply should write all discovered URLs to the config."""
|
||||||
|
config = {
|
||||||
|
"name": "test-site",
|
||||||
|
"sources": [
|
||||||
|
{
|
||||||
|
"type": "documentation",
|
||||||
|
"base_url": self.base_url,
|
||||||
|
"start_urls": [f"http://127.0.0.1:{self.port}/docs/guide"],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
path = _write_config(config)
|
||||||
|
|
||||||
|
result = sync_config(str(path), apply=True, depth=3, rate_limit=0)
|
||||||
|
|
||||||
|
self.assertTrue(result["applied"])
|
||||||
|
|
||||||
|
# Verify the file was updated
|
||||||
|
with open(path, encoding="utf-8") as f:
|
||||||
|
saved = json.load(f)
|
||||||
|
saved_urls = saved["sources"][0]["start_urls"]
|
||||||
|
self.assertEqual(len(saved_urls), result["total_discovered"])
|
||||||
|
|
||||||
|
# All expected URLs should be present
|
||||||
|
expected = {f"http://127.0.0.1:{self.port}{p}" for p in _ALL_DOC_URLS_PATHS}
|
||||||
|
for url in expected:
|
||||||
|
self.assertIn(url, saved_urls, f"Expected URL missing from saved config: {url}")
|
||||||
|
|
||||||
|
path.unlink()
|
||||||
|
|
||||||
|
def test_sync_config_idempotent(self):
|
||||||
|
"""Running sync twice with --apply should be a no-op the second time."""
|
||||||
|
config = {
|
||||||
|
"name": "test-site",
|
||||||
|
"sources": [
|
||||||
|
{
|
||||||
|
"type": "documentation",
|
||||||
|
"base_url": self.base_url,
|
||||||
|
"start_urls": [],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
path = _write_config(config)
|
||||||
|
|
||||||
|
# First run: should apply changes
|
||||||
|
result1 = sync_config(str(path), apply=True, depth=3, rate_limit=0)
|
||||||
|
self.assertTrue(result1["applied"])
|
||||||
|
self.assertGreater(len(result1["added"]), 0)
|
||||||
|
|
||||||
|
# Second run: should detect no changes
|
||||||
|
result2 = sync_config(str(path), apply=True, depth=3, rate_limit=0)
|
||||||
|
self.assertFalse(result2["applied"])
|
||||||
|
self.assertEqual(result2["added"], [])
|
||||||
|
self.assertEqual(result2["removed"], [])
|
||||||
|
|
||||||
|
path.unlink()
|
||||||
|
|
||||||
|
def test_sync_config_detects_removed_pages(self):
|
||||||
|
"""Pages in config but not discovered should show as removed."""
|
||||||
|
config = {
|
||||||
|
"name": "test-site",
|
||||||
|
"sources": [
|
||||||
|
{
|
||||||
|
"type": "documentation",
|
||||||
|
"base_url": self.base_url,
|
||||||
|
"start_urls": [
|
||||||
|
f"http://127.0.0.1:{self.port}/docs/guide",
|
||||||
|
f"http://127.0.0.1:{self.port}/docs/old-page-that-no-longer-exists",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
path = _write_config(config)
|
||||||
|
|
||||||
|
result = sync_config(str(path), apply=False, depth=3, rate_limit=0)
|
||||||
|
|
||||||
|
self.assertIn(
|
||||||
|
f"http://127.0.0.1:{self.port}/docs/old-page-that-no-longer-exists",
|
||||||
|
result["removed"],
|
||||||
|
)
|
||||||
|
path.unlink()
|
||||||
|
|
||||||
|
def test_sync_config_preserves_other_config_fields(self):
|
||||||
|
"""--apply should only modify start_urls, preserving all other fields."""
|
||||||
|
config = {
|
||||||
|
"name": "my-skill",
|
||||||
|
"description": "Important skill description",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"sources": [
|
||||||
|
{
|
||||||
|
"type": "documentation",
|
||||||
|
"base_url": self.base_url,
|
||||||
|
"start_urls": [],
|
||||||
|
"selectors": {"main_content": "article", "title": "h1"},
|
||||||
|
"url_patterns": {"include": [], "exclude": []},
|
||||||
|
"rate_limit": 0.5,
|
||||||
|
"max_pages": 100,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "github",
|
||||||
|
"repo": "owner/repo",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
path = _write_config(config)
|
||||||
|
|
||||||
|
sync_config(str(path), apply=True, depth=3, rate_limit=0)
|
||||||
|
|
||||||
|
with open(path, encoding="utf-8") as f:
|
||||||
|
saved = json.load(f)
|
||||||
|
|
||||||
|
# Non-start_urls fields should be untouched
|
||||||
|
self.assertEqual(saved["name"], "my-skill")
|
||||||
|
self.assertEqual(saved["description"], "Important skill description")
|
||||||
|
self.assertEqual(saved["version"], "1.0.0")
|
||||||
|
self.assertEqual(saved["sources"][0]["selectors"]["main_content"], "article")
|
||||||
|
self.assertEqual(saved["sources"][0]["rate_limit"], 0.5)
|
||||||
|
self.assertEqual(saved["sources"][1]["type"], "github")
|
||||||
|
self.assertEqual(saved["sources"][1]["repo"], "owner/repo")
|
||||||
|
|
||||||
|
# start_urls should be updated
|
||||||
|
self.assertGreater(len(saved["sources"][0]["start_urls"]), 0)
|
||||||
|
|
||||||
|
path.unlink()
|
||||||
|
|
||||||
|
def test_sync_config_with_nav_seed_urls(self):
|
||||||
|
"""nav_seed_urls should be used as BFS seeds instead of start_urls."""
|
||||||
|
config = {
|
||||||
|
"name": "test-site",
|
||||||
|
"sources": [
|
||||||
|
{
|
||||||
|
"type": "documentation",
|
||||||
|
"base_url": self.base_url,
|
||||||
|
"start_urls": [],
|
||||||
|
# Only seed from /docs/api — should only discover API pages
|
||||||
|
"nav_seed_urls": [f"http://127.0.0.1:{self.port}/docs/api"],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
path = _write_config(config)
|
||||||
|
|
||||||
|
result = sync_config(str(path), apply=False, depth=1, rate_limit=0)
|
||||||
|
|
||||||
|
# Should discover at least the API seed page
|
||||||
|
self.assertGreater(len(result["added"]), 0, "nav_seed_urls should discover pages")
|
||||||
|
# All added URLs should be under /docs/
|
||||||
|
for url in result["added"]:
|
||||||
|
self.assertTrue(url.startswith(self.base_url), f"URL outside base: {url}")
|
||||||
|
|
||||||
|
path.unlink()
|
||||||
|
|
||||||
|
def test_sync_config_legacy_format(self):
|
||||||
|
"""Legacy flat config format should work end-to-end."""
|
||||||
|
config = {
|
||||||
|
"name": "test-site",
|
||||||
|
"base_url": self.base_url,
|
||||||
|
"start_urls": [f"http://127.0.0.1:{self.port}/docs/guide"],
|
||||||
|
}
|
||||||
|
path = _write_config(config)
|
||||||
|
|
||||||
|
result = sync_config(str(path), apply=True, depth=3, rate_limit=0)
|
||||||
|
|
||||||
|
self.assertTrue(result["applied"])
|
||||||
|
|
||||||
|
with open(path, encoding="utf-8") as f:
|
||||||
|
saved = json.load(f)
|
||||||
|
self.assertGreater(len(saved["start_urls"]), 1)
|
||||||
|
|
||||||
|
path.unlink()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# CLI subprocess tests
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.e2e
|
||||||
|
class TestSyncConfigCLIE2E(unittest.TestCase):
|
||||||
|
"""Test the CLI entry point via subprocess."""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setUpClass(cls):
|
||||||
|
cls.server, cls.port = _start_server()
|
||||||
|
cls.base_url = f"http://127.0.0.1:{cls.port}/docs/"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def tearDownClass(cls):
|
||||||
|
cls.server.shutdown()
|
||||||
|
|
||||||
|
def test_cli_dry_run(self):
|
||||||
|
"""CLI dry-run should print diff and exit 0."""
|
||||||
|
config = {
|
||||||
|
"name": "test",
|
||||||
|
"sources": [
|
||||||
|
{
|
||||||
|
"type": "documentation",
|
||||||
|
"base_url": self.base_url,
|
||||||
|
# Only one URL configured — the rest should show as "new"
|
||||||
|
"start_urls": [f"http://127.0.0.1:{self.port}/docs/faq"],
|
||||||
|
# Seed from root to discover all pages
|
||||||
|
"nav_seed_urls": [self.base_url],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
path = _write_config(config)
|
||||||
|
|
||||||
|
result = subprocess.run(
|
||||||
|
[
|
||||||
|
sys.executable,
|
||||||
|
"-m",
|
||||||
|
"skill_seekers.cli.sync_config",
|
||||||
|
"--config",
|
||||||
|
str(path),
|
||||||
|
"--depth",
|
||||||
|
"3",
|
||||||
|
"--rate-limit",
|
||||||
|
"0",
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(result.returncode, 0, f"CLI failed: {result.stderr}")
|
||||||
|
# Should mention new pages in the output (logged to stderr)
|
||||||
|
combined = result.stderr.lower() + result.stdout.lower()
|
||||||
|
self.assertIn("new page", combined, f"Expected 'new page' in output: {combined}")
|
||||||
|
path.unlink()
|
||||||
|
|
||||||
|
def test_cli_apply(self):
|
||||||
|
"""CLI --apply should update the config file."""
|
||||||
|
config = {
|
||||||
|
"name": "test",
|
||||||
|
"sources": [
|
||||||
|
{
|
||||||
|
"type": "documentation",
|
||||||
|
"base_url": self.base_url,
|
||||||
|
"start_urls": [f"http://127.0.0.1:{self.port}/docs/faq"],
|
||||||
|
"nav_seed_urls": [self.base_url],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
path = _write_config(config)
|
||||||
|
|
||||||
|
result = subprocess.run(
|
||||||
|
[
|
||||||
|
sys.executable,
|
||||||
|
"-m",
|
||||||
|
"skill_seekers.cli.sync_config",
|
||||||
|
"--config",
|
||||||
|
str(path),
|
||||||
|
"--apply",
|
||||||
|
"--depth",
|
||||||
|
"3",
|
||||||
|
"--rate-limit",
|
||||||
|
"0",
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(result.returncode, 0, f"CLI failed: {result.stderr}")
|
||||||
|
|
||||||
|
with open(path, encoding="utf-8") as f:
|
||||||
|
saved = json.load(f)
|
||||||
|
self.assertGreater(len(saved["sources"][0]["start_urls"]), 0)
|
||||||
|
|
||||||
|
path.unlink()
|
||||||
|
|
||||||
|
def test_cli_help(self):
|
||||||
|
"""CLI --help should print usage and exit 0."""
|
||||||
|
result = subprocess.run(
|
||||||
|
[sys.executable, "-m", "skill_seekers.cli.sync_config", "--help"],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(result.returncode, 0)
|
||||||
|
self.assertIn("sync", result.stdout.lower())
|
||||||
|
self.assertIn("--config", result.stdout)
|
||||||
|
self.assertIn("--apply", result.stdout)
|
||||||
|
self.assertIn("--depth", result.stdout)
|
||||||
|
|
||||||
|
def test_cli_missing_config_exits_nonzero(self):
|
||||||
|
"""CLI with a non-existent config should fail."""
|
||||||
|
result = subprocess.run(
|
||||||
|
[
|
||||||
|
sys.executable,
|
||||||
|
"-m",
|
||||||
|
"skill_seekers.cli.sync_config",
|
||||||
|
"--config",
|
||||||
|
"/nonexistent/path/config.json",
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertNotEqual(result.returncode, 0)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Integration test against real public site
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
class TestSyncConfigRealSite(unittest.TestCase):
|
||||||
|
"""Integration test against a real public docs site.
|
||||||
|
|
||||||
|
Skipped by default (use ``-m integration`` to run).
|
||||||
|
Uses httpbin.org which is a stable, small public HTTP test service.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_discover_urls_real_http(self):
|
||||||
|
"""discover_urls should work against a real HTTP server."""
|
||||||
|
# Use Python docs — small, stable, well-structured
|
||||||
|
discovered = discover_urls(
|
||||||
|
base_url="https://docs.python.org/3/library/",
|
||||||
|
seed_urls=["https://docs.python.org/3/library/functions.html"],
|
||||||
|
depth=1,
|
||||||
|
max_pages=10,
|
||||||
|
rate_limit=0.5,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should find at least the seed page itself
|
||||||
|
self.assertGreater(len(discovered), 0)
|
||||||
|
# All discovered URLs should be under the base
|
||||||
|
for url in discovered:
|
||||||
|
self.assertTrue(
|
||||||
|
url.startswith("https://docs.python.org/3/library/"),
|
||||||
|
f"Discovered URL outside base: {url}",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
Reference in New Issue
Block a user