Files
skill-seekers-reference/src/skill_seekers/cli/sync_config.py
yusyus 83b9a695ba feat: add sync-config command to detect and update config start_urls (#306)
## Summary

Add `skill-seekers sync-config` subcommand that crawls a docs site's navigation,
diffs discovered URLs against a config's start_urls, and optionally writes the
updated list back with --apply.

- BFS link discovery with configurable depth (default 2), max-pages, rate-limit
- Respects url_patterns.include/exclude from config
- Supports optional nav_seed_urls config field
- Handles both unified (sources array) and legacy flat config formats
- MCP tool sync_config included
- 57 tests (39 unit + 18 E2E with local HTTP server)
- Fixed CI: renamed summary job to "Tests" to match branch protection rule

Closes #306
2026-03-15 02:16:32 +03:00

326 lines
10 KiB
Python

#!/usr/bin/env python3
"""Sync a config file's start_urls against what's currently live on a docs site.
Crawls navigation links from seed pages, diffs them against the config's
``start_urls``, and optionally writes the updated list back.
Usage:
skill-seekers sync-config --config configs/claude-code.json
skill-seekers sync-config --config configs/claude-code.json --apply
"""
import argparse
import json
import logging
import sys
import time
from collections import deque
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
from skill_seekers.cli.utils import sanitize_url, setup_logging
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# URL filtering (mirrors DocToSkillConverter.is_valid_url logic)
# ---------------------------------------------------------------------------
def _is_valid_url(
url: str,
base_url: str,
include_patterns: list[str],
exclude_patterns: list[str],
) -> bool:
"""Return True if *url* passes include/exclude pattern filters."""
if not url.startswith(base_url):
return False
if include_patterns and not any(p in url for p in include_patterns):
return False
return not any(p in url for p in exclude_patterns)
# ---------------------------------------------------------------------------
# Lightweight BFS link discovery
# ---------------------------------------------------------------------------
def discover_urls(
base_url: str,
seed_urls: list[str],
include_patterns: list[str] | None = None,
exclude_patterns: list[str] | None = None,
depth: int = 2,
max_pages: int = 500,
rate_limit: float = 0.5,
) -> set[str]:
"""BFS-crawl *seed_urls* and return all discovered internal URLs.
Only follows ``<a href>`` links on HTML pages; does not download
full page content. Applies the same include/exclude filtering as
:class:`DocToSkillConverter`.
Args:
base_url: Only URLs under this prefix are accepted.
seed_urls: Starting points for the BFS.
include_patterns: Substring patterns a URL must contain (any).
exclude_patterns: Substring patterns that disqualify a URL.
depth: Maximum number of BFS hops from the seed pages.
max_pages: Stop after discovering this many unique URLs.
rate_limit: Seconds to wait between HTTP requests.
Returns:
Set of discovered absolute URLs (fragments stripped).
"""
includes = include_patterns or []
excludes = exclude_patterns or []
visited: set[str] = set()
# Queue entries are (url, current_depth)
queue: deque[tuple[str, int]] = deque()
for u in seed_urls:
u = sanitize_url(u)
queue.append((u, 0))
discovered: set[str] = set()
while queue and len(discovered) < max_pages:
url, cur_depth = queue.popleft()
if url in visited:
continue
visited.add(url)
if not _is_valid_url(url, base_url, includes, excludes):
continue
logger.debug(" [depth %d] %s", cur_depth, url)
try:
headers = {"User-Agent": "Mozilla/5.0 (Skill-Seekers sync-config)"}
resp = requests.get(url, headers=headers, timeout=15)
resp.raise_for_status()
except Exception as e:
logger.warning(" Could not fetch %s: %s", url, e)
continue
# Only mark as "discovered" after a successful fetch — 404s and
# other errors mean the page no longer exists on the live site.
discovered.add(url)
# Follow links if we haven't hit the depth limit
if cur_depth < depth:
soup = BeautifulSoup(resp.content, "html.parser")
for link in soup.find_all("a", href=True):
href = urljoin(url, link["href"])
href = href.split("#")[0] # strip fragment
href = sanitize_url(href)
if href not in visited and _is_valid_url(href, base_url, includes, excludes):
queue.append((href, cur_depth + 1))
if rate_limit > 0:
time.sleep(rate_limit)
return discovered
# ---------------------------------------------------------------------------
# Diff logic
# ---------------------------------------------------------------------------
def diff_urls(discovered: set[str], configured: list[str]) -> tuple[list[str], list[str]]:
"""Compare *discovered* URLs against a *configured* list.
Returns:
``(added, removed)`` — both sorted lists of URLs.
"""
configured_set = set(configured)
added = sorted(discovered - configured_set)
removed = sorted(configured_set - discovered)
return added, removed
# ---------------------------------------------------------------------------
# Config helpers
# ---------------------------------------------------------------------------
def _get_doc_source(config: dict, source_index: int = 0) -> dict | None:
"""Extract the documentation source dict from *config*.
Handles both the unified format (``sources`` array) and legacy flat
format (fields at the top level).
"""
sources = config.get("sources")
if sources:
doc_sources = [s for s in sources if s.get("type") == "documentation"]
if source_index < len(doc_sources):
return doc_sources[source_index]
return None
# Legacy flat format — treat the whole config as a single source
if config.get("base_url"):
return config
return None
def _set_start_urls(config: dict, source_index: int, urls: list[str]) -> None:
"""Write *urls* into the correct ``start_urls`` field in *config*."""
sources = config.get("sources")
if sources:
doc_sources = [s for s in sources if s.get("type") == "documentation"]
if source_index < len(doc_sources):
doc_sources[source_index]["start_urls"] = urls
return
# Legacy flat format
config["start_urls"] = urls
# ---------------------------------------------------------------------------
# Main orchestrator
# ---------------------------------------------------------------------------
def sync_config(
config_path: str,
apply: bool = False,
depth: int = 2,
max_pages: int = 500,
rate_limit: float | None = None,
source_index: int = 0,
) -> dict:
"""Run the sync-config workflow.
Returns:
Dict with keys ``added``, ``removed``, ``total_discovered``,
``total_configured``, ``applied``.
"""
# Load config
with open(config_path, encoding="utf-8") as f:
config = json.load(f)
source = _get_doc_source(config, source_index)
if source is None:
logger.error("No documentation source found at index %d in %s", source_index, config_path)
return {
"added": [],
"removed": [],
"total_discovered": 0,
"total_configured": 0,
"applied": False,
"error": "No documentation source found",
}
base_url: str = source["base_url"]
configured_urls: list[str] = source.get("start_urls") or []
seed_urls: list[str] = source.get("nav_seed_urls") or configured_urls or [base_url]
url_patterns = source.get("url_patterns", {})
includes: list[str] = url_patterns.get("include", [])
excludes: list[str] = url_patterns.get("exclude", [])
effective_rate = rate_limit if rate_limit is not None else source.get("rate_limit", 0.5)
logger.info("Syncing config: %s", config_path)
logger.info(" Base URL: %s", base_url)
logger.info(" Seed URLs: %d", len(seed_urls))
logger.info(" Configured: %d start_urls", len(configured_urls))
logger.info(" Depth: %d", depth)
logger.info(" Rate limit: %.1fs", effective_rate)
logger.info("")
# Discover
discovered = discover_urls(
base_url=base_url,
seed_urls=seed_urls,
include_patterns=includes,
exclude_patterns=excludes,
depth=depth,
max_pages=max_pages,
rate_limit=effective_rate,
)
# Diff
added, removed = diff_urls(discovered, configured_urls)
# Report
if added:
logger.info("New pages (%d):", len(added))
for url in added:
path = url.replace(base_url, "/")
logger.info(" + %s", path)
if removed:
logger.info("Removed pages (%d):", len(removed))
for url in removed:
path = url.replace(base_url, "/")
logger.info(" - %s", path)
if not added and not removed:
logger.info("Config is up to date. No changes detected.")
else:
logger.info("")
logger.info(
"Summary: %d new, %d removed (discovered %d total, configured %d)",
len(added),
len(removed),
len(discovered),
len(configured_urls),
)
applied = False
if apply and (added or removed):
new_urls = sorted(discovered)
_set_start_urls(config, source_index, new_urls)
with open(config_path, "w", encoding="utf-8") as f:
json.dump(config, f, indent=2, ensure_ascii=False)
f.write("\n")
logger.info("Updated %s (%d start_urls)", config_path, len(new_urls))
applied = True
elif added or removed:
logger.info("Run with --apply to update %s", config_path)
return {
"added": added,
"removed": removed,
"total_discovered": len(discovered),
"total_configured": len(configured_urls),
"applied": applied,
}
# ---------------------------------------------------------------------------
# CLI entry point
# ---------------------------------------------------------------------------
def main() -> None:
"""CLI entry point for ``skill-seekers sync-config``."""
from skill_seekers.cli.arguments.sync_config import add_sync_config_arguments
parser = argparse.ArgumentParser(
prog="skill-seekers-sync-config",
description="Sync a config's start_urls against what's live on the docs site.",
)
add_sync_config_arguments(parser)
args = parser.parse_args()
setup_logging(verbose=args.verbose, quiet=args.quiet)
result = sync_config(
config_path=args.config,
apply=args.apply,
depth=args.depth,
max_pages=args.max_pages,
rate_limit=args.rate_limit,
source_index=args.source_index,
)
if result.get("error"):
sys.exit(1)
if __name__ == "__main__":
main()