Add Jupyter Notebook, Local HTML, OpenAPI/Swagger, AsciiDoc, PowerPoint, RSS/Atom, Man Pages, Confluence, Notion, and Slack/Discord Chat as new skill source types. Each type is fully integrated across: - Standalone CLI commands (skill-seekers <type>) - Auto-detection via 'skill-seekers create' (file extension + content sniffing) - Unified multi-source configs (scraped_data, dispatch, config validation) - Unified skill builder (generic merge + source-attributed synthesis) - MCP server (scrape_generic tool with per-type flag mapping) - pyproject.toml (entry points, optional deps, [all] group) Also fixes: EPUB unified pipeline gap, missing word/video config validators, OpenAPI yaml import guard, MCP flag mismatch for all 10 types, stale docstrings, and adds 77 integration tests + complex-merge workflow. 50 files changed, +20,201 lines
1162 lines
41 KiB
Python
1162 lines
41 KiB
Python
"""
|
|
Scraping Tools Module for MCP Server
|
|
|
|
This module contains all scraping-related MCP tool implementations:
|
|
- estimate_pages_tool: Estimate page count before scraping
|
|
- scrape_docs_tool: Scrape documentation (legacy or unified)
|
|
- scrape_github_tool: Scrape GitHub repositories
|
|
- scrape_pdf_tool: Scrape PDF documentation
|
|
- scrape_codebase_tool: Analyze local codebase and extract code knowledge
|
|
- scrape_generic_tool: Generic scraper for new source types (jupyter, html,
|
|
openapi, asciidoc, pptx, confluence, notion, rss, manpage, chat)
|
|
|
|
Extracted from server.py for better modularity and organization.
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# MCP types - with graceful fallback for testing
|
|
try:
|
|
from mcp.types import TextContent
|
|
except ImportError:
|
|
# Graceful degradation: Create a simple fallback class for testing
|
|
class TextContent:
|
|
"""Fallback TextContent for when MCP is not installed"""
|
|
|
|
def __init__(self, type: str, text: str):
|
|
self.type = type
|
|
self.text = text
|
|
|
|
|
|
# Path to CLI tools
|
|
CLI_DIR = Path(__file__).parent.parent.parent / "cli"
|
|
|
|
|
|
def run_subprocess_with_streaming(cmd: list[str], timeout: int = None) -> tuple:
|
|
"""
|
|
Run subprocess with real-time output streaming.
|
|
|
|
This solves the blocking issue where long-running processes (like scraping)
|
|
would cause MCP to appear frozen. Now we stream output as it comes.
|
|
|
|
Args:
|
|
cmd: Command list to execute
|
|
timeout: Optional timeout in seconds
|
|
|
|
Returns:
|
|
Tuple of (stdout, stderr, returncode)
|
|
"""
|
|
import subprocess
|
|
import time
|
|
|
|
try:
|
|
process = subprocess.Popen(
|
|
cmd,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
text=True,
|
|
bufsize=1, # Line buffered
|
|
universal_newlines=True,
|
|
)
|
|
|
|
stdout_lines = []
|
|
stderr_lines = []
|
|
start_time = time.time()
|
|
|
|
# Read output line by line as it comes
|
|
while True:
|
|
# Check timeout
|
|
if timeout and (time.time() - start_time) > timeout:
|
|
process.kill()
|
|
stderr_lines.append(f"\n⚠️ Process killed after {timeout}s timeout")
|
|
break
|
|
|
|
# Check if process finished
|
|
if process.poll() is not None:
|
|
break
|
|
|
|
# Read available output (non-blocking)
|
|
try:
|
|
import select
|
|
|
|
readable, _, _ = select.select([process.stdout, process.stderr], [], [], 0.1)
|
|
|
|
if process.stdout in readable:
|
|
line = process.stdout.readline()
|
|
if line:
|
|
stdout_lines.append(line)
|
|
|
|
if process.stderr in readable:
|
|
line = process.stderr.readline()
|
|
if line:
|
|
stderr_lines.append(line)
|
|
except Exception:
|
|
# Fallback for Windows (no select)
|
|
time.sleep(0.1)
|
|
|
|
# Get any remaining output
|
|
remaining_stdout, remaining_stderr = process.communicate()
|
|
if remaining_stdout:
|
|
stdout_lines.append(remaining_stdout)
|
|
if remaining_stderr:
|
|
stderr_lines.append(remaining_stderr)
|
|
|
|
stdout = "".join(stdout_lines)
|
|
stderr = "".join(stderr_lines)
|
|
returncode = process.returncode
|
|
|
|
return stdout, stderr, returncode
|
|
|
|
except Exception as e:
|
|
return "", f"Error running subprocess: {str(e)}", 1
|
|
|
|
|
|
async def estimate_pages_tool(args: dict) -> list[TextContent]:
|
|
"""
|
|
Estimate page count from a config file.
|
|
|
|
Performs fast preview without downloading content to estimate
|
|
how many pages will be scraped.
|
|
|
|
Args:
|
|
args: Dictionary containing:
|
|
- config_path (str): Path to config JSON file
|
|
- max_discovery (int, optional): Maximum pages to discover (default: 1000)
|
|
- unlimited (bool, optional): Remove discovery limit (default: False)
|
|
|
|
Returns:
|
|
List[TextContent]: Tool execution results
|
|
"""
|
|
config_path = args["config_path"]
|
|
max_discovery = args.get("max_discovery", 1000)
|
|
unlimited = args.get("unlimited", False)
|
|
|
|
# Handle unlimited mode
|
|
if unlimited or max_discovery == -1:
|
|
max_discovery = -1
|
|
timeout = 1800 # 30 minutes for unlimited discovery
|
|
else:
|
|
# Estimate: 0.5s per page discovered
|
|
timeout = max(300, max_discovery // 2) # Minimum 5 minutes
|
|
|
|
# Run estimate_pages.py
|
|
cmd = [
|
|
sys.executable,
|
|
str(CLI_DIR / "estimate_pages.py"),
|
|
config_path,
|
|
"--max-discovery",
|
|
str(max_discovery),
|
|
]
|
|
|
|
progress_msg = "🔄 Estimating page count...\n"
|
|
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
|
|
|
|
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
|
|
|
|
output = progress_msg + stdout
|
|
|
|
if returncode == 0:
|
|
return [TextContent(type="text", text=output)]
|
|
else:
|
|
return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")]
|
|
|
|
|
|
async def scrape_docs_tool(args: dict) -> list[TextContent]:
|
|
"""
|
|
Scrape documentation and build skill.
|
|
|
|
Auto-detects unified vs legacy format and routes to appropriate scraper.
|
|
Supports both single-source (legacy) and unified multi-source configs.
|
|
Creates SKILL.md and reference files.
|
|
|
|
Args:
|
|
args: Dictionary containing:
|
|
- config_path (str): Path to config JSON file
|
|
- unlimited (bool, optional): Remove page limit (default: False)
|
|
- enhance_local (bool, optional): Open terminal for local enhancement (default: False)
|
|
- skip_scrape (bool, optional): Skip scraping, use cached data (default: False)
|
|
- dry_run (bool, optional): Preview without saving (default: False)
|
|
- merge_mode (str, optional): Override merge mode for unified configs
|
|
|
|
Returns:
|
|
List[TextContent]: Tool execution results
|
|
"""
|
|
config_path = args["config_path"]
|
|
unlimited = args.get("unlimited", False)
|
|
enhance_local = args.get("enhance_local", False)
|
|
skip_scrape = args.get("skip_scrape", False)
|
|
dry_run = args.get("dry_run", False)
|
|
merge_mode = args.get("merge_mode")
|
|
|
|
# Load config to detect format
|
|
with open(config_path) as f:
|
|
config = json.load(f)
|
|
|
|
# Detect if unified format (has 'sources' array)
|
|
is_unified = "sources" in config and isinstance(config["sources"], list)
|
|
|
|
# Handle unlimited mode by modifying config temporarily
|
|
if unlimited:
|
|
# Set max_pages to None (unlimited)
|
|
if is_unified:
|
|
# For unified configs, set max_pages on documentation sources
|
|
for source in config.get("sources", []):
|
|
if source.get("type") == "documentation":
|
|
source["max_pages"] = None
|
|
else:
|
|
# For legacy configs
|
|
config["max_pages"] = None
|
|
|
|
# Create temporary config file
|
|
temp_config_path = config_path.replace(".json", "_unlimited_temp.json")
|
|
with open(temp_config_path, "w") as f:
|
|
json.dump(config, f, indent=2)
|
|
|
|
config_to_use = temp_config_path
|
|
else:
|
|
config_to_use = config_path
|
|
|
|
# Choose scraper based on format
|
|
if is_unified:
|
|
scraper_script = "unified_scraper.py"
|
|
progress_msg = "🔄 Starting unified multi-source scraping...\n"
|
|
progress_msg += "📦 Config format: Unified (multiple sources)\n"
|
|
else:
|
|
scraper_script = "doc_scraper.py"
|
|
progress_msg = "🔄 Starting scraping process...\n"
|
|
progress_msg += "📦 Config format: Legacy (single source)\n"
|
|
|
|
# Build command
|
|
cmd = [sys.executable, str(CLI_DIR / scraper_script), "--config", config_to_use]
|
|
|
|
# Add merge mode for unified configs
|
|
if is_unified and merge_mode:
|
|
cmd.extend(["--merge-mode", merge_mode])
|
|
|
|
# Add --fresh to avoid user input prompts when existing data found
|
|
if not skip_scrape:
|
|
cmd.append("--fresh")
|
|
|
|
if enhance_local:
|
|
cmd.append("--enhance-local")
|
|
if skip_scrape:
|
|
cmd.append("--skip-scrape")
|
|
if dry_run:
|
|
cmd.append("--dry-run")
|
|
|
|
# Determine timeout based on operation type
|
|
if dry_run:
|
|
timeout = 300 # 5 minutes for dry run
|
|
elif skip_scrape:
|
|
timeout = 600 # 10 minutes for building from cache
|
|
elif unlimited:
|
|
timeout = None # No timeout for unlimited mode (user explicitly requested)
|
|
else:
|
|
# Read config to estimate timeout
|
|
try:
|
|
if is_unified:
|
|
# For unified configs, estimate based on all sources
|
|
total_pages = 0
|
|
for source in config.get("sources", []):
|
|
if source.get("type") == "documentation":
|
|
total_pages += source.get("max_pages", 500)
|
|
max_pages = total_pages or 500
|
|
else:
|
|
max_pages = config.get("max_pages", 500)
|
|
|
|
# Estimate: 30s per page + buffer
|
|
timeout = max(3600, max_pages * 35) # Minimum 1 hour, or 35s per page
|
|
except Exception:
|
|
timeout = 14400 # Default: 4 hours
|
|
|
|
# Add progress message
|
|
if timeout:
|
|
progress_msg += f"⏱️ Maximum time allowed: {timeout // 60} minutes\n"
|
|
else:
|
|
progress_msg += "⏱️ Unlimited mode - no timeout\n"
|
|
progress_msg += "📝 Progress will be shown below:\n\n"
|
|
|
|
# Run scraper with streaming
|
|
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
|
|
|
|
# Clean up temporary config
|
|
if unlimited and Path(config_to_use).exists():
|
|
Path(config_to_use).unlink()
|
|
|
|
output = progress_msg + stdout
|
|
|
|
if returncode == 0:
|
|
return [TextContent(type="text", text=output)]
|
|
else:
|
|
error_output = output + f"\n\n❌ Error:\n{stderr}"
|
|
return [TextContent(type="text", text=error_output)]
|
|
|
|
|
|
async def scrape_pdf_tool(args: dict) -> list[TextContent]:
|
|
"""
|
|
Scrape PDF documentation and build Claude skill.
|
|
|
|
Extracts text, code, and images from PDF files and builds
|
|
a skill package with organized references.
|
|
|
|
Args:
|
|
args: Dictionary containing:
|
|
- config_path (str, optional): Path to PDF config JSON file
|
|
- pdf_path (str, optional): Direct PDF path (alternative to config_path)
|
|
- name (str, optional): Skill name (required with pdf_path)
|
|
- description (str, optional): Skill description
|
|
- from_json (str, optional): Build from extracted JSON file
|
|
|
|
Returns:
|
|
List[TextContent]: Tool execution results
|
|
"""
|
|
config_path = args.get("config_path")
|
|
pdf_path = args.get("pdf_path")
|
|
name = args.get("name")
|
|
description = args.get("description")
|
|
from_json = args.get("from_json")
|
|
|
|
# Build command
|
|
cmd = [sys.executable, str(CLI_DIR / "pdf_scraper.py")]
|
|
|
|
# Mode 1: Config file
|
|
if config_path:
|
|
cmd.extend(["--config", config_path])
|
|
|
|
# Mode 2: Direct PDF
|
|
elif pdf_path and name:
|
|
cmd.extend(["--pdf", pdf_path, "--name", name])
|
|
if description:
|
|
cmd.extend(["--description", description])
|
|
|
|
# Mode 3: From JSON
|
|
elif from_json:
|
|
cmd.extend(["--from-json", from_json])
|
|
|
|
else:
|
|
return [
|
|
TextContent(
|
|
type="text", text="❌ Error: Must specify --config, --pdf + --name, or --from-json"
|
|
)
|
|
]
|
|
|
|
# Run pdf_scraper.py with streaming (can take a while)
|
|
timeout = 600 # 10 minutes for PDF extraction
|
|
|
|
progress_msg = "📄 Scraping PDF documentation...\n"
|
|
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
|
|
|
|
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
|
|
|
|
output = progress_msg + stdout
|
|
|
|
if returncode == 0:
|
|
return [TextContent(type="text", text=output)]
|
|
else:
|
|
return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")]
|
|
|
|
|
|
async def scrape_video_tool(args: dict) -> list[TextContent]:
|
|
"""
|
|
Scrape video content (YouTube, local files) and build Claude skill.
|
|
|
|
Extracts transcripts, metadata, and optionally visual content from videos
|
|
to create skills.
|
|
|
|
Args:
|
|
args: Dictionary containing:
|
|
- url (str, optional): Video URL (YouTube, Vimeo)
|
|
- video_file (str, optional): Local video file path
|
|
- playlist (str, optional): Playlist URL
|
|
- name (str, optional): Skill name
|
|
- description (str, optional): Skill description
|
|
- languages (str, optional): Language preferences (comma-separated)
|
|
- from_json (str, optional): Build from extracted JSON file
|
|
- visual (bool, optional): Enable visual frame extraction (default: False)
|
|
- whisper_model (str, optional): Whisper model size (default: base)
|
|
- visual_interval (float, optional): Seconds between frame captures (default: 5.0)
|
|
- visual_min_gap (float, optional): Minimum seconds between kept frames (default: 2.0)
|
|
- visual_similarity (float, optional): Similarity threshold to skip duplicate frames (default: 0.95)
|
|
- vision_ocr (bool, optional): Use vision model for OCR on frames (default: False)
|
|
- start_time (str, optional): Start time for extraction (seconds, MM:SS, or HH:MM:SS)
|
|
- end_time (str, optional): End time for extraction (seconds, MM:SS, or HH:MM:SS)
|
|
- setup (bool, optional): Auto-detect GPU and install visual extraction deps
|
|
|
|
Returns:
|
|
List[TextContent]: Tool execution results
|
|
"""
|
|
# Handle --setup early exit
|
|
if args.get("setup", False):
|
|
from skill_seekers.cli.video_setup import run_setup
|
|
|
|
rc = run_setup(interactive=False)
|
|
msg = "Setup completed successfully." if rc == 0 else "Setup failed. Check logs."
|
|
return [TextContent(type="text", text=msg)]
|
|
|
|
url = args.get("url")
|
|
video_file = args.get("video_file")
|
|
playlist = args.get("playlist")
|
|
name = args.get("name")
|
|
description = args.get("description")
|
|
languages = args.get("languages")
|
|
from_json = args.get("from_json")
|
|
visual = args.get("visual", False)
|
|
whisper_model = args.get("whisper_model")
|
|
visual_interval = args.get("visual_interval")
|
|
visual_min_gap = args.get("visual_min_gap")
|
|
visual_similarity = args.get("visual_similarity")
|
|
vision_ocr = args.get("vision_ocr", False)
|
|
start_time = args.get("start_time")
|
|
end_time = args.get("end_time")
|
|
|
|
# Build command
|
|
cmd = [sys.executable, str(CLI_DIR / "video_scraper.py")]
|
|
|
|
if from_json:
|
|
cmd.extend(["--from-json", from_json])
|
|
elif url:
|
|
cmd.extend(["--url", url])
|
|
if name:
|
|
cmd.extend(["--name", name])
|
|
if description:
|
|
cmd.extend(["--description", description])
|
|
if languages:
|
|
cmd.extend(["--languages", languages])
|
|
elif video_file:
|
|
cmd.extend(["--video-file", video_file])
|
|
if name:
|
|
cmd.extend(["--name", name])
|
|
if description:
|
|
cmd.extend(["--description", description])
|
|
elif playlist:
|
|
cmd.extend(["--playlist", playlist])
|
|
if name:
|
|
cmd.extend(["--name", name])
|
|
else:
|
|
return [
|
|
TextContent(
|
|
type="text",
|
|
text="❌ Error: Must specify --url, --video-file, --playlist, or --from-json",
|
|
)
|
|
]
|
|
|
|
# Visual extraction parameters
|
|
if visual:
|
|
cmd.append("--visual")
|
|
if whisper_model:
|
|
cmd.extend(["--whisper-model", whisper_model])
|
|
if visual_interval is not None:
|
|
cmd.extend(["--visual-interval", str(visual_interval)])
|
|
if visual_min_gap is not None:
|
|
cmd.extend(["--visual-min-gap", str(visual_min_gap)])
|
|
if visual_similarity is not None:
|
|
cmd.extend(["--visual-similarity", str(visual_similarity)])
|
|
if vision_ocr:
|
|
cmd.append("--vision-ocr")
|
|
if start_time:
|
|
cmd.extend(["--start-time", str(start_time)])
|
|
if end_time:
|
|
cmd.extend(["--end-time", str(end_time)])
|
|
|
|
# Run video_scraper.py with streaming
|
|
timeout = 600 # 10 minutes for video extraction
|
|
|
|
progress_msg = "🎬 Scraping video content...\n"
|
|
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
|
|
|
|
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
|
|
|
|
output = progress_msg + stdout
|
|
|
|
if returncode == 0:
|
|
return [TextContent(type="text", text=output)]
|
|
else:
|
|
return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")]
|
|
|
|
|
|
async def scrape_github_tool(args: dict) -> list[TextContent]:
|
|
"""
|
|
Scrape GitHub repository and build Claude skill.
|
|
|
|
Extracts README, Issues, Changelog, Releases, and code structure
|
|
from GitHub repositories to create comprehensive skills.
|
|
|
|
Args:
|
|
args: Dictionary containing:
|
|
- repo (str, optional): GitHub repository (owner/repo)
|
|
- config_path (str, optional): Path to GitHub config JSON file
|
|
- name (str, optional): Skill name (default: repo name)
|
|
- description (str, optional): Skill description
|
|
- token (str, optional): GitHub personal access token
|
|
- no_issues (bool, optional): Skip GitHub issues extraction (default: False)
|
|
- no_changelog (bool, optional): Skip CHANGELOG extraction (default: False)
|
|
- no_releases (bool, optional): Skip releases extraction (default: False)
|
|
- max_issues (int, optional): Maximum issues to fetch (default: 100)
|
|
- scrape_only (bool, optional): Only scrape, don't build skill (default: False)
|
|
|
|
Returns:
|
|
List[TextContent]: Tool execution results
|
|
"""
|
|
repo = args.get("repo")
|
|
config_path = args.get("config_path")
|
|
name = args.get("name")
|
|
description = args.get("description")
|
|
token = args.get("token")
|
|
no_issues = args.get("no_issues", False)
|
|
no_changelog = args.get("no_changelog", False)
|
|
no_releases = args.get("no_releases", False)
|
|
max_issues = args.get("max_issues", 100)
|
|
scrape_only = args.get("scrape_only", False)
|
|
|
|
# Build command
|
|
cmd = [sys.executable, str(CLI_DIR / "github_scraper.py")]
|
|
|
|
# Mode 1: Config file
|
|
if config_path:
|
|
cmd.extend(["--config", config_path])
|
|
|
|
# Mode 2: Direct repo
|
|
elif repo:
|
|
cmd.extend(["--repo", repo])
|
|
if name:
|
|
cmd.extend(["--name", name])
|
|
if description:
|
|
cmd.extend(["--description", description])
|
|
if token:
|
|
cmd.extend(["--token", token])
|
|
if no_issues:
|
|
cmd.append("--no-issues")
|
|
if no_changelog:
|
|
cmd.append("--no-changelog")
|
|
if no_releases:
|
|
cmd.append("--no-releases")
|
|
if max_issues != 100:
|
|
cmd.extend(["--max-issues", str(max_issues)])
|
|
if scrape_only:
|
|
cmd.append("--scrape-only")
|
|
|
|
else:
|
|
return [TextContent(type="text", text="❌ Error: Must specify --repo or --config")]
|
|
|
|
# Run github_scraper.py with streaming (can take a while)
|
|
timeout = 600 # 10 minutes for GitHub scraping
|
|
|
|
progress_msg = "🐙 Scraping GitHub repository...\n"
|
|
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
|
|
|
|
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
|
|
|
|
output = progress_msg + stdout
|
|
|
|
if returncode == 0:
|
|
return [TextContent(type="text", text=output)]
|
|
else:
|
|
return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")]
|
|
|
|
|
|
async def scrape_codebase_tool(args: dict) -> list[TextContent]:
|
|
"""
|
|
Analyze local codebase and extract code knowledge.
|
|
|
|
Walks directory tree, analyzes code files, extracts signatures,
|
|
docstrings, and generates API reference documentation, dependency graphs,
|
|
design patterns, test examples, and how-to guides.
|
|
|
|
All features are ON by default. Use skip_* parameters to disable specific features.
|
|
|
|
Args:
|
|
args: Dictionary containing:
|
|
- directory (str): Directory to analyze
|
|
- output (str, optional): Output directory for results (default: output/codebase/)
|
|
- depth (str, optional): Analysis depth - surface, deep, full (default: deep)
|
|
- languages (str, optional): Comma-separated languages (e.g., "Python,JavaScript,C++")
|
|
- file_patterns (str, optional): Comma-separated file patterns (e.g., "*.py,src/**/*.js")
|
|
- enhance_level (int, optional): AI enhancement level 0-3 (default: 0)
|
|
- 0: No AI enhancement
|
|
- 1: SKILL.md enhancement only
|
|
- 2: SKILL.md + Architecture + Config enhancement
|
|
- 3: Full enhancement (patterns, tests, config, architecture, SKILL.md)
|
|
- skip_api_reference (bool, optional): Skip API reference generation (default: False)
|
|
- skip_dependency_graph (bool, optional): Skip dependency graph (default: False)
|
|
- skip_patterns (bool, optional): Skip design pattern detection (default: False)
|
|
- skip_test_examples (bool, optional): Skip test example extraction (default: False)
|
|
- skip_how_to_guides (bool, optional): Skip how-to guide generation (default: False)
|
|
- skip_config_patterns (bool, optional): Skip config pattern extraction (default: False)
|
|
- skip_docs (bool, optional): Skip project documentation extraction (default: False)
|
|
|
|
Returns:
|
|
List[TextContent]: Tool execution results
|
|
|
|
Example:
|
|
scrape_codebase(
|
|
directory="/path/to/repo",
|
|
depth="deep",
|
|
enhance_level=1
|
|
)
|
|
scrape_codebase(
|
|
directory="/path/to/repo",
|
|
enhance_level=2,
|
|
skip_patterns=True
|
|
)
|
|
"""
|
|
directory = args.get("directory")
|
|
if not directory:
|
|
return [TextContent(type="text", text="❌ Error: directory parameter is required")]
|
|
|
|
output = args.get("output", "output/codebase/")
|
|
depth = args.get("depth", "deep")
|
|
languages = args.get("languages", "")
|
|
file_patterns = args.get("file_patterns", "")
|
|
enhance_level = args.get("enhance_level", 0)
|
|
|
|
# Skip flags (features are ON by default)
|
|
skip_api_reference = args.get("skip_api_reference", False)
|
|
skip_dependency_graph = args.get("skip_dependency_graph", False)
|
|
skip_patterns = args.get("skip_patterns", False)
|
|
skip_test_examples = args.get("skip_test_examples", False)
|
|
skip_how_to_guides = args.get("skip_how_to_guides", False)
|
|
skip_config_patterns = args.get("skip_config_patterns", False)
|
|
skip_docs = args.get("skip_docs", False)
|
|
|
|
# Build command
|
|
cmd = [sys.executable, "-m", "skill_seekers.cli.codebase_scraper"]
|
|
cmd.extend(["--directory", directory])
|
|
|
|
if output:
|
|
cmd.extend(["--output", output])
|
|
if depth:
|
|
cmd.extend(["--depth", depth])
|
|
if languages:
|
|
cmd.extend(["--languages", languages])
|
|
if file_patterns:
|
|
cmd.extend(["--file-patterns", file_patterns])
|
|
if enhance_level > 0:
|
|
cmd.extend(["--enhance-level", str(enhance_level)])
|
|
|
|
# Skip flags
|
|
if skip_api_reference:
|
|
cmd.append("--skip-api-reference")
|
|
if skip_dependency_graph:
|
|
cmd.append("--skip-dependency-graph")
|
|
if skip_patterns:
|
|
cmd.append("--skip-patterns")
|
|
if skip_test_examples:
|
|
cmd.append("--skip-test-examples")
|
|
if skip_how_to_guides:
|
|
cmd.append("--skip-how-to-guides")
|
|
if skip_config_patterns:
|
|
cmd.append("--skip-config-patterns")
|
|
if skip_docs:
|
|
cmd.append("--skip-docs")
|
|
|
|
# Adjust timeout based on enhance_level
|
|
timeout = 600 # 10 minutes base
|
|
if enhance_level >= 2:
|
|
timeout = 1200 # 20 minutes with AI enhancement
|
|
if enhance_level >= 3:
|
|
timeout = 3600 # 60 minutes for full enhancement
|
|
|
|
level_names = {0: "off", 1: "SKILL.md only", 2: "standard", 3: "full"}
|
|
progress_msg = "🔍 Analyzing local codebase...\n"
|
|
progress_msg += f"📁 Directory: {directory}\n"
|
|
progress_msg += f"📊 Depth: {depth}\n"
|
|
if enhance_level > 0:
|
|
progress_msg += f"🤖 AI Enhancement: Level {enhance_level} ({level_names.get(enhance_level, 'unknown')})\n"
|
|
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
|
|
|
|
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
|
|
|
|
output_text = progress_msg + stdout
|
|
|
|
if returncode == 0:
|
|
return [TextContent(type="text", text=output_text)]
|
|
else:
|
|
return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]
|
|
|
|
|
|
async def detect_patterns_tool(args: dict) -> list[TextContent]:
|
|
"""
|
|
Detect design patterns in source code.
|
|
|
|
Analyzes source files or directories to detect common design patterns
|
|
(Singleton, Factory, Observer, Strategy, Decorator, Builder, Adapter,
|
|
Command, Template Method, Chain of Responsibility).
|
|
|
|
Supports 9 languages: Python, JavaScript, TypeScript, C++, C, C#,
|
|
Go, Rust, Java, Ruby, PHP.
|
|
|
|
Args:
|
|
args: Dictionary containing:
|
|
- file (str, optional): Single file to analyze
|
|
- directory (str, optional): Directory to analyze (analyzes all source files)
|
|
- output (str, optional): Output directory for JSON results
|
|
- depth (str, optional): Detection depth - surface, deep, full (default: deep)
|
|
- json (bool, optional): Output JSON format (default: False)
|
|
|
|
Returns:
|
|
List[TextContent]: Pattern detection results
|
|
|
|
Example:
|
|
detect_patterns(file="src/database.py", depth="deep")
|
|
detect_patterns(directory="src/", output="patterns/", json=True)
|
|
"""
|
|
file_path = args.get("file")
|
|
directory = args.get("directory")
|
|
|
|
if not file_path and not directory:
|
|
return [
|
|
TextContent(
|
|
type="text", text="❌ Error: Must specify either 'file' or 'directory' parameter"
|
|
)
|
|
]
|
|
|
|
output = args.get("output", "")
|
|
depth = args.get("depth", "deep")
|
|
json_output = args.get("json", False)
|
|
|
|
# Build command
|
|
cmd = [sys.executable, "-m", "skill_seekers.cli.pattern_recognizer"]
|
|
|
|
if file_path:
|
|
cmd.extend(["--file", file_path])
|
|
if directory:
|
|
cmd.extend(["--directory", directory])
|
|
if output:
|
|
cmd.extend(["--output", output])
|
|
if depth:
|
|
cmd.extend(["--depth", depth])
|
|
if json_output:
|
|
cmd.append("--json")
|
|
|
|
timeout = 300 # 5 minutes for pattern detection
|
|
|
|
progress_msg = "🔍 Detecting design patterns...\n"
|
|
if file_path:
|
|
progress_msg += f"📄 File: {file_path}\n"
|
|
if directory:
|
|
progress_msg += f"📁 Directory: {directory}\n"
|
|
progress_msg += f"🎯 Detection depth: {depth}\n"
|
|
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
|
|
|
|
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
|
|
|
|
output_text = progress_msg + stdout
|
|
|
|
if returncode == 0:
|
|
return [TextContent(type="text", text=output_text)]
|
|
else:
|
|
return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]
|
|
|
|
|
|
async def extract_test_examples_tool(args: dict) -> list[TextContent]:
|
|
"""
|
|
Extract usage examples from test files.
|
|
|
|
Analyzes test files to extract real API usage patterns including:
|
|
- Object instantiation with real parameters
|
|
- Method calls with expected behaviors
|
|
- Configuration examples
|
|
- Setup patterns from fixtures/setUp()
|
|
- Multi-step workflows from integration tests
|
|
|
|
Supports 9 languages: Python (AST-based deep analysis), JavaScript,
|
|
TypeScript, Go, Rust, Java, C#, PHP, Ruby (regex-based).
|
|
|
|
Args:
|
|
args: Dictionary containing:
|
|
- file (str, optional): Single test file to analyze
|
|
- directory (str, optional): Directory containing test files
|
|
- language (str, optional): Filter by language (python, javascript, etc.)
|
|
- min_confidence (float, optional): Minimum confidence threshold 0.0-1.0 (default: 0.5)
|
|
- max_per_file (int, optional): Maximum examples per file (default: 10)
|
|
- json (bool, optional): Output JSON format (default: False)
|
|
- markdown (bool, optional): Output Markdown format (default: False)
|
|
|
|
Returns:
|
|
List[TextContent]: Extracted test examples
|
|
|
|
Example:
|
|
extract_test_examples(directory="tests/", language="python")
|
|
extract_test_examples(file="tests/test_scraper.py", json=True)
|
|
"""
|
|
file_path = args.get("file")
|
|
directory = args.get("directory")
|
|
|
|
if not file_path and not directory:
|
|
return [
|
|
TextContent(
|
|
type="text", text="❌ Error: Must specify either 'file' or 'directory' parameter"
|
|
)
|
|
]
|
|
|
|
language = args.get("language", "")
|
|
min_confidence = args.get("min_confidence", 0.5)
|
|
max_per_file = args.get("max_per_file", 10)
|
|
json_output = args.get("json", False)
|
|
markdown_output = args.get("markdown", False)
|
|
|
|
# Build command
|
|
cmd = [sys.executable, "-m", "skill_seekers.cli.test_example_extractor"]
|
|
|
|
if directory:
|
|
cmd.append(directory)
|
|
if file_path:
|
|
cmd.extend(["--file", file_path])
|
|
if language:
|
|
cmd.extend(["--language", language])
|
|
if min_confidence:
|
|
cmd.extend(["--min-confidence", str(min_confidence)])
|
|
if max_per_file:
|
|
cmd.extend(["--max-per-file", str(max_per_file)])
|
|
if json_output:
|
|
cmd.append("--json")
|
|
if markdown_output:
|
|
cmd.append("--markdown")
|
|
|
|
timeout = 180 # 3 minutes for test example extraction
|
|
|
|
progress_msg = "🧪 Extracting usage examples from test files...\n"
|
|
if file_path:
|
|
progress_msg += f"📄 File: {file_path}\n"
|
|
if directory:
|
|
progress_msg += f"📁 Directory: {directory}\n"
|
|
if language:
|
|
progress_msg += f"🔤 Language: {language}\n"
|
|
progress_msg += f"🎯 Min confidence: {min_confidence}\n"
|
|
progress_msg += f"📊 Max per file: {max_per_file}\n"
|
|
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
|
|
|
|
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
|
|
|
|
output_text = progress_msg + stdout
|
|
|
|
if returncode == 0:
|
|
return [TextContent(type="text", text=output_text)]
|
|
else:
|
|
return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]
|
|
|
|
|
|
async def build_how_to_guides_tool(args: dict) -> list[TextContent]:
|
|
"""
|
|
Build how-to guides from workflow test examples.
|
|
|
|
Transforms workflow examples extracted from test files into step-by-step
|
|
educational guides. Automatically groups related workflows, extracts steps,
|
|
and generates comprehensive markdown guides.
|
|
|
|
Features:
|
|
- Python AST-based step extraction (heuristic for other languages)
|
|
- 4 grouping strategies: ai-tutorial-group, file-path, test-name, complexity
|
|
- Detects prerequisites, setup code, and verification points
|
|
- Generates troubleshooting tips and next steps
|
|
- Creates index with difficulty levels
|
|
|
|
Args:
|
|
args: Dictionary containing:
|
|
- input (str): Path to test_examples.json from extract_test_examples
|
|
- output (str, optional): Output directory for guides (default: output/codebase/tutorials)
|
|
- group_by (str, optional): Grouping strategy - ai-tutorial-group, file-path, test-name, complexity
|
|
- no_ai (bool, optional): Disable AI enhancement for grouping (default: False)
|
|
- json_output (bool, optional): Output JSON format alongside markdown (default: False)
|
|
|
|
Returns:
|
|
List[TextContent]: Guide building results
|
|
|
|
Example:
|
|
build_how_to_guides(
|
|
input="output/codebase/test_examples/test_examples.json",
|
|
group_by="ai-tutorial-group",
|
|
output="output/codebase/tutorials"
|
|
)
|
|
"""
|
|
input_file = args.get("input")
|
|
if not input_file:
|
|
return [
|
|
TextContent(
|
|
type="text",
|
|
text="❌ Error: input parameter is required (path to test_examples.json)",
|
|
)
|
|
]
|
|
|
|
output = args.get("output", "output/codebase/tutorials")
|
|
group_by = args.get("group_by", "ai-tutorial-group")
|
|
no_ai = args.get("no_ai", False)
|
|
json_output = args.get("json_output", False)
|
|
|
|
# Build command
|
|
cmd = [sys.executable, "-m", "skill_seekers.cli.how_to_guide_builder"]
|
|
cmd.append(input_file)
|
|
|
|
if output:
|
|
cmd.extend(["--output", output])
|
|
if group_by:
|
|
cmd.extend(["--group-by", group_by])
|
|
if no_ai:
|
|
cmd.append("--no-ai")
|
|
if json_output:
|
|
cmd.append("--json-output")
|
|
|
|
timeout = 180 # 3 minutes for guide building
|
|
|
|
progress_msg = "📚 Building how-to guides from workflow examples...\n"
|
|
progress_msg += f"📄 Input: {input_file}\n"
|
|
progress_msg += f"📁 Output: {output}\n"
|
|
progress_msg += f"🔀 Grouping: {group_by}\n"
|
|
if no_ai:
|
|
progress_msg += "🚫 AI enhancement disabled\n"
|
|
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
|
|
|
|
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
|
|
|
|
output_text = progress_msg + stdout
|
|
|
|
if returncode == 0:
|
|
return [TextContent(type="text", text=output_text)]
|
|
else:
|
|
return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]
|
|
|
|
|
|
async def extract_config_patterns_tool(args: dict) -> list[TextContent]:
|
|
"""
|
|
Extract configuration patterns from config files (C3.4).
|
|
|
|
Analyzes configuration files in the codebase to extract settings,
|
|
detect common patterns (database, API, logging, cache, etc.), and
|
|
generate comprehensive documentation.
|
|
|
|
Supports 9 config formats: JSON, YAML, TOML, ENV, INI, Python modules,
|
|
JavaScript/TypeScript configs, Dockerfile, Docker Compose.
|
|
|
|
Detects 7 common patterns:
|
|
- Database configuration (host, port, credentials)
|
|
- API configuration (endpoints, keys, timeouts)
|
|
- Logging configuration (level, format, handlers)
|
|
- Cache configuration (backend, TTL, keys)
|
|
- Email configuration (SMTP, credentials)
|
|
- Authentication configuration (providers, secrets)
|
|
- Server configuration (host, port, workers)
|
|
|
|
Args:
|
|
args: Dictionary containing:
|
|
- directory (str): Directory to analyze
|
|
- output (str, optional): Output directory (default: output/codebase/config_patterns)
|
|
- max_files (int, optional): Maximum config files to process (default: 100)
|
|
- enhance (bool, optional): Enable AI enhancement - API mode (default: False, requires ANTHROPIC_API_KEY)
|
|
- enhance_local (bool, optional): Enable AI enhancement - LOCAL mode (default: False, uses Claude Code CLI)
|
|
- ai_mode (str, optional): AI mode - auto, api, local, none (default: none)
|
|
- json (bool, optional): Output JSON format (default: True)
|
|
- markdown (bool, optional): Output Markdown format (default: True)
|
|
|
|
Returns:
|
|
List[TextContent]: Config extraction results with optional AI enhancements
|
|
|
|
Example:
|
|
extract_config_patterns(directory=".", output="output/configs")
|
|
extract_config_patterns(directory="/path/to/repo", max_files=50, enhance_local=True)
|
|
"""
|
|
directory = args.get("directory")
|
|
if not directory:
|
|
return [TextContent(type="text", text="❌ Error: directory parameter is required")]
|
|
|
|
output = args.get("output", "output/codebase/config_patterns")
|
|
max_files = args.get("max_files", 100)
|
|
enhance = args.get("enhance", False)
|
|
enhance_local = args.get("enhance_local", False)
|
|
ai_mode = args.get("ai_mode", "none")
|
|
json_output = args.get("json", True)
|
|
markdown_output = args.get("markdown", True)
|
|
|
|
# Build command
|
|
cmd = [sys.executable, "-m", "skill_seekers.cli.config_extractor"]
|
|
cmd.extend(["--directory", directory])
|
|
|
|
if output:
|
|
cmd.extend(["--output", output])
|
|
if max_files:
|
|
cmd.extend(["--max-files", str(max_files)])
|
|
if enhance:
|
|
cmd.append("--enhance")
|
|
if enhance_local:
|
|
cmd.append("--enhance-local")
|
|
if ai_mode and ai_mode != "none":
|
|
cmd.extend(["--ai-mode", ai_mode])
|
|
if json_output:
|
|
cmd.append("--json")
|
|
if markdown_output:
|
|
cmd.append("--markdown")
|
|
|
|
# Adjust timeout for AI enhancement
|
|
timeout = 180 # 3 minutes base
|
|
if enhance or enhance_local or ai_mode != "none":
|
|
timeout = 360 # 6 minutes with AI enhancement
|
|
|
|
progress_msg = "⚙️ Extracting configuration patterns...\n"
|
|
progress_msg += f"📁 Directory: {directory}\n"
|
|
progress_msg += f"📄 Max files: {max_files}\n"
|
|
if enhance or enhance_local or (ai_mode and ai_mode != "none"):
|
|
progress_msg += f"🤖 AI enhancement: {ai_mode if ai_mode != 'none' else ('api' if enhance else 'local')}\n"
|
|
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
|
|
|
|
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
|
|
|
|
output_text = progress_msg + stdout
|
|
|
|
if returncode == 0:
|
|
return [TextContent(type="text", text=output_text)]
|
|
else:
|
|
return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]
|
|
|
|
|
|
# Valid source types for the generic scraper
|
|
GENERIC_SOURCE_TYPES = (
|
|
"jupyter",
|
|
"html",
|
|
"openapi",
|
|
"asciidoc",
|
|
"pptx",
|
|
"confluence",
|
|
"notion",
|
|
"rss",
|
|
"manpage",
|
|
"chat",
|
|
)
|
|
|
|
# Mapping from source type to the CLI flag used for the primary input argument.
|
|
# URL-based types use --url; file/path-based types use --path.
|
|
_URL_BASED_TYPES = {"confluence", "notion", "rss"}
|
|
|
|
# Friendly emoji labels per source type
|
|
_SOURCE_EMOJIS = {
|
|
"jupyter": "📓",
|
|
"html": "🌐",
|
|
"openapi": "📡",
|
|
"asciidoc": "📄",
|
|
"pptx": "📊",
|
|
"confluence": "🏢",
|
|
"notion": "📝",
|
|
"rss": "📰",
|
|
"manpage": "📖",
|
|
"chat": "💬",
|
|
}
|
|
|
|
|
|
async def scrape_generic_tool(args: dict) -> list[TextContent]:
|
|
"""
|
|
Generic scraper for new source types.
|
|
|
|
Handles all 10 new source types by building the appropriate subprocess
|
|
command and delegating to the corresponding CLI scraper module.
|
|
|
|
Supported source types: jupyter, html, openapi, asciidoc, pptx,
|
|
confluence, notion, rss, manpage, chat.
|
|
|
|
Args:
|
|
args: Dictionary containing:
|
|
- source_type (str): One of the supported source types
|
|
- path (str, optional): File or directory path (for file-based sources)
|
|
- url (str, optional): URL (for URL-based sources like confluence, notion, rss)
|
|
- name (str): Skill name for the output
|
|
|
|
Returns:
|
|
List[TextContent]: Tool execution results
|
|
"""
|
|
source_type = args.get("source_type", "")
|
|
path = args.get("path")
|
|
url = args.get("url")
|
|
name = args.get("name")
|
|
|
|
# Validate source_type
|
|
if source_type not in GENERIC_SOURCE_TYPES:
|
|
return [
|
|
TextContent(
|
|
type="text",
|
|
text=(
|
|
f"❌ Error: Unknown source_type '{source_type}'. "
|
|
f"Must be one of: {', '.join(GENERIC_SOURCE_TYPES)}"
|
|
),
|
|
)
|
|
]
|
|
|
|
# Validate that we have either path or url
|
|
if not path and not url:
|
|
return [
|
|
TextContent(
|
|
type="text",
|
|
text="❌ Error: Must specify either 'path' (file/directory) or 'url'",
|
|
)
|
|
]
|
|
|
|
if not name:
|
|
return [
|
|
TextContent(
|
|
type="text",
|
|
text="❌ Error: 'name' parameter is required",
|
|
)
|
|
]
|
|
|
|
# Build the subprocess command
|
|
# Map source type to module name (most are <type>_scraper, but some differ)
|
|
_MODULE_NAMES = {
|
|
"manpage": "man_scraper",
|
|
}
|
|
module_name = _MODULE_NAMES.get(source_type, f"{source_type}_scraper")
|
|
cmd = [sys.executable, "-m", f"skill_seekers.cli.{module_name}"]
|
|
|
|
# Map source type to the correct CLI flag for file/path input and URL input.
|
|
# Each scraper has its own flag name — using a generic --path or --url would fail.
|
|
_PATH_FLAGS: dict[str, str] = {
|
|
"jupyter": "--notebook",
|
|
"html": "--html-path",
|
|
"openapi": "--spec",
|
|
"asciidoc": "--asciidoc-path",
|
|
"pptx": "--pptx",
|
|
"manpage": "--man-path",
|
|
"confluence": "--export-path",
|
|
"notion": "--export-path",
|
|
"rss": "--feed-path",
|
|
"chat": "--export-path",
|
|
}
|
|
_URL_FLAGS: dict[str, str] = {
|
|
"confluence": "--base-url",
|
|
"notion": "--page-id",
|
|
"rss": "--feed-url",
|
|
"openapi": "--spec-url",
|
|
}
|
|
|
|
# Determine the input flag based on source type
|
|
if source_type in _URL_BASED_TYPES and url:
|
|
url_flag = _URL_FLAGS.get(source_type, "--url")
|
|
cmd.extend([url_flag, url])
|
|
elif path:
|
|
path_flag = _PATH_FLAGS.get(source_type, "--path")
|
|
cmd.extend([path_flag, path])
|
|
elif url:
|
|
# Allow url fallback for file-based types (some may accept URLs too)
|
|
url_flag = _URL_FLAGS.get(source_type, "--url")
|
|
cmd.extend([url_flag, url])
|
|
|
|
cmd.extend(["--name", name])
|
|
|
|
# Set a reasonable timeout
|
|
timeout = 600 # 10 minutes
|
|
|
|
emoji = _SOURCE_EMOJIS.get(source_type, "🔧")
|
|
progress_msg = f"{emoji} Scraping {source_type} source...\n"
|
|
if path:
|
|
progress_msg += f"📁 Path: {path}\n"
|
|
if url:
|
|
progress_msg += f"🔗 URL: {url}\n"
|
|
progress_msg += f"📛 Name: {name}\n"
|
|
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
|
|
|
|
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
|
|
|
|
output = progress_msg + stdout
|
|
|
|
if returncode == 0:
|
|
return [TextContent(type="text", text=output)]
|
|
else:
|
|
return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")]
|