Files
skill-seekers-reference/src/skill_seekers/mcp/tools/scraping_tools.py
yusyus 53b911b697 feat: add 10 new skill source types (17 total) with full pipeline integration
Add Jupyter Notebook, Local HTML, OpenAPI/Swagger, AsciiDoc, PowerPoint,
RSS/Atom, Man Pages, Confluence, Notion, and Slack/Discord Chat as new
skill source types. Each type is fully integrated across:

- Standalone CLI commands (skill-seekers <type>)
- Auto-detection via 'skill-seekers create' (file extension + content sniffing)
- Unified multi-source configs (scraped_data, dispatch, config validation)
- Unified skill builder (generic merge + source-attributed synthesis)
- MCP server (scrape_generic tool with per-type flag mapping)
- pyproject.toml (entry points, optional deps, [all] group)

Also fixes: EPUB unified pipeline gap, missing word/video config validators,
OpenAPI yaml import guard, MCP flag mismatch for all 10 types, stale
docstrings, and adds 77 integration tests + complex-merge workflow.

50 files changed, +20,201 lines
2026-03-15 15:30:15 +03:00

1162 lines
41 KiB
Python

"""
Scraping Tools Module for MCP Server
This module contains all scraping-related MCP tool implementations:
- estimate_pages_tool: Estimate page count before scraping
- scrape_docs_tool: Scrape documentation (legacy or unified)
- scrape_github_tool: Scrape GitHub repositories
- scrape_pdf_tool: Scrape PDF documentation
- scrape_codebase_tool: Analyze local codebase and extract code knowledge
- scrape_generic_tool: Generic scraper for new source types (jupyter, html,
openapi, asciidoc, pptx, confluence, notion, rss, manpage, chat)
Extracted from server.py for better modularity and organization.
"""
import json
import sys
from pathlib import Path
# MCP types - with graceful fallback for testing
try:
from mcp.types import TextContent
except ImportError:
# Graceful degradation: Create a simple fallback class for testing
class TextContent:
"""Fallback TextContent for when MCP is not installed"""
def __init__(self, type: str, text: str):
self.type = type
self.text = text
# Path to CLI tools
CLI_DIR = Path(__file__).parent.parent.parent / "cli"
def run_subprocess_with_streaming(cmd: list[str], timeout: int = None) -> tuple:
"""
Run subprocess with real-time output streaming.
This solves the blocking issue where long-running processes (like scraping)
would cause MCP to appear frozen. Now we stream output as it comes.
Args:
cmd: Command list to execute
timeout: Optional timeout in seconds
Returns:
Tuple of (stdout, stderr, returncode)
"""
import subprocess
import time
try:
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
bufsize=1, # Line buffered
universal_newlines=True,
)
stdout_lines = []
stderr_lines = []
start_time = time.time()
# Read output line by line as it comes
while True:
# Check timeout
if timeout and (time.time() - start_time) > timeout:
process.kill()
stderr_lines.append(f"\n⚠️ Process killed after {timeout}s timeout")
break
# Check if process finished
if process.poll() is not None:
break
# Read available output (non-blocking)
try:
import select
readable, _, _ = select.select([process.stdout, process.stderr], [], [], 0.1)
if process.stdout in readable:
line = process.stdout.readline()
if line:
stdout_lines.append(line)
if process.stderr in readable:
line = process.stderr.readline()
if line:
stderr_lines.append(line)
except Exception:
# Fallback for Windows (no select)
time.sleep(0.1)
# Get any remaining output
remaining_stdout, remaining_stderr = process.communicate()
if remaining_stdout:
stdout_lines.append(remaining_stdout)
if remaining_stderr:
stderr_lines.append(remaining_stderr)
stdout = "".join(stdout_lines)
stderr = "".join(stderr_lines)
returncode = process.returncode
return stdout, stderr, returncode
except Exception as e:
return "", f"Error running subprocess: {str(e)}", 1
async def estimate_pages_tool(args: dict) -> list[TextContent]:
"""
Estimate page count from a config file.
Performs fast preview without downloading content to estimate
how many pages will be scraped.
Args:
args: Dictionary containing:
- config_path (str): Path to config JSON file
- max_discovery (int, optional): Maximum pages to discover (default: 1000)
- unlimited (bool, optional): Remove discovery limit (default: False)
Returns:
List[TextContent]: Tool execution results
"""
config_path = args["config_path"]
max_discovery = args.get("max_discovery", 1000)
unlimited = args.get("unlimited", False)
# Handle unlimited mode
if unlimited or max_discovery == -1:
max_discovery = -1
timeout = 1800 # 30 minutes for unlimited discovery
else:
# Estimate: 0.5s per page discovered
timeout = max(300, max_discovery // 2) # Minimum 5 minutes
# Run estimate_pages.py
cmd = [
sys.executable,
str(CLI_DIR / "estimate_pages.py"),
config_path,
"--max-discovery",
str(max_discovery),
]
progress_msg = "🔄 Estimating page count...\n"
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
output = progress_msg + stdout
if returncode == 0:
return [TextContent(type="text", text=output)]
else:
return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")]
async def scrape_docs_tool(args: dict) -> list[TextContent]:
"""
Scrape documentation and build skill.
Auto-detects unified vs legacy format and routes to appropriate scraper.
Supports both single-source (legacy) and unified multi-source configs.
Creates SKILL.md and reference files.
Args:
args: Dictionary containing:
- config_path (str): Path to config JSON file
- unlimited (bool, optional): Remove page limit (default: False)
- enhance_local (bool, optional): Open terminal for local enhancement (default: False)
- skip_scrape (bool, optional): Skip scraping, use cached data (default: False)
- dry_run (bool, optional): Preview without saving (default: False)
- merge_mode (str, optional): Override merge mode for unified configs
Returns:
List[TextContent]: Tool execution results
"""
config_path = args["config_path"]
unlimited = args.get("unlimited", False)
enhance_local = args.get("enhance_local", False)
skip_scrape = args.get("skip_scrape", False)
dry_run = args.get("dry_run", False)
merge_mode = args.get("merge_mode")
# Load config to detect format
with open(config_path) as f:
config = json.load(f)
# Detect if unified format (has 'sources' array)
is_unified = "sources" in config and isinstance(config["sources"], list)
# Handle unlimited mode by modifying config temporarily
if unlimited:
# Set max_pages to None (unlimited)
if is_unified:
# For unified configs, set max_pages on documentation sources
for source in config.get("sources", []):
if source.get("type") == "documentation":
source["max_pages"] = None
else:
# For legacy configs
config["max_pages"] = None
# Create temporary config file
temp_config_path = config_path.replace(".json", "_unlimited_temp.json")
with open(temp_config_path, "w") as f:
json.dump(config, f, indent=2)
config_to_use = temp_config_path
else:
config_to_use = config_path
# Choose scraper based on format
if is_unified:
scraper_script = "unified_scraper.py"
progress_msg = "🔄 Starting unified multi-source scraping...\n"
progress_msg += "📦 Config format: Unified (multiple sources)\n"
else:
scraper_script = "doc_scraper.py"
progress_msg = "🔄 Starting scraping process...\n"
progress_msg += "📦 Config format: Legacy (single source)\n"
# Build command
cmd = [sys.executable, str(CLI_DIR / scraper_script), "--config", config_to_use]
# Add merge mode for unified configs
if is_unified and merge_mode:
cmd.extend(["--merge-mode", merge_mode])
# Add --fresh to avoid user input prompts when existing data found
if not skip_scrape:
cmd.append("--fresh")
if enhance_local:
cmd.append("--enhance-local")
if skip_scrape:
cmd.append("--skip-scrape")
if dry_run:
cmd.append("--dry-run")
# Determine timeout based on operation type
if dry_run:
timeout = 300 # 5 minutes for dry run
elif skip_scrape:
timeout = 600 # 10 minutes for building from cache
elif unlimited:
timeout = None # No timeout for unlimited mode (user explicitly requested)
else:
# Read config to estimate timeout
try:
if is_unified:
# For unified configs, estimate based on all sources
total_pages = 0
for source in config.get("sources", []):
if source.get("type") == "documentation":
total_pages += source.get("max_pages", 500)
max_pages = total_pages or 500
else:
max_pages = config.get("max_pages", 500)
# Estimate: 30s per page + buffer
timeout = max(3600, max_pages * 35) # Minimum 1 hour, or 35s per page
except Exception:
timeout = 14400 # Default: 4 hours
# Add progress message
if timeout:
progress_msg += f"⏱️ Maximum time allowed: {timeout // 60} minutes\n"
else:
progress_msg += "⏱️ Unlimited mode - no timeout\n"
progress_msg += "📝 Progress will be shown below:\n\n"
# Run scraper with streaming
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
# Clean up temporary config
if unlimited and Path(config_to_use).exists():
Path(config_to_use).unlink()
output = progress_msg + stdout
if returncode == 0:
return [TextContent(type="text", text=output)]
else:
error_output = output + f"\n\n❌ Error:\n{stderr}"
return [TextContent(type="text", text=error_output)]
async def scrape_pdf_tool(args: dict) -> list[TextContent]:
"""
Scrape PDF documentation and build Claude skill.
Extracts text, code, and images from PDF files and builds
a skill package with organized references.
Args:
args: Dictionary containing:
- config_path (str, optional): Path to PDF config JSON file
- pdf_path (str, optional): Direct PDF path (alternative to config_path)
- name (str, optional): Skill name (required with pdf_path)
- description (str, optional): Skill description
- from_json (str, optional): Build from extracted JSON file
Returns:
List[TextContent]: Tool execution results
"""
config_path = args.get("config_path")
pdf_path = args.get("pdf_path")
name = args.get("name")
description = args.get("description")
from_json = args.get("from_json")
# Build command
cmd = [sys.executable, str(CLI_DIR / "pdf_scraper.py")]
# Mode 1: Config file
if config_path:
cmd.extend(["--config", config_path])
# Mode 2: Direct PDF
elif pdf_path and name:
cmd.extend(["--pdf", pdf_path, "--name", name])
if description:
cmd.extend(["--description", description])
# Mode 3: From JSON
elif from_json:
cmd.extend(["--from-json", from_json])
else:
return [
TextContent(
type="text", text="❌ Error: Must specify --config, --pdf + --name, or --from-json"
)
]
# Run pdf_scraper.py with streaming (can take a while)
timeout = 600 # 10 minutes for PDF extraction
progress_msg = "📄 Scraping PDF documentation...\n"
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
output = progress_msg + stdout
if returncode == 0:
return [TextContent(type="text", text=output)]
else:
return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")]
async def scrape_video_tool(args: dict) -> list[TextContent]:
"""
Scrape video content (YouTube, local files) and build Claude skill.
Extracts transcripts, metadata, and optionally visual content from videos
to create skills.
Args:
args: Dictionary containing:
- url (str, optional): Video URL (YouTube, Vimeo)
- video_file (str, optional): Local video file path
- playlist (str, optional): Playlist URL
- name (str, optional): Skill name
- description (str, optional): Skill description
- languages (str, optional): Language preferences (comma-separated)
- from_json (str, optional): Build from extracted JSON file
- visual (bool, optional): Enable visual frame extraction (default: False)
- whisper_model (str, optional): Whisper model size (default: base)
- visual_interval (float, optional): Seconds between frame captures (default: 5.0)
- visual_min_gap (float, optional): Minimum seconds between kept frames (default: 2.0)
- visual_similarity (float, optional): Similarity threshold to skip duplicate frames (default: 0.95)
- vision_ocr (bool, optional): Use vision model for OCR on frames (default: False)
- start_time (str, optional): Start time for extraction (seconds, MM:SS, or HH:MM:SS)
- end_time (str, optional): End time for extraction (seconds, MM:SS, or HH:MM:SS)
- setup (bool, optional): Auto-detect GPU and install visual extraction deps
Returns:
List[TextContent]: Tool execution results
"""
# Handle --setup early exit
if args.get("setup", False):
from skill_seekers.cli.video_setup import run_setup
rc = run_setup(interactive=False)
msg = "Setup completed successfully." if rc == 0 else "Setup failed. Check logs."
return [TextContent(type="text", text=msg)]
url = args.get("url")
video_file = args.get("video_file")
playlist = args.get("playlist")
name = args.get("name")
description = args.get("description")
languages = args.get("languages")
from_json = args.get("from_json")
visual = args.get("visual", False)
whisper_model = args.get("whisper_model")
visual_interval = args.get("visual_interval")
visual_min_gap = args.get("visual_min_gap")
visual_similarity = args.get("visual_similarity")
vision_ocr = args.get("vision_ocr", False)
start_time = args.get("start_time")
end_time = args.get("end_time")
# Build command
cmd = [sys.executable, str(CLI_DIR / "video_scraper.py")]
if from_json:
cmd.extend(["--from-json", from_json])
elif url:
cmd.extend(["--url", url])
if name:
cmd.extend(["--name", name])
if description:
cmd.extend(["--description", description])
if languages:
cmd.extend(["--languages", languages])
elif video_file:
cmd.extend(["--video-file", video_file])
if name:
cmd.extend(["--name", name])
if description:
cmd.extend(["--description", description])
elif playlist:
cmd.extend(["--playlist", playlist])
if name:
cmd.extend(["--name", name])
else:
return [
TextContent(
type="text",
text="❌ Error: Must specify --url, --video-file, --playlist, or --from-json",
)
]
# Visual extraction parameters
if visual:
cmd.append("--visual")
if whisper_model:
cmd.extend(["--whisper-model", whisper_model])
if visual_interval is not None:
cmd.extend(["--visual-interval", str(visual_interval)])
if visual_min_gap is not None:
cmd.extend(["--visual-min-gap", str(visual_min_gap)])
if visual_similarity is not None:
cmd.extend(["--visual-similarity", str(visual_similarity)])
if vision_ocr:
cmd.append("--vision-ocr")
if start_time:
cmd.extend(["--start-time", str(start_time)])
if end_time:
cmd.extend(["--end-time", str(end_time)])
# Run video_scraper.py with streaming
timeout = 600 # 10 minutes for video extraction
progress_msg = "🎬 Scraping video content...\n"
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
output = progress_msg + stdout
if returncode == 0:
return [TextContent(type="text", text=output)]
else:
return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")]
async def scrape_github_tool(args: dict) -> list[TextContent]:
"""
Scrape GitHub repository and build Claude skill.
Extracts README, Issues, Changelog, Releases, and code structure
from GitHub repositories to create comprehensive skills.
Args:
args: Dictionary containing:
- repo (str, optional): GitHub repository (owner/repo)
- config_path (str, optional): Path to GitHub config JSON file
- name (str, optional): Skill name (default: repo name)
- description (str, optional): Skill description
- token (str, optional): GitHub personal access token
- no_issues (bool, optional): Skip GitHub issues extraction (default: False)
- no_changelog (bool, optional): Skip CHANGELOG extraction (default: False)
- no_releases (bool, optional): Skip releases extraction (default: False)
- max_issues (int, optional): Maximum issues to fetch (default: 100)
- scrape_only (bool, optional): Only scrape, don't build skill (default: False)
Returns:
List[TextContent]: Tool execution results
"""
repo = args.get("repo")
config_path = args.get("config_path")
name = args.get("name")
description = args.get("description")
token = args.get("token")
no_issues = args.get("no_issues", False)
no_changelog = args.get("no_changelog", False)
no_releases = args.get("no_releases", False)
max_issues = args.get("max_issues", 100)
scrape_only = args.get("scrape_only", False)
# Build command
cmd = [sys.executable, str(CLI_DIR / "github_scraper.py")]
# Mode 1: Config file
if config_path:
cmd.extend(["--config", config_path])
# Mode 2: Direct repo
elif repo:
cmd.extend(["--repo", repo])
if name:
cmd.extend(["--name", name])
if description:
cmd.extend(["--description", description])
if token:
cmd.extend(["--token", token])
if no_issues:
cmd.append("--no-issues")
if no_changelog:
cmd.append("--no-changelog")
if no_releases:
cmd.append("--no-releases")
if max_issues != 100:
cmd.extend(["--max-issues", str(max_issues)])
if scrape_only:
cmd.append("--scrape-only")
else:
return [TextContent(type="text", text="❌ Error: Must specify --repo or --config")]
# Run github_scraper.py with streaming (can take a while)
timeout = 600 # 10 minutes for GitHub scraping
progress_msg = "🐙 Scraping GitHub repository...\n"
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
output = progress_msg + stdout
if returncode == 0:
return [TextContent(type="text", text=output)]
else:
return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")]
async def scrape_codebase_tool(args: dict) -> list[TextContent]:
"""
Analyze local codebase and extract code knowledge.
Walks directory tree, analyzes code files, extracts signatures,
docstrings, and generates API reference documentation, dependency graphs,
design patterns, test examples, and how-to guides.
All features are ON by default. Use skip_* parameters to disable specific features.
Args:
args: Dictionary containing:
- directory (str): Directory to analyze
- output (str, optional): Output directory for results (default: output/codebase/)
- depth (str, optional): Analysis depth - surface, deep, full (default: deep)
- languages (str, optional): Comma-separated languages (e.g., "Python,JavaScript,C++")
- file_patterns (str, optional): Comma-separated file patterns (e.g., "*.py,src/**/*.js")
- enhance_level (int, optional): AI enhancement level 0-3 (default: 0)
- 0: No AI enhancement
- 1: SKILL.md enhancement only
- 2: SKILL.md + Architecture + Config enhancement
- 3: Full enhancement (patterns, tests, config, architecture, SKILL.md)
- skip_api_reference (bool, optional): Skip API reference generation (default: False)
- skip_dependency_graph (bool, optional): Skip dependency graph (default: False)
- skip_patterns (bool, optional): Skip design pattern detection (default: False)
- skip_test_examples (bool, optional): Skip test example extraction (default: False)
- skip_how_to_guides (bool, optional): Skip how-to guide generation (default: False)
- skip_config_patterns (bool, optional): Skip config pattern extraction (default: False)
- skip_docs (bool, optional): Skip project documentation extraction (default: False)
Returns:
List[TextContent]: Tool execution results
Example:
scrape_codebase(
directory="/path/to/repo",
depth="deep",
enhance_level=1
)
scrape_codebase(
directory="/path/to/repo",
enhance_level=2,
skip_patterns=True
)
"""
directory = args.get("directory")
if not directory:
return [TextContent(type="text", text="❌ Error: directory parameter is required")]
output = args.get("output", "output/codebase/")
depth = args.get("depth", "deep")
languages = args.get("languages", "")
file_patterns = args.get("file_patterns", "")
enhance_level = args.get("enhance_level", 0)
# Skip flags (features are ON by default)
skip_api_reference = args.get("skip_api_reference", False)
skip_dependency_graph = args.get("skip_dependency_graph", False)
skip_patterns = args.get("skip_patterns", False)
skip_test_examples = args.get("skip_test_examples", False)
skip_how_to_guides = args.get("skip_how_to_guides", False)
skip_config_patterns = args.get("skip_config_patterns", False)
skip_docs = args.get("skip_docs", False)
# Build command
cmd = [sys.executable, "-m", "skill_seekers.cli.codebase_scraper"]
cmd.extend(["--directory", directory])
if output:
cmd.extend(["--output", output])
if depth:
cmd.extend(["--depth", depth])
if languages:
cmd.extend(["--languages", languages])
if file_patterns:
cmd.extend(["--file-patterns", file_patterns])
if enhance_level > 0:
cmd.extend(["--enhance-level", str(enhance_level)])
# Skip flags
if skip_api_reference:
cmd.append("--skip-api-reference")
if skip_dependency_graph:
cmd.append("--skip-dependency-graph")
if skip_patterns:
cmd.append("--skip-patterns")
if skip_test_examples:
cmd.append("--skip-test-examples")
if skip_how_to_guides:
cmd.append("--skip-how-to-guides")
if skip_config_patterns:
cmd.append("--skip-config-patterns")
if skip_docs:
cmd.append("--skip-docs")
# Adjust timeout based on enhance_level
timeout = 600 # 10 minutes base
if enhance_level >= 2:
timeout = 1200 # 20 minutes with AI enhancement
if enhance_level >= 3:
timeout = 3600 # 60 minutes for full enhancement
level_names = {0: "off", 1: "SKILL.md only", 2: "standard", 3: "full"}
progress_msg = "🔍 Analyzing local codebase...\n"
progress_msg += f"📁 Directory: {directory}\n"
progress_msg += f"📊 Depth: {depth}\n"
if enhance_level > 0:
progress_msg += f"🤖 AI Enhancement: Level {enhance_level} ({level_names.get(enhance_level, 'unknown')})\n"
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
output_text = progress_msg + stdout
if returncode == 0:
return [TextContent(type="text", text=output_text)]
else:
return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]
async def detect_patterns_tool(args: dict) -> list[TextContent]:
"""
Detect design patterns in source code.
Analyzes source files or directories to detect common design patterns
(Singleton, Factory, Observer, Strategy, Decorator, Builder, Adapter,
Command, Template Method, Chain of Responsibility).
Supports 9 languages: Python, JavaScript, TypeScript, C++, C, C#,
Go, Rust, Java, Ruby, PHP.
Args:
args: Dictionary containing:
- file (str, optional): Single file to analyze
- directory (str, optional): Directory to analyze (analyzes all source files)
- output (str, optional): Output directory for JSON results
- depth (str, optional): Detection depth - surface, deep, full (default: deep)
- json (bool, optional): Output JSON format (default: False)
Returns:
List[TextContent]: Pattern detection results
Example:
detect_patterns(file="src/database.py", depth="deep")
detect_patterns(directory="src/", output="patterns/", json=True)
"""
file_path = args.get("file")
directory = args.get("directory")
if not file_path and not directory:
return [
TextContent(
type="text", text="❌ Error: Must specify either 'file' or 'directory' parameter"
)
]
output = args.get("output", "")
depth = args.get("depth", "deep")
json_output = args.get("json", False)
# Build command
cmd = [sys.executable, "-m", "skill_seekers.cli.pattern_recognizer"]
if file_path:
cmd.extend(["--file", file_path])
if directory:
cmd.extend(["--directory", directory])
if output:
cmd.extend(["--output", output])
if depth:
cmd.extend(["--depth", depth])
if json_output:
cmd.append("--json")
timeout = 300 # 5 minutes for pattern detection
progress_msg = "🔍 Detecting design patterns...\n"
if file_path:
progress_msg += f"📄 File: {file_path}\n"
if directory:
progress_msg += f"📁 Directory: {directory}\n"
progress_msg += f"🎯 Detection depth: {depth}\n"
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
output_text = progress_msg + stdout
if returncode == 0:
return [TextContent(type="text", text=output_text)]
else:
return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]
async def extract_test_examples_tool(args: dict) -> list[TextContent]:
"""
Extract usage examples from test files.
Analyzes test files to extract real API usage patterns including:
- Object instantiation with real parameters
- Method calls with expected behaviors
- Configuration examples
- Setup patterns from fixtures/setUp()
- Multi-step workflows from integration tests
Supports 9 languages: Python (AST-based deep analysis), JavaScript,
TypeScript, Go, Rust, Java, C#, PHP, Ruby (regex-based).
Args:
args: Dictionary containing:
- file (str, optional): Single test file to analyze
- directory (str, optional): Directory containing test files
- language (str, optional): Filter by language (python, javascript, etc.)
- min_confidence (float, optional): Minimum confidence threshold 0.0-1.0 (default: 0.5)
- max_per_file (int, optional): Maximum examples per file (default: 10)
- json (bool, optional): Output JSON format (default: False)
- markdown (bool, optional): Output Markdown format (default: False)
Returns:
List[TextContent]: Extracted test examples
Example:
extract_test_examples(directory="tests/", language="python")
extract_test_examples(file="tests/test_scraper.py", json=True)
"""
file_path = args.get("file")
directory = args.get("directory")
if not file_path and not directory:
return [
TextContent(
type="text", text="❌ Error: Must specify either 'file' or 'directory' parameter"
)
]
language = args.get("language", "")
min_confidence = args.get("min_confidence", 0.5)
max_per_file = args.get("max_per_file", 10)
json_output = args.get("json", False)
markdown_output = args.get("markdown", False)
# Build command
cmd = [sys.executable, "-m", "skill_seekers.cli.test_example_extractor"]
if directory:
cmd.append(directory)
if file_path:
cmd.extend(["--file", file_path])
if language:
cmd.extend(["--language", language])
if min_confidence:
cmd.extend(["--min-confidence", str(min_confidence)])
if max_per_file:
cmd.extend(["--max-per-file", str(max_per_file)])
if json_output:
cmd.append("--json")
if markdown_output:
cmd.append("--markdown")
timeout = 180 # 3 minutes for test example extraction
progress_msg = "🧪 Extracting usage examples from test files...\n"
if file_path:
progress_msg += f"📄 File: {file_path}\n"
if directory:
progress_msg += f"📁 Directory: {directory}\n"
if language:
progress_msg += f"🔤 Language: {language}\n"
progress_msg += f"🎯 Min confidence: {min_confidence}\n"
progress_msg += f"📊 Max per file: {max_per_file}\n"
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
output_text = progress_msg + stdout
if returncode == 0:
return [TextContent(type="text", text=output_text)]
else:
return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]
async def build_how_to_guides_tool(args: dict) -> list[TextContent]:
"""
Build how-to guides from workflow test examples.
Transforms workflow examples extracted from test files into step-by-step
educational guides. Automatically groups related workflows, extracts steps,
and generates comprehensive markdown guides.
Features:
- Python AST-based step extraction (heuristic for other languages)
- 4 grouping strategies: ai-tutorial-group, file-path, test-name, complexity
- Detects prerequisites, setup code, and verification points
- Generates troubleshooting tips and next steps
- Creates index with difficulty levels
Args:
args: Dictionary containing:
- input (str): Path to test_examples.json from extract_test_examples
- output (str, optional): Output directory for guides (default: output/codebase/tutorials)
- group_by (str, optional): Grouping strategy - ai-tutorial-group, file-path, test-name, complexity
- no_ai (bool, optional): Disable AI enhancement for grouping (default: False)
- json_output (bool, optional): Output JSON format alongside markdown (default: False)
Returns:
List[TextContent]: Guide building results
Example:
build_how_to_guides(
input="output/codebase/test_examples/test_examples.json",
group_by="ai-tutorial-group",
output="output/codebase/tutorials"
)
"""
input_file = args.get("input")
if not input_file:
return [
TextContent(
type="text",
text="❌ Error: input parameter is required (path to test_examples.json)",
)
]
output = args.get("output", "output/codebase/tutorials")
group_by = args.get("group_by", "ai-tutorial-group")
no_ai = args.get("no_ai", False)
json_output = args.get("json_output", False)
# Build command
cmd = [sys.executable, "-m", "skill_seekers.cli.how_to_guide_builder"]
cmd.append(input_file)
if output:
cmd.extend(["--output", output])
if group_by:
cmd.extend(["--group-by", group_by])
if no_ai:
cmd.append("--no-ai")
if json_output:
cmd.append("--json-output")
timeout = 180 # 3 minutes for guide building
progress_msg = "📚 Building how-to guides from workflow examples...\n"
progress_msg += f"📄 Input: {input_file}\n"
progress_msg += f"📁 Output: {output}\n"
progress_msg += f"🔀 Grouping: {group_by}\n"
if no_ai:
progress_msg += "🚫 AI enhancement disabled\n"
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
output_text = progress_msg + stdout
if returncode == 0:
return [TextContent(type="text", text=output_text)]
else:
return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]
async def extract_config_patterns_tool(args: dict) -> list[TextContent]:
"""
Extract configuration patterns from config files (C3.4).
Analyzes configuration files in the codebase to extract settings,
detect common patterns (database, API, logging, cache, etc.), and
generate comprehensive documentation.
Supports 9 config formats: JSON, YAML, TOML, ENV, INI, Python modules,
JavaScript/TypeScript configs, Dockerfile, Docker Compose.
Detects 7 common patterns:
- Database configuration (host, port, credentials)
- API configuration (endpoints, keys, timeouts)
- Logging configuration (level, format, handlers)
- Cache configuration (backend, TTL, keys)
- Email configuration (SMTP, credentials)
- Authentication configuration (providers, secrets)
- Server configuration (host, port, workers)
Args:
args: Dictionary containing:
- directory (str): Directory to analyze
- output (str, optional): Output directory (default: output/codebase/config_patterns)
- max_files (int, optional): Maximum config files to process (default: 100)
- enhance (bool, optional): Enable AI enhancement - API mode (default: False, requires ANTHROPIC_API_KEY)
- enhance_local (bool, optional): Enable AI enhancement - LOCAL mode (default: False, uses Claude Code CLI)
- ai_mode (str, optional): AI mode - auto, api, local, none (default: none)
- json (bool, optional): Output JSON format (default: True)
- markdown (bool, optional): Output Markdown format (default: True)
Returns:
List[TextContent]: Config extraction results with optional AI enhancements
Example:
extract_config_patterns(directory=".", output="output/configs")
extract_config_patterns(directory="/path/to/repo", max_files=50, enhance_local=True)
"""
directory = args.get("directory")
if not directory:
return [TextContent(type="text", text="❌ Error: directory parameter is required")]
output = args.get("output", "output/codebase/config_patterns")
max_files = args.get("max_files", 100)
enhance = args.get("enhance", False)
enhance_local = args.get("enhance_local", False)
ai_mode = args.get("ai_mode", "none")
json_output = args.get("json", True)
markdown_output = args.get("markdown", True)
# Build command
cmd = [sys.executable, "-m", "skill_seekers.cli.config_extractor"]
cmd.extend(["--directory", directory])
if output:
cmd.extend(["--output", output])
if max_files:
cmd.extend(["--max-files", str(max_files)])
if enhance:
cmd.append("--enhance")
if enhance_local:
cmd.append("--enhance-local")
if ai_mode and ai_mode != "none":
cmd.extend(["--ai-mode", ai_mode])
if json_output:
cmd.append("--json")
if markdown_output:
cmd.append("--markdown")
# Adjust timeout for AI enhancement
timeout = 180 # 3 minutes base
if enhance or enhance_local or ai_mode != "none":
timeout = 360 # 6 minutes with AI enhancement
progress_msg = "⚙️ Extracting configuration patterns...\n"
progress_msg += f"📁 Directory: {directory}\n"
progress_msg += f"📄 Max files: {max_files}\n"
if enhance or enhance_local or (ai_mode and ai_mode != "none"):
progress_msg += f"🤖 AI enhancement: {ai_mode if ai_mode != 'none' else ('api' if enhance else 'local')}\n"
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
output_text = progress_msg + stdout
if returncode == 0:
return [TextContent(type="text", text=output_text)]
else:
return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]
# Valid source types for the generic scraper
GENERIC_SOURCE_TYPES = (
"jupyter",
"html",
"openapi",
"asciidoc",
"pptx",
"confluence",
"notion",
"rss",
"manpage",
"chat",
)
# Mapping from source type to the CLI flag used for the primary input argument.
# URL-based types use --url; file/path-based types use --path.
_URL_BASED_TYPES = {"confluence", "notion", "rss"}
# Friendly emoji labels per source type
_SOURCE_EMOJIS = {
"jupyter": "📓",
"html": "🌐",
"openapi": "📡",
"asciidoc": "📄",
"pptx": "📊",
"confluence": "🏢",
"notion": "📝",
"rss": "📰",
"manpage": "📖",
"chat": "💬",
}
async def scrape_generic_tool(args: dict) -> list[TextContent]:
"""
Generic scraper for new source types.
Handles all 10 new source types by building the appropriate subprocess
command and delegating to the corresponding CLI scraper module.
Supported source types: jupyter, html, openapi, asciidoc, pptx,
confluence, notion, rss, manpage, chat.
Args:
args: Dictionary containing:
- source_type (str): One of the supported source types
- path (str, optional): File or directory path (for file-based sources)
- url (str, optional): URL (for URL-based sources like confluence, notion, rss)
- name (str): Skill name for the output
Returns:
List[TextContent]: Tool execution results
"""
source_type = args.get("source_type", "")
path = args.get("path")
url = args.get("url")
name = args.get("name")
# Validate source_type
if source_type not in GENERIC_SOURCE_TYPES:
return [
TextContent(
type="text",
text=(
f"❌ Error: Unknown source_type '{source_type}'. "
f"Must be one of: {', '.join(GENERIC_SOURCE_TYPES)}"
),
)
]
# Validate that we have either path or url
if not path and not url:
return [
TextContent(
type="text",
text="❌ Error: Must specify either 'path' (file/directory) or 'url'",
)
]
if not name:
return [
TextContent(
type="text",
text="❌ Error: 'name' parameter is required",
)
]
# Build the subprocess command
# Map source type to module name (most are <type>_scraper, but some differ)
_MODULE_NAMES = {
"manpage": "man_scraper",
}
module_name = _MODULE_NAMES.get(source_type, f"{source_type}_scraper")
cmd = [sys.executable, "-m", f"skill_seekers.cli.{module_name}"]
# Map source type to the correct CLI flag for file/path input and URL input.
# Each scraper has its own flag name — using a generic --path or --url would fail.
_PATH_FLAGS: dict[str, str] = {
"jupyter": "--notebook",
"html": "--html-path",
"openapi": "--spec",
"asciidoc": "--asciidoc-path",
"pptx": "--pptx",
"manpage": "--man-path",
"confluence": "--export-path",
"notion": "--export-path",
"rss": "--feed-path",
"chat": "--export-path",
}
_URL_FLAGS: dict[str, str] = {
"confluence": "--base-url",
"notion": "--page-id",
"rss": "--feed-url",
"openapi": "--spec-url",
}
# Determine the input flag based on source type
if source_type in _URL_BASED_TYPES and url:
url_flag = _URL_FLAGS.get(source_type, "--url")
cmd.extend([url_flag, url])
elif path:
path_flag = _PATH_FLAGS.get(source_type, "--path")
cmd.extend([path_flag, path])
elif url:
# Allow url fallback for file-based types (some may accept URLs too)
url_flag = _URL_FLAGS.get(source_type, "--url")
cmd.extend([url_flag, url])
cmd.extend(["--name", name])
# Set a reasonable timeout
timeout = 600 # 10 minutes
emoji = _SOURCE_EMOJIS.get(source_type, "🔧")
progress_msg = f"{emoji} Scraping {source_type} source...\n"
if path:
progress_msg += f"📁 Path: {path}\n"
if url:
progress_msg += f"🔗 URL: {url}\n"
progress_msg += f"📛 Name: {name}\n"
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
output = progress_msg + stdout
if returncode == 0:
return [TextContent(type="text", text=output)]
else:
return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")]