Files
skill-seekers-reference/src/skill_seekers/mcp/tools/scraping_tools.py
YusufKaraaslanSpyke 170dd0fd75 feat(C3.9): Add project documentation extraction from markdown files
- Scan ALL .md files in project (README, docs/, etc.)
- Smart categorization by folder/filename (overview, architecture, guides, etc.)
- Processing depth: surface=raw copy, deep=parse+summarize, full=AI-enhanced
- AI enhancement at level 2+ adds topic extraction and cross-references
- New "Project Documentation" section in SKILL.md with summaries
- Output to references/documentation/ organized by category
- Default ON, use --skip-docs to disable
- Add skip_docs parameter to MCP scrape_codebase_tool
- Add 15 new tests for markdown documentation features

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-31 13:54:56 +03:00

890 lines
32 KiB
Python

"""
Scraping Tools Module for MCP Server
This module contains all scraping-related MCP tool implementations:
- estimate_pages_tool: Estimate page count before scraping
- scrape_docs_tool: Scrape documentation (legacy or unified)
- scrape_github_tool: Scrape GitHub repositories
- scrape_pdf_tool: Scrape PDF documentation
- scrape_codebase_tool: Analyze local codebase and extract code knowledge
Extracted from server.py for better modularity and organization.
"""
import json
import sys
from pathlib import Path
# MCP types - with graceful fallback for testing
try:
from mcp.types import TextContent
except ImportError:
# Graceful degradation: Create a simple fallback class for testing
class TextContent:
"""Fallback TextContent for when MCP is not installed"""
def __init__(self, type: str, text: str):
self.type = type
self.text = text
# Path to CLI tools
CLI_DIR = Path(__file__).parent.parent.parent / "cli"
def run_subprocess_with_streaming(cmd: list[str], timeout: int = None) -> tuple:
"""
Run subprocess with real-time output streaming.
This solves the blocking issue where long-running processes (like scraping)
would cause MCP to appear frozen. Now we stream output as it comes.
Args:
cmd: Command list to execute
timeout: Optional timeout in seconds
Returns:
Tuple of (stdout, stderr, returncode)
"""
import subprocess
import time
try:
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
bufsize=1, # Line buffered
universal_newlines=True,
)
stdout_lines = []
stderr_lines = []
start_time = time.time()
# Read output line by line as it comes
while True:
# Check timeout
if timeout and (time.time() - start_time) > timeout:
process.kill()
stderr_lines.append(f"\n⚠️ Process killed after {timeout}s timeout")
break
# Check if process finished
if process.poll() is not None:
break
# Read available output (non-blocking)
try:
import select
readable, _, _ = select.select([process.stdout, process.stderr], [], [], 0.1)
if process.stdout in readable:
line = process.stdout.readline()
if line:
stdout_lines.append(line)
if process.stderr in readable:
line = process.stderr.readline()
if line:
stderr_lines.append(line)
except Exception:
# Fallback for Windows (no select)
time.sleep(0.1)
# Get any remaining output
remaining_stdout, remaining_stderr = process.communicate()
if remaining_stdout:
stdout_lines.append(remaining_stdout)
if remaining_stderr:
stderr_lines.append(remaining_stderr)
stdout = "".join(stdout_lines)
stderr = "".join(stderr_lines)
returncode = process.returncode
return stdout, stderr, returncode
except Exception as e:
return "", f"Error running subprocess: {str(e)}", 1
async def estimate_pages_tool(args: dict) -> list[TextContent]:
"""
Estimate page count from a config file.
Performs fast preview without downloading content to estimate
how many pages will be scraped.
Args:
args: Dictionary containing:
- config_path (str): Path to config JSON file
- max_discovery (int, optional): Maximum pages to discover (default: 1000)
- unlimited (bool, optional): Remove discovery limit (default: False)
Returns:
List[TextContent]: Tool execution results
"""
config_path = args["config_path"]
max_discovery = args.get("max_discovery", 1000)
unlimited = args.get("unlimited", False)
# Handle unlimited mode
if unlimited or max_discovery == -1:
max_discovery = -1
timeout = 1800 # 30 minutes for unlimited discovery
else:
# Estimate: 0.5s per page discovered
timeout = max(300, max_discovery // 2) # Minimum 5 minutes
# Run estimate_pages.py
cmd = [
sys.executable,
str(CLI_DIR / "estimate_pages.py"),
config_path,
"--max-discovery",
str(max_discovery),
]
progress_msg = "🔄 Estimating page count...\n"
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
output = progress_msg + stdout
if returncode == 0:
return [TextContent(type="text", text=output)]
else:
return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")]
async def scrape_docs_tool(args: dict) -> list[TextContent]:
"""
Scrape documentation and build skill.
Auto-detects unified vs legacy format and routes to appropriate scraper.
Supports both single-source (legacy) and unified multi-source configs.
Creates SKILL.md and reference files.
Args:
args: Dictionary containing:
- config_path (str): Path to config JSON file
- unlimited (bool, optional): Remove page limit (default: False)
- enhance_local (bool, optional): Open terminal for local enhancement (default: False)
- skip_scrape (bool, optional): Skip scraping, use cached data (default: False)
- dry_run (bool, optional): Preview without saving (default: False)
- merge_mode (str, optional): Override merge mode for unified configs
Returns:
List[TextContent]: Tool execution results
"""
config_path = args["config_path"]
unlimited = args.get("unlimited", False)
enhance_local = args.get("enhance_local", False)
skip_scrape = args.get("skip_scrape", False)
dry_run = args.get("dry_run", False)
merge_mode = args.get("merge_mode")
# Load config to detect format
with open(config_path) as f:
config = json.load(f)
# Detect if unified format (has 'sources' array)
is_unified = "sources" in config and isinstance(config["sources"], list)
# Handle unlimited mode by modifying config temporarily
if unlimited:
# Set max_pages to None (unlimited)
if is_unified:
# For unified configs, set max_pages on documentation sources
for source in config.get("sources", []):
if source.get("type") == "documentation":
source["max_pages"] = None
else:
# For legacy configs
config["max_pages"] = None
# Create temporary config file
temp_config_path = config_path.replace(".json", "_unlimited_temp.json")
with open(temp_config_path, "w") as f:
json.dump(config, f, indent=2)
config_to_use = temp_config_path
else:
config_to_use = config_path
# Choose scraper based on format
if is_unified:
scraper_script = "unified_scraper.py"
progress_msg = "🔄 Starting unified multi-source scraping...\n"
progress_msg += "📦 Config format: Unified (multiple sources)\n"
else:
scraper_script = "doc_scraper.py"
progress_msg = "🔄 Starting scraping process...\n"
progress_msg += "📦 Config format: Legacy (single source)\n"
# Build command
cmd = [sys.executable, str(CLI_DIR / scraper_script), "--config", config_to_use]
# Add merge mode for unified configs
if is_unified and merge_mode:
cmd.extend(["--merge-mode", merge_mode])
# Add --fresh to avoid user input prompts when existing data found
if not skip_scrape:
cmd.append("--fresh")
if enhance_local:
cmd.append("--enhance-local")
if skip_scrape:
cmd.append("--skip-scrape")
if dry_run:
cmd.append("--dry-run")
# Determine timeout based on operation type
if dry_run:
timeout = 300 # 5 minutes for dry run
elif skip_scrape:
timeout = 600 # 10 minutes for building from cache
elif unlimited:
timeout = None # No timeout for unlimited mode (user explicitly requested)
else:
# Read config to estimate timeout
try:
if is_unified:
# For unified configs, estimate based on all sources
total_pages = 0
for source in config.get("sources", []):
if source.get("type") == "documentation":
total_pages += source.get("max_pages", 500)
max_pages = total_pages or 500
else:
max_pages = config.get("max_pages", 500)
# Estimate: 30s per page + buffer
timeout = max(3600, max_pages * 35) # Minimum 1 hour, or 35s per page
except Exception:
timeout = 14400 # Default: 4 hours
# Add progress message
if timeout:
progress_msg += f"⏱️ Maximum time allowed: {timeout // 60} minutes\n"
else:
progress_msg += "⏱️ Unlimited mode - no timeout\n"
progress_msg += "📝 Progress will be shown below:\n\n"
# Run scraper with streaming
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
# Clean up temporary config
if unlimited and Path(config_to_use).exists():
Path(config_to_use).unlink()
output = progress_msg + stdout
if returncode == 0:
return [TextContent(type="text", text=output)]
else:
error_output = output + f"\n\n❌ Error:\n{stderr}"
return [TextContent(type="text", text=error_output)]
async def scrape_pdf_tool(args: dict) -> list[TextContent]:
"""
Scrape PDF documentation and build Claude skill.
Extracts text, code, and images from PDF files and builds
a skill package with organized references.
Args:
args: Dictionary containing:
- config_path (str, optional): Path to PDF config JSON file
- pdf_path (str, optional): Direct PDF path (alternative to config_path)
- name (str, optional): Skill name (required with pdf_path)
- description (str, optional): Skill description
- from_json (str, optional): Build from extracted JSON file
Returns:
List[TextContent]: Tool execution results
"""
config_path = args.get("config_path")
pdf_path = args.get("pdf_path")
name = args.get("name")
description = args.get("description")
from_json = args.get("from_json")
# Build command
cmd = [sys.executable, str(CLI_DIR / "pdf_scraper.py")]
# Mode 1: Config file
if config_path:
cmd.extend(["--config", config_path])
# Mode 2: Direct PDF
elif pdf_path and name:
cmd.extend(["--pdf", pdf_path, "--name", name])
if description:
cmd.extend(["--description", description])
# Mode 3: From JSON
elif from_json:
cmd.extend(["--from-json", from_json])
else:
return [
TextContent(
type="text", text="❌ Error: Must specify --config, --pdf + --name, or --from-json"
)
]
# Run pdf_scraper.py with streaming (can take a while)
timeout = 600 # 10 minutes for PDF extraction
progress_msg = "📄 Scraping PDF documentation...\n"
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
output = progress_msg + stdout
if returncode == 0:
return [TextContent(type="text", text=output)]
else:
return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")]
async def scrape_github_tool(args: dict) -> list[TextContent]:
"""
Scrape GitHub repository and build Claude skill.
Extracts README, Issues, Changelog, Releases, and code structure
from GitHub repositories to create comprehensive skills.
Args:
args: Dictionary containing:
- repo (str, optional): GitHub repository (owner/repo)
- config_path (str, optional): Path to GitHub config JSON file
- name (str, optional): Skill name (default: repo name)
- description (str, optional): Skill description
- token (str, optional): GitHub personal access token
- no_issues (bool, optional): Skip GitHub issues extraction (default: False)
- no_changelog (bool, optional): Skip CHANGELOG extraction (default: False)
- no_releases (bool, optional): Skip releases extraction (default: False)
- max_issues (int, optional): Maximum issues to fetch (default: 100)
- scrape_only (bool, optional): Only scrape, don't build skill (default: False)
Returns:
List[TextContent]: Tool execution results
"""
repo = args.get("repo")
config_path = args.get("config_path")
name = args.get("name")
description = args.get("description")
token = args.get("token")
no_issues = args.get("no_issues", False)
no_changelog = args.get("no_changelog", False)
no_releases = args.get("no_releases", False)
max_issues = args.get("max_issues", 100)
scrape_only = args.get("scrape_only", False)
# Build command
cmd = [sys.executable, str(CLI_DIR / "github_scraper.py")]
# Mode 1: Config file
if config_path:
cmd.extend(["--config", config_path])
# Mode 2: Direct repo
elif repo:
cmd.extend(["--repo", repo])
if name:
cmd.extend(["--name", name])
if description:
cmd.extend(["--description", description])
if token:
cmd.extend(["--token", token])
if no_issues:
cmd.append("--no-issues")
if no_changelog:
cmd.append("--no-changelog")
if no_releases:
cmd.append("--no-releases")
if max_issues != 100:
cmd.extend(["--max-issues", str(max_issues)])
if scrape_only:
cmd.append("--scrape-only")
else:
return [TextContent(type="text", text="❌ Error: Must specify --repo or --config")]
# Run github_scraper.py with streaming (can take a while)
timeout = 600 # 10 minutes for GitHub scraping
progress_msg = "🐙 Scraping GitHub repository...\n"
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
output = progress_msg + stdout
if returncode == 0:
return [TextContent(type="text", text=output)]
else:
return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")]
async def scrape_codebase_tool(args: dict) -> list[TextContent]:
"""
Analyze local codebase and extract code knowledge.
Walks directory tree, analyzes code files, extracts signatures,
docstrings, and generates API reference documentation, dependency graphs,
design patterns, test examples, and how-to guides.
All features are ON by default. Use skip_* parameters to disable specific features.
Args:
args: Dictionary containing:
- directory (str): Directory to analyze
- output (str, optional): Output directory for results (default: output/codebase/)
- depth (str, optional): Analysis depth - surface, deep, full (default: deep)
- languages (str, optional): Comma-separated languages (e.g., "Python,JavaScript,C++")
- file_patterns (str, optional): Comma-separated file patterns (e.g., "*.py,src/**/*.js")
- enhance_level (int, optional): AI enhancement level 0-3 (default: 0)
- 0: No AI enhancement
- 1: SKILL.md enhancement only
- 2: SKILL.md + Architecture + Config enhancement
- 3: Full enhancement (patterns, tests, config, architecture, SKILL.md)
- skip_api_reference (bool, optional): Skip API reference generation (default: False)
- skip_dependency_graph (bool, optional): Skip dependency graph (default: False)
- skip_patterns (bool, optional): Skip design pattern detection (default: False)
- skip_test_examples (bool, optional): Skip test example extraction (default: False)
- skip_how_to_guides (bool, optional): Skip how-to guide generation (default: False)
- skip_config_patterns (bool, optional): Skip config pattern extraction (default: False)
- skip_docs (bool, optional): Skip project documentation extraction (default: False)
Returns:
List[TextContent]: Tool execution results
Example:
scrape_codebase(
directory="/path/to/repo",
depth="deep",
enhance_level=1
)
scrape_codebase(
directory="/path/to/repo",
enhance_level=2,
skip_patterns=True
)
"""
directory = args.get("directory")
if not directory:
return [TextContent(type="text", text="❌ Error: directory parameter is required")]
output = args.get("output", "output/codebase/")
depth = args.get("depth", "deep")
languages = args.get("languages", "")
file_patterns = args.get("file_patterns", "")
enhance_level = args.get("enhance_level", 0)
# Skip flags (features are ON by default)
skip_api_reference = args.get("skip_api_reference", False)
skip_dependency_graph = args.get("skip_dependency_graph", False)
skip_patterns = args.get("skip_patterns", False)
skip_test_examples = args.get("skip_test_examples", False)
skip_how_to_guides = args.get("skip_how_to_guides", False)
skip_config_patterns = args.get("skip_config_patterns", False)
skip_docs = args.get("skip_docs", False)
# Build command
cmd = [sys.executable, "-m", "skill_seekers.cli.codebase_scraper"]
cmd.extend(["--directory", directory])
if output:
cmd.extend(["--output", output])
if depth:
cmd.extend(["--depth", depth])
if languages:
cmd.extend(["--languages", languages])
if file_patterns:
cmd.extend(["--file-patterns", file_patterns])
if enhance_level > 0:
cmd.extend(["--enhance-level", str(enhance_level)])
# Skip flags
if skip_api_reference:
cmd.append("--skip-api-reference")
if skip_dependency_graph:
cmd.append("--skip-dependency-graph")
if skip_patterns:
cmd.append("--skip-patterns")
if skip_test_examples:
cmd.append("--skip-test-examples")
if skip_how_to_guides:
cmd.append("--skip-how-to-guides")
if skip_config_patterns:
cmd.append("--skip-config-patterns")
if skip_docs:
cmd.append("--skip-docs")
# Adjust timeout based on enhance_level
timeout = 600 # 10 minutes base
if enhance_level >= 2:
timeout = 1200 # 20 minutes with AI enhancement
if enhance_level >= 3:
timeout = 3600 # 60 minutes for full enhancement
level_names = {0: "off", 1: "SKILL.md only", 2: "standard", 3: "full"}
progress_msg = "🔍 Analyzing local codebase...\n"
progress_msg += f"📁 Directory: {directory}\n"
progress_msg += f"📊 Depth: {depth}\n"
if enhance_level > 0:
progress_msg += f"🤖 AI Enhancement: Level {enhance_level} ({level_names.get(enhance_level, 'unknown')})\n"
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
output_text = progress_msg + stdout
if returncode == 0:
return [TextContent(type="text", text=output_text)]
else:
return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]
async def detect_patterns_tool(args: dict) -> list[TextContent]:
"""
Detect design patterns in source code.
Analyzes source files or directories to detect common design patterns
(Singleton, Factory, Observer, Strategy, Decorator, Builder, Adapter,
Command, Template Method, Chain of Responsibility).
Supports 9 languages: Python, JavaScript, TypeScript, C++, C, C#,
Go, Rust, Java, Ruby, PHP.
Args:
args: Dictionary containing:
- file (str, optional): Single file to analyze
- directory (str, optional): Directory to analyze (analyzes all source files)
- output (str, optional): Output directory for JSON results
- depth (str, optional): Detection depth - surface, deep, full (default: deep)
- json (bool, optional): Output JSON format (default: False)
Returns:
List[TextContent]: Pattern detection results
Example:
detect_patterns(file="src/database.py", depth="deep")
detect_patterns(directory="src/", output="patterns/", json=True)
"""
file_path = args.get("file")
directory = args.get("directory")
if not file_path and not directory:
return [
TextContent(
type="text", text="❌ Error: Must specify either 'file' or 'directory' parameter"
)
]
output = args.get("output", "")
depth = args.get("depth", "deep")
json_output = args.get("json", False)
# Build command
cmd = [sys.executable, "-m", "skill_seekers.cli.pattern_recognizer"]
if file_path:
cmd.extend(["--file", file_path])
if directory:
cmd.extend(["--directory", directory])
if output:
cmd.extend(["--output", output])
if depth:
cmd.extend(["--depth", depth])
if json_output:
cmd.append("--json")
timeout = 300 # 5 minutes for pattern detection
progress_msg = "🔍 Detecting design patterns...\n"
if file_path:
progress_msg += f"📄 File: {file_path}\n"
if directory:
progress_msg += f"📁 Directory: {directory}\n"
progress_msg += f"🎯 Detection depth: {depth}\n"
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
output_text = progress_msg + stdout
if returncode == 0:
return [TextContent(type="text", text=output_text)]
else:
return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]
async def extract_test_examples_tool(args: dict) -> list[TextContent]:
"""
Extract usage examples from test files.
Analyzes test files to extract real API usage patterns including:
- Object instantiation with real parameters
- Method calls with expected behaviors
- Configuration examples
- Setup patterns from fixtures/setUp()
- Multi-step workflows from integration tests
Supports 9 languages: Python (AST-based deep analysis), JavaScript,
TypeScript, Go, Rust, Java, C#, PHP, Ruby (regex-based).
Args:
args: Dictionary containing:
- file (str, optional): Single test file to analyze
- directory (str, optional): Directory containing test files
- language (str, optional): Filter by language (python, javascript, etc.)
- min_confidence (float, optional): Minimum confidence threshold 0.0-1.0 (default: 0.5)
- max_per_file (int, optional): Maximum examples per file (default: 10)
- json (bool, optional): Output JSON format (default: False)
- markdown (bool, optional): Output Markdown format (default: False)
Returns:
List[TextContent]: Extracted test examples
Example:
extract_test_examples(directory="tests/", language="python")
extract_test_examples(file="tests/test_scraper.py", json=True)
"""
file_path = args.get("file")
directory = args.get("directory")
if not file_path and not directory:
return [
TextContent(
type="text", text="❌ Error: Must specify either 'file' or 'directory' parameter"
)
]
language = args.get("language", "")
min_confidence = args.get("min_confidence", 0.5)
max_per_file = args.get("max_per_file", 10)
json_output = args.get("json", False)
markdown_output = args.get("markdown", False)
# Build command
cmd = [sys.executable, "-m", "skill_seekers.cli.test_example_extractor"]
if directory:
cmd.append(directory)
if file_path:
cmd.extend(["--file", file_path])
if language:
cmd.extend(["--language", language])
if min_confidence:
cmd.extend(["--min-confidence", str(min_confidence)])
if max_per_file:
cmd.extend(["--max-per-file", str(max_per_file)])
if json_output:
cmd.append("--json")
if markdown_output:
cmd.append("--markdown")
timeout = 180 # 3 minutes for test example extraction
progress_msg = "🧪 Extracting usage examples from test files...\n"
if file_path:
progress_msg += f"📄 File: {file_path}\n"
if directory:
progress_msg += f"📁 Directory: {directory}\n"
if language:
progress_msg += f"🔤 Language: {language}\n"
progress_msg += f"🎯 Min confidence: {min_confidence}\n"
progress_msg += f"📊 Max per file: {max_per_file}\n"
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
output_text = progress_msg + stdout
if returncode == 0:
return [TextContent(type="text", text=output_text)]
else:
return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]
async def build_how_to_guides_tool(args: dict) -> list[TextContent]:
"""
Build how-to guides from workflow test examples.
Transforms workflow examples extracted from test files into step-by-step
educational guides. Automatically groups related workflows, extracts steps,
and generates comprehensive markdown guides.
Features:
- Python AST-based step extraction (heuristic for other languages)
- 4 grouping strategies: ai-tutorial-group, file-path, test-name, complexity
- Detects prerequisites, setup code, and verification points
- Generates troubleshooting tips and next steps
- Creates index with difficulty levels
Args:
args: Dictionary containing:
- input (str): Path to test_examples.json from extract_test_examples
- output (str, optional): Output directory for guides (default: output/codebase/tutorials)
- group_by (str, optional): Grouping strategy - ai-tutorial-group, file-path, test-name, complexity
- no_ai (bool, optional): Disable AI enhancement for grouping (default: False)
- json_output (bool, optional): Output JSON format alongside markdown (default: False)
Returns:
List[TextContent]: Guide building results
Example:
build_how_to_guides(
input="output/codebase/test_examples/test_examples.json",
group_by="ai-tutorial-group",
output="output/codebase/tutorials"
)
"""
input_file = args.get("input")
if not input_file:
return [
TextContent(
type="text",
text="❌ Error: input parameter is required (path to test_examples.json)",
)
]
output = args.get("output", "output/codebase/tutorials")
group_by = args.get("group_by", "ai-tutorial-group")
no_ai = args.get("no_ai", False)
json_output = args.get("json_output", False)
# Build command
cmd = [sys.executable, "-m", "skill_seekers.cli.how_to_guide_builder"]
cmd.append(input_file)
if output:
cmd.extend(["--output", output])
if group_by:
cmd.extend(["--group-by", group_by])
if no_ai:
cmd.append("--no-ai")
if json_output:
cmd.append("--json-output")
timeout = 180 # 3 minutes for guide building
progress_msg = "📚 Building how-to guides from workflow examples...\n"
progress_msg += f"📄 Input: {input_file}\n"
progress_msg += f"📁 Output: {output}\n"
progress_msg += f"🔀 Grouping: {group_by}\n"
if no_ai:
progress_msg += "🚫 AI enhancement disabled\n"
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
output_text = progress_msg + stdout
if returncode == 0:
return [TextContent(type="text", text=output_text)]
else:
return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]
async def extract_config_patterns_tool(args: dict) -> list[TextContent]:
"""
Extract configuration patterns from config files (C3.4).
Analyzes configuration files in the codebase to extract settings,
detect common patterns (database, API, logging, cache, etc.), and
generate comprehensive documentation.
Supports 9 config formats: JSON, YAML, TOML, ENV, INI, Python modules,
JavaScript/TypeScript configs, Dockerfile, Docker Compose.
Detects 7 common patterns:
- Database configuration (host, port, credentials)
- API configuration (endpoints, keys, timeouts)
- Logging configuration (level, format, handlers)
- Cache configuration (backend, TTL, keys)
- Email configuration (SMTP, credentials)
- Authentication configuration (providers, secrets)
- Server configuration (host, port, workers)
Args:
args: Dictionary containing:
- directory (str): Directory to analyze
- output (str, optional): Output directory (default: output/codebase/config_patterns)
- max_files (int, optional): Maximum config files to process (default: 100)
- enhance (bool, optional): Enable AI enhancement - API mode (default: False, requires ANTHROPIC_API_KEY)
- enhance_local (bool, optional): Enable AI enhancement - LOCAL mode (default: False, uses Claude Code CLI)
- ai_mode (str, optional): AI mode - auto, api, local, none (default: none)
- json (bool, optional): Output JSON format (default: True)
- markdown (bool, optional): Output Markdown format (default: True)
Returns:
List[TextContent]: Config extraction results with optional AI enhancements
Example:
extract_config_patterns(directory=".", output="output/configs")
extract_config_patterns(directory="/path/to/repo", max_files=50, enhance_local=True)
"""
directory = args.get("directory")
if not directory:
return [TextContent(type="text", text="❌ Error: directory parameter is required")]
output = args.get("output", "output/codebase/config_patterns")
max_files = args.get("max_files", 100)
enhance = args.get("enhance", False)
enhance_local = args.get("enhance_local", False)
ai_mode = args.get("ai_mode", "none")
json_output = args.get("json", True)
markdown_output = args.get("markdown", True)
# Build command
cmd = [sys.executable, "-m", "skill_seekers.cli.config_extractor"]
cmd.extend(["--directory", directory])
if output:
cmd.extend(["--output", output])
if max_files:
cmd.extend(["--max-files", str(max_files)])
if enhance:
cmd.append("--enhance")
if enhance_local:
cmd.append("--enhance-local")
if ai_mode and ai_mode != "none":
cmd.extend(["--ai-mode", ai_mode])
if json_output:
cmd.append("--json")
if markdown_output:
cmd.append("--markdown")
# Adjust timeout for AI enhancement
timeout = 180 # 3 minutes base
if enhance or enhance_local or ai_mode != "none":
timeout = 360 # 6 minutes with AI enhancement
progress_msg = "⚙️ Extracting configuration patterns...\n"
progress_msg += f"📁 Directory: {directory}\n"
progress_msg += f"📄 Max files: {max_files}\n"
if enhance or enhance_local or (ai_mode and ai_mode != "none"):
progress_msg += f"🤖 AI enhancement: {ai_mode if ai_mode != 'none' else ('api' if enhance else 'local')}\n"
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
output_text = progress_msg + stdout
if returncode == 0:
return [TextContent(type="text", text=output_text)]
else:
return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]