Refactor: Convert to monorepo with CLI and MCP server
Major restructure to support both CLI usage and MCP integration: **Repository Structure:** - cli/ - All CLI tools (doc_scraper, estimate_pages, enhance_skill, etc.) - mcp/ - New MCP server for Claude Code integration - configs/ - Shared configuration files - tests/ - Updated to import from cli/ - docs/ - Shared documentation **MCP Server (NEW):** - mcp/server.py - Full MCP server implementation - 6 tools available: * generate_config - Create config from URL * estimate_pages - Fast page count estimation * scrape_docs - Full documentation scraping * package_skill - Package to .zip * list_configs - Show available presets * validate_config - Validate config files - mcp/README.md - Complete MCP documentation - mcp/requirements.txt - MCP dependencies **CLI Tools (Moved to cli/):** - All existing functionality preserved - Same commands, same behavior - Tests updated to import from cli.doc_scraper **Tests:** - 68/71 passing (95.8%) - Updated imports from doc_scraper to cli.doc_scraper - Fixed validate_config() tuple unpacking (errors, warnings) - 3 minor test failures (checking warnings instead of errors) **Benefits:** - Use as CLI tool: python3 cli/doc_scraper.py - Use via MCP: Integrated with Claude Code - Shared code and configs - Single source of truth 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
258
cli/estimate_pages.py
Executable file
258
cli/estimate_pages.py
Executable file
@@ -0,0 +1,258 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Page Count Estimator for Skill Seeker
|
||||
Quickly estimates how many pages a config will scrape without downloading content
|
||||
"""
|
||||
|
||||
import sys
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import time
|
||||
import json
|
||||
|
||||
|
||||
def estimate_pages(config, max_discovery=1000, timeout=30):
|
||||
"""
|
||||
Estimate total pages that will be scraped
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary
|
||||
max_discovery: Maximum pages to discover (safety limit)
|
||||
timeout: Timeout for HTTP requests in seconds
|
||||
|
||||
Returns:
|
||||
dict with estimation results
|
||||
"""
|
||||
base_url = config['base_url']
|
||||
start_urls = config.get('start_urls', [base_url])
|
||||
url_patterns = config.get('url_patterns', {'include': [], 'exclude': []})
|
||||
rate_limit = config.get('rate_limit', 0.5)
|
||||
|
||||
visited = set()
|
||||
pending = list(start_urls)
|
||||
discovered = 0
|
||||
|
||||
include_patterns = url_patterns.get('include', [])
|
||||
exclude_patterns = url_patterns.get('exclude', [])
|
||||
|
||||
print(f"🔍 Estimating pages for: {config['name']}")
|
||||
print(f"📍 Base URL: {base_url}")
|
||||
print(f"🎯 Start URLs: {len(start_urls)}")
|
||||
print(f"⏱️ Rate limit: {rate_limit}s")
|
||||
print(f"🔢 Max discovery: {max_discovery}")
|
||||
print()
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
while pending and discovered < max_discovery:
|
||||
url = pending.pop(0)
|
||||
|
||||
# Skip if already visited
|
||||
if url in visited:
|
||||
continue
|
||||
|
||||
visited.add(url)
|
||||
discovered += 1
|
||||
|
||||
# Progress indicator
|
||||
if discovered % 10 == 0:
|
||||
elapsed = time.time() - start_time
|
||||
rate = discovered / elapsed if elapsed > 0 else 0
|
||||
print(f"⏳ Discovered: {discovered} pages ({rate:.1f} pages/sec)", end='\r')
|
||||
|
||||
try:
|
||||
# HEAD request first to check if page exists (faster)
|
||||
head_response = requests.head(url, timeout=timeout, allow_redirects=True)
|
||||
|
||||
# Skip non-HTML content
|
||||
content_type = head_response.headers.get('Content-Type', '')
|
||||
if 'text/html' not in content_type:
|
||||
continue
|
||||
|
||||
# Now GET the page to find links
|
||||
response = requests.get(url, timeout=timeout)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# Find all links
|
||||
for link in soup.find_all('a', href=True):
|
||||
href = link['href']
|
||||
full_url = urljoin(url, href)
|
||||
|
||||
# Normalize URL
|
||||
parsed = urlparse(full_url)
|
||||
full_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
|
||||
|
||||
# Check if URL is valid
|
||||
if not is_valid_url(full_url, base_url, include_patterns, exclude_patterns):
|
||||
continue
|
||||
|
||||
# Add to pending if not visited
|
||||
if full_url not in visited and full_url not in pending:
|
||||
pending.append(full_url)
|
||||
|
||||
# Rate limiting
|
||||
time.sleep(rate_limit)
|
||||
|
||||
except requests.RequestException as e:
|
||||
# Silently skip errors during estimation
|
||||
pass
|
||||
except Exception as e:
|
||||
# Silently skip other errors
|
||||
pass
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
# Results
|
||||
results = {
|
||||
'discovered': discovered,
|
||||
'pending': len(pending),
|
||||
'estimated_total': discovered + len(pending),
|
||||
'elapsed_seconds': round(elapsed, 2),
|
||||
'discovery_rate': round(discovered / elapsed if elapsed > 0 else 0, 2),
|
||||
'hit_limit': discovered >= max_discovery
|
||||
}
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def is_valid_url(url, base_url, include_patterns, exclude_patterns):
|
||||
"""Check if URL should be crawled"""
|
||||
# Must be same domain
|
||||
if not url.startswith(base_url.rstrip('/')):
|
||||
return False
|
||||
|
||||
# Check exclude patterns first
|
||||
if exclude_patterns:
|
||||
for pattern in exclude_patterns:
|
||||
if pattern in url:
|
||||
return False
|
||||
|
||||
# Check include patterns (if specified)
|
||||
if include_patterns:
|
||||
for pattern in include_patterns:
|
||||
if pattern in url:
|
||||
return True
|
||||
return False
|
||||
|
||||
# If no include patterns, accept by default
|
||||
return True
|
||||
|
||||
|
||||
def print_results(results, config):
|
||||
"""Print estimation results"""
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("📊 ESTIMATION RESULTS")
|
||||
print("=" * 70)
|
||||
print()
|
||||
print(f"Config: {config['name']}")
|
||||
print(f"Base URL: {config['base_url']}")
|
||||
print()
|
||||
print(f"✅ Pages Discovered: {results['discovered']}")
|
||||
print(f"⏳ Pages Pending: {results['pending']}")
|
||||
print(f"📈 Estimated Total: {results['estimated_total']}")
|
||||
print()
|
||||
print(f"⏱️ Time Elapsed: {results['elapsed_seconds']}s")
|
||||
print(f"⚡ Discovery Rate: {results['discovery_rate']} pages/sec")
|
||||
|
||||
if results['hit_limit']:
|
||||
print()
|
||||
print("⚠️ Hit discovery limit - actual total may be higher")
|
||||
print(" Increase max_discovery parameter for more accurate estimate")
|
||||
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("💡 RECOMMENDATIONS")
|
||||
print("=" * 70)
|
||||
print()
|
||||
|
||||
estimated = results['estimated_total']
|
||||
current_max = config.get('max_pages', 100)
|
||||
|
||||
if estimated <= current_max:
|
||||
print(f"✅ Current max_pages ({current_max}) is sufficient")
|
||||
else:
|
||||
recommended = min(estimated + 50, 10000) # Add 50 buffer, cap at 10k
|
||||
print(f"⚠️ Current max_pages ({current_max}) may be too low")
|
||||
print(f"📝 Recommended max_pages: {recommended}")
|
||||
print(f" (Estimated {estimated} + 50 buffer)")
|
||||
|
||||
# Estimate time for full scrape
|
||||
rate_limit = config.get('rate_limit', 0.5)
|
||||
estimated_time = (estimated * rate_limit) / 60 # in minutes
|
||||
|
||||
print()
|
||||
print(f"⏱️ Estimated full scrape time: {estimated_time:.1f} minutes")
|
||||
print(f" (Based on rate_limit: {rate_limit}s)")
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def load_config(config_path):
|
||||
"""Load configuration from JSON file"""
|
||||
try:
|
||||
with open(config_path, 'r') as f:
|
||||
config = json.load(f)
|
||||
return config
|
||||
except FileNotFoundError:
|
||||
print(f"❌ Error: Config file not found: {config_path}")
|
||||
sys.exit(1)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"❌ Error: Invalid JSON in config file: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point"""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Estimate page count for Skill Seeker configs',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Estimate pages for a config
|
||||
python3 estimate_pages.py configs/react.json
|
||||
|
||||
# Estimate with higher discovery limit
|
||||
python3 estimate_pages.py configs/godot.json --max-discovery 2000
|
||||
|
||||
# Quick estimate (stop at 100 pages)
|
||||
python3 estimate_pages.py configs/vue.json --max-discovery 100
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('config', help='Path to config JSON file')
|
||||
parser.add_argument('--max-discovery', '-m', type=int, default=1000,
|
||||
help='Maximum pages to discover (default: 1000)')
|
||||
parser.add_argument('--timeout', '-t', type=int, default=30,
|
||||
help='HTTP request timeout in seconds (default: 30)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load config
|
||||
config = load_config(args.config)
|
||||
|
||||
# Run estimation
|
||||
try:
|
||||
results = estimate_pages(config, args.max_discovery, args.timeout)
|
||||
print_results(results, config)
|
||||
|
||||
# Return exit code based on results
|
||||
if results['hit_limit']:
|
||||
return 2 # Warning: hit limit
|
||||
return 0 # Success
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n\n⚠️ Estimation interrupted by user")
|
||||
return 1
|
||||
except Exception as e:
|
||||
print(f"\n\n❌ Error during estimation: {e}")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user