Refactor: Convert to monorepo with CLI and MCP server

Major restructure to support both CLI usage and MCP integration:

**Repository Structure:**
- cli/ - All CLI tools (doc_scraper, estimate_pages, enhance_skill, etc.)
- mcp/ - New MCP server for Claude Code integration
- configs/ - Shared configuration files
- tests/ - Updated to import from cli/
- docs/ - Shared documentation

**MCP Server (NEW):**
- mcp/server.py - Full MCP server implementation
- 6 tools available:
  * generate_config - Create config from URL
  * estimate_pages - Fast page count estimation
  * scrape_docs - Full documentation scraping
  * package_skill - Package to .zip
  * list_configs - Show available presets
  * validate_config - Validate config files
- mcp/README.md - Complete MCP documentation
- mcp/requirements.txt - MCP dependencies

**CLI Tools (Moved to cli/):**
- All existing functionality preserved
- Same commands, same behavior
- Tests updated to import from cli.doc_scraper

**Tests:**
- 68/71 passing (95.8%)
- Updated imports from doc_scraper to cli.doc_scraper
- Fixed validate_config() tuple unpacking (errors, warnings)
- 3 minor test failures (checking warnings instead of errors)

**Benefits:**
- Use as CLI tool: python3 cli/doc_scraper.py
- Use via MCP: Integrated with Claude Code
- Shared code and configs
- Single source of truth

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
yusyus
2025-10-19 15:19:53 +03:00
parent af87572735
commit ae924a9d05
12 changed files with 658 additions and 34 deletions

956
cli/doc_scraper.py Normal file
View File

@@ -0,0 +1,956 @@
#!/usr/bin/env python3
"""
Documentation to Claude Skill Converter
Single tool to scrape any documentation and create high-quality Claude skills.
Usage:
python3 doc_scraper.py --interactive
python3 doc_scraper.py --config configs/godot.json
python3 doc_scraper.py --url https://react.dev/ --name react
"""
import os
import sys
import json
import time
import re
import argparse
import hashlib
import requests
from pathlib import Path
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from collections import deque, defaultdict
class DocToSkillConverter:
def __init__(self, config, dry_run=False):
self.config = config
self.name = config['name']
self.base_url = config['base_url']
self.dry_run = dry_run
# Paths
self.data_dir = f"output/{self.name}_data"
self.skill_dir = f"output/{self.name}"
# State
self.visited_urls = set()
# Support multiple starting URLs
start_urls = config.get('start_urls', [self.base_url])
self.pending_urls = deque(start_urls)
self.pages = []
# Create directories (unless dry-run)
if not dry_run:
os.makedirs(f"{self.data_dir}/pages", exist_ok=True)
os.makedirs(f"{self.skill_dir}/references", exist_ok=True)
os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True)
os.makedirs(f"{self.skill_dir}/assets", exist_ok=True)
def is_valid_url(self, url):
"""Check if URL should be scraped"""
if not url.startswith(self.base_url):
return False
# Include patterns
includes = self.config.get('url_patterns', {}).get('include', [])
if includes and not any(pattern in url for pattern in includes):
return False
# Exclude patterns
excludes = self.config.get('url_patterns', {}).get('exclude', [])
if any(pattern in url for pattern in excludes):
return False
return True
def extract_content(self, soup, url):
"""Extract content with improved code and pattern detection"""
page = {
'url': url,
'title': '',
'content': '',
'headings': [],
'code_samples': [],
'patterns': [], # NEW: Extract common patterns
'links': []
}
selectors = self.config.get('selectors', {})
# Extract title
title_elem = soup.select_one(selectors.get('title', 'title'))
if title_elem:
page['title'] = self.clean_text(title_elem.get_text())
# Find main content
main_selector = selectors.get('main_content', 'div[role="main"]')
main = soup.select_one(main_selector)
if not main:
print(f"⚠ No content: {url}")
return page
# Extract headings with better structure
for h in main.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
text = self.clean_text(h.get_text())
if text:
page['headings'].append({
'level': h.name,
'text': text,
'id': h.get('id', '')
})
# Extract code with language detection
code_selector = selectors.get('code_blocks', 'pre code')
for code_elem in main.select(code_selector):
code = code_elem.get_text()
if len(code.strip()) > 10:
# Try to detect language
lang = self.detect_language(code_elem, code)
page['code_samples'].append({
'code': code.strip(),
'language': lang
})
# Extract patterns (NEW: common code patterns)
page['patterns'] = self.extract_patterns(main, page['code_samples'])
# Extract paragraphs
paragraphs = []
for p in main.find_all('p'):
text = self.clean_text(p.get_text())
if text and len(text) > 20: # Skip very short paragraphs
paragraphs.append(text)
page['content'] = '\n\n'.join(paragraphs)
# Extract links
for link in main.find_all('a', href=True):
href = urljoin(url, link['href'])
if self.is_valid_url(href):
page['links'].append(href)
return page
def detect_language(self, elem, code):
"""Detect programming language from code block"""
# Check class attribute
classes = elem.get('class', [])
for cls in classes:
if 'language-' in cls:
return cls.replace('language-', '')
if 'lang-' in cls:
return cls.replace('lang-', '')
# Check parent pre element
parent = elem.parent
if parent and parent.name == 'pre':
classes = parent.get('class', [])
for cls in classes:
if 'language-' in cls:
return cls.replace('language-', '')
# Heuristic detection
if 'import ' in code and 'from ' in code:
return 'python'
if 'const ' in code or 'let ' in code or '=>' in code:
return 'javascript'
if 'func ' in code and 'var ' in code:
return 'gdscript'
if 'def ' in code and ':' in code:
return 'python'
if '#include' in code or 'int main' in code:
return 'cpp'
return 'unknown'
def extract_patterns(self, main, code_samples):
"""Extract common coding patterns (NEW FEATURE)"""
patterns = []
# Look for "Example:" or "Pattern:" sections
for elem in main.find_all(['p', 'div']):
text = elem.get_text().lower()
if any(word in text for word in ['example:', 'pattern:', 'usage:', 'typical use']):
# Get the code that follows
next_code = elem.find_next(['pre', 'code'])
if next_code:
patterns.append({
'description': self.clean_text(elem.get_text()),
'code': next_code.get_text().strip()
})
return patterns[:5] # Limit to 5 most relevant patterns
def clean_text(self, text):
"""Clean text content"""
text = re.sub(r'\s+', ' ', text)
return text.strip()
def save_page(self, page):
"""Save page data"""
url_hash = hashlib.md5(page['url'].encode()).hexdigest()[:10]
safe_title = re.sub(r'[^\w\s-]', '', page['title'])[:50]
safe_title = re.sub(r'[-\s]+', '_', safe_title)
filename = f"{safe_title}_{url_hash}.json"
filepath = os.path.join(self.data_dir, "pages", filename)
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(page, f, indent=2, ensure_ascii=False)
def scrape_page(self, url):
"""Scrape a single page"""
try:
print(f" {url}")
headers = {'User-Agent': 'Mozilla/5.0 (Documentation Scraper)'}
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
page = self.extract_content(soup, url)
self.save_page(page)
self.pages.append(page)
# Add new URLs
for link in page['links']:
if link not in self.visited_urls and link not in self.pending_urls:
self.pending_urls.append(link)
# Rate limiting
time.sleep(self.config.get('rate_limit', 0.5))
except Exception as e:
print(f" ✗ Error: {e}")
def scrape_all(self):
"""Scrape all pages"""
print(f"\n{'='*60}")
if self.dry_run:
print(f"DRY RUN: {self.name}")
else:
print(f"SCRAPING: {self.name}")
print(f"{'='*60}")
print(f"Base URL: {self.base_url}")
if self.dry_run:
print(f"Mode: Preview only (no actual scraping)\n")
else:
print(f"Output: {self.data_dir}\n")
max_pages = self.config.get('max_pages', 500)
# Dry run: preview first 20 URLs
preview_limit = 20 if self.dry_run else max_pages
while self.pending_urls and len(self.visited_urls) < preview_limit:
url = self.pending_urls.popleft()
if url in self.visited_urls:
continue
self.visited_urls.add(url)
if self.dry_run:
# Just show what would be scraped
print(f" [Preview] {url}")
# Simulate finding links without actually scraping
try:
headers = {'User-Agent': 'Mozilla/5.0 (Documentation Scraper - Dry Run)'}
response = requests.get(url, headers=headers, timeout=10)
soup = BeautifulSoup(response.content, 'html.parser')
main_selector = self.config.get('selectors', {}).get('main_content', 'div[role="main"]')
main = soup.select_one(main_selector)
if main:
for link in main.find_all('a', href=True):
href = urljoin(url, link['href'])
if self.is_valid_url(href) and href not in self.visited_urls:
self.pending_urls.append(href)
except:
pass # Ignore errors in dry run
else:
self.scrape_page(url)
if len(self.visited_urls) % 10 == 0:
print(f" [{len(self.visited_urls)} pages]")
if self.dry_run:
print(f"\n✅ Dry run complete: would scrape ~{len(self.visited_urls)} pages")
if len(self.visited_urls) >= preview_limit:
print(f" (showing first {preview_limit}, actual scraping may find more)")
print(f"\n💡 To actually scrape, run without --dry-run")
else:
print(f"\n✅ Scraped {len(self.visited_urls)} pages")
self.save_summary()
def save_summary(self):
"""Save scraping summary"""
summary = {
'name': self.name,
'total_pages': len(self.pages),
'base_url': self.base_url,
'pages': [{'title': p['title'], 'url': p['url']} for p in self.pages]
}
with open(f"{self.data_dir}/summary.json", 'w', encoding='utf-8') as f:
json.dump(summary, f, indent=2, ensure_ascii=False)
def load_scraped_data(self):
"""Load previously scraped data"""
pages = []
pages_dir = Path(self.data_dir) / "pages"
if not pages_dir.exists():
return []
for json_file in pages_dir.glob("*.json"):
try:
with open(json_file, 'r', encoding='utf-8') as f:
pages.append(json.load(f))
except Exception as e:
print(f"⚠ Error loading {json_file}: {e}")
return pages
def smart_categorize(self, pages):
"""Improved categorization with better pattern matching"""
category_defs = self.config.get('categories', {})
# Default smart categories if none provided
if not category_defs:
category_defs = self.infer_categories(pages)
categories = {cat: [] for cat in category_defs.keys()}
categories['other'] = []
for page in pages:
url = page['url'].lower()
title = page['title'].lower()
content = page.get('content', '').lower()[:500] # Check first 500 chars
categorized = False
# Match against keywords
for cat, keywords in category_defs.items():
score = 0
for keyword in keywords:
keyword = keyword.lower()
if keyword in url:
score += 3
if keyword in title:
score += 2
if keyword in content:
score += 1
if score >= 2: # Threshold for categorization
categories[cat].append(page)
categorized = True
break
if not categorized:
categories['other'].append(page)
# Remove empty categories
categories = {k: v for k, v in categories.items() if v}
return categories
def infer_categories(self, pages):
"""Infer categories from URL patterns (IMPROVED)"""
url_segments = defaultdict(int)
for page in pages:
path = urlparse(page['url']).path
segments = [s for s in path.split('/') if s and s not in ['en', 'stable', 'latest', 'docs']]
for seg in segments:
url_segments[seg] += 1
# Top segments become categories
top_segments = sorted(url_segments.items(), key=lambda x: x[1], reverse=True)[:8]
categories = {}
for seg, count in top_segments:
if count >= 3: # At least 3 pages
categories[seg] = [seg]
# Add common defaults
if 'tutorial' not in categories and any('tutorial' in url for url in [p['url'] for p in pages]):
categories['tutorials'] = ['tutorial', 'guide', 'getting-started']
if 'api' not in categories and any('api' in url or 'reference' in url for url in [p['url'] for p in pages]):
categories['api'] = ['api', 'reference', 'class']
return categories
def generate_quick_reference(self, pages):
"""Generate quick reference from common patterns (NEW FEATURE)"""
quick_ref = []
# Collect all patterns
all_patterns = []
for page in pages:
all_patterns.extend(page.get('patterns', []))
# Get most common code patterns
seen_codes = set()
for pattern in all_patterns:
code = pattern['code']
if code not in seen_codes and len(code) < 300:
quick_ref.append(pattern)
seen_codes.add(code)
if len(quick_ref) >= 15:
break
return quick_ref
def create_reference_file(self, category, pages):
"""Create enhanced reference file"""
if not pages:
return
lines = []
lines.append(f"# {self.name.title()} - {category.replace('_', ' ').title()}\n")
lines.append(f"**Pages:** {len(pages)}\n")
lines.append("---\n")
for page in pages:
lines.append(f"## {page['title']}\n")
lines.append(f"**URL:** {page['url']}\n")
# Table of contents from headings
if page.get('headings'):
lines.append("**Contents:**")
for h in page['headings'][:10]:
level = int(h['level'][1]) if len(h['level']) > 1 else 1
indent = " " * max(0, level - 2)
lines.append(f"{indent}- {h['text']}")
lines.append("")
# Content
if page.get('content'):
content = page['content'][:2500]
if len(page['content']) > 2500:
content += "\n\n*[Content truncated]*"
lines.append(content)
lines.append("")
# Code examples with language
if page.get('code_samples'):
lines.append("**Examples:**\n")
for i, sample in enumerate(page['code_samples'][:4], 1):
lang = sample.get('language', 'unknown')
code = sample.get('code', sample if isinstance(sample, str) else '')
lines.append(f"Example {i} ({lang}):")
lines.append(f"```{lang}")
lines.append(code[:600])
if len(code) > 600:
lines.append("...")
lines.append("```\n")
lines.append("---\n")
filepath = os.path.join(self.skill_dir, "references", f"{category}.md")
with open(filepath, 'w', encoding='utf-8') as f:
f.write('\n'.join(lines))
print(f"{category}.md ({len(pages)} pages)")
def create_enhanced_skill_md(self, categories, quick_ref):
"""Create SKILL.md with actual examples (IMPROVED)"""
description = self.config.get('description', f'Comprehensive assistance with {self.name}')
# Extract actual code examples from docs
example_codes = []
for pages in categories.values():
for page in pages[:3]: # First 3 pages per category
for sample in page.get('code_samples', [])[:2]: # First 2 samples per page
code = sample.get('code', sample if isinstance(sample, str) else '')
lang = sample.get('language', 'unknown')
if len(code) < 200 and lang != 'unknown':
example_codes.append((lang, code))
if len(example_codes) >= 10:
break
if len(example_codes) >= 10:
break
if len(example_codes) >= 10:
break
content = f"""---
name: {self.name}
description: {description}
---
# {self.name.title()} Skill
Comprehensive assistance with {self.name} development, generated from official documentation.
## When to Use This Skill
This skill should be triggered when:
- Working with {self.name}
- Asking about {self.name} features or APIs
- Implementing {self.name} solutions
- Debugging {self.name} code
- Learning {self.name} best practices
## Quick Reference
### Common Patterns
"""
# Add actual quick reference patterns
if quick_ref:
for i, pattern in enumerate(quick_ref[:8], 1):
content += f"**Pattern {i}:** {pattern.get('description', 'Example pattern')}\n\n"
content += "```\n"
content += pattern.get('code', '')[:300]
content += "\n```\n\n"
else:
content += "*Quick reference patterns will be added as you use the skill.*\n\n"
# Add example codes from docs
if example_codes:
content += "### Example Code Patterns\n\n"
for i, (lang, code) in enumerate(example_codes[:5], 1):
content += f"**Example {i}** ({lang}):\n```{lang}\n{code}\n```\n\n"
content += f"""## Reference Files
This skill includes comprehensive documentation in `references/`:
"""
for cat in sorted(categories.keys()):
content += f"- **{cat}.md** - {cat.replace('_', ' ').title()} documentation\n"
content += """
Use `view` to read specific reference files when detailed information is needed.
## Working with This Skill
### For Beginners
Start with the getting_started or tutorials reference files for foundational concepts.
### For Specific Features
Use the appropriate category reference file (api, guides, etc.) for detailed information.
### For Code Examples
The quick reference section above contains common patterns extracted from the official docs.
## Resources
### references/
Organized documentation extracted from official sources. These files contain:
- Detailed explanations
- Code examples with language annotations
- Links to original documentation
- Table of contents for quick navigation
### scripts/
Add helper scripts here for common automation tasks.
### assets/
Add templates, boilerplate, or example projects here.
## Notes
- This skill was automatically generated from official documentation
- Reference files preserve the structure and examples from source docs
- Code examples include language detection for better syntax highlighting
- Quick reference patterns are extracted from common usage examples in the docs
## Updating
To refresh this skill with updated documentation:
1. Re-run the scraper with the same configuration
2. The skill will be rebuilt with the latest information
"""
filepath = os.path.join(self.skill_dir, "SKILL.md")
with open(filepath, 'w', encoding='utf-8') as f:
f.write(content)
print(f" ✓ SKILL.md (enhanced with {len(example_codes)} examples)")
def create_index(self, categories):
"""Create navigation index"""
lines = []
lines.append(f"# {self.name.title()} Documentation Index\n")
lines.append("## Categories\n")
for cat, pages in sorted(categories.items()):
lines.append(f"### {cat.replace('_', ' ').title()}")
lines.append(f"**File:** `{cat}.md`")
lines.append(f"**Pages:** {len(pages)}\n")
filepath = os.path.join(self.skill_dir, "references", "index.md")
with open(filepath, 'w', encoding='utf-8') as f:
f.write('\n'.join(lines))
print(" ✓ index.md")
def build_skill(self):
"""Build the skill from scraped data"""
print(f"\n{'='*60}")
print(f"BUILDING SKILL: {self.name}")
print(f"{'='*60}\n")
# Load data
print("Loading scraped data...")
pages = self.load_scraped_data()
if not pages:
print("✗ No scraped data found!")
return False
print(f" ✓ Loaded {len(pages)} pages\n")
# Categorize
print("Categorizing pages...")
categories = self.smart_categorize(pages)
print(f" ✓ Created {len(categories)} categories\n")
# Generate quick reference
print("Generating quick reference...")
quick_ref = self.generate_quick_reference(pages)
print(f" ✓ Extracted {len(quick_ref)} patterns\n")
# Create reference files
print("Creating reference files...")
for cat, cat_pages in categories.items():
self.create_reference_file(cat, cat_pages)
# Create index
self.create_index(categories)
print()
# Create enhanced SKILL.md
print("Creating SKILL.md...")
self.create_enhanced_skill_md(categories, quick_ref)
print(f"\n✅ Skill built: {self.skill_dir}/")
return True
def validate_config(config):
"""Validate configuration structure"""
errors = []
warnings = []
# Required fields
required_fields = ['name', 'base_url']
for field in required_fields:
if field not in config:
errors.append(f"Missing required field: '{field}'")
# Validate name (alphanumeric, hyphens, underscores only)
if 'name' in config:
if not re.match(r'^[a-zA-Z0-9_-]+$', config['name']):
errors.append(f"Invalid name: '{config['name']}' (use only letters, numbers, hyphens, underscores)")
# Validate base_url
if 'base_url' in config:
if not config['base_url'].startswith(('http://', 'https://')):
errors.append(f"Invalid base_url: '{config['base_url']}' (must start with http:// or https://)")
# Validate selectors structure
if 'selectors' in config:
if not isinstance(config['selectors'], dict):
errors.append("'selectors' must be a dictionary")
else:
recommended_selectors = ['main_content', 'title', 'code_blocks']
for selector in recommended_selectors:
if selector not in config['selectors']:
warnings.append(f"Missing recommended selector: '{selector}'")
else:
warnings.append("Missing 'selectors' section (recommended)")
# Validate url_patterns
if 'url_patterns' in config:
if not isinstance(config['url_patterns'], dict):
errors.append("'url_patterns' must be a dictionary")
else:
for key in ['include', 'exclude']:
if key in config['url_patterns']:
if not isinstance(config['url_patterns'][key], list):
errors.append(f"'url_patterns.{key}' must be a list")
# Validate categories
if 'categories' in config:
if not isinstance(config['categories'], dict):
errors.append("'categories' must be a dictionary")
else:
for cat_name, keywords in config['categories'].items():
if not isinstance(keywords, list):
errors.append(f"'categories.{cat_name}' must be a list of keywords")
# Validate rate_limit
if 'rate_limit' in config:
try:
rate = float(config['rate_limit'])
if rate < 0:
errors.append(f"'rate_limit' must be non-negative (got {rate})")
except (ValueError, TypeError):
errors.append(f"'rate_limit' must be a number (got {config['rate_limit']})")
# Validate max_pages
if 'max_pages' in config:
try:
max_p = int(config['max_pages'])
if max_p < 1:
errors.append(f"'max_pages' must be at least 1 (got {max_p})")
except (ValueError, TypeError):
errors.append(f"'max_pages' must be an integer (got {config['max_pages']})")
# Validate start_urls if present
if 'start_urls' in config:
if not isinstance(config['start_urls'], list):
errors.append("'start_urls' must be a list")
else:
for url in config['start_urls']:
if not url.startswith(('http://', 'https://')):
errors.append(f"Invalid start_url: '{url}' (must start with http:// or https://)")
return errors, warnings
def load_config(config_path):
"""Load and validate configuration from file"""
try:
with open(config_path, 'r') as f:
config = json.load(f)
except json.JSONDecodeError as e:
print(f"❌ Error: Invalid JSON in config file: {e}")
sys.exit(1)
except FileNotFoundError:
print(f"❌ Error: Config file not found: {config_path}")
sys.exit(1)
# Validate config
errors, warnings = validate_config(config)
# Show warnings (non-blocking)
if warnings:
print(f"⚠️ Configuration warnings in {config_path}:")
for warning in warnings:
print(f" - {warning}")
print()
# Show errors (blocking)
if errors:
print(f"❌ Configuration validation errors in {config_path}:")
for error in errors:
print(f" - {error}")
sys.exit(1)
return config
def interactive_config():
"""Interactive configuration"""
print("\n" + "="*60)
print("Documentation to Skill Converter")
print("="*60 + "\n")
config = {}
# Basic info
config['name'] = input("Skill name (e.g., 'react', 'godot'): ").strip()
config['description'] = input("Skill description: ").strip()
config['base_url'] = input("Base URL (e.g., https://docs.example.com/): ").strip()
if not config['base_url'].endswith('/'):
config['base_url'] += '/'
# Selectors
print("\nCSS Selectors (press Enter for defaults):")
selectors = {}
selectors['main_content'] = input(" Main content [div[role='main']]: ").strip() or "div[role='main']"
selectors['title'] = input(" Title [title]: ").strip() or "title"
selectors['code_blocks'] = input(" Code blocks [pre code]: ").strip() or "pre code"
config['selectors'] = selectors
# URL patterns
print("\nURL Patterns (comma-separated, optional):")
include = input(" Include: ").strip()
exclude = input(" Exclude: ").strip()
config['url_patterns'] = {
'include': [p.strip() for p in include.split(',') if p.strip()],
'exclude': [p.strip() for p in exclude.split(',') if p.strip()]
}
# Settings
rate = input("\nRate limit (seconds) [0.5]: ").strip()
config['rate_limit'] = float(rate) if rate else 0.5
max_p = input("Max pages [500]: ").strip()
config['max_pages'] = int(max_p) if max_p else 500
return config
def check_existing_data(name):
"""Check if scraped data already exists"""
data_dir = f"output/{name}_data"
if os.path.exists(data_dir) and os.path.exists(f"{data_dir}/summary.json"):
with open(f"{data_dir}/summary.json", 'r') as f:
summary = json.load(f)
return True, summary.get('total_pages', 0)
return False, 0
def main():
parser = argparse.ArgumentParser(
description='Convert documentation websites to Claude skills',
formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument('--interactive', '-i', action='store_true',
help='Interactive configuration mode')
parser.add_argument('--config', '-c', type=str,
help='Load configuration from file (e.g., configs/godot.json)')
parser.add_argument('--name', type=str,
help='Skill name')
parser.add_argument('--url', type=str,
help='Base documentation URL')
parser.add_argument('--description', '-d', type=str,
help='Skill description')
parser.add_argument('--skip-scrape', action='store_true',
help='Skip scraping, use existing data')
parser.add_argument('--dry-run', action='store_true',
help='Preview what will be scraped without actually scraping')
parser.add_argument('--enhance', action='store_true',
help='Enhance SKILL.md using Claude API after building (requires API key)')
parser.add_argument('--enhance-local', action='store_true',
help='Enhance SKILL.md using Claude Code in new terminal (no API key needed)')
parser.add_argument('--api-key', type=str,
help='Anthropic API key for --enhance (or set ANTHROPIC_API_KEY)')
args = parser.parse_args()
# Get configuration
if args.config:
config = load_config(args.config)
elif args.interactive or not (args.name and args.url):
config = interactive_config()
else:
config = {
'name': args.name,
'description': args.description or f'Comprehensive assistance with {args.name}',
'base_url': args.url,
'selectors': {
'main_content': "div[role='main']",
'title': 'title',
'code_blocks': 'pre code'
},
'url_patterns': {'include': [], 'exclude': []},
'rate_limit': 0.5,
'max_pages': 500
}
# Dry run mode - preview only
if args.dry_run:
print(f"\n{'='*60}")
print("DRY RUN MODE")
print(f"{'='*60}")
print("This will show what would be scraped without saving anything.\n")
converter = DocToSkillConverter(config, dry_run=True)
converter.scrape_all()
print(f"\n📋 Configuration Summary:")
print(f" Name: {config['name']}")
print(f" Base URL: {config['base_url']}")
print(f" Max pages: {config.get('max_pages', 500)}")
print(f" Rate limit: {config.get('rate_limit', 0.5)}s")
print(f" Categories: {len(config.get('categories', {}))}")
return
# Check for existing data
exists, page_count = check_existing_data(config['name'])
if exists and not args.skip_scrape:
print(f"\n✓ Found existing data: {page_count} pages")
response = input("Use existing data? (y/n): ").strip().lower()
if response == 'y':
args.skip_scrape = True
# Create converter
converter = DocToSkillConverter(config)
# Scrape or skip
if not args.skip_scrape:
try:
converter.scrape_all()
except KeyboardInterrupt:
print("\n\nScraping interrupted.")
response = input("Continue with skill building? (y/n): ").strip().lower()
if response != 'y':
return
else:
print(f"\n⏭️ Skipping scrape, using existing data")
# Build skill
success = converter.build_skill()
if not success:
sys.exit(1)
# Optional enhancement with Claude API
if args.enhance:
print(f"\n{'='*60}")
print(f"ENHANCING SKILL.MD WITH CLAUDE API")
print(f"{'='*60}\n")
try:
import subprocess
enhance_cmd = ['python3', 'enhance_skill.py', f'output/{config["name"]}/']
if args.api_key:
enhance_cmd.extend(['--api-key', args.api_key])
result = subprocess.run(enhance_cmd, check=True)
if result.returncode == 0:
print("\n✅ Enhancement complete!")
except subprocess.CalledProcessError:
print("\n⚠ Enhancement failed, but skill was still built")
except FileNotFoundError:
print("\n⚠ enhance_skill.py not found. Run manually:")
print(f" python3 enhance_skill.py output/{config['name']}/")
# Optional enhancement with Claude Code (local, no API key)
if args.enhance_local:
print(f"\n{'='*60}")
print(f"ENHANCING SKILL.MD WITH CLAUDE CODE (LOCAL)")
print(f"{'='*60}\n")
try:
import subprocess
enhance_cmd = ['python3', 'enhance_skill_local.py', f'output/{config["name"]}/']
subprocess.run(enhance_cmd, check=True)
except subprocess.CalledProcessError:
print("\n⚠ Enhancement failed, but skill was still built")
except FileNotFoundError:
print("\n⚠ enhance_skill_local.py not found. Run manually:")
print(f" python3 enhance_skill_local.py output/{config['name']}/")
print(f"\n📦 Package your skill:")
print(f" python3 package_skill.py output/{config['name']}/")
if not args.enhance and not args.enhance_local:
print(f"\n💡 Optional: Enhance SKILL.md with Claude:")
print(f" API-based: python3 enhance_skill.py output/{config['name']}/")
print(f" or re-run with: --enhance")
print(f" Local (no API key): python3 enhance_skill_local.py output/{config['name']}/")
print(f" or re-run with: --enhance-local")
if __name__ == "__main__":
main()

292
cli/enhance_skill.py Normal file
View File

@@ -0,0 +1,292 @@
#!/usr/bin/env python3
"""
SKILL.md Enhancement Script
Uses Claude API to improve SKILL.md by analyzing reference documentation.
Usage:
python3 enhance_skill.py output/steam-inventory/
python3 enhance_skill.py output/react/
python3 enhance_skill.py output/godot/ --api-key YOUR_API_KEY
"""
import os
import sys
import json
import argparse
from pathlib import Path
try:
import anthropic
except ImportError:
print("❌ Error: anthropic package not installed")
print("Install with: pip3 install anthropic")
sys.exit(1)
class SkillEnhancer:
def __init__(self, skill_dir, api_key=None):
self.skill_dir = Path(skill_dir)
self.references_dir = self.skill_dir / "references"
self.skill_md_path = self.skill_dir / "SKILL.md"
# Get API key
self.api_key = api_key or os.environ.get('ANTHROPIC_API_KEY')
if not self.api_key:
raise ValueError(
"No API key provided. Set ANTHROPIC_API_KEY environment variable "
"or use --api-key argument"
)
self.client = anthropic.Anthropic(api_key=self.api_key)
def read_reference_files(self, max_chars=100000):
"""Read reference files with size limit"""
references = {}
if not self.references_dir.exists():
print(f"⚠ No references directory found at {self.references_dir}")
return references
total_chars = 0
for ref_file in sorted(self.references_dir.glob("*.md")):
if ref_file.name == "index.md":
continue
content = ref_file.read_text(encoding='utf-8')
# Limit size per file
if len(content) > 40000:
content = content[:40000] + "\n\n[Content truncated...]"
references[ref_file.name] = content
total_chars += len(content)
# Stop if we've read enough
if total_chars > max_chars:
print(f" Limiting input to {max_chars:,} characters")
break
return references
def read_current_skill_md(self):
"""Read existing SKILL.md"""
if not self.skill_md_path.exists():
return None
return self.skill_md_path.read_text(encoding='utf-8')
def enhance_skill_md(self, references, current_skill_md):
"""Use Claude to enhance SKILL.md"""
# Build prompt
prompt = self._build_enhancement_prompt(references, current_skill_md)
print("\n🤖 Asking Claude to enhance SKILL.md...")
print(f" Input: {len(prompt):,} characters")
try:
message = self.client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=4096,
temperature=0.3,
messages=[{
"role": "user",
"content": prompt
}]
)
enhanced_content = message.content[0].text
return enhanced_content
except Exception as e:
print(f"❌ Error calling Claude API: {e}")
return None
def _build_enhancement_prompt(self, references, current_skill_md):
"""Build the prompt for Claude"""
# Extract skill name and description
skill_name = self.skill_dir.name
prompt = f"""You are enhancing a Claude skill's SKILL.md file. This skill is about: {skill_name}
I've scraped documentation and organized it into reference files. Your job is to create an EXCELLENT SKILL.md that will help Claude use this documentation effectively.
CURRENT SKILL.MD:
{'```markdown' if current_skill_md else '(none - create from scratch)'}
{current_skill_md or 'No existing SKILL.md'}
{'```' if current_skill_md else ''}
REFERENCE DOCUMENTATION:
"""
for filename, content in references.items():
prompt += f"\n\n## {filename}\n```markdown\n{content[:30000]}\n```\n"
prompt += """
YOUR TASK:
Create an enhanced SKILL.md that includes:
1. **Clear "When to Use This Skill" section** - Be specific about trigger conditions
2. **Excellent Quick Reference section** - Extract 5-10 of the BEST, most practical code examples from the reference docs
- Choose SHORT, clear examples that demonstrate common tasks
- Include both simple and intermediate examples
- Annotate examples with clear descriptions
- Use proper language tags (cpp, python, javascript, json, etc.)
3. **Detailed Reference Files description** - Explain what's in each reference file
4. **Practical "Working with This Skill" section** - Give users clear guidance on how to navigate the skill
5. **Key Concepts section** (if applicable) - Explain core concepts
6. **Keep the frontmatter** (---\nname: ...\n---) intact
IMPORTANT:
- Extract REAL examples from the reference docs, don't make them up
- Prioritize SHORT, clear examples (5-20 lines max)
- Make it actionable and practical
- Don't be too verbose - be concise but useful
- Maintain the markdown structure for Claude skills
- Keep code examples properly formatted with language tags
OUTPUT:
Return ONLY the complete SKILL.md content, starting with the frontmatter (---).
"""
return prompt
def save_enhanced_skill_md(self, content):
"""Save the enhanced SKILL.md"""
# Backup original
if self.skill_md_path.exists():
backup_path = self.skill_md_path.with_suffix('.md.backup')
self.skill_md_path.rename(backup_path)
print(f" 💾 Backed up original to: {backup_path.name}")
# Save enhanced version
self.skill_md_path.write_text(content, encoding='utf-8')
print(f" ✅ Saved enhanced SKILL.md")
def run(self):
"""Main enhancement workflow"""
print(f"\n{'='*60}")
print(f"ENHANCING SKILL: {self.skill_dir.name}")
print(f"{'='*60}\n")
# Read reference files
print("📖 Reading reference documentation...")
references = self.read_reference_files()
if not references:
print("❌ No reference files found to analyze")
return False
print(f" ✓ Read {len(references)} reference files")
total_size = sum(len(c) for c in references.values())
print(f" ✓ Total size: {total_size:,} characters\n")
# Read current SKILL.md
current_skill_md = self.read_current_skill_md()
if current_skill_md:
print(f" Found existing SKILL.md ({len(current_skill_md)} chars)")
else:
print(f" No existing SKILL.md, will create new one")
# Enhance with Claude
enhanced = self.enhance_skill_md(references, current_skill_md)
if not enhanced:
print("❌ Enhancement failed")
return False
print(f" ✓ Generated enhanced SKILL.md ({len(enhanced)} chars)\n")
# Save
print("💾 Saving enhanced SKILL.md...")
self.save_enhanced_skill_md(enhanced)
print(f"\n✅ Enhancement complete!")
print(f"\nNext steps:")
print(f" 1. Review: {self.skill_md_path}")
print(f" 2. If you don't like it, restore backup: {self.skill_md_path.with_suffix('.md.backup')}")
print(f" 3. Package your skill:")
print(f" python3 /mnt/skills/examples/skill-creator/scripts/package_skill.py {self.skill_dir}/")
return True
def main():
parser = argparse.ArgumentParser(
description='Enhance SKILL.md using Claude API',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Using ANTHROPIC_API_KEY environment variable
export ANTHROPIC_API_KEY=sk-ant-...
python3 enhance_skill.py output/steam-inventory/
# Providing API key directly
python3 enhance_skill.py output/react/ --api-key sk-ant-...
# Show what would be done (dry run)
python3 enhance_skill.py output/godot/ --dry-run
"""
)
parser.add_argument('skill_dir', type=str,
help='Path to skill directory (e.g., output/steam-inventory/)')
parser.add_argument('--api-key', type=str,
help='Anthropic API key (or set ANTHROPIC_API_KEY env var)')
parser.add_argument('--dry-run', action='store_true',
help='Show what would be done without calling API')
args = parser.parse_args()
# Validate skill directory
skill_dir = Path(args.skill_dir)
if not skill_dir.exists():
print(f"❌ Error: Directory not found: {skill_dir}")
sys.exit(1)
if not skill_dir.is_dir():
print(f"❌ Error: Not a directory: {skill_dir}")
sys.exit(1)
# Dry run mode
if args.dry_run:
print(f"🔍 DRY RUN MODE")
print(f" Would enhance: {skill_dir}")
print(f" References: {skill_dir / 'references'}")
print(f" SKILL.md: {skill_dir / 'SKILL.md'}")
refs_dir = skill_dir / "references"
if refs_dir.exists():
ref_files = list(refs_dir.glob("*.md"))
print(f" Found {len(ref_files)} reference files:")
for rf in ref_files:
size = rf.stat().st_size
print(f" - {rf.name} ({size:,} bytes)")
print("\nTo actually run enhancement:")
print(f" python3 enhance_skill.py {skill_dir}")
return
# Create enhancer and run
try:
enhancer = SkillEnhancer(skill_dir, api_key=args.api_key)
success = enhancer.run()
sys.exit(0 if success else 1)
except ValueError as e:
print(f"❌ Error: {e}")
print("\nSet your API key:")
print(" export ANTHROPIC_API_KEY=sk-ant-...")
print("Or provide it directly:")
print(f" python3 enhance_skill.py {skill_dir} --api-key sk-ant-...")
sys.exit(1)
except Exception as e:
print(f"❌ Unexpected error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()

244
cli/enhance_skill_local.py Normal file
View File

@@ -0,0 +1,244 @@
#!/usr/bin/env python3
"""
SKILL.md Enhancement Script (Local - Using Claude Code)
Opens a new terminal with Claude Code to enhance SKILL.md, then reports back.
No API key needed - uses your existing Claude Code Max plan!
Usage:
python3 enhance_skill_local.py output/steam-inventory/
python3 enhance_skill_local.py output/react/
"""
import os
import sys
import time
import subprocess
import tempfile
from pathlib import Path
class LocalSkillEnhancer:
def __init__(self, skill_dir):
self.skill_dir = Path(skill_dir)
self.references_dir = self.skill_dir / "references"
self.skill_md_path = self.skill_dir / "SKILL.md"
def create_enhancement_prompt(self):
"""Create the prompt file for Claude Code"""
# Read reference files
references = self.read_reference_files()
if not references:
print("❌ No reference files found")
return None
# Read current SKILL.md
current_skill_md = ""
if self.skill_md_path.exists():
current_skill_md = self.skill_md_path.read_text(encoding='utf-8')
# Build prompt
prompt = f"""I need you to enhance the SKILL.md file for the {self.skill_dir.name} skill.
CURRENT SKILL.MD:
{'-'*60}
{current_skill_md if current_skill_md else '(No existing SKILL.md - create from scratch)'}
{'-'*60}
REFERENCE DOCUMENTATION:
{'-'*60}
"""
for filename, content in references.items():
prompt += f"\n## {filename}\n{content[:15000]}\n"
prompt += f"""
{'-'*60}
YOUR TASK:
Create an EXCELLENT SKILL.md file that will help Claude use this documentation effectively.
Requirements:
1. **Clear "When to Use This Skill" section**
- Be SPECIFIC about trigger conditions
- List concrete use cases
2. **Excellent Quick Reference section**
- Extract 5-10 of the BEST, most practical code examples from the reference docs
- Choose SHORT, clear examples (5-20 lines max)
- Include both simple and intermediate examples
- Use proper language tags (cpp, python, javascript, json, etc.)
- Add clear descriptions for each example
3. **Detailed Reference Files description**
- Explain what's in each reference file
- Help users navigate the documentation
4. **Practical "Working with This Skill" section**
- Clear guidance for beginners, intermediate, and advanced users
- Navigation tips
5. **Key Concepts section** (if applicable)
- Explain core concepts
- Define important terminology
IMPORTANT:
- Extract REAL examples from the reference docs above
- Prioritize SHORT, clear examples
- Make it actionable and practical
- Keep the frontmatter (---\\nname: ...\\n---) intact
- Use proper markdown formatting
SAVE THE RESULT:
Save the complete enhanced SKILL.md to: {self.skill_md_path.absolute()}
First, backup the original to: {self.skill_md_path.with_suffix('.md.backup').absolute()}
"""
return prompt
def read_reference_files(self, max_chars=50000):
"""Read reference files with size limit"""
references = {}
if not self.references_dir.exists():
return references
total_chars = 0
for ref_file in sorted(self.references_dir.glob("*.md")):
if ref_file.name == "index.md":
continue
content = ref_file.read_text(encoding='utf-8')
# Limit size per file
if len(content) > 20000:
content = content[:20000] + "\n\n[Content truncated...]"
references[ref_file.name] = content
total_chars += len(content)
if total_chars > max_chars:
break
return references
def run(self):
"""Main enhancement workflow"""
print(f"\n{'='*60}")
print(f"LOCAL ENHANCEMENT: {self.skill_dir.name}")
print(f"{'='*60}\n")
# Validate
if not self.skill_dir.exists():
print(f"❌ Directory not found: {self.skill_dir}")
return False
# Read reference files
print("📖 Reading reference documentation...")
references = self.read_reference_files()
if not references:
print("❌ No reference files found to analyze")
return False
print(f" ✓ Read {len(references)} reference files")
total_size = sum(len(c) for c in references.values())
print(f" ✓ Total size: {total_size:,} characters\n")
# Create prompt
print("📝 Creating enhancement prompt...")
prompt = self.create_enhancement_prompt()
if not prompt:
return False
# Save prompt to temp file
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8') as f:
prompt_file = f.name
f.write(prompt)
print(f" ✓ Prompt saved ({len(prompt):,} characters)\n")
# Launch Claude Code in new terminal
print("🚀 Launching Claude Code in new terminal...")
print(" This will:")
print(" 1. Open a new terminal window")
print(" 2. Run Claude Code with the enhancement task")
print(" 3. Claude will read the docs and enhance SKILL.md")
print(" 4. Terminal will auto-close when done")
print()
# Create a shell script to run in the terminal
shell_script = f'''#!/bin/bash
claude {prompt_file}
echo ""
echo "✅ Enhancement complete!"
echo "Press any key to close..."
read -n 1
rm {prompt_file}
'''
# Save shell script
with tempfile.NamedTemporaryFile(mode='w', suffix='.sh', delete=False) as f:
script_file = f.name
f.write(shell_script)
os.chmod(script_file, 0o755)
# Launch in new terminal (macOS specific)
if sys.platform == 'darwin':
# macOS Terminal - simple approach
try:
subprocess.Popen(['open', '-a', 'Terminal', script_file])
except Exception as e:
print(f"⚠️ Error launching terminal: {e}")
print(f"\nManually run: {script_file}")
return False
else:
print("⚠️ Auto-launch only works on macOS")
print(f"\nManually run this command in a new terminal:")
print(f" claude '{prompt_file}'")
print(f"\nThen delete the prompt file:")
print(f" rm '{prompt_file}'")
return False
print("✅ New terminal launched with Claude Code!")
print()
print("📊 Status:")
print(f" - Prompt file: {prompt_file}")
print(f" - Skill directory: {self.skill_dir.absolute()}")
print(f" - SKILL.md will be saved to: {self.skill_md_path.absolute()}")
print(f" - Original backed up to: {self.skill_md_path.with_suffix('.md.backup').absolute()}")
print()
print("⏳ Wait for Claude Code to finish in the other terminal...")
print(" (Usually takes 30-60 seconds)")
print()
print("💡 When done:")
print(f" 1. Check the enhanced SKILL.md: {self.skill_md_path}")
print(f" 2. If you don't like it, restore: mv {self.skill_md_path.with_suffix('.md.backup')} {self.skill_md_path}")
print(f" 3. Package: python3 /mnt/skills/examples/skill-creator/scripts/package_skill.py {self.skill_dir}/")
return True
def main():
if len(sys.argv) < 2:
print("Usage: python3 enhance_skill_local.py <skill_directory>")
print()
print("Examples:")
print(" python3 enhance_skill_local.py output/steam-inventory/")
print(" python3 enhance_skill_local.py output/react/")
sys.exit(1)
skill_dir = sys.argv[1]
enhancer = LocalSkillEnhancer(skill_dir)
success = enhancer.run()
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()

258
cli/estimate_pages.py Executable file
View File

@@ -0,0 +1,258 @@
#!/usr/bin/env python3
"""
Page Count Estimator for Skill Seeker
Quickly estimates how many pages a config will scrape without downloading content
"""
import sys
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import json
def estimate_pages(config, max_discovery=1000, timeout=30):
"""
Estimate total pages that will be scraped
Args:
config: Configuration dictionary
max_discovery: Maximum pages to discover (safety limit)
timeout: Timeout for HTTP requests in seconds
Returns:
dict with estimation results
"""
base_url = config['base_url']
start_urls = config.get('start_urls', [base_url])
url_patterns = config.get('url_patterns', {'include': [], 'exclude': []})
rate_limit = config.get('rate_limit', 0.5)
visited = set()
pending = list(start_urls)
discovered = 0
include_patterns = url_patterns.get('include', [])
exclude_patterns = url_patterns.get('exclude', [])
print(f"🔍 Estimating pages for: {config['name']}")
print(f"📍 Base URL: {base_url}")
print(f"🎯 Start URLs: {len(start_urls)}")
print(f"⏱️ Rate limit: {rate_limit}s")
print(f"🔢 Max discovery: {max_discovery}")
print()
start_time = time.time()
while pending and discovered < max_discovery:
url = pending.pop(0)
# Skip if already visited
if url in visited:
continue
visited.add(url)
discovered += 1
# Progress indicator
if discovered % 10 == 0:
elapsed = time.time() - start_time
rate = discovered / elapsed if elapsed > 0 else 0
print(f"⏳ Discovered: {discovered} pages ({rate:.1f} pages/sec)", end='\r')
try:
# HEAD request first to check if page exists (faster)
head_response = requests.head(url, timeout=timeout, allow_redirects=True)
# Skip non-HTML content
content_type = head_response.headers.get('Content-Type', '')
if 'text/html' not in content_type:
continue
# Now GET the page to find links
response = requests.get(url, timeout=timeout)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Find all links
for link in soup.find_all('a', href=True):
href = link['href']
full_url = urljoin(url, href)
# Normalize URL
parsed = urlparse(full_url)
full_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
# Check if URL is valid
if not is_valid_url(full_url, base_url, include_patterns, exclude_patterns):
continue
# Add to pending if not visited
if full_url not in visited and full_url not in pending:
pending.append(full_url)
# Rate limiting
time.sleep(rate_limit)
except requests.RequestException as e:
# Silently skip errors during estimation
pass
except Exception as e:
# Silently skip other errors
pass
elapsed = time.time() - start_time
# Results
results = {
'discovered': discovered,
'pending': len(pending),
'estimated_total': discovered + len(pending),
'elapsed_seconds': round(elapsed, 2),
'discovery_rate': round(discovered / elapsed if elapsed > 0 else 0, 2),
'hit_limit': discovered >= max_discovery
}
return results
def is_valid_url(url, base_url, include_patterns, exclude_patterns):
"""Check if URL should be crawled"""
# Must be same domain
if not url.startswith(base_url.rstrip('/')):
return False
# Check exclude patterns first
if exclude_patterns:
for pattern in exclude_patterns:
if pattern in url:
return False
# Check include patterns (if specified)
if include_patterns:
for pattern in include_patterns:
if pattern in url:
return True
return False
# If no include patterns, accept by default
return True
def print_results(results, config):
"""Print estimation results"""
print()
print("=" * 70)
print("📊 ESTIMATION RESULTS")
print("=" * 70)
print()
print(f"Config: {config['name']}")
print(f"Base URL: {config['base_url']}")
print()
print(f"✅ Pages Discovered: {results['discovered']}")
print(f"⏳ Pages Pending: {results['pending']}")
print(f"📈 Estimated Total: {results['estimated_total']}")
print()
print(f"⏱️ Time Elapsed: {results['elapsed_seconds']}s")
print(f"⚡ Discovery Rate: {results['discovery_rate']} pages/sec")
if results['hit_limit']:
print()
print("⚠️ Hit discovery limit - actual total may be higher")
print(" Increase max_discovery parameter for more accurate estimate")
print()
print("=" * 70)
print("💡 RECOMMENDATIONS")
print("=" * 70)
print()
estimated = results['estimated_total']
current_max = config.get('max_pages', 100)
if estimated <= current_max:
print(f"✅ Current max_pages ({current_max}) is sufficient")
else:
recommended = min(estimated + 50, 10000) # Add 50 buffer, cap at 10k
print(f"⚠️ Current max_pages ({current_max}) may be too low")
print(f"📝 Recommended max_pages: {recommended}")
print(f" (Estimated {estimated} + 50 buffer)")
# Estimate time for full scrape
rate_limit = config.get('rate_limit', 0.5)
estimated_time = (estimated * rate_limit) / 60 # in minutes
print()
print(f"⏱️ Estimated full scrape time: {estimated_time:.1f} minutes")
print(f" (Based on rate_limit: {rate_limit}s)")
print()
def load_config(config_path):
"""Load configuration from JSON file"""
try:
with open(config_path, 'r') as f:
config = json.load(f)
return config
except FileNotFoundError:
print(f"❌ Error: Config file not found: {config_path}")
sys.exit(1)
except json.JSONDecodeError as e:
print(f"❌ Error: Invalid JSON in config file: {e}")
sys.exit(1)
def main():
"""Main entry point"""
import argparse
parser = argparse.ArgumentParser(
description='Estimate page count for Skill Seeker configs',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Estimate pages for a config
python3 estimate_pages.py configs/react.json
# Estimate with higher discovery limit
python3 estimate_pages.py configs/godot.json --max-discovery 2000
# Quick estimate (stop at 100 pages)
python3 estimate_pages.py configs/vue.json --max-discovery 100
"""
)
parser.add_argument('config', help='Path to config JSON file')
parser.add_argument('--max-discovery', '-m', type=int, default=1000,
help='Maximum pages to discover (default: 1000)')
parser.add_argument('--timeout', '-t', type=int, default=30,
help='HTTP request timeout in seconds (default: 30)')
args = parser.parse_args()
# Load config
config = load_config(args.config)
# Run estimation
try:
results = estimate_pages(config, args.max_discovery, args.timeout)
print_results(results, config)
# Return exit code based on results
if results['hit_limit']:
return 2 # Warning: hit limit
return 0 # Success
except KeyboardInterrupt:
print("\n\n⚠️ Estimation interrupted by user")
return 1
except Exception as e:
print(f"\n\n❌ Error during estimation: {e}")
return 1
if __name__ == '__main__':
sys.exit(main())

78
cli/package_skill.py Normal file
View File

@@ -0,0 +1,78 @@
#!/usr/bin/env python3
"""
Simple Skill Packager
Packages a skill directory into a .zip file for Claude.
Usage:
python3 package_skill.py output/steam-inventory/
python3 package_skill.py output/react/
"""
import os
import sys
import zipfile
from pathlib import Path
def package_skill(skill_dir):
"""Package a skill directory into a .zip file"""
skill_path = Path(skill_dir)
if not skill_path.exists():
print(f"❌ Error: Directory not found: {skill_dir}")
return False
if not skill_path.is_dir():
print(f"❌ Error: Not a directory: {skill_dir}")
return False
# Verify SKILL.md exists
skill_md = skill_path / "SKILL.md"
if not skill_md.exists():
print(f"❌ Error: SKILL.md not found in {skill_dir}")
return False
# Create zip filename
skill_name = skill_path.name
zip_path = skill_path.parent / f"{skill_name}.zip"
print(f"📦 Packaging skill: {skill_name}")
print(f" Source: {skill_path}")
print(f" Output: {zip_path}")
# Create zip file
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
for root, dirs, files in os.walk(skill_path):
# Skip backup files
files = [f for f in files if not f.endswith('.backup')]
for file in files:
file_path = Path(root) / file
arcname = file_path.relative_to(skill_path)
zf.write(file_path, arcname)
print(f" + {arcname}")
# Get zip size
zip_size = zip_path.stat().st_size
print(f"\n✅ Package created: {zip_path}")
print(f" Size: {zip_size:,} bytes ({zip_size / 1024:.1f} KB)")
return True
def main():
if len(sys.argv) < 2:
print("Usage: python3 package_skill.py <skill_directory>")
print()
print("Examples:")
print(" python3 package_skill.py output/steam-inventory/")
print(" python3 package_skill.py output/react/")
sys.exit(1)
skill_dir = sys.argv[1]
success = package_skill(skill_dir)
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()

228
cli/run_tests.py Executable file
View File

@@ -0,0 +1,228 @@
#!/usr/bin/env python3
"""
Test Runner for Skill Seeker
Runs all test suites and generates a comprehensive test report
"""
import sys
import unittest
import os
from io import StringIO
from pathlib import Path
class ColoredTextTestResult(unittest.TextTestResult):
"""Custom test result class with colored output"""
# ANSI color codes
GREEN = '\033[92m'
RED = '\033[91m'
YELLOW = '\033[93m'
BLUE = '\033[94m'
RESET = '\033[0m'
BOLD = '\033[1m'
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.test_results = []
def addSuccess(self, test):
super().addSuccess(test)
self.test_results.append(('PASS', test))
if self.showAll:
self.stream.write(f"{self.GREEN}✓ PASS{self.RESET}\n")
elif self.dots:
self.stream.write(f"{self.GREEN}.{self.RESET}")
self.stream.flush()
def addError(self, test, err):
super().addError(test, err)
self.test_results.append(('ERROR', test))
if self.showAll:
self.stream.write(f"{self.RED}✗ ERROR{self.RESET}\n")
elif self.dots:
self.stream.write(f"{self.RED}E{self.RESET}")
self.stream.flush()
def addFailure(self, test, err):
super().addFailure(test, err)
self.test_results.append(('FAIL', test))
if self.showAll:
self.stream.write(f"{self.RED}✗ FAIL{self.RESET}\n")
elif self.dots:
self.stream.write(f"{self.RED}F{self.RESET}")
self.stream.flush()
def addSkip(self, test, reason):
super().addSkip(test, reason)
self.test_results.append(('SKIP', test))
if self.showAll:
self.stream.write(f"{self.YELLOW}⊘ SKIP{self.RESET}\n")
elif self.dots:
self.stream.write(f"{self.YELLOW}s{self.RESET}")
self.stream.flush()
class ColoredTextTestRunner(unittest.TextTestRunner):
"""Custom test runner with colored output"""
resultclass = ColoredTextTestResult
def discover_tests(test_dir='tests'):
"""Discover all test files in the tests directory"""
loader = unittest.TestLoader()
start_dir = test_dir
pattern = 'test_*.py'
suite = loader.discover(start_dir, pattern=pattern)
return suite
def run_specific_suite(suite_name):
"""Run a specific test suite"""
loader = unittest.TestLoader()
suite_map = {
'config': 'tests.test_config_validation',
'features': 'tests.test_scraper_features',
'integration': 'tests.test_integration'
}
if suite_name not in suite_map:
print(f"Unknown test suite: {suite_name}")
print(f"Available suites: {', '.join(suite_map.keys())}")
return None
module_name = suite_map[suite_name]
try:
suite = loader.loadTestsFromName(module_name)
return suite
except Exception as e:
print(f"Error loading test suite '{suite_name}': {e}")
return None
def print_summary(result):
"""Print a detailed test summary"""
total = result.testsRun
passed = total - len(result.failures) - len(result.errors) - len(result.skipped)
failed = len(result.failures)
errors = len(result.errors)
skipped = len(result.skipped)
print("\n" + "="*70)
print("TEST SUMMARY")
print("="*70)
# Overall stats
print(f"\n{ColoredTextTestResult.BOLD}Total Tests:{ColoredTextTestResult.RESET} {total}")
print(f"{ColoredTextTestResult.GREEN}✓ Passed:{ColoredTextTestResult.RESET} {passed}")
if failed > 0:
print(f"{ColoredTextTestResult.RED}✗ Failed:{ColoredTextTestResult.RESET} {failed}")
if errors > 0:
print(f"{ColoredTextTestResult.RED}✗ Errors:{ColoredTextTestResult.RESET} {errors}")
if skipped > 0:
print(f"{ColoredTextTestResult.YELLOW}⊘ Skipped:{ColoredTextTestResult.RESET} {skipped}")
# Success rate
if total > 0:
success_rate = (passed / total) * 100
color = ColoredTextTestResult.GREEN if success_rate == 100 else \
ColoredTextTestResult.YELLOW if success_rate >= 80 else \
ColoredTextTestResult.RED
print(f"\n{color}Success Rate: {success_rate:.1f}%{ColoredTextTestResult.RESET}")
# Category breakdown
if hasattr(result, 'test_results'):
print(f"\n{ColoredTextTestResult.BOLD}Test Breakdown by Category:{ColoredTextTestResult.RESET}")
categories = {}
for status, test in result.test_results:
test_name = str(test)
# Extract test class name
if '.' in test_name:
class_name = test_name.split('.')[0].split()[-1]
if class_name not in categories:
categories[class_name] = {'PASS': 0, 'FAIL': 0, 'ERROR': 0, 'SKIP': 0}
categories[class_name][status] += 1
for category, stats in sorted(categories.items()):
total_cat = sum(stats.values())
passed_cat = stats['PASS']
print(f" {category}: {passed_cat}/{total_cat} passed")
print("\n" + "="*70)
# Return status
return failed == 0 and errors == 0
def main():
"""Main test runner"""
import argparse
parser = argparse.ArgumentParser(
description='Run tests for Skill Seeker',
formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument('--suite', '-s', type=str,
help='Run specific test suite (config, features, integration)')
parser.add_argument('--verbose', '-v', action='store_true',
help='Verbose output (show each test)')
parser.add_argument('--quiet', '-q', action='store_true',
help='Quiet output (minimal output)')
parser.add_argument('--failfast', '-f', action='store_true',
help='Stop on first failure')
parser.add_argument('--list', '-l', action='store_true',
help='List all available tests')
args = parser.parse_args()
# Set verbosity
verbosity = 1
if args.verbose:
verbosity = 2
elif args.quiet:
verbosity = 0
print(f"\n{ColoredTextTestResult.BOLD}{'='*70}{ColoredTextTestResult.RESET}")
print(f"{ColoredTextTestResult.BOLD}SKILL SEEKER TEST SUITE{ColoredTextTestResult.RESET}")
print(f"{ColoredTextTestResult.BOLD}{'='*70}{ColoredTextTestResult.RESET}\n")
# Discover or load specific suite
if args.suite:
print(f"Running test suite: {ColoredTextTestResult.BLUE}{args.suite}{ColoredTextTestResult.RESET}\n")
suite = run_specific_suite(args.suite)
if suite is None:
return 1
else:
print(f"Running {ColoredTextTestResult.BLUE}all tests{ColoredTextTestResult.RESET}\n")
suite = discover_tests()
# List tests
if args.list:
print("\nAvailable tests:\n")
for test_group in suite:
for test in test_group:
print(f" - {test}")
print()
return 0
# Run tests
runner = ColoredTextTestRunner(
verbosity=verbosity,
failfast=args.failfast
)
result = runner.run(suite)
# Print summary
success = print_summary(result)
# Return appropriate exit code
return 0 if success else 1
if __name__ == '__main__':
sys.exit(main())