790 lines
28 KiB
Python
790 lines
28 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Documentation to Claude Skill Converter
|
|
Single tool to scrape any documentation and create high-quality Claude skills.
|
|
|
|
Usage:
|
|
python3 doc_scraper.py --interactive
|
|
python3 doc_scraper.py --config configs/godot.json
|
|
python3 doc_scraper.py --url https://react.dev/ --name react
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import time
|
|
import re
|
|
import argparse
|
|
import hashlib
|
|
import requests
|
|
from pathlib import Path
|
|
from urllib.parse import urljoin, urlparse
|
|
from bs4 import BeautifulSoup
|
|
from collections import deque, defaultdict
|
|
|
|
|
|
class DocToSkillConverter:
|
|
def __init__(self, config):
|
|
self.config = config
|
|
self.name = config['name']
|
|
self.base_url = config['base_url']
|
|
|
|
# Paths
|
|
self.data_dir = f"output/{self.name}_data"
|
|
self.skill_dir = f"output/{self.name}"
|
|
|
|
# State
|
|
self.visited_urls = set()
|
|
# Support multiple starting URLs
|
|
start_urls = config.get('start_urls', [self.base_url])
|
|
self.pending_urls = deque(start_urls)
|
|
self.pages = []
|
|
|
|
# Create directories
|
|
os.makedirs(f"{self.data_dir}/pages", exist_ok=True)
|
|
os.makedirs(f"{self.skill_dir}/references", exist_ok=True)
|
|
os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True)
|
|
os.makedirs(f"{self.skill_dir}/assets", exist_ok=True)
|
|
|
|
def is_valid_url(self, url):
|
|
"""Check if URL should be scraped"""
|
|
if not url.startswith(self.base_url):
|
|
return False
|
|
|
|
# Include patterns
|
|
includes = self.config.get('url_patterns', {}).get('include', [])
|
|
if includes and not any(pattern in url for pattern in includes):
|
|
return False
|
|
|
|
# Exclude patterns
|
|
excludes = self.config.get('url_patterns', {}).get('exclude', [])
|
|
if any(pattern in url for pattern in excludes):
|
|
return False
|
|
|
|
return True
|
|
|
|
def extract_content(self, soup, url):
|
|
"""Extract content with improved code and pattern detection"""
|
|
page = {
|
|
'url': url,
|
|
'title': '',
|
|
'content': '',
|
|
'headings': [],
|
|
'code_samples': [],
|
|
'patterns': [], # NEW: Extract common patterns
|
|
'links': []
|
|
}
|
|
|
|
selectors = self.config.get('selectors', {})
|
|
|
|
# Extract title
|
|
title_elem = soup.select_one(selectors.get('title', 'title'))
|
|
if title_elem:
|
|
page['title'] = self.clean_text(title_elem.get_text())
|
|
|
|
# Find main content
|
|
main_selector = selectors.get('main_content', 'div[role="main"]')
|
|
main = soup.select_one(main_selector)
|
|
|
|
if not main:
|
|
print(f"⚠ No content: {url}")
|
|
return page
|
|
|
|
# Extract headings with better structure
|
|
for h in main.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
|
|
text = self.clean_text(h.get_text())
|
|
if text:
|
|
page['headings'].append({
|
|
'level': h.name,
|
|
'text': text,
|
|
'id': h.get('id', '')
|
|
})
|
|
|
|
# Extract code with language detection
|
|
code_selector = selectors.get('code_blocks', 'pre code')
|
|
for code_elem in main.select(code_selector):
|
|
code = code_elem.get_text()
|
|
if len(code.strip()) > 10:
|
|
# Try to detect language
|
|
lang = self.detect_language(code_elem, code)
|
|
page['code_samples'].append({
|
|
'code': code.strip(),
|
|
'language': lang
|
|
})
|
|
|
|
# Extract patterns (NEW: common code patterns)
|
|
page['patterns'] = self.extract_patterns(main, page['code_samples'])
|
|
|
|
# Extract paragraphs
|
|
paragraphs = []
|
|
for p in main.find_all('p'):
|
|
text = self.clean_text(p.get_text())
|
|
if text and len(text) > 20: # Skip very short paragraphs
|
|
paragraphs.append(text)
|
|
|
|
page['content'] = '\n\n'.join(paragraphs)
|
|
|
|
# Extract links
|
|
for link in main.find_all('a', href=True):
|
|
href = urljoin(url, link['href'])
|
|
if self.is_valid_url(href):
|
|
page['links'].append(href)
|
|
|
|
return page
|
|
|
|
def detect_language(self, elem, code):
|
|
"""Detect programming language from code block"""
|
|
# Check class attribute
|
|
classes = elem.get('class', [])
|
|
for cls in classes:
|
|
if 'language-' in cls:
|
|
return cls.replace('language-', '')
|
|
if 'lang-' in cls:
|
|
return cls.replace('lang-', '')
|
|
|
|
# Check parent pre element
|
|
parent = elem.parent
|
|
if parent and parent.name == 'pre':
|
|
classes = parent.get('class', [])
|
|
for cls in classes:
|
|
if 'language-' in cls:
|
|
return cls.replace('language-', '')
|
|
|
|
# Heuristic detection
|
|
if 'import ' in code and 'from ' in code:
|
|
return 'python'
|
|
if 'const ' in code or 'let ' in code or '=>' in code:
|
|
return 'javascript'
|
|
if 'func ' in code and 'var ' in code:
|
|
return 'gdscript'
|
|
if 'def ' in code and ':' in code:
|
|
return 'python'
|
|
if '#include' in code or 'int main' in code:
|
|
return 'cpp'
|
|
|
|
return 'unknown'
|
|
|
|
def extract_patterns(self, main, code_samples):
|
|
"""Extract common coding patterns (NEW FEATURE)"""
|
|
patterns = []
|
|
|
|
# Look for "Example:" or "Pattern:" sections
|
|
for elem in main.find_all(['p', 'div']):
|
|
text = elem.get_text().lower()
|
|
if any(word in text for word in ['example:', 'pattern:', 'usage:', 'typical use']):
|
|
# Get the code that follows
|
|
next_code = elem.find_next(['pre', 'code'])
|
|
if next_code:
|
|
patterns.append({
|
|
'description': self.clean_text(elem.get_text()),
|
|
'code': next_code.get_text().strip()
|
|
})
|
|
|
|
return patterns[:5] # Limit to 5 most relevant patterns
|
|
|
|
def clean_text(self, text):
|
|
"""Clean text content"""
|
|
text = re.sub(r'\s+', ' ', text)
|
|
return text.strip()
|
|
|
|
def save_page(self, page):
|
|
"""Save page data"""
|
|
url_hash = hashlib.md5(page['url'].encode()).hexdigest()[:10]
|
|
safe_title = re.sub(r'[^\w\s-]', '', page['title'])[:50]
|
|
safe_title = re.sub(r'[-\s]+', '_', safe_title)
|
|
|
|
filename = f"{safe_title}_{url_hash}.json"
|
|
filepath = os.path.join(self.data_dir, "pages", filename)
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(page, f, indent=2, ensure_ascii=False)
|
|
|
|
def scrape_page(self, url):
|
|
"""Scrape a single page"""
|
|
try:
|
|
print(f" {url}")
|
|
|
|
headers = {'User-Agent': 'Mozilla/5.0 (Documentation Scraper)'}
|
|
response = requests.get(url, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
page = self.extract_content(soup, url)
|
|
|
|
self.save_page(page)
|
|
self.pages.append(page)
|
|
|
|
# Add new URLs
|
|
for link in page['links']:
|
|
if link not in self.visited_urls and link not in self.pending_urls:
|
|
self.pending_urls.append(link)
|
|
|
|
# Rate limiting
|
|
time.sleep(self.config.get('rate_limit', 0.5))
|
|
|
|
except Exception as e:
|
|
print(f" ✗ Error: {e}")
|
|
|
|
def scrape_all(self):
|
|
"""Scrape all pages"""
|
|
print(f"\n{'='*60}")
|
|
print(f"SCRAPING: {self.name}")
|
|
print(f"{'='*60}")
|
|
print(f"Base URL: {self.base_url}")
|
|
print(f"Output: {self.data_dir}\n")
|
|
|
|
max_pages = self.config.get('max_pages', 500)
|
|
|
|
while self.pending_urls and len(self.visited_urls) < max_pages:
|
|
url = self.pending_urls.popleft()
|
|
|
|
if url in self.visited_urls:
|
|
continue
|
|
|
|
self.visited_urls.add(url)
|
|
self.scrape_page(url)
|
|
|
|
if len(self.visited_urls) % 10 == 0:
|
|
print(f" [{len(self.visited_urls)} pages]")
|
|
|
|
print(f"\n✅ Scraped {len(self.visited_urls)} pages")
|
|
self.save_summary()
|
|
|
|
def save_summary(self):
|
|
"""Save scraping summary"""
|
|
summary = {
|
|
'name': self.name,
|
|
'total_pages': len(self.pages),
|
|
'base_url': self.base_url,
|
|
'pages': [{'title': p['title'], 'url': p['url']} for p in self.pages]
|
|
}
|
|
|
|
with open(f"{self.data_dir}/summary.json", 'w', encoding='utf-8') as f:
|
|
json.dump(summary, f, indent=2, ensure_ascii=False)
|
|
|
|
def load_scraped_data(self):
|
|
"""Load previously scraped data"""
|
|
pages = []
|
|
pages_dir = Path(self.data_dir) / "pages"
|
|
|
|
if not pages_dir.exists():
|
|
return []
|
|
|
|
for json_file in pages_dir.glob("*.json"):
|
|
try:
|
|
with open(json_file, 'r', encoding='utf-8') as f:
|
|
pages.append(json.load(f))
|
|
except Exception as e:
|
|
print(f"⚠ Error loading {json_file}: {e}")
|
|
|
|
return pages
|
|
|
|
def smart_categorize(self, pages):
|
|
"""Improved categorization with better pattern matching"""
|
|
category_defs = self.config.get('categories', {})
|
|
|
|
# Default smart categories if none provided
|
|
if not category_defs:
|
|
category_defs = self.infer_categories(pages)
|
|
|
|
categories = {cat: [] for cat in category_defs.keys()}
|
|
categories['other'] = []
|
|
|
|
for page in pages:
|
|
url = page['url'].lower()
|
|
title = page['title'].lower()
|
|
content = page.get('content', '').lower()[:500] # Check first 500 chars
|
|
|
|
categorized = False
|
|
|
|
# Match against keywords
|
|
for cat, keywords in category_defs.items():
|
|
score = 0
|
|
for keyword in keywords:
|
|
keyword = keyword.lower()
|
|
if keyword in url:
|
|
score += 3
|
|
if keyword in title:
|
|
score += 2
|
|
if keyword in content:
|
|
score += 1
|
|
|
|
if score >= 2: # Threshold for categorization
|
|
categories[cat].append(page)
|
|
categorized = True
|
|
break
|
|
|
|
if not categorized:
|
|
categories['other'].append(page)
|
|
|
|
# Remove empty categories
|
|
categories = {k: v for k, v in categories.items() if v}
|
|
|
|
return categories
|
|
|
|
def infer_categories(self, pages):
|
|
"""Infer categories from URL patterns (IMPROVED)"""
|
|
url_segments = defaultdict(int)
|
|
|
|
for page in pages:
|
|
path = urlparse(page['url']).path
|
|
segments = [s for s in path.split('/') if s and s not in ['en', 'stable', 'latest', 'docs']]
|
|
|
|
for seg in segments:
|
|
url_segments[seg] += 1
|
|
|
|
# Top segments become categories
|
|
top_segments = sorted(url_segments.items(), key=lambda x: x[1], reverse=True)[:8]
|
|
|
|
categories = {}
|
|
for seg, count in top_segments:
|
|
if count >= 3: # At least 3 pages
|
|
categories[seg] = [seg]
|
|
|
|
# Add common defaults
|
|
if 'tutorial' not in categories and any('tutorial' in url for url in [p['url'] for p in pages]):
|
|
categories['tutorials'] = ['tutorial', 'guide', 'getting-started']
|
|
|
|
if 'api' not in categories and any('api' in url or 'reference' in url for url in [p['url'] for p in pages]):
|
|
categories['api'] = ['api', 'reference', 'class']
|
|
|
|
return categories
|
|
|
|
def generate_quick_reference(self, pages):
|
|
"""Generate quick reference from common patterns (NEW FEATURE)"""
|
|
quick_ref = []
|
|
|
|
# Collect all patterns
|
|
all_patterns = []
|
|
for page in pages:
|
|
all_patterns.extend(page.get('patterns', []))
|
|
|
|
# Get most common code patterns
|
|
seen_codes = set()
|
|
for pattern in all_patterns:
|
|
code = pattern['code']
|
|
if code not in seen_codes and len(code) < 300:
|
|
quick_ref.append(pattern)
|
|
seen_codes.add(code)
|
|
if len(quick_ref) >= 15:
|
|
break
|
|
|
|
return quick_ref
|
|
|
|
def create_reference_file(self, category, pages):
|
|
"""Create enhanced reference file"""
|
|
if not pages:
|
|
return
|
|
|
|
lines = []
|
|
lines.append(f"# {self.name.title()} - {category.replace('_', ' ').title()}\n")
|
|
lines.append(f"**Pages:** {len(pages)}\n")
|
|
lines.append("---\n")
|
|
|
|
for page in pages:
|
|
lines.append(f"## {page['title']}\n")
|
|
lines.append(f"**URL:** {page['url']}\n")
|
|
|
|
# Table of contents from headings
|
|
if page.get('headings'):
|
|
lines.append("**Contents:**")
|
|
for h in page['headings'][:10]:
|
|
level = int(h['level'][1]) if len(h['level']) > 1 else 1
|
|
indent = " " * max(0, level - 2)
|
|
lines.append(f"{indent}- {h['text']}")
|
|
lines.append("")
|
|
|
|
# Content
|
|
if page.get('content'):
|
|
content = page['content'][:2500]
|
|
if len(page['content']) > 2500:
|
|
content += "\n\n*[Content truncated]*"
|
|
lines.append(content)
|
|
lines.append("")
|
|
|
|
# Code examples with language
|
|
if page.get('code_samples'):
|
|
lines.append("**Examples:**\n")
|
|
for i, sample in enumerate(page['code_samples'][:4], 1):
|
|
lang = sample.get('language', 'unknown')
|
|
code = sample.get('code', sample if isinstance(sample, str) else '')
|
|
lines.append(f"Example {i} ({lang}):")
|
|
lines.append(f"```{lang}")
|
|
lines.append(code[:600])
|
|
if len(code) > 600:
|
|
lines.append("...")
|
|
lines.append("```\n")
|
|
|
|
lines.append("---\n")
|
|
|
|
filepath = os.path.join(self.skill_dir, "references", f"{category}.md")
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
f.write('\n'.join(lines))
|
|
|
|
print(f" ✓ {category}.md ({len(pages)} pages)")
|
|
|
|
def create_enhanced_skill_md(self, categories, quick_ref):
|
|
"""Create SKILL.md with actual examples (IMPROVED)"""
|
|
description = self.config.get('description', f'Comprehensive assistance with {self.name}')
|
|
|
|
# Extract actual code examples from docs
|
|
example_codes = []
|
|
for pages in categories.values():
|
|
for page in pages[:3]: # First 3 pages per category
|
|
for sample in page.get('code_samples', [])[:2]: # First 2 samples per page
|
|
code = sample.get('code', sample if isinstance(sample, str) else '')
|
|
lang = sample.get('language', 'unknown')
|
|
if len(code) < 200 and lang != 'unknown':
|
|
example_codes.append((lang, code))
|
|
if len(example_codes) >= 10:
|
|
break
|
|
if len(example_codes) >= 10:
|
|
break
|
|
if len(example_codes) >= 10:
|
|
break
|
|
|
|
content = f"""---
|
|
name: {self.name}
|
|
description: {description}
|
|
---
|
|
|
|
# {self.name.title()} Skill
|
|
|
|
Comprehensive assistance with {self.name} development, generated from official documentation.
|
|
|
|
## When to Use This Skill
|
|
|
|
This skill should be triggered when:
|
|
- Working with {self.name}
|
|
- Asking about {self.name} features or APIs
|
|
- Implementing {self.name} solutions
|
|
- Debugging {self.name} code
|
|
- Learning {self.name} best practices
|
|
|
|
## Quick Reference
|
|
|
|
### Common Patterns
|
|
|
|
"""
|
|
|
|
# Add actual quick reference patterns
|
|
if quick_ref:
|
|
for i, pattern in enumerate(quick_ref[:8], 1):
|
|
content += f"**Pattern {i}:** {pattern.get('description', 'Example pattern')}\n\n"
|
|
content += "```\n"
|
|
content += pattern.get('code', '')[:300]
|
|
content += "\n```\n\n"
|
|
else:
|
|
content += "*Quick reference patterns will be added as you use the skill.*\n\n"
|
|
|
|
# Add example codes from docs
|
|
if example_codes:
|
|
content += "### Example Code Patterns\n\n"
|
|
for i, (lang, code) in enumerate(example_codes[:5], 1):
|
|
content += f"**Example {i}** ({lang}):\n```{lang}\n{code}\n```\n\n"
|
|
|
|
content += f"""## Reference Files
|
|
|
|
This skill includes comprehensive documentation in `references/`:
|
|
|
|
"""
|
|
|
|
for cat in sorted(categories.keys()):
|
|
content += f"- **{cat}.md** - {cat.replace('_', ' ').title()} documentation\n"
|
|
|
|
content += """
|
|
Use `view` to read specific reference files when detailed information is needed.
|
|
|
|
## Working with This Skill
|
|
|
|
### For Beginners
|
|
Start with the getting_started or tutorials reference files for foundational concepts.
|
|
|
|
### For Specific Features
|
|
Use the appropriate category reference file (api, guides, etc.) for detailed information.
|
|
|
|
### For Code Examples
|
|
The quick reference section above contains common patterns extracted from the official docs.
|
|
|
|
## Resources
|
|
|
|
### references/
|
|
Organized documentation extracted from official sources. These files contain:
|
|
- Detailed explanations
|
|
- Code examples with language annotations
|
|
- Links to original documentation
|
|
- Table of contents for quick navigation
|
|
|
|
### scripts/
|
|
Add helper scripts here for common automation tasks.
|
|
|
|
### assets/
|
|
Add templates, boilerplate, or example projects here.
|
|
|
|
## Notes
|
|
|
|
- This skill was automatically generated from official documentation
|
|
- Reference files preserve the structure and examples from source docs
|
|
- Code examples include language detection for better syntax highlighting
|
|
- Quick reference patterns are extracted from common usage examples in the docs
|
|
|
|
## Updating
|
|
|
|
To refresh this skill with updated documentation:
|
|
1. Re-run the scraper with the same configuration
|
|
2. The skill will be rebuilt with the latest information
|
|
"""
|
|
|
|
filepath = os.path.join(self.skill_dir, "SKILL.md")
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
|
|
print(f" ✓ SKILL.md (enhanced with {len(example_codes)} examples)")
|
|
|
|
def create_index(self, categories):
|
|
"""Create navigation index"""
|
|
lines = []
|
|
lines.append(f"# {self.name.title()} Documentation Index\n")
|
|
lines.append("## Categories\n")
|
|
|
|
for cat, pages in sorted(categories.items()):
|
|
lines.append(f"### {cat.replace('_', ' ').title()}")
|
|
lines.append(f"**File:** `{cat}.md`")
|
|
lines.append(f"**Pages:** {len(pages)}\n")
|
|
|
|
filepath = os.path.join(self.skill_dir, "references", "index.md")
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
f.write('\n'.join(lines))
|
|
|
|
print(" ✓ index.md")
|
|
|
|
def build_skill(self):
|
|
"""Build the skill from scraped data"""
|
|
print(f"\n{'='*60}")
|
|
print(f"BUILDING SKILL: {self.name}")
|
|
print(f"{'='*60}\n")
|
|
|
|
# Load data
|
|
print("Loading scraped data...")
|
|
pages = self.load_scraped_data()
|
|
|
|
if not pages:
|
|
print("✗ No scraped data found!")
|
|
return False
|
|
|
|
print(f" ✓ Loaded {len(pages)} pages\n")
|
|
|
|
# Categorize
|
|
print("Categorizing pages...")
|
|
categories = self.smart_categorize(pages)
|
|
print(f" ✓ Created {len(categories)} categories\n")
|
|
|
|
# Generate quick reference
|
|
print("Generating quick reference...")
|
|
quick_ref = self.generate_quick_reference(pages)
|
|
print(f" ✓ Extracted {len(quick_ref)} patterns\n")
|
|
|
|
# Create reference files
|
|
print("Creating reference files...")
|
|
for cat, cat_pages in categories.items():
|
|
self.create_reference_file(cat, cat_pages)
|
|
|
|
# Create index
|
|
self.create_index(categories)
|
|
print()
|
|
|
|
# Create enhanced SKILL.md
|
|
print("Creating SKILL.md...")
|
|
self.create_enhanced_skill_md(categories, quick_ref)
|
|
|
|
print(f"\n✅ Skill built: {self.skill_dir}/")
|
|
return True
|
|
|
|
|
|
def load_config(config_path):
|
|
"""Load configuration from file"""
|
|
with open(config_path, 'r') as f:
|
|
return json.load(f)
|
|
|
|
|
|
def interactive_config():
|
|
"""Interactive configuration"""
|
|
print("\n" + "="*60)
|
|
print("Documentation to Skill Converter")
|
|
print("="*60 + "\n")
|
|
|
|
config = {}
|
|
|
|
# Basic info
|
|
config['name'] = input("Skill name (e.g., 'react', 'godot'): ").strip()
|
|
config['description'] = input("Skill description: ").strip()
|
|
config['base_url'] = input("Base URL (e.g., https://docs.example.com/): ").strip()
|
|
|
|
if not config['base_url'].endswith('/'):
|
|
config['base_url'] += '/'
|
|
|
|
# Selectors
|
|
print("\nCSS Selectors (press Enter for defaults):")
|
|
selectors = {}
|
|
selectors['main_content'] = input(" Main content [div[role='main']]: ").strip() or "div[role='main']"
|
|
selectors['title'] = input(" Title [title]: ").strip() or "title"
|
|
selectors['code_blocks'] = input(" Code blocks [pre code]: ").strip() or "pre code"
|
|
config['selectors'] = selectors
|
|
|
|
# URL patterns
|
|
print("\nURL Patterns (comma-separated, optional):")
|
|
include = input(" Include: ").strip()
|
|
exclude = input(" Exclude: ").strip()
|
|
config['url_patterns'] = {
|
|
'include': [p.strip() for p in include.split(',') if p.strip()],
|
|
'exclude': [p.strip() for p in exclude.split(',') if p.strip()]
|
|
}
|
|
|
|
# Settings
|
|
rate = input("\nRate limit (seconds) [0.5]: ").strip()
|
|
config['rate_limit'] = float(rate) if rate else 0.5
|
|
|
|
max_p = input("Max pages [500]: ").strip()
|
|
config['max_pages'] = int(max_p) if max_p else 500
|
|
|
|
return config
|
|
|
|
|
|
def check_existing_data(name):
|
|
"""Check if scraped data already exists"""
|
|
data_dir = f"output/{name}_data"
|
|
if os.path.exists(data_dir) and os.path.exists(f"{data_dir}/summary.json"):
|
|
with open(f"{data_dir}/summary.json", 'r') as f:
|
|
summary = json.load(f)
|
|
return True, summary.get('total_pages', 0)
|
|
return False, 0
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Convert documentation websites to Claude skills',
|
|
formatter_class=argparse.RawDescriptionHelpFormatter
|
|
)
|
|
|
|
parser.add_argument('--interactive', '-i', action='store_true',
|
|
help='Interactive configuration mode')
|
|
parser.add_argument('--config', '-c', type=str,
|
|
help='Load configuration from file (e.g., configs/godot.json)')
|
|
parser.add_argument('--name', type=str,
|
|
help='Skill name')
|
|
parser.add_argument('--url', type=str,
|
|
help='Base documentation URL')
|
|
parser.add_argument('--description', '-d', type=str,
|
|
help='Skill description')
|
|
parser.add_argument('--skip-scrape', action='store_true',
|
|
help='Skip scraping, use existing data')
|
|
parser.add_argument('--enhance', action='store_true',
|
|
help='Enhance SKILL.md using Claude API after building (requires API key)')
|
|
parser.add_argument('--enhance-local', action='store_true',
|
|
help='Enhance SKILL.md using Claude Code in new terminal (no API key needed)')
|
|
parser.add_argument('--api-key', type=str,
|
|
help='Anthropic API key for --enhance (or set ANTHROPIC_API_KEY)')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Get configuration
|
|
if args.config:
|
|
config = load_config(args.config)
|
|
elif args.interactive or not (args.name and args.url):
|
|
config = interactive_config()
|
|
else:
|
|
config = {
|
|
'name': args.name,
|
|
'description': args.description or f'Comprehensive assistance with {args.name}',
|
|
'base_url': args.url,
|
|
'selectors': {
|
|
'main_content': "div[role='main']",
|
|
'title': 'title',
|
|
'code_blocks': 'pre code'
|
|
},
|
|
'url_patterns': {'include': [], 'exclude': []},
|
|
'rate_limit': 0.5,
|
|
'max_pages': 500
|
|
}
|
|
|
|
# Check for existing data
|
|
exists, page_count = check_existing_data(config['name'])
|
|
|
|
if exists and not args.skip_scrape:
|
|
print(f"\n✓ Found existing data: {page_count} pages")
|
|
response = input("Use existing data? (y/n): ").strip().lower()
|
|
if response == 'y':
|
|
args.skip_scrape = True
|
|
|
|
# Create converter
|
|
converter = DocToSkillConverter(config)
|
|
|
|
# Scrape or skip
|
|
if not args.skip_scrape:
|
|
try:
|
|
converter.scrape_all()
|
|
except KeyboardInterrupt:
|
|
print("\n\nScraping interrupted.")
|
|
response = input("Continue with skill building? (y/n): ").strip().lower()
|
|
if response != 'y':
|
|
return
|
|
else:
|
|
print(f"\n⏭️ Skipping scrape, using existing data")
|
|
|
|
# Build skill
|
|
success = converter.build_skill()
|
|
|
|
if not success:
|
|
sys.exit(1)
|
|
|
|
# Optional enhancement with Claude API
|
|
if args.enhance:
|
|
print(f"\n{'='*60}")
|
|
print(f"ENHANCING SKILL.MD WITH CLAUDE API")
|
|
print(f"{'='*60}\n")
|
|
|
|
try:
|
|
import subprocess
|
|
enhance_cmd = ['python3', 'enhance_skill.py', f'output/{config["name"]}/']
|
|
if args.api_key:
|
|
enhance_cmd.extend(['--api-key', args.api_key])
|
|
|
|
result = subprocess.run(enhance_cmd, check=True)
|
|
if result.returncode == 0:
|
|
print("\n✅ Enhancement complete!")
|
|
except subprocess.CalledProcessError:
|
|
print("\n⚠ Enhancement failed, but skill was still built")
|
|
except FileNotFoundError:
|
|
print("\n⚠ enhance_skill.py not found. Run manually:")
|
|
print(f" python3 enhance_skill.py output/{config['name']}/")
|
|
|
|
# Optional enhancement with Claude Code (local, no API key)
|
|
if args.enhance_local:
|
|
print(f"\n{'='*60}")
|
|
print(f"ENHANCING SKILL.MD WITH CLAUDE CODE (LOCAL)")
|
|
print(f"{'='*60}\n")
|
|
|
|
try:
|
|
import subprocess
|
|
enhance_cmd = ['python3', 'enhance_skill_local.py', f'output/{config["name"]}/']
|
|
subprocess.run(enhance_cmd, check=True)
|
|
except subprocess.CalledProcessError:
|
|
print("\n⚠ Enhancement failed, but skill was still built")
|
|
except FileNotFoundError:
|
|
print("\n⚠ enhance_skill_local.py not found. Run manually:")
|
|
print(f" python3 enhance_skill_local.py output/{config['name']}/")
|
|
|
|
print(f"\n📦 Package your skill:")
|
|
print(f" python3 /mnt/skills/examples/skill-creator/scripts/package_skill.py output/{config['name']}/")
|
|
|
|
if not args.enhance and not args.enhance_local:
|
|
print(f"\n💡 Optional: Enhance SKILL.md with Claude:")
|
|
print(f" API-based: python3 enhance_skill.py output/{config['name']}/")
|
|
print(f" or re-run with: --enhance")
|
|
print(f" Local (no API key): python3 enhance_skill_local.py output/{config['name']}/")
|
|
print(f" or re-run with: --enhance-local")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|