- Add Markdown file parsing in doc_scraper (_extract_markdown_content, _extract_html_as_markdown) - Add URL extraction and cleaning in llms_txt_parser (extract_urls, _clean_url) - Support multiple documentation/github/pdf sources in unified_scraper - Generate separate reference directories per source in unified_skill_builder - Skip pages with empty/short content (<50 chars) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
513 lines
18 KiB
Python
513 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Unified Multi-Source Scraper
|
|
|
|
Orchestrates scraping from multiple sources (documentation, GitHub, PDF),
|
|
detects conflicts, merges intelligently, and builds unified skills.
|
|
|
|
This is the main entry point for unified config workflow.
|
|
|
|
Usage:
|
|
skill-seekers unified --config configs/godot_unified.json
|
|
skill-seekers unified --config configs/react_unified.json --merge-mode claude-enhanced
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import json
|
|
import logging
|
|
import argparse
|
|
import subprocess
|
|
from pathlib import Path
|
|
from typing import Dict, List, Any, Optional
|
|
|
|
# Import validators and scrapers
|
|
try:
|
|
from skill_seekers.cli.config_validator import ConfigValidator, validate_config
|
|
from skill_seekers.cli.conflict_detector import ConflictDetector
|
|
from skill_seekers.cli.merge_sources import RuleBasedMerger, ClaudeEnhancedMerger
|
|
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
|
|
except ImportError as e:
|
|
print(f"Error importing modules: {e}")
|
|
print("Make sure you're running from the project root directory")
|
|
sys.exit(1)
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class UnifiedScraper:
|
|
"""
|
|
Orchestrates multi-source scraping and merging.
|
|
|
|
Main workflow:
|
|
1. Load and validate unified config
|
|
2. Scrape all sources (docs, GitHub, PDF)
|
|
3. Detect conflicts between sources
|
|
4. Merge intelligently (rule-based or Claude-enhanced)
|
|
5. Build unified skill
|
|
"""
|
|
|
|
def __init__(self, config_path: str, merge_mode: Optional[str] = None):
|
|
"""
|
|
Initialize unified scraper.
|
|
|
|
Args:
|
|
config_path: Path to unified config JSON
|
|
merge_mode: Override config merge_mode ('rule-based' or 'claude-enhanced')
|
|
"""
|
|
self.config_path = config_path
|
|
|
|
# Validate and load config
|
|
logger.info(f"Loading config: {config_path}")
|
|
self.validator = validate_config(config_path)
|
|
self.config = self.validator.config
|
|
|
|
# Determine merge mode
|
|
self.merge_mode = merge_mode or self.config.get('merge_mode', 'rule-based')
|
|
logger.info(f"Merge mode: {self.merge_mode}")
|
|
|
|
# Storage for scraped data - use lists to support multiple sources of same type
|
|
self.scraped_data = {
|
|
'documentation': [], # List of doc sources
|
|
'github': [], # List of github sources
|
|
'pdf': [] # List of pdf sources
|
|
}
|
|
|
|
# Output paths
|
|
self.name = self.config['name']
|
|
self.output_dir = f"output/{self.name}"
|
|
self.data_dir = f"output/{self.name}_unified_data"
|
|
|
|
os.makedirs(self.output_dir, exist_ok=True)
|
|
os.makedirs(self.data_dir, exist_ok=True)
|
|
|
|
# Track source index for unique naming
|
|
self._source_counters = {'documentation': 0, 'github': 0, 'pdf': 0}
|
|
|
|
def scrape_all_sources(self):
|
|
"""
|
|
Scrape all configured sources.
|
|
|
|
Routes to appropriate scraper based on source type.
|
|
"""
|
|
logger.info("=" * 60)
|
|
logger.info("PHASE 1: Scraping all sources")
|
|
logger.info("=" * 60)
|
|
|
|
if not self.validator.is_unified:
|
|
logger.warning("Config is not unified format, converting...")
|
|
self.config = self.validator.convert_legacy_to_unified()
|
|
|
|
sources = self.config.get('sources', [])
|
|
|
|
for i, source in enumerate(sources):
|
|
source_type = source['type']
|
|
logger.info(f"\n[{i+1}/{len(sources)}] Scraping {source_type} source...")
|
|
|
|
try:
|
|
if source_type == 'documentation':
|
|
self._scrape_documentation(source)
|
|
elif source_type == 'github':
|
|
self._scrape_github(source)
|
|
elif source_type == 'pdf':
|
|
self._scrape_pdf(source)
|
|
else:
|
|
logger.warning(f"Unknown source type: {source_type}")
|
|
except Exception as e:
|
|
logger.error(f"Error scraping {source_type}: {e}")
|
|
logger.info("Continuing with other sources...")
|
|
|
|
logger.info(f"\n✅ Scraped {sum(len(v) for v in self.scraped_data.values())} sources successfully")
|
|
|
|
def _scrape_documentation(self, source: Dict[str, Any]):
|
|
"""Scrape documentation website."""
|
|
# Get unique index for this documentation source
|
|
idx = self._source_counters['documentation']
|
|
self._source_counters['documentation'] += 1
|
|
|
|
# Extract source identifier from URL for unique naming
|
|
from urllib.parse import urlparse
|
|
parsed = urlparse(source['base_url'])
|
|
source_id = parsed.netloc.replace('.', '_').replace(':', '_')
|
|
|
|
# Create temporary config for doc scraper with unique name
|
|
doc_config = {
|
|
'name': f"{self.name}_docs_{idx}_{source_id}",
|
|
'base_url': source['base_url'],
|
|
'selectors': source.get('selectors', {}),
|
|
'url_patterns': source.get('url_patterns', {}),
|
|
'categories': source.get('categories', {}),
|
|
'rate_limit': source.get('rate_limit', 0.5),
|
|
'max_pages': source.get('max_pages', 100)
|
|
}
|
|
|
|
# Pass through llms.txt settings (so unified configs behave the same as doc_scraper configs)
|
|
if 'llms_txt_url' in source:
|
|
doc_config['llms_txt_url'] = source.get('llms_txt_url')
|
|
|
|
if 'skip_llms_txt' in source:
|
|
doc_config['skip_llms_txt'] = source.get('skip_llms_txt')
|
|
|
|
# Optional: support overriding start URLs
|
|
if 'start_urls' in source:
|
|
doc_config['start_urls'] = source.get('start_urls')
|
|
|
|
# Write temporary config
|
|
temp_config_path = os.path.join(self.data_dir, 'temp_docs_config.json')
|
|
with open(temp_config_path, 'w', encoding='utf-8') as f:
|
|
json.dump(doc_config, f, indent=2)
|
|
|
|
# Run doc_scraper as subprocess
|
|
logger.info(f"Scraping documentation from {source['base_url']}")
|
|
|
|
doc_scraper_path = Path(__file__).parent / "doc_scraper.py"
|
|
cmd = [sys.executable, str(doc_scraper_path), '--config', temp_config_path]
|
|
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
|
|
if result.returncode != 0:
|
|
logger.error(f"Documentation scraping failed: {result.stderr}")
|
|
return
|
|
|
|
# Load scraped data
|
|
docs_data_file = f"output/{doc_config['name']}_data/summary.json"
|
|
|
|
if os.path.exists(docs_data_file):
|
|
with open(docs_data_file, 'r', encoding='utf-8') as f:
|
|
summary = json.load(f)
|
|
|
|
# Append to list instead of overwriting
|
|
self.scraped_data['documentation'].append({
|
|
'source_id': source_id,
|
|
'base_url': source['base_url'],
|
|
'pages': summary.get('pages', []),
|
|
'total_pages': summary.get('total_pages', 0),
|
|
'data_file': docs_data_file,
|
|
'refs_dir': f"output/{doc_config['name']}/references"
|
|
})
|
|
|
|
logger.info(f"✅ Documentation: {summary.get('total_pages', 0)} pages scraped")
|
|
else:
|
|
logger.warning("Documentation data file not found")
|
|
|
|
# Clean up temp config
|
|
if os.path.exists(temp_config_path):
|
|
os.remove(temp_config_path)
|
|
|
|
def _scrape_github(self, source: Dict[str, Any]):
|
|
"""Scrape GitHub repository."""
|
|
try:
|
|
from skill_seekers.cli.github_scraper import GitHubScraper
|
|
except ImportError:
|
|
logger.error("github_scraper.py not found")
|
|
return
|
|
|
|
# Get unique index for this GitHub source
|
|
idx = self._source_counters['github']
|
|
self._source_counters['github'] += 1
|
|
|
|
# Extract repo identifier for unique naming
|
|
repo = source['repo']
|
|
repo_id = repo.replace('/', '_')
|
|
|
|
# Create config for GitHub scraper
|
|
github_config = {
|
|
'repo': repo,
|
|
'name': f"{self.name}_github_{idx}_{repo_id}",
|
|
'github_token': source.get('github_token'),
|
|
'include_issues': source.get('include_issues', True),
|
|
'max_issues': source.get('max_issues', 100),
|
|
'include_changelog': source.get('include_changelog', True),
|
|
'include_releases': source.get('include_releases', True),
|
|
'include_code': source.get('include_code', True),
|
|
'code_analysis_depth': source.get('code_analysis_depth', 'surface'),
|
|
'file_patterns': source.get('file_patterns', []),
|
|
'local_repo_path': source.get('local_repo_path')
|
|
}
|
|
|
|
# Pass directory exclusions if specified (optional)
|
|
if 'exclude_dirs' in source:
|
|
github_config['exclude_dirs'] = source['exclude_dirs']
|
|
if 'exclude_dirs_additional' in source:
|
|
github_config['exclude_dirs_additional'] = source['exclude_dirs_additional']
|
|
|
|
# Scrape
|
|
logger.info(f"Scraping GitHub repository: {repo}")
|
|
scraper = GitHubScraper(github_config)
|
|
github_data = scraper.scrape()
|
|
|
|
# Save data with unique filename
|
|
github_data_file = os.path.join(self.data_dir, f'github_data_{idx}_{repo_id}.json')
|
|
with open(github_data_file, 'w', encoding='utf-8') as f:
|
|
json.dump(github_data, f, indent=2, ensure_ascii=False)
|
|
|
|
# Append to list instead of overwriting
|
|
self.scraped_data['github'].append({
|
|
'repo': repo,
|
|
'repo_id': repo_id,
|
|
'data': github_data,
|
|
'data_file': github_data_file
|
|
})
|
|
|
|
logger.info(f"✅ GitHub: Repository scraped successfully")
|
|
|
|
def _scrape_pdf(self, source: Dict[str, Any]):
|
|
"""Scrape PDF document."""
|
|
try:
|
|
from skill_seekers.cli.pdf_scraper import PDFToSkillConverter
|
|
except ImportError:
|
|
logger.error("pdf_scraper.py not found")
|
|
return
|
|
|
|
# Create config for PDF scraper
|
|
pdf_config = {
|
|
'name': f"{self.name}_pdf",
|
|
'pdf': source['path'],
|
|
'extract_tables': source.get('extract_tables', False),
|
|
'ocr': source.get('ocr', False),
|
|
'password': source.get('password')
|
|
}
|
|
|
|
# Scrape
|
|
logger.info(f"Scraping PDF: {source['path']}")
|
|
converter = PDFToSkillConverter(pdf_config)
|
|
pdf_data = converter.extract_all()
|
|
|
|
# Save data
|
|
pdf_data_file = os.path.join(self.data_dir, 'pdf_data.json')
|
|
with open(pdf_data_file, 'w', encoding='utf-8') as f:
|
|
json.dump(pdf_data, f, indent=2, ensure_ascii=False)
|
|
|
|
self.scraped_data['pdf'] = {
|
|
'data': pdf_data,
|
|
'data_file': pdf_data_file
|
|
}
|
|
|
|
logger.info(f"✅ PDF: {len(pdf_data.get('pages', []))} pages extracted")
|
|
|
|
def detect_conflicts(self) -> List:
|
|
"""
|
|
Detect conflicts between documentation and code.
|
|
|
|
Only applicable if both documentation and GitHub sources exist.
|
|
|
|
Returns:
|
|
List of conflicts
|
|
"""
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info("PHASE 2: Detecting conflicts")
|
|
logger.info("=" * 60)
|
|
|
|
if not self.validator.needs_api_merge():
|
|
logger.info("No API merge needed (only one API source)")
|
|
return []
|
|
|
|
# Get documentation and GitHub data (now lists)
|
|
docs_list = self.scraped_data.get('documentation', [])
|
|
github_list = self.scraped_data.get('github', [])
|
|
|
|
if not docs_list or not github_list:
|
|
logger.warning("Missing documentation or GitHub data for conflict detection")
|
|
return []
|
|
|
|
# For conflict detection, combine all docs and all github data
|
|
# Use the first of each for now (conflict detection is optional)
|
|
docs_data = docs_list[0] if docs_list else {}
|
|
github_data = github_list[0] if github_list else {}
|
|
|
|
if not docs_data.get('data_file') or not github_data.get('data_file'):
|
|
logger.warning("Missing data files for conflict detection")
|
|
return []
|
|
|
|
# Load data files
|
|
with open(docs_data['data_file'], 'r', encoding='utf-8') as f:
|
|
docs_json = json.load(f)
|
|
|
|
with open(github_data['data_file'], 'r', encoding='utf-8') as f:
|
|
github_json = json.load(f)
|
|
|
|
# Detect conflicts
|
|
detector = ConflictDetector(docs_json, github_json)
|
|
conflicts = detector.detect_all_conflicts()
|
|
|
|
# Save conflicts
|
|
conflicts_file = os.path.join(self.data_dir, 'conflicts.json')
|
|
detector.save_conflicts(conflicts, conflicts_file)
|
|
|
|
# Print summary
|
|
summary = detector.generate_summary(conflicts)
|
|
logger.info(f"\n📊 Conflict Summary:")
|
|
logger.info(f" Total: {summary['total']}")
|
|
logger.info(f" By Type:")
|
|
for ctype, count in summary['by_type'].items():
|
|
if count > 0:
|
|
logger.info(f" - {ctype}: {count}")
|
|
logger.info(f" By Severity:")
|
|
for severity, count in summary['by_severity'].items():
|
|
if count > 0:
|
|
emoji = '🔴' if severity == 'high' else '🟡' if severity == 'medium' else '🟢'
|
|
logger.info(f" {emoji} {severity}: {count}")
|
|
|
|
return conflicts
|
|
|
|
def merge_sources(self, conflicts: List):
|
|
"""
|
|
Merge data from multiple sources.
|
|
|
|
Args:
|
|
conflicts: List of detected conflicts
|
|
"""
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info(f"PHASE 3: Merging sources ({self.merge_mode})")
|
|
logger.info("=" * 60)
|
|
|
|
if not conflicts:
|
|
logger.info("No conflicts to merge")
|
|
return None
|
|
|
|
# Get data files (now lists)
|
|
docs_list = self.scraped_data.get('documentation', [])
|
|
github_list = self.scraped_data.get('github', [])
|
|
|
|
if not docs_list or not github_list:
|
|
logger.warning("Missing data for merge")
|
|
return None
|
|
|
|
# Use first source of each type for merge
|
|
docs_data = docs_list[0]
|
|
github_data = github_list[0]
|
|
|
|
# Load data
|
|
with open(docs_data['data_file'], 'r', encoding='utf-8') as f:
|
|
docs_json = json.load(f)
|
|
|
|
with open(github_data['data_file'], 'r', encoding='utf-8') as f:
|
|
github_json = json.load(f)
|
|
|
|
# Choose merger
|
|
if self.merge_mode == 'claude-enhanced':
|
|
merger = ClaudeEnhancedMerger(docs_json, github_json, conflicts)
|
|
else:
|
|
merger = RuleBasedMerger(docs_json, github_json, conflicts)
|
|
|
|
# Merge
|
|
merged_data = merger.merge_all()
|
|
|
|
# Save merged data
|
|
merged_file = os.path.join(self.data_dir, 'merged_data.json')
|
|
with open(merged_file, 'w', encoding='utf-8') as f:
|
|
json.dump(merged_data, f, indent=2, ensure_ascii=False)
|
|
|
|
logger.info(f"✅ Merged data saved: {merged_file}")
|
|
|
|
return merged_data
|
|
|
|
def build_skill(self, merged_data: Optional[Dict] = None):
|
|
"""
|
|
Build final unified skill.
|
|
|
|
Args:
|
|
merged_data: Merged API data (if conflicts were resolved)
|
|
"""
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info("PHASE 4: Building unified skill")
|
|
logger.info("=" * 60)
|
|
|
|
# Load conflicts if they exist
|
|
conflicts = []
|
|
conflicts_file = os.path.join(self.data_dir, 'conflicts.json')
|
|
if os.path.exists(conflicts_file):
|
|
with open(conflicts_file, 'r', encoding='utf-8') as f:
|
|
conflicts_data = json.load(f)
|
|
conflicts = conflicts_data.get('conflicts', [])
|
|
|
|
# Build skill
|
|
builder = UnifiedSkillBuilder(
|
|
self.config,
|
|
self.scraped_data,
|
|
merged_data,
|
|
conflicts
|
|
)
|
|
|
|
builder.build()
|
|
|
|
logger.info(f"✅ Unified skill built: {self.output_dir}/")
|
|
|
|
def run(self):
|
|
"""
|
|
Execute complete unified scraping workflow.
|
|
"""
|
|
logger.info("\n" + "🚀 " * 20)
|
|
logger.info(f"Unified Scraper: {self.config['name']}")
|
|
logger.info("🚀 " * 20 + "\n")
|
|
|
|
try:
|
|
# Phase 1: Scrape all sources
|
|
self.scrape_all_sources()
|
|
|
|
# Phase 2: Detect conflicts (if applicable)
|
|
conflicts = self.detect_conflicts()
|
|
|
|
# Phase 3: Merge sources (if conflicts exist)
|
|
merged_data = None
|
|
if conflicts:
|
|
merged_data = self.merge_sources(conflicts)
|
|
|
|
# Phase 4: Build skill
|
|
self.build_skill(merged_data)
|
|
|
|
logger.info("\n" + "✅ " * 20)
|
|
logger.info("Unified scraping complete!")
|
|
logger.info("✅ " * 20 + "\n")
|
|
|
|
logger.info(f"📁 Output: {self.output_dir}/")
|
|
logger.info(f"📁 Data: {self.data_dir}/")
|
|
|
|
except KeyboardInterrupt:
|
|
logger.info("\n\n⚠️ Scraping interrupted by user")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
logger.error(f"\n\n❌ Error during scraping: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
sys.exit(1)
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
parser = argparse.ArgumentParser(
|
|
description='Unified multi-source scraper',
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Basic usage with unified config
|
|
skill-seekers unified --config configs/godot_unified.json
|
|
|
|
# Override merge mode
|
|
skill-seekers unified --config configs/react_unified.json --merge-mode claude-enhanced
|
|
|
|
# Backward compatible with legacy configs
|
|
skill-seekers unified --config configs/react.json
|
|
"""
|
|
)
|
|
|
|
parser.add_argument('--config', '-c', required=True,
|
|
help='Path to unified config JSON file')
|
|
parser.add_argument('--merge-mode', '-m',
|
|
choices=['rule-based', 'claude-enhanced'],
|
|
help='Override config merge mode')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Create and run scraper
|
|
scraper = UnifiedScraper(args.config, args.merge_mode)
|
|
scraper.run()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|