Add unified multi-source scraping feature (Phases 7-11)
Completes the unified scraping system implementation: **Phase 7: Unified Skill Builder** - cli/unified_skill_builder.py: Generates final skill structure - Inline conflict warnings (⚠️) in API reference - Side-by-side docs vs code comparison - Severity-based conflict grouping - Separate conflicts.md report **Phase 8: MCP Integration** - skill_seeker_mcp/server.py: Auto-detects unified vs legacy configs - Routes to unified_scraper.py or doc_scraper.py automatically - Supports merge_mode parameter override - Maintains full backward compatibility **Phase 9: Example Unified Configs** - configs/react_unified.json: React docs + GitHub - configs/django_unified.json: Django docs + GitHub - configs/fastapi_unified.json: FastAPI docs + GitHub - configs/fastapi_unified_test.json: Test config with limited pages **Phase 10: Comprehensive Tests** - cli/test_unified_simple.py: Integration tests (all passing) - Tests unified config validation - Tests backward compatibility - Tests mixed source types - Tests error handling **Phase 11: Documentation** - docs/UNIFIED_SCRAPING.md: Complete guide (1000+ lines) - Examples, best practices, troubleshooting - Architecture diagrams and data flow - Command reference **Additional:** - demo_conflicts.py: Interactive conflict detection demo - TEST_RESULTS.md: Complete test results and findings - cli/unified_scraper.py: Fixed doc_scraper integration (subprocess) **Features:** ✅ Multi-source scraping (docs + GitHub + PDF) ✅ Conflict detection (4 types, 3 severity levels) ✅ Rule-based merging (fast, deterministic) ✅ Claude-enhanced merging (AI-powered) ✅ Transparent conflict reporting ✅ MCP auto-detection ✅ Backward compatibility **Test Results:** - 6/6 integration tests passed - 4 unified configs validated - 3 legacy configs backward compatible - 5 conflicts detected in test data - All documentation complete 🤖 Generated with Claude Code
This commit is contained in:
@@ -17,6 +17,7 @@ import sys
|
||||
import json
|
||||
import logging
|
||||
import argparse
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any, Optional
|
||||
|
||||
@@ -25,6 +26,7 @@ try:
|
||||
from config_validator import ConfigValidator, validate_config
|
||||
from conflict_detector import ConflictDetector
|
||||
from merge_sources import RuleBasedMerger, ClaudeEnhancedMerger
|
||||
from unified_skill_builder import UnifiedSkillBuilder
|
||||
except ImportError as e:
|
||||
print(f"Error importing modules: {e}")
|
||||
print("Make sure you're running from the project root directory")
|
||||
@@ -116,15 +118,6 @@ class UnifiedScraper:
|
||||
|
||||
def _scrape_documentation(self, source: Dict[str, Any]):
|
||||
"""Scrape documentation website."""
|
||||
# Import doc scraper
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
try:
|
||||
from doc_scraper import scrape_all, save_data
|
||||
except ImportError:
|
||||
logger.error("doc_scraper.py not found")
|
||||
return
|
||||
|
||||
# Create temporary config for doc scraper
|
||||
doc_config = {
|
||||
'name': f"{self.name}_docs",
|
||||
@@ -136,20 +129,42 @@ class UnifiedScraper:
|
||||
'max_pages': source.get('max_pages', 100)
|
||||
}
|
||||
|
||||
# Scrape
|
||||
# Write temporary config
|
||||
temp_config_path = os.path.join(self.data_dir, 'temp_docs_config.json')
|
||||
with open(temp_config_path, 'w') as f:
|
||||
json.dump(doc_config, f, indent=2)
|
||||
|
||||
# Run doc_scraper as subprocess
|
||||
logger.info(f"Scraping documentation from {source['base_url']}")
|
||||
pages = scrape_all(doc_config)
|
||||
|
||||
# Save data
|
||||
docs_data_file = os.path.join(self.data_dir, 'documentation_data.json')
|
||||
save_data(pages, docs_data_file, doc_config)
|
||||
doc_scraper_path = Path(__file__).parent / "doc_scraper.py"
|
||||
cmd = [sys.executable, str(doc_scraper_path), '--config', temp_config_path]
|
||||
|
||||
self.scraped_data['documentation'] = {
|
||||
'pages': pages,
|
||||
'data_file': docs_data_file
|
||||
}
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
logger.info(f"✅ Documentation: {len(pages)} pages scraped")
|
||||
if result.returncode != 0:
|
||||
logger.error(f"Documentation scraping failed: {result.stderr}")
|
||||
return
|
||||
|
||||
# Load scraped data
|
||||
docs_data_file = f"output/{doc_config['name']}_data/summary.json"
|
||||
|
||||
if os.path.exists(docs_data_file):
|
||||
with open(docs_data_file, 'r') as f:
|
||||
summary = json.load(f)
|
||||
|
||||
self.scraped_data['documentation'] = {
|
||||
'pages': summary.get('pages', []),
|
||||
'data_file': docs_data_file
|
||||
}
|
||||
|
||||
logger.info(f"✅ Documentation: {summary.get('total_pages', 0)} pages scraped")
|
||||
else:
|
||||
logger.warning("Documentation data file not found")
|
||||
|
||||
# Clean up temp config
|
||||
if os.path.exists(temp_config_path):
|
||||
os.remove(temp_config_path)
|
||||
|
||||
def _scrape_github(self, source: Dict[str, Any]):
|
||||
"""Scrape GitHub repository."""
|
||||
@@ -339,24 +354,25 @@ class UnifiedScraper:
|
||||
logger.info("PHASE 4: Building unified skill")
|
||||
logger.info("=" * 60)
|
||||
|
||||
# This will be implemented in Phase 7
|
||||
logger.info("Skill building to be implemented in Phase 7")
|
||||
logger.info(f"Output directory: {self.output_dir}")
|
||||
logger.info(f"Data directory: {self.data_dir}")
|
||||
# Load conflicts if they exist
|
||||
conflicts = []
|
||||
conflicts_file = os.path.join(self.data_dir, 'conflicts.json')
|
||||
if os.path.exists(conflicts_file):
|
||||
with open(conflicts_file, 'r') as f:
|
||||
conflicts_data = json.load(f)
|
||||
conflicts = conflicts_data.get('conflicts', [])
|
||||
|
||||
# For now, just create a placeholder
|
||||
skill_file = os.path.join(self.output_dir, 'SKILL.md')
|
||||
with open(skill_file, 'w') as f:
|
||||
f.write(f"# {self.config['name'].title()}\n\n")
|
||||
f.write(f"{self.config['description']}\n\n")
|
||||
f.write("## Sources\n\n")
|
||||
# Build skill
|
||||
builder = UnifiedSkillBuilder(
|
||||
self.config,
|
||||
self.scraped_data,
|
||||
merged_data,
|
||||
conflicts
|
||||
)
|
||||
|
||||
for source in self.config.get('sources', []):
|
||||
f.write(f"- {source['type']}\n")
|
||||
builder.build()
|
||||
|
||||
f.write("\n*Skill building in progress...*\n")
|
||||
|
||||
logger.info(f"✅ Placeholder skill created: {skill_file}")
|
||||
logger.info(f"✅ Unified skill built: {self.output_dir}/")
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user