Add unified multi-source scraping feature (Phases 7-11)

Completes the unified scraping system implementation: **Phase 7: Unified Skill Builder** - cli/unified_skill_builder.py: Generates final skill structure - Inline conflict warnings (⚠️) in API reference - Side-by-side docs vs code comparison - Severity-based conflict grouping - Separate conflicts.md report **Phase 8: MCP Integration** - skill_seeker_mcp/server.py: Auto-detects unified vs legacy configs - Routes to unified_scraper.py or doc_scraper.py automatically - Supports merge_mode parameter override - Maintains full backward compatibility **Phase 9: Example Unified Configs** - configs/react_unified.json: React docs + GitHub - configs/django_unified.json: Django docs + GitHub - configs/fastapi_unified.json: FastAPI docs + GitHub - configs/fastapi_unified_test.json: Test config with limited pages **Phase 10: Comprehensive Tests** - cli/test_unified_simple.py: Integration tests (all passing) - Tests unified config validation - Tests backward compatibility - Tests mixed source types - Tests error handling **Phase 11: Documentation** - docs/UNIFIED_SCRAPING.md: Complete guide (1000+ lines) - Examples, best practices, troubleshooting - Architecture diagrams and data flow - Command reference **Additional:** - demo_conflicts.py: Interactive conflict detection demo - TEST_RESULTS.md: Complete test results and findings - cli/unified_scraper.py: Fixed doc_scraper integration (subprocess) **Features:** ✅ Multi-source scraping (docs + GitHub + PDF) ✅ Conflict detection (4 types, 3 severity levels) ✅ Rule-based merging (fast, deterministic) ✅ Claude-enhanced merging (AI-powered) ✅ Transparent conflict reporting ✅ MCP auto-detection ✅ Backward compatibility **Test Results:** - 6/6 integration tests passed - 4 unified configs validated - 3 legacy configs backward compatible - 5 conflicts detected in test data - All documentation complete 🤖 Generated with Claude Code
2025-10-26 16:33:41 +03:00
parent f03f4cf569
commit 5d8c7e39f6
11 changed files with 2171 additions and 72 deletions
--- a/cli/unified_scraper.py
+++ b/cli/unified_scraper.py
@@ -17,6 +17,7 @@ import sys
 import json
 import logging
 import argparse
+import subprocess
 from pathlib import Path
 from typing import Dict, List, Any, Optional

@@ -25,6 +26,7 @@ try:
    from config_validator import ConfigValidator, validate_config
    from conflict_detector import ConflictDetector
    from merge_sources import RuleBasedMerger, ClaudeEnhancedMerger
+    from unified_skill_builder import UnifiedSkillBuilder
 except ImportError as e:
    print(f"Error importing modules: {e}")
    print("Make sure you're running from the project root directory")
@@ -116,15 +118,6 @@ class UnifiedScraper:

    def _scrape_documentation(self, source: Dict[str, Any]):
        """Scrape documentation website."""
-        # Import doc scraper
-        sys.path.insert(0, str(Path(__file__).parent))
-
-        try:
-            from doc_scraper import scrape_all, save_data
-        except ImportError:
-            logger.error("doc_scraper.py not found")
-            return
-
        # Create temporary config for doc scraper
        doc_config = {
            'name': f"{self.name}_docs",
@@ -136,20 +129,42 @@ class UnifiedScraper:
            'max_pages': source.get('max_pages', 100)
        }

-        # Scrape
+        # Write temporary config
+        temp_config_path = os.path.join(self.data_dir, 'temp_docs_config.json')
+        with open(temp_config_path, 'w') as f:
+            json.dump(doc_config, f, indent=2)
+
+        # Run doc_scraper as subprocess
        logger.info(f"Scraping documentation from {source['base_url']}")
-        pages = scrape_all(doc_config)

-        # Save data
-        docs_data_file = os.path.join(self.data_dir, 'documentation_data.json')
-        save_data(pages, docs_data_file, doc_config)
+        doc_scraper_path = Path(__file__).parent / "doc_scraper.py"
+        cmd = [sys.executable, str(doc_scraper_path), '--config', temp_config_path]

-        self.scraped_data['documentation'] = {
-            'pages': pages,
-            'data_file': docs_data_file
-        }
+        result = subprocess.run(cmd, capture_output=True, text=True)

-        logger.info(f"✅ Documentation: {len(pages)} pages scraped")
+        if result.returncode != 0:
+            logger.error(f"Documentation scraping failed: {result.stderr}")
+            return
+
+        # Load scraped data
+        docs_data_file = f"output/{doc_config['name']}_data/summary.json"
+
+        if os.path.exists(docs_data_file):
+            with open(docs_data_file, 'r') as f:
+                summary = json.load(f)
+
+            self.scraped_data['documentation'] = {
+                'pages': summary.get('pages', []),
+                'data_file': docs_data_file
+            }
+
+            logger.info(f"✅ Documentation: {summary.get('total_pages', 0)} pages scraped")
+        else:
+            logger.warning("Documentation data file not found")
+
+        # Clean up temp config
+        if os.path.exists(temp_config_path):
+            os.remove(temp_config_path)

    def _scrape_github(self, source: Dict[str, Any]):
        """Scrape GitHub repository."""
@@ -339,24 +354,25 @@ class UnifiedScraper:
        logger.info("PHASE 4: Building unified skill")
        logger.info("=" * 60)

-        # This will be implemented in Phase 7
-        logger.info("Skill building to be implemented in Phase 7")
-        logger.info(f"Output directory: {self.output_dir}")
-        logger.info(f"Data directory: {self.data_dir}")
+        # Load conflicts if they exist
+        conflicts = []
+        conflicts_file = os.path.join(self.data_dir, 'conflicts.json')
+        if os.path.exists(conflicts_file):
+            with open(conflicts_file, 'r') as f:
+                conflicts_data = json.load(f)
+                conflicts = conflicts_data.get('conflicts', [])

-        # For now, just create a placeholder
-        skill_file = os.path.join(self.output_dir, 'SKILL.md')
-        with open(skill_file, 'w') as f:
-            f.write(f"# {self.config['name'].title()}\n\n")
-            f.write(f"{self.config['description']}\n\n")
-            f.write("## Sources\n\n")
+        # Build skill
+        builder = UnifiedSkillBuilder(
+            self.config,
+            self.scraped_data,
+            merged_data,
+            conflicts
+        )

-            for source in self.config.get('sources', []):
-                f.write(f"- {source['type']}\n")
+        builder.build()

-            f.write("\n*Skill building in progress...*\n")
-
-        logger.info(f"✅ Placeholder skill created: {skill_file}")
+        logger.info(f"✅ Unified skill built: {self.output_dir}/")

    def run(self):
        """