feat: Phase 6 - Unified scraper orchestrator

Created main orchestrator that coordinates entire workflow: Architecture: - UnifiedScraper class orchestrates all phases - Routes to appropriate scraper based on source type - Supports any combination of sources 4-Phase Workflow: 1. Scrape all sources (docs, GitHub, PDF) 2. Detect conflicts (if multiple API sources) 3. Merge intelligently (rule-based or Claude-enhanced) 4. Build unified skill (placeholder for Phase 7) Features: ✅ Validates unified config on startup ✅ Backward compatible with legacy configs ✅ Source-specific routing (documentation/github/pdf) ✅ Automatic conflict detection when needed ✅ Merge mode selection (rule-based/claude-enhanced) ✅ Creates organized output structure ✅ Comprehensive logging for each phase ✅ Error handling and graceful failures CLI Usage: - python3 cli/unified_scraper.py --config configs/godot_unified.json - python3 cli/unified_scraper.py -c configs/react_unified.json -m claude-enhanced Output Structure: - output/{name}/ - Final skill directory - output/{name}_unified_data/ - Intermediate data files * documentation_data.json * github_data.json * conflicts.json * merged_data.json Next: Phase 7 - Skill builder to generate final SKILL.md 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-26 15:32:23 +03:00
parent e7ec923d47
commit f03f4cf569
1 changed files with 433 additions and 0 deletions
--- a/cli/unified_scraper.py
+++ b/cli/unified_scraper.py
@@ -0,0 +1,433 @@
+#!/usr/bin/env python3
+"""
+Unified Multi-Source Scraper
+
+Orchestrates scraping from multiple sources (documentation, GitHub, PDF),
+detects conflicts, merges intelligently, and builds unified skills.
+
+This is the main entry point for unified config workflow.
+
+Usage:
+    python3 cli/unified_scraper.py --config configs/godot_unified.json
+    python3 cli/unified_scraper.py --config configs/react_unified.json --merge-mode claude-enhanced
+"""
+
+import os
+import sys
+import json
+import logging
+import argparse
+from pathlib import Path
+from typing import Dict, List, Any, Optional
+
+# Import validators and scrapers
+try:
+    from config_validator import ConfigValidator, validate_config
+    from conflict_detector import ConflictDetector
+    from merge_sources import RuleBasedMerger, ClaudeEnhancedMerger
+except ImportError as e:
+    print(f"Error importing modules: {e}")
+    print("Make sure you're running from the project root directory")
+    sys.exit(1)
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+
+class UnifiedScraper:
+    """
+    Orchestrates multi-source scraping and merging.
+
+    Main workflow:
+    1. Load and validate unified config
+    2. Scrape all sources (docs, GitHub, PDF)
+    3. Detect conflicts between sources
+    4. Merge intelligently (rule-based or Claude-enhanced)
+    5. Build unified skill
+    """
+
+    def __init__(self, config_path: str, merge_mode: Optional[str] = None):
+        """
+        Initialize unified scraper.
+
+        Args:
+            config_path: Path to unified config JSON
+            merge_mode: Override config merge_mode ('rule-based' or 'claude-enhanced')
+        """
+        self.config_path = config_path
+
+        # Validate and load config
+        logger.info(f"Loading config: {config_path}")
+        self.validator = validate_config(config_path)
+        self.config = self.validator.config
+
+        # Determine merge mode
+        self.merge_mode = merge_mode or self.config.get('merge_mode', 'rule-based')
+        logger.info(f"Merge mode: {self.merge_mode}")
+
+        # Storage for scraped data
+        self.scraped_data = {}
+
+        # Output paths
+        self.name = self.config['name']
+        self.output_dir = f"output/{self.name}"
+        self.data_dir = f"output/{self.name}_unified_data"
+
+        os.makedirs(self.output_dir, exist_ok=True)
+        os.makedirs(self.data_dir, exist_ok=True)
+
+    def scrape_all_sources(self):
+        """
+        Scrape all configured sources.
+
+        Routes to appropriate scraper based on source type.
+        """
+        logger.info("=" * 60)
+        logger.info("PHASE 1: Scraping all sources")
+        logger.info("=" * 60)
+
+        if not self.validator.is_unified:
+            logger.warning("Config is not unified format, converting...")
+            self.config = self.validator.convert_legacy_to_unified()
+
+        sources = self.config.get('sources', [])
+
+        for i, source in enumerate(sources):
+            source_type = source['type']
+            logger.info(f"\n[{i+1}/{len(sources)}] Scraping {source_type} source...")
+
+            try:
+                if source_type == 'documentation':
+                    self._scrape_documentation(source)
+                elif source_type == 'github':
+                    self._scrape_github(source)
+                elif source_type == 'pdf':
+                    self._scrape_pdf(source)
+                else:
+                    logger.warning(f"Unknown source type: {source_type}")
+            except Exception as e:
+                logger.error(f"Error scraping {source_type}: {e}")
+                logger.info("Continuing with other sources...")
+
+        logger.info(f"\n✅ Scraped {len(self.scraped_data)} sources successfully")
+
+    def _scrape_documentation(self, source: Dict[str, Any]):
+        """Scrape documentation website."""
+        # Import doc scraper
+        sys.path.insert(0, str(Path(__file__).parent))
+
+        try:
+            from doc_scraper import scrape_all, save_data
+        except ImportError:
+            logger.error("doc_scraper.py not found")
+            return
+
+        # Create temporary config for doc scraper
+        doc_config = {
+            'name': f"{self.name}_docs",
+            'base_url': source['base_url'],
+            'selectors': source.get('selectors', {}),
+            'url_patterns': source.get('url_patterns', {}),
+            'categories': source.get('categories', {}),
+            'rate_limit': source.get('rate_limit', 0.5),
+            'max_pages': source.get('max_pages', 100)
+        }
+
+        # Scrape
+        logger.info(f"Scraping documentation from {source['base_url']}")
+        pages = scrape_all(doc_config)
+
+        # Save data
+        docs_data_file = os.path.join(self.data_dir, 'documentation_data.json')
+        save_data(pages, docs_data_file, doc_config)
+
+        self.scraped_data['documentation'] = {
+            'pages': pages,
+            'data_file': docs_data_file
+        }
+
+        logger.info(f"✅ Documentation: {len(pages)} pages scraped")
+
+    def _scrape_github(self, source: Dict[str, Any]):
+        """Scrape GitHub repository."""
+        sys.path.insert(0, str(Path(__file__).parent))
+
+        try:
+            from github_scraper import GitHubScraper
+        except ImportError:
+            logger.error("github_scraper.py not found")
+            return
+
+        # Create config for GitHub scraper
+        github_config = {
+            'repo': source['repo'],
+            'name': f"{self.name}_github",
+            'github_token': source.get('github_token'),
+            'include_issues': source.get('include_issues', True),
+            'max_issues': source.get('max_issues', 100),
+            'include_changelog': source.get('include_changelog', True),
+            'include_releases': source.get('include_releases', True),
+            'include_code': source.get('include_code', True),
+            'code_analysis_depth': source.get('code_analysis_depth', 'surface'),
+            'file_patterns': source.get('file_patterns', [])
+        }
+
+        # Scrape
+        logger.info(f"Scraping GitHub repository: {source['repo']}")
+        scraper = GitHubScraper(github_config)
+        github_data = scraper.scrape()
+
+        # Save data
+        github_data_file = os.path.join(self.data_dir, 'github_data.json')
+        with open(github_data_file, 'w') as f:
+            json.dump(github_data, f, indent=2, ensure_ascii=False)
+
+        self.scraped_data['github'] = {
+            'data': github_data,
+            'data_file': github_data_file
+        }
+
+        logger.info(f"✅ GitHub: Repository scraped successfully")
+
+    def _scrape_pdf(self, source: Dict[str, Any]):
+        """Scrape PDF document."""
+        sys.path.insert(0, str(Path(__file__).parent))
+
+        try:
+            from pdf_scraper import PDFToSkillConverter
+        except ImportError:
+            logger.error("pdf_scraper.py not found")
+            return
+
+        # Create config for PDF scraper
+        pdf_config = {
+            'name': f"{self.name}_pdf",
+            'pdf': source['path'],
+            'extract_tables': source.get('extract_tables', False),
+            'ocr': source.get('ocr', False),
+            'password': source.get('password')
+        }
+
+        # Scrape
+        logger.info(f"Scraping PDF: {source['path']}")
+        converter = PDFToSkillConverter(pdf_config)
+        pdf_data = converter.extract_all()
+
+        # Save data
+        pdf_data_file = os.path.join(self.data_dir, 'pdf_data.json')
+        with open(pdf_data_file, 'w') as f:
+            json.dump(pdf_data, f, indent=2, ensure_ascii=False)
+
+        self.scraped_data['pdf'] = {
+            'data': pdf_data,
+            'data_file': pdf_data_file
+        }
+
+        logger.info(f"✅ PDF: {len(pdf_data.get('pages', []))} pages extracted")
+
+    def detect_conflicts(self) -> List:
+        """
+        Detect conflicts between documentation and code.
+
+        Only applicable if both documentation and GitHub sources exist.
+
+        Returns:
+            List of conflicts
+        """
+        logger.info("\n" + "=" * 60)
+        logger.info("PHASE 2: Detecting conflicts")
+        logger.info("=" * 60)
+
+        if not self.validator.needs_api_merge():
+            logger.info("No API merge needed (only one API source)")
+            return []
+
+        # Get documentation and GitHub data
+        docs_data = self.scraped_data.get('documentation', {})
+        github_data = self.scraped_data.get('github', {})
+
+        if not docs_data or not github_data:
+            logger.warning("Missing documentation or GitHub data for conflict detection")
+            return []
+
+        # Load data files
+        with open(docs_data['data_file'], 'r') as f:
+            docs_json = json.load(f)
+
+        with open(github_data['data_file'], 'r') as f:
+            github_json = json.load(f)
+
+        # Detect conflicts
+        detector = ConflictDetector(docs_json, github_json)
+        conflicts = detector.detect_all_conflicts()
+
+        # Save conflicts
+        conflicts_file = os.path.join(self.data_dir, 'conflicts.json')
+        detector.save_conflicts(conflicts, conflicts_file)
+
+        # Print summary
+        summary = detector.generate_summary(conflicts)
+        logger.info(f"\n📊 Conflict Summary:")
+        logger.info(f"   Total: {summary['total']}")
+        logger.info(f"   By Type:")
+        for ctype, count in summary['by_type'].items():
+            if count > 0:
+                logger.info(f"     - {ctype}: {count}")
+        logger.info(f"   By Severity:")
+        for severity, count in summary['by_severity'].items():
+            if count > 0:
+                emoji = '🔴' if severity == 'high' else '🟡' if severity == 'medium' else '🟢'
+                logger.info(f"     {emoji} {severity}: {count}")
+
+        return conflicts
+
+    def merge_sources(self, conflicts: List):
+        """
+        Merge data from multiple sources.
+
+        Args:
+            conflicts: List of detected conflicts
+        """
+        logger.info("\n" + "=" * 60)
+        logger.info(f"PHASE 3: Merging sources ({self.merge_mode})")
+        logger.info("=" * 60)
+
+        if not conflicts:
+            logger.info("No conflicts to merge")
+            return None
+
+        # Get data files
+        docs_data = self.scraped_data.get('documentation', {})
+        github_data = self.scraped_data.get('github', {})
+
+        # Load data
+        with open(docs_data['data_file'], 'r') as f:
+            docs_json = json.load(f)
+
+        with open(github_data['data_file'], 'r') as f:
+            github_json = json.load(f)
+
+        # Choose merger
+        if self.merge_mode == 'claude-enhanced':
+            merger = ClaudeEnhancedMerger(docs_json, github_json, conflicts)
+        else:
+            merger = RuleBasedMerger(docs_json, github_json, conflicts)
+
+        # Merge
+        merged_data = merger.merge_all()
+
+        # Save merged data
+        merged_file = os.path.join(self.data_dir, 'merged_data.json')
+        with open(merged_file, 'w') as f:
+            json.dump(merged_data, f, indent=2, ensure_ascii=False)
+
+        logger.info(f"✅ Merged data saved: {merged_file}")
+
+        return merged_data
+
+    def build_skill(self, merged_data: Optional[Dict] = None):
+        """
+        Build final unified skill.
+
+        Args:
+            merged_data: Merged API data (if conflicts were resolved)
+        """
+        logger.info("\n" + "=" * 60)
+        logger.info("PHASE 4: Building unified skill")
+        logger.info("=" * 60)
+
+        # This will be implemented in Phase 7
+        logger.info("Skill building to be implemented in Phase 7")
+        logger.info(f"Output directory: {self.output_dir}")
+        logger.info(f"Data directory: {self.data_dir}")
+
+        # For now, just create a placeholder
+        skill_file = os.path.join(self.output_dir, 'SKILL.md')
+        with open(skill_file, 'w') as f:
+            f.write(f"# {self.config['name'].title()}\n\n")
+            f.write(f"{self.config['description']}\n\n")
+            f.write("## Sources\n\n")
+
+            for source in self.config.get('sources', []):
+                f.write(f"- {source['type']}\n")
+
+            f.write("\n*Skill building in progress...*\n")
+
+        logger.info(f"✅ Placeholder skill created: {skill_file}")
+
+    def run(self):
+        """
+        Execute complete unified scraping workflow.
+        """
+        logger.info("\n" + "🚀 " * 20)
+        logger.info(f"Unified Scraper: {self.config['name']}")
+        logger.info("🚀 " * 20 + "\n")
+
+        try:
+            # Phase 1: Scrape all sources
+            self.scrape_all_sources()
+
+            # Phase 2: Detect conflicts (if applicable)
+            conflicts = self.detect_conflicts()
+
+            # Phase 3: Merge sources (if conflicts exist)
+            merged_data = None
+            if conflicts:
+                merged_data = self.merge_sources(conflicts)
+
+            # Phase 4: Build skill
+            self.build_skill(merged_data)
+
+            logger.info("\n" + "✅ " * 20)
+            logger.info("Unified scraping complete!")
+            logger.info("✅ " * 20 + "\n")
+
+            logger.info(f"📁 Output: {self.output_dir}/")
+            logger.info(f"📁 Data: {self.data_dir}/")
+
+        except KeyboardInterrupt:
+            logger.info("\n\n⚠️  Scraping interrupted by user")
+            sys.exit(1)
+        except Exception as e:
+            logger.error(f"\n\n❌ Error during scraping: {e}")
+            import traceback
+            traceback.print_exc()
+            sys.exit(1)
+
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description='Unified multi-source scraper',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Basic usage with unified config
+  python3 cli/unified_scraper.py --config configs/godot_unified.json
+
+  # Override merge mode
+  python3 cli/unified_scraper.py --config configs/react_unified.json --merge-mode claude-enhanced
+
+  # Backward compatible with legacy configs
+  python3 cli/unified_scraper.py --config configs/react.json
+        """
+    )
+
+    parser.add_argument('--config', '-c', required=True,
+                       help='Path to unified config JSON file')
+    parser.add_argument('--merge-mode', '-m',
+                       choices=['rule-based', 'claude-enhanced'],
+                       help='Override config merge mode')
+
+    args = parser.parse_args()
+
+    # Create and run scraper
+    scraper = UnifiedScraper(args.config, args.merge_mode)
+    scraper.run()
+
+
+if __name__ == '__main__':
+    main()