#!/usr/bin/env python3 """ Source Merger for Multi-Source Skills Merges documentation and code data intelligently with GitHub insights: - Rule-based merge: Fast, deterministic rules - Claude-enhanced merge: AI-powered reconciliation Handles conflicts and creates unified API reference with GitHub metadata. Multi-layer architecture (Phase 3): - Layer 1: C3.x code (ground truth) - Layer 2: HTML docs (official intent) - Layer 3: GitHub docs (README/CONTRIBUTING) - Layer 4: GitHub insights (issues) """ import json import logging import os import subprocess import tempfile from typing import Any, Optional from .conflict_detector import Conflict, ConflictDetector # Import three-stream data classes (Phase 1) try: from .github_fetcher import CodeStream, DocsStream, InsightsStream, ThreeStreamData except ImportError: # Fallback if github_fetcher not available ThreeStreamData = None CodeStream = None DocsStream = None InsightsStream = None logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def categorize_issues_by_topic( problems: list[dict], solutions: list[dict], topics: list[str] ) -> dict[str, list[dict]]: """ Categorize GitHub issues by topic keywords. Args: problems: List of common problems (open issues with 5+ comments) solutions: List of known solutions (closed issues with comments) topics: List of topic keywords to match against Returns: Dict mapping topic to relevant issues """ categorized = {topic: [] for topic in topics} categorized["other"] = [] all_issues = problems + solutions for issue in all_issues: # Get searchable text title = issue.get("title", "").lower() labels = [label.lower() for label in issue.get("labels", [])] text = f"{title} {' '.join(labels)}" # Find best matching topic matched_topic = None max_matches = 0 for topic in topics: # Count keyword matches topic_keywords = topic.lower().split() matches = sum(1 for keyword in topic_keywords if keyword in text) if matches > max_matches: max_matches = matches matched_topic = topic # Categorize by best match or 'other' if matched_topic and max_matches > 0: categorized[matched_topic].append(issue) else: categorized["other"].append(issue) # Remove empty categories return {k: v for k, v in categorized.items() if v} def generate_hybrid_content( api_data: dict, github_docs: dict | None, github_insights: dict | None, conflicts: list[Conflict], ) -> dict[str, Any]: """ Generate hybrid content combining API data with GitHub context. Args: api_data: Merged API data github_docs: GitHub docs stream (README, CONTRIBUTING, docs/*.md) github_insights: GitHub insights stream (metadata, issues, labels) conflicts: List of detected conflicts Returns: Hybrid content dict with enriched API reference """ hybrid = {"api_reference": api_data, "github_context": {}} # Add GitHub documentation layer if github_docs: hybrid["github_context"]["docs"] = { "readme": github_docs.get("readme"), "contributing": github_docs.get("contributing"), "docs_files_count": len(github_docs.get("docs_files", [])), } # Add GitHub insights layer if github_insights: metadata = github_insights.get("metadata", {}) hybrid["github_context"]["metadata"] = { "stars": metadata.get("stars", 0), "forks": metadata.get("forks", 0), "language": metadata.get("language", "Unknown"), "description": metadata.get("description", ""), } # Add issue insights common_problems = github_insights.get("common_problems", []) known_solutions = github_insights.get("known_solutions", []) hybrid["github_context"]["issues"] = { "common_problems_count": len(common_problems), "known_solutions_count": len(known_solutions), "top_problems": common_problems[:5], # Top 5 most-discussed "top_solutions": known_solutions[:5], } hybrid["github_context"]["top_labels"] = github_insights.get("top_labels", []) # Add conflict summary hybrid["conflict_summary"] = { "total_conflicts": len(conflicts), "by_type": {}, "by_severity": {}, } for conflict in conflicts: # Count by type conflict_type = conflict.type hybrid["conflict_summary"]["by_type"][conflict_type] = ( hybrid["conflict_summary"]["by_type"].get(conflict_type, 0) + 1 ) # Count by severity severity = conflict.severity hybrid["conflict_summary"]["by_severity"][severity] = ( hybrid["conflict_summary"]["by_severity"].get(severity, 0) + 1 ) # Add GitHub issue links for relevant APIs if github_insights: hybrid["issue_links"] = _match_issues_to_apis( api_data.get("apis", {}), github_insights.get("common_problems", []), github_insights.get("known_solutions", []), ) return hybrid def _match_issues_to_apis( apis: dict[str, dict], problems: list[dict], solutions: list[dict] ) -> dict[str, list[dict]]: """ Match GitHub issues to specific APIs by keyword matching. Args: apis: Dict of API data keyed by name problems: List of common problems solutions: List of known solutions Returns: Dict mapping API names to relevant issues """ issue_links = {} all_issues = problems + solutions for api_name in apis: # Extract searchable keywords from API name api_keywords = api_name.lower().replace("_", " ").split(".") matched_issues = [] for issue in all_issues: title = issue.get("title", "").lower() labels = [label.lower() for label in issue.get("labels", [])] text = f"{title} {' '.join(labels)}" # Check if any API keyword appears in issue if any(keyword in text for keyword in api_keywords): matched_issues.append( { "number": issue.get("number"), "title": issue.get("title"), "state": issue.get("state"), "comments": issue.get("comments"), } ) if matched_issues: issue_links[api_name] = matched_issues return issue_links class RuleBasedMerger: """ Rule-based API merger using deterministic rules with GitHub insights. Multi-layer architecture (Phase 3): - Layer 1: C3.x code (ground truth) - Layer 2: HTML docs (official intent) - Layer 3: GitHub docs (README/CONTRIBUTING) - Layer 4: GitHub insights (issues) Rules: 1. If API only in docs → Include with [DOCS_ONLY] tag 2. If API only in code → Include with [UNDOCUMENTED] tag 3. If both match perfectly → Include normally 4. If conflict → Include both versions with [CONFLICT] tag, prefer code signature """ def __init__( self, docs_data: dict, github_data: dict, conflicts: list[Conflict], github_streams: Optional["ThreeStreamData"] = None, ): """ Initialize rule-based merger with GitHub streams support. Args: docs_data: Documentation scraper data (Layer 2: HTML docs) github_data: GitHub scraper data (Layer 1: C3.x code) conflicts: List of detected conflicts github_streams: Optional ThreeStreamData with docs and insights (Layers 3-4) """ self.docs_data = docs_data self.github_data = github_data self.conflicts = conflicts self.github_streams = github_streams # Build conflict index for fast lookup self.conflict_index = {c.api_name: c for c in conflicts} # Extract APIs from both sources detector = ConflictDetector(docs_data, github_data) self.docs_apis = detector.docs_apis self.code_apis = detector.code_apis # Extract GitHub streams if available self.github_docs = None self.github_insights = None if github_streams: # Layer 3: GitHub docs if github_streams.docs_stream: self.github_docs = { "readme": github_streams.docs_stream.readme, "contributing": github_streams.docs_stream.contributing, "docs_files": github_streams.docs_stream.docs_files, } # Layer 4: GitHub insights if github_streams.insights_stream: self.github_insights = { "metadata": github_streams.insights_stream.metadata, "common_problems": github_streams.insights_stream.common_problems, "known_solutions": github_streams.insights_stream.known_solutions, "top_labels": github_streams.insights_stream.top_labels, } def merge_all(self) -> dict[str, Any]: """ Merge all APIs using rule-based logic with GitHub insights (Phase 3). Returns: Dict containing merged API data with hybrid content """ logger.info("Starting rule-based merge with GitHub streams...") merged_apis = {} # Get all unique API names all_api_names = set(self.docs_apis.keys()) | set(self.code_apis.keys()) for api_name in sorted(all_api_names): merged_api = self._merge_single_api(api_name) merged_apis[api_name] = merged_api logger.info(f"Merged {len(merged_apis)} APIs") # Build base result merged_data = { "merge_mode": "rule-based", "apis": merged_apis, "summary": { "total_apis": len(merged_apis), "docs_only": sum(1 for api in merged_apis.values() if api["status"] == "docs_only"), "code_only": sum(1 for api in merged_apis.values() if api["status"] == "code_only"), "matched": sum(1 for api in merged_apis.values() if api["status"] == "matched"), "conflict": sum(1 for api in merged_apis.values() if api["status"] == "conflict"), }, } # Generate hybrid content if GitHub streams available (Phase 3) if self.github_streams: logger.info("Generating hybrid content with GitHub insights...") hybrid_content = generate_hybrid_content( api_data=merged_data, github_docs=self.github_docs, github_insights=self.github_insights, conflicts=self.conflicts, ) # Merge hybrid content into result merged_data["github_context"] = hybrid_content.get("github_context", {}) merged_data["conflict_summary"] = hybrid_content.get("conflict_summary", {}) merged_data["issue_links"] = hybrid_content.get("issue_links", {}) logger.info( f"Added GitHub context: {len(self.github_insights.get('common_problems', []))} problems, " f"{len(self.github_insights.get('known_solutions', []))} solutions" ) return merged_data def _merge_single_api(self, api_name: str) -> dict[str, Any]: """ Merge a single API using rules. Args: api_name: Name of the API to merge Returns: Merged API dict """ in_docs = api_name in self.docs_apis in_code = api_name in self.code_apis has_conflict = api_name in self.conflict_index # Rule 1: Only in docs if in_docs and not in_code: conflict = self.conflict_index.get(api_name) return { "name": api_name, "status": "docs_only", "source": "documentation", "data": self.docs_apis[api_name], "warning": "This API is documented but not found in codebase", "conflict": conflict.__dict__ if conflict else None, } # Rule 2: Only in code if in_code and not in_docs: is_private = api_name.startswith("_") conflict = self.conflict_index.get(api_name) return { "name": api_name, "status": "code_only", "source": "code", "data": self.code_apis[api_name], "warning": "This API exists in code but is not documented" if not is_private else "Internal/private API", "conflict": conflict.__dict__ if conflict else None, } # Both exist - check for conflicts docs_info = self.docs_apis[api_name] code_info = self.code_apis[api_name] # Rule 3: Both match perfectly (no conflict) if not has_conflict: return { "name": api_name, "status": "matched", "source": "both", "docs_data": docs_info, "code_data": code_info, "merged_signature": self._create_merged_signature(code_info, docs_info), "merged_description": docs_info.get("docstring") or code_info.get("docstring"), } # Rule 4: Conflict exists - prefer code signature, keep docs description conflict = self.conflict_index[api_name] return { "name": api_name, "status": "conflict", "source": "both", "docs_data": docs_info, "code_data": code_info, "conflict": conflict.__dict__, "resolution": "prefer_code_signature", "merged_signature": self._create_merged_signature(code_info, docs_info), "merged_description": docs_info.get("docstring") or code_info.get("docstring"), "warning": conflict.difference, } def _create_merged_signature(self, code_info: dict, docs_info: dict) -> str: """ Create merged signature preferring code data. Args: code_info: API info from code docs_info: API info from docs Returns: Merged signature string """ name = code_info.get("name", docs_info.get("name")) params = code_info.get("parameters", docs_info.get("parameters", [])) return_type = code_info.get("return_type", docs_info.get("return_type")) # Build parameter string param_strs = [] for param in params: param_str = param["name"] if param.get("type_hint"): param_str += f": {param['type_hint']}" if param.get("default"): param_str += f" = {param['default']}" param_strs.append(param_str) signature = f"{name}({', '.join(param_strs)})" if return_type: signature += f" -> {return_type}" return signature class ClaudeEnhancedMerger: """ Claude-enhanced API merger using local Claude Code with GitHub insights. Opens Claude Code in a new terminal to intelligently reconcile conflicts. Uses the same approach as enhance_skill_local.py. Multi-layer architecture (Phase 3): - Layer 1: C3.x code (ground truth) - Layer 2: HTML docs (official intent) - Layer 3: GitHub docs (README/CONTRIBUTING) - Layer 4: GitHub insights (issues) """ def __init__( self, docs_data: dict, github_data: dict, conflicts: list[Conflict], github_streams: Optional["ThreeStreamData"] = None, ): """ Initialize Claude-enhanced merger with GitHub streams support. Args: docs_data: Documentation scraper data (Layer 2: HTML docs) github_data: GitHub scraper data (Layer 1: C3.x code) conflicts: List of detected conflicts github_streams: Optional ThreeStreamData with docs and insights (Layers 3-4) """ self.docs_data = docs_data self.github_data = github_data self.conflicts = conflicts self.github_streams = github_streams # First do rule-based merge as baseline self.rule_merger = RuleBasedMerger(docs_data, github_data, conflicts, github_streams) def merge_all(self) -> dict[str, Any]: """ Merge all APIs using Claude enhancement. Returns: Dict containing merged API data """ logger.info("Starting Claude-enhanced merge...") # Create temporary workspace workspace_dir = self._create_workspace() # Launch Claude Code for enhancement logger.info("Launching Claude Code for intelligent merging...") logger.info("Claude will analyze conflicts and create reconciled API reference") try: self._launch_claude_merge(workspace_dir) # Read enhanced results merged_data = self._read_merged_results(workspace_dir) logger.info("Claude-enhanced merge complete") return merged_data except Exception as e: logger.error(f"Claude enhancement failed: {e}") logger.info("Falling back to rule-based merge") return self.rule_merger.merge_all() def _create_workspace(self) -> str: """ Create temporary workspace with merge context. Returns: Path to workspace directory """ workspace = tempfile.mkdtemp(prefix="skill_merge_") logger.info(f"Created merge workspace: {workspace}") # Write context files for Claude self._write_context_files(workspace) return workspace def _write_context_files(self, workspace: str): """Write context files for Claude to analyze.""" # 1. Write conflicts summary conflicts_file = os.path.join(workspace, "conflicts.json") with open(conflicts_file, "w") as f: json.dump( { "conflicts": [c.__dict__ for c in self.conflicts], "summary": { "total": len(self.conflicts), "by_type": self._count_by_field("type"), "by_severity": self._count_by_field("severity"), }, }, f, indent=2, ) # 2. Write documentation APIs docs_apis_file = os.path.join(workspace, "docs_apis.json") detector = ConflictDetector(self.docs_data, self.github_data) with open(docs_apis_file, "w") as f: json.dump(detector.docs_apis, f, indent=2) # 3. Write code APIs code_apis_file = os.path.join(workspace, "code_apis.json") with open(code_apis_file, "w") as f: json.dump(detector.code_apis, f, indent=2) # 4. Write merge instructions for Claude instructions = """# API Merge Task You are merging API documentation from two sources: 1. Official documentation (user-facing) 2. Source code analysis (implementation reality) ## Context Files: - `conflicts.json` - All detected conflicts between sources - `docs_apis.json` - APIs from documentation - `code_apis.json` - APIs from source code ## Your Task: For each conflict, reconcile the differences intelligently: 1. **Prefer code signatures as source of truth** - Use actual parameter names, types, defaults from code - Code is what actually runs, docs might be outdated 2. **Keep documentation descriptions** - Docs are user-friendly, code comments might be technical - Keep the docs' explanation of what the API does 3. **Add implementation notes for discrepancies** - If docs differ from code, explain the difference - Example: "⚠️ The `snap` parameter exists in code but is not documented" 4. **Flag missing APIs clearly** - Missing in docs → Add [UNDOCUMENTED] tag - Missing in code → Add [REMOVED] or [DOCS_ERROR] tag 5. **Create unified API reference** - One definitive signature per API - Clear warnings about conflicts - Implementation notes where helpful ## Output Format: Create `merged_apis.json` with this structure: ```json { "apis": { "API.name": { "signature": "final_signature_here", "parameters": [...], "return_type": "type", "description": "user-friendly description", "implementation_notes": "Any discrepancies or warnings", "source": "both|docs_only|code_only", "confidence": "high|medium|low" } } } ``` Take your time to analyze each conflict carefully. The goal is to create the most accurate and helpful API reference possible. """ instructions_file = os.path.join(workspace, "MERGE_INSTRUCTIONS.md") with open(instructions_file, "w") as f: f.write(instructions) logger.info(f"Wrote context files to {workspace}") def _count_by_field(self, field: str) -> dict[str, int]: """Count conflicts by a specific field.""" counts = {} for conflict in self.conflicts: value = getattr(conflict, field) counts[value] = counts.get(value, 0) + 1 return counts def _launch_claude_merge(self, workspace: str): """ Launch Claude Code to perform merge. Similar to enhance_skill_local.py approach. """ # Create a script that Claude will execute script_path = os.path.join(workspace, "merge_script.sh") script_content = f"""#!/bin/bash # Automatic merge script for Claude Code cd "{workspace}" echo "📊 Analyzing conflicts..." cat conflicts.json | head -20 echo "" echo "📖 Documentation APIs: $(cat docs_apis.json | grep -c '\"name\"')" echo "💻 Code APIs: $(cat code_apis.json | grep -c '\"name\"')" echo "" echo "Please review the conflicts and create merged_apis.json" echo "Follow the instructions in MERGE_INSTRUCTIONS.md" echo "" echo "When done, save merged_apis.json and close this terminal." # Wait for user to complete merge read -p "Press Enter when merge is complete..." """ with open(script_path, "w") as f: f.write(script_content) os.chmod(script_path, 0o755) # Open new terminal with Claude Code # Try different terminal emulators terminals = [ ["x-terminal-emulator", "-e"], ["gnome-terminal", "--"], ["xterm", "-e"], ["konsole", "-e"], ] for terminal_cmd in terminals: try: cmd = terminal_cmd + ["bash", script_path] subprocess.Popen(cmd) logger.info(f"Opened terminal with {terminal_cmd[0]}") break except FileNotFoundError: continue # Wait for merge to complete merged_file = os.path.join(workspace, "merged_apis.json") logger.info(f"Waiting for merged results at: {merged_file}") logger.info("Close the terminal when done to continue...") # Poll for file existence import time timeout = 3600 # 1 hour max elapsed = 0 while not os.path.exists(merged_file) and elapsed < timeout: time.sleep(5) elapsed += 5 if not os.path.exists(merged_file): raise TimeoutError("Claude merge timed out after 1 hour") def _read_merged_results(self, workspace: str) -> dict[str, Any]: """Read merged results from workspace.""" merged_file = os.path.join(workspace, "merged_apis.json") if not os.path.exists(merged_file): raise FileNotFoundError(f"Merged results not found: {merged_file}") with open(merged_file) as f: merged_data = json.load(f) return {"merge_mode": "claude-enhanced", **merged_data} def merge_sources( docs_data_path: str, github_data_path: str, output_path: str, mode: str = "rule-based", github_streams: Optional["ThreeStreamData"] = None, ) -> dict[str, Any]: """ Merge documentation and GitHub data with optional GitHub streams (Phase 3). Multi-layer architecture: - Layer 1: C3.x code (ground truth) - Layer 2: HTML docs (official intent) - Layer 3: GitHub docs (README/CONTRIBUTING) - from github_streams - Layer 4: GitHub insights (issues) - from github_streams Args: docs_data_path: Path to documentation data JSON github_data_path: Path to GitHub data JSON output_path: Path to save merged output mode: 'rule-based' or 'claude-enhanced' github_streams: Optional ThreeStreamData with docs and insights Returns: Merged data dict with hybrid content """ # Load data with open(docs_data_path) as f: docs_data = json.load(f) with open(github_data_path) as f: github_data = json.load(f) # Detect conflicts detector = ConflictDetector(docs_data, github_data) conflicts = detector.detect_all_conflicts() logger.info(f"Detected {len(conflicts)} conflicts") # Log GitHub streams availability if github_streams: logger.info("GitHub streams available for multi-layer merge") if github_streams.docs_stream: logger.info( f" - Docs stream: README, {len(github_streams.docs_stream.docs_files)} docs files" ) if github_streams.insights_stream: problems = len(github_streams.insights_stream.common_problems) solutions = len(github_streams.insights_stream.known_solutions) logger.info(f" - Insights stream: {problems} problems, {solutions} solutions") # Merge based on mode if mode == "claude-enhanced": merger = ClaudeEnhancedMerger(docs_data, github_data, conflicts, github_streams) else: merger = RuleBasedMerger(docs_data, github_data, conflicts, github_streams) merged_data = merger.merge_all() # Save merged data with open(output_path, "w") as f: json.dump(merged_data, f, indent=2, ensure_ascii=False) logger.info(f"Merged data saved to: {output_path}") return merged_data if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Merge documentation and code sources") parser.add_argument("docs_data", help="Path to documentation data JSON") parser.add_argument("github_data", help="Path to GitHub data JSON") parser.add_argument("--output", "-o", default="merged_data.json", help="Output file path") parser.add_argument( "--mode", "-m", choices=["rule-based", "claude-enhanced"], default="rule-based", help="Merge mode", ) args = parser.parse_args() merged = merge_sources(args.docs_data, args.github_data, args.output, args.mode) # Print summary summary = merged.get("summary", {}) print(f"\n✅ Merge complete ({merged.get('merge_mode')})") print(f" Total APIs: {summary.get('total_apis', 0)}") print(f" Matched: {summary.get('matched', 0)}") print(f" Docs only: {summary.get('docs_only', 0)}") print(f" Code only: {summary.get('code_only', 0)}") print(f" Conflicts: {summary.get('conflict', 0)}") print(f"\n📄 Saved to: {args.output}")