skill-seekers-reference/src/skill_seekers/cli/merge_sources.py

#!/usr/bin/env python3
"""
Source Merger for Multi-Source Skills

Merges documentation and code data intelligently with GitHub insights:
- Rule-based merge: Fast, deterministic rules
- Claude-enhanced merge: AI-powered reconciliation

Handles conflicts and creates unified API reference with GitHub metadata.

Multi-layer architecture (Phase 3):
- Layer 1: C3.x code (ground truth)
- Layer 2: HTML docs (official intent)
- Layer 3: GitHub docs (README/CONTRIBUTING)
- Layer 4: GitHub insights (issues)
"""

import json
import logging
import os
import subprocess
import tempfile
from typing import Any, Optional

from .conflict_detector import Conflict, ConflictDetector

# Import three-stream data classes (Phase 1)
try:
    from .github_fetcher import CodeStream, DocsStream, InsightsStream, ThreeStreamData
except ImportError:
    # Fallback if github_fetcher not available
    ThreeStreamData = None
    CodeStream = None
    DocsStream = None
    InsightsStream = None

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def categorize_issues_by_topic(
    problems: list[dict], solutions: list[dict], topics: list[str]
) -> dict[str, list[dict]]:
    """
    Categorize GitHub issues by topic keywords.

    Args:
        problems: List of common problems (open issues with 5+ comments)
        solutions: List of known solutions (closed issues with comments)
        topics: List of topic keywords to match against

    Returns:
        Dict mapping topic to relevant issues
    """
    categorized = {topic: [] for topic in topics}
    categorized["other"] = []

    all_issues = problems + solutions

    for issue in all_issues:
        # Get searchable text
        title = issue.get("title", "").lower()
        labels = [label.lower() for label in issue.get("labels", [])]
        text = f"{title} {' '.join(labels)}"

        # Find best matching topic
        matched_topic = None
        max_matches = 0

        for topic in topics:
            # Count keyword matches
            topic_keywords = topic.lower().split()
            matches = sum(1 for keyword in topic_keywords if keyword in text)

            if matches > max_matches:
                max_matches = matches
                matched_topic = topic

        # Categorize by best match or 'other'
        if matched_topic and max_matches > 0:
            categorized[matched_topic].append(issue)
        else:
            categorized["other"].append(issue)

    # Remove empty categories
    return {k: v for k, v in categorized.items() if v}


def generate_hybrid_content(
    api_data: dict,
    github_docs: dict | None,
    github_insights: dict | None,
    conflicts: list[Conflict],
) -> dict[str, Any]:
    """
    Generate hybrid content combining API data with GitHub context.

    Args:
        api_data: Merged API data
        github_docs: GitHub docs stream (README, CONTRIBUTING, docs/*.md)
        github_insights: GitHub insights stream (metadata, issues, labels)
        conflicts: List of detected conflicts

    Returns:
        Hybrid content dict with enriched API reference
    """
    hybrid = {"api_reference": api_data, "github_context": {}}

    # Add GitHub documentation layer
    if github_docs:
        hybrid["github_context"]["docs"] = {
            "readme": github_docs.get("readme"),
            "contributing": github_docs.get("contributing"),
            "docs_files_count": len(github_docs.get("docs_files", [])),
        }

    # Add GitHub insights layer
    if github_insights:
        metadata = github_insights.get("metadata", {})
        hybrid["github_context"]["metadata"] = {
            "stars": metadata.get("stars", 0),
            "forks": metadata.get("forks", 0),
            "language": metadata.get("language", "Unknown"),
            "description": metadata.get("description", ""),
        }

        # Add issue insights
        common_problems = github_insights.get("common_problems", [])
        known_solutions = github_insights.get("known_solutions", [])

        hybrid["github_context"]["issues"] = {
            "common_problems_count": len(common_problems),
            "known_solutions_count": len(known_solutions),
            "top_problems": common_problems[:5],  # Top 5 most-discussed
            "top_solutions": known_solutions[:5],
        }

        hybrid["github_context"]["top_labels"] = github_insights.get("top_labels", [])

    # Add conflict summary
    hybrid["conflict_summary"] = {
        "total_conflicts": len(conflicts),
        "by_type": {},
        "by_severity": {},
    }

    for conflict in conflicts:
        # Count by type
        conflict_type = conflict.type
        hybrid["conflict_summary"]["by_type"][conflict_type] = (
            hybrid["conflict_summary"]["by_type"].get(conflict_type, 0) + 1
        )

        # Count by severity
        severity = conflict.severity
        hybrid["conflict_summary"]["by_severity"][severity] = (
            hybrid["conflict_summary"]["by_severity"].get(severity, 0) + 1
        )

    # Add GitHub issue links for relevant APIs
    if github_insights:
        hybrid["issue_links"] = _match_issues_to_apis(
            api_data.get("apis", {}),
            github_insights.get("common_problems", []),
            github_insights.get("known_solutions", []),
        )

    return hybrid


def _match_issues_to_apis(
    apis: dict[str, dict], problems: list[dict], solutions: list[dict]
) -> dict[str, list[dict]]:
    """
    Match GitHub issues to specific APIs by keyword matching.

    Args:
        apis: Dict of API data keyed by name
        problems: List of common problems
        solutions: List of known solutions

    Returns:
        Dict mapping API names to relevant issues
    """
    issue_links = {}
    all_issues = problems + solutions

    for api_name in apis:
        # Extract searchable keywords from API name
        api_keywords = api_name.lower().replace("_", " ").split(".")

        matched_issues = []
        for issue in all_issues:
            title = issue.get("title", "").lower()
            labels = [label.lower() for label in issue.get("labels", [])]
            text = f"{title} {' '.join(labels)}"

            # Check if any API keyword appears in issue
            if any(keyword in text for keyword in api_keywords):
                matched_issues.append(
                    {
                        "number": issue.get("number"),
                        "title": issue.get("title"),
                        "state": issue.get("state"),
                        "comments": issue.get("comments"),
                    }
                )

        if matched_issues:
            issue_links[api_name] = matched_issues

    return issue_links


class RuleBasedMerger:
    """
    Rule-based API merger using deterministic rules with GitHub insights.

    Multi-layer architecture (Phase 3):
    - Layer 1: C3.x code (ground truth)
    - Layer 2: HTML docs (official intent)
    - Layer 3: GitHub docs (README/CONTRIBUTING)
    - Layer 4: GitHub insights (issues)

    Rules:
    1. If API only in docs → Include with [DOCS_ONLY] tag
    2. If API only in code → Include with [UNDOCUMENTED] tag
    3. If both match perfectly → Include normally
    4. If conflict → Include both versions with [CONFLICT] tag, prefer code signature
    """

    def __init__(
        self,
        docs_data: dict,
        github_data: dict,
        conflicts: list[Conflict],
        github_streams: Optional["ThreeStreamData"] = None,
    ):
        """
        Initialize rule-based merger with GitHub streams support.

        Args:
            docs_data: Documentation scraper data (Layer 2: HTML docs)
            github_data: GitHub scraper data (Layer 1: C3.x code)
            conflicts: List of detected conflicts
            github_streams: Optional ThreeStreamData with docs and insights (Layers 3-4)
        """
        self.docs_data = docs_data
        self.github_data = github_data
        self.conflicts = conflicts
        self.github_streams = github_streams

        # Build conflict index for fast lookup
        self.conflict_index = {c.api_name: c for c in conflicts}

        # Extract APIs from both sources
        detector = ConflictDetector(docs_data, github_data)
        self.docs_apis = detector.docs_apis
        self.code_apis = detector.code_apis

        # Extract GitHub streams if available
        self.github_docs = None
        self.github_insights = None
        if github_streams:
            # Layer 3: GitHub docs
            if github_streams.docs_stream:
                self.github_docs = {
                    "readme": github_streams.docs_stream.readme,
                    "contributing": github_streams.docs_stream.contributing,
                    "docs_files": github_streams.docs_stream.docs_files,
                }

            # Layer 4: GitHub insights
            if github_streams.insights_stream:
                self.github_insights = {
                    "metadata": github_streams.insights_stream.metadata,
                    "common_problems": github_streams.insights_stream.common_problems,
                    "known_solutions": github_streams.insights_stream.known_solutions,
                    "top_labels": github_streams.insights_stream.top_labels,
                }

    def merge_all(self) -> dict[str, Any]:
        """
        Merge all APIs using rule-based logic with GitHub insights (Phase 3).

        Returns:
            Dict containing merged API data with hybrid content
        """
        logger.info("Starting rule-based merge with GitHub streams...")

        merged_apis = {}

        # Get all unique API names
        all_api_names = set(self.docs_apis.keys()) | set(self.code_apis.keys())

        for api_name in sorted(all_api_names):
            merged_api = self._merge_single_api(api_name)
            merged_apis[api_name] = merged_api

        logger.info(f"Merged {len(merged_apis)} APIs")

        # Build base result
        merged_data = {
            "merge_mode": "rule-based",
            "apis": merged_apis,
            "summary": {
                "total_apis": len(merged_apis),
                "docs_only": sum(1 for api in merged_apis.values() if api["status"] == "docs_only"),
                "code_only": sum(1 for api in merged_apis.values() if api["status"] == "code_only"),
                "matched": sum(1 for api in merged_apis.values() if api["status"] == "matched"),
                "conflict": sum(1 for api in merged_apis.values() if api["status"] == "conflict"),
            },
        }

        # Generate hybrid content if GitHub streams available (Phase 3)
        if self.github_streams:
            logger.info("Generating hybrid content with GitHub insights...")
            hybrid_content = generate_hybrid_content(
                api_data=merged_data,
                github_docs=self.github_docs,
                github_insights=self.github_insights,
                conflicts=self.conflicts,
            )

            # Merge hybrid content into result
            merged_data["github_context"] = hybrid_content.get("github_context", {})
            merged_data["conflict_summary"] = hybrid_content.get("conflict_summary", {})
            merged_data["issue_links"] = hybrid_content.get("issue_links", {})

            logger.info(
                f"Added GitHub context: {len(self.github_insights.get('common_problems', []))} problems, "
                f"{len(self.github_insights.get('known_solutions', []))} solutions"
            )

        return merged_data

    def _merge_single_api(self, api_name: str) -> dict[str, Any]:
        """
        Merge a single API using rules.

        Args:
            api_name: Name of the API to merge

        Returns:
            Merged API dict
        """
        in_docs = api_name in self.docs_apis
        in_code = api_name in self.code_apis
        has_conflict = api_name in self.conflict_index

        # Rule 1: Only in docs
        if in_docs and not in_code:
            conflict = self.conflict_index.get(api_name)
            return {
                "name": api_name,
                "status": "docs_only",
                "source": "documentation",
                "data": self.docs_apis[api_name],
                "warning": "This API is documented but not found in codebase",
                "conflict": conflict.__dict__ if conflict else None,
            }

        # Rule 2: Only in code
        if in_code and not in_docs:
            is_private = api_name.startswith("_")
            conflict = self.conflict_index.get(api_name)
            return {
                "name": api_name,
                "status": "code_only",
                "source": "code",
                "data": self.code_apis[api_name],
                "warning": "This API exists in code but is not documented"
                if not is_private
                else "Internal/private API",
                "conflict": conflict.__dict__ if conflict else None,
            }

        # Both exist - check for conflicts
        docs_info = self.docs_apis[api_name]
        code_info = self.code_apis[api_name]

        # Rule 3: Both match perfectly (no conflict)
        if not has_conflict:
            return {
                "name": api_name,
                "status": "matched",
                "source": "both",
                "docs_data": docs_info,
                "code_data": code_info,
                "merged_signature": self._create_merged_signature(code_info, docs_info),
                "merged_description": docs_info.get("docstring") or code_info.get("docstring"),
            }

        # Rule 4: Conflict exists - prefer code signature, keep docs description
        conflict = self.conflict_index[api_name]

        return {
            "name": api_name,
            "status": "conflict",
            "source": "both",
            "docs_data": docs_info,
            "code_data": code_info,
            "conflict": conflict.__dict__,
            "resolution": "prefer_code_signature",
            "merged_signature": self._create_merged_signature(code_info, docs_info),
            "merged_description": docs_info.get("docstring") or code_info.get("docstring"),
            "warning": conflict.difference,
        }

    def _create_merged_signature(self, code_info: dict, docs_info: dict) -> str:
        """
        Create merged signature preferring code data.

        Args:
            code_info: API info from code
            docs_info: API info from docs

        Returns:
            Merged signature string
        """
        name = code_info.get("name", docs_info.get("name"))
        params = code_info.get("parameters", docs_info.get("parameters", []))
        return_type = code_info.get("return_type", docs_info.get("return_type"))

        # Build parameter string
        param_strs = []
        for param in params:
            param_str = param["name"]
            if param.get("type_hint"):
                param_str += f": {param['type_hint']}"
            if param.get("default"):
                param_str += f" = {param['default']}"
            param_strs.append(param_str)

        signature = f"{name}({', '.join(param_strs)})"

        if return_type:
            signature += f" -> {return_type}"

        return signature


class ClaudeEnhancedMerger:
    """
    Claude-enhanced API merger using local Claude Code with GitHub insights.

    Opens Claude Code in a new terminal to intelligently reconcile conflicts.
    Uses the same approach as enhance_skill_local.py.

    Multi-layer architecture (Phase 3):
    - Layer 1: C3.x code (ground truth)
    - Layer 2: HTML docs (official intent)
    - Layer 3: GitHub docs (README/CONTRIBUTING)
    - Layer 4: GitHub insights (issues)
    """

    def __init__(
        self,
        docs_data: dict,
        github_data: dict,
        conflicts: list[Conflict],
        github_streams: Optional["ThreeStreamData"] = None,
    ):
        """
        Initialize Claude-enhanced merger with GitHub streams support.

        Args:
            docs_data: Documentation scraper data (Layer 2: HTML docs)
            github_data: GitHub scraper data (Layer 1: C3.x code)
            conflicts: List of detected conflicts
            github_streams: Optional ThreeStreamData with docs and insights (Layers 3-4)
        """
        self.docs_data = docs_data
        self.github_data = github_data
        self.conflicts = conflicts
        self.github_streams = github_streams

        # First do rule-based merge as baseline
        self.rule_merger = RuleBasedMerger(docs_data, github_data, conflicts, github_streams)

    def merge_all(self) -> dict[str, Any]:
        """
        Merge all APIs using Claude enhancement.

        Returns:
            Dict containing merged API data
        """
        logger.info("Starting Claude-enhanced merge...")

        # Create temporary workspace
        workspace_dir = self._create_workspace()

        # Launch Claude Code for enhancement
        logger.info("Launching Claude Code for intelligent merging...")
        logger.info("Claude will analyze conflicts and create reconciled API reference")

        try:
            self._launch_claude_merge(workspace_dir)

            # Read enhanced results
            merged_data = self._read_merged_results(workspace_dir)

            logger.info("Claude-enhanced merge complete")
            return merged_data

        except Exception as e:
            logger.error(f"Claude enhancement failed: {e}")
            logger.info("Falling back to rule-based merge")
            return self.rule_merger.merge_all()

    def _create_workspace(self) -> str:
        """
        Create temporary workspace with merge context.

        Returns:
            Path to workspace directory
        """
        workspace = tempfile.mkdtemp(prefix="skill_merge_")
        logger.info(f"Created merge workspace: {workspace}")

        # Write context files for Claude
        self._write_context_files(workspace)

        return workspace

    def _write_context_files(self, workspace: str):
        """Write context files for Claude to analyze."""

        # 1. Write conflicts summary
        conflicts_file = os.path.join(workspace, "conflicts.json")
        with open(conflicts_file, "w") as f:
            json.dump(
                {
                    "conflicts": [c.__dict__ for c in self.conflicts],
                    "summary": {
                        "total": len(self.conflicts),
                        "by_type": self._count_by_field("type"),
                        "by_severity": self._count_by_field("severity"),
                    },
                },
                f,
                indent=2,
            )

        # 2. Write documentation APIs
        docs_apis_file = os.path.join(workspace, "docs_apis.json")
        detector = ConflictDetector(self.docs_data, self.github_data)
        with open(docs_apis_file, "w") as f:
            json.dump(detector.docs_apis, f, indent=2)

        # 3. Write code APIs
        code_apis_file = os.path.join(workspace, "code_apis.json")
        with open(code_apis_file, "w") as f:
            json.dump(detector.code_apis, f, indent=2)

        # 4. Write merge instructions for Claude
        instructions = """# API Merge Task

You are merging API documentation from two sources:
1. Official documentation (user-facing)
2. Source code analysis (implementation reality)

## Context Files:
- `conflicts.json` - All detected conflicts between sources
- `docs_apis.json` - APIs from documentation
- `code_apis.json` - APIs from source code

## Your Task:
For each conflict, reconcile the differences intelligently:

1. **Prefer code signatures as source of truth**
   - Use actual parameter names, types, defaults from code
   - Code is what actually runs, docs might be outdated

2. **Keep documentation descriptions**
   - Docs are user-friendly, code comments might be technical
   - Keep the docs' explanation of what the API does

3. **Add implementation notes for discrepancies**
   - If docs differ from code, explain the difference
   - Example: "⚠️ The `snap` parameter exists in code but is not documented"

4. **Flag missing APIs clearly**
   - Missing in docs → Add [UNDOCUMENTED] tag
   - Missing in code → Add [REMOVED] or [DOCS_ERROR] tag

5. **Create unified API reference**
   - One definitive signature per API
   - Clear warnings about conflicts
   - Implementation notes where helpful

## Output Format:
Create `merged_apis.json` with this structure:

```json
{
  "apis": {
    "API.name": {
      "signature": "final_signature_here",
      "parameters": [...],
      "return_type": "type",
      "description": "user-friendly description",
      "implementation_notes": "Any discrepancies or warnings",
      "source": "both|docs_only|code_only",
      "confidence": "high|medium|low"
    }
  }
}
```

Take your time to analyze each conflict carefully. The goal is to create the most accurate and helpful API reference possible.
"""

        instructions_file = os.path.join(workspace, "MERGE_INSTRUCTIONS.md")
        with open(instructions_file, "w") as f:
            f.write(instructions)

        logger.info(f"Wrote context files to {workspace}")

    def _count_by_field(self, field: str) -> dict[str, int]:
        """Count conflicts by a specific field."""
        counts = {}
        for conflict in self.conflicts:
            value = getattr(conflict, field)
            counts[value] = counts.get(value, 0) + 1
        return counts

    def _launch_claude_merge(self, workspace: str):
        """
        Launch Claude Code to perform merge.

        Similar to enhance_skill_local.py approach.
        """
        # Create a script that Claude will execute
        script_path = os.path.join(workspace, "merge_script.sh")

        script_content = f"""#!/bin/bash
# Automatic merge script for Claude Code

cd "{workspace}"

echo "📊 Analyzing conflicts..."
cat conflicts.json | head -20

echo ""
echo "📖 Documentation APIs: $(cat docs_apis.json | grep -c '\"name\"')"
echo "💻 Code APIs: $(cat code_apis.json | grep -c '\"name\"')"
echo ""
echo "Please review the conflicts and create merged_apis.json"
echo "Follow the instructions in MERGE_INSTRUCTIONS.md"
echo ""
echo "When done, save merged_apis.json and close this terminal."

# Wait for user to complete merge
read -p "Press Enter when merge is complete..."
"""

        with open(script_path, "w") as f:
            f.write(script_content)

        os.chmod(script_path, 0o755)

        # Open new terminal with Claude Code
        # Try different terminal emulators
        terminals = [
            ["x-terminal-emulator", "-e"],
            ["gnome-terminal", "--"],
            ["xterm", "-e"],
            ["konsole", "-e"],
        ]

        for terminal_cmd in terminals:
            try:
                cmd = terminal_cmd + ["bash", script_path]
                subprocess.Popen(cmd)
                logger.info(f"Opened terminal with {terminal_cmd[0]}")
                break
            except FileNotFoundError:
                continue

        # Wait for merge to complete
        merged_file = os.path.join(workspace, "merged_apis.json")
        logger.info(f"Waiting for merged results at: {merged_file}")
        logger.info("Close the terminal when done to continue...")

        # Poll for file existence
        import time

        timeout = 3600  # 1 hour max
        elapsed = 0
        while not os.path.exists(merged_file) and elapsed < timeout:
            time.sleep(5)
            elapsed += 5

        if not os.path.exists(merged_file):
            raise TimeoutError("Claude merge timed out after 1 hour")

    def _read_merged_results(self, workspace: str) -> dict[str, Any]:
        """Read merged results from workspace."""
        merged_file = os.path.join(workspace, "merged_apis.json")

        if not os.path.exists(merged_file):
            raise FileNotFoundError(f"Merged results not found: {merged_file}")

        with open(merged_file) as f:
            merged_data = json.load(f)

        return {"merge_mode": "claude-enhanced", **merged_data}


def merge_sources(
    docs_data_path: str,
    github_data_path: str,
    output_path: str,
    mode: str = "rule-based",
    github_streams: Optional["ThreeStreamData"] = None,
) -> dict[str, Any]:
    """
    Merge documentation and GitHub data with optional GitHub streams (Phase 3).

    Multi-layer architecture:
    - Layer 1: C3.x code (ground truth)
    - Layer 2: HTML docs (official intent)
    - Layer 3: GitHub docs (README/CONTRIBUTING) - from github_streams
    - Layer 4: GitHub insights (issues) - from github_streams

    Args:
        docs_data_path: Path to documentation data JSON
        github_data_path: Path to GitHub data JSON
        output_path: Path to save merged output
        mode: 'rule-based' or 'claude-enhanced'
        github_streams: Optional ThreeStreamData with docs and insights

    Returns:
        Merged data dict with hybrid content
    """
    # Load data
    with open(docs_data_path) as f:
        docs_data = json.load(f)

    with open(github_data_path) as f:
        github_data = json.load(f)

    # Detect conflicts
    detector = ConflictDetector(docs_data, github_data)
    conflicts = detector.detect_all_conflicts()

    logger.info(f"Detected {len(conflicts)} conflicts")

    # Log GitHub streams availability
    if github_streams:
        logger.info("GitHub streams available for multi-layer merge")
        if github_streams.docs_stream:
            logger.info(
                f"  - Docs stream: README, {len(github_streams.docs_stream.docs_files)} docs files"
            )
        if github_streams.insights_stream:
            problems = len(github_streams.insights_stream.common_problems)
            solutions = len(github_streams.insights_stream.known_solutions)
            logger.info(f"  - Insights stream: {problems} problems, {solutions} solutions")

    # Merge based on mode
    if mode == "claude-enhanced":
        merger = ClaudeEnhancedMerger(docs_data, github_data, conflicts, github_streams)
    else:
        merger = RuleBasedMerger(docs_data, github_data, conflicts, github_streams)

    merged_data = merger.merge_all()

    # Save merged data
    with open(output_path, "w") as f:
        json.dump(merged_data, f, indent=2, ensure_ascii=False)

    logger.info(f"Merged data saved to: {output_path}")

    return merged_data


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Merge documentation and code sources")
    parser.add_argument("docs_data", help="Path to documentation data JSON")
    parser.add_argument("github_data", help="Path to GitHub data JSON")
    parser.add_argument("--output", "-o", default="merged_data.json", help="Output file path")
    parser.add_argument(
        "--mode",
        "-m",
        choices=["rule-based", "claude-enhanced"],
        default="rule-based",
        help="Merge mode",
    )

    args = parser.parse_args()

    merged = merge_sources(args.docs_data, args.github_data, args.output, args.mode)

    # Print summary
    summary = merged.get("summary", {})
    print(f"\n✅ Merge complete ({merged.get('merge_mode')})")
    print(f"   Total APIs: {summary.get('total_apis', 0)}")
    print(f"   Matched: {summary.get('matched', 0)}")
    print(f"   Docs only: {summary.get('docs_only', 0)}")
    print(f"   Code only: {summary.get('code_only', 0)}")
    print(f"   Conflicts: {summary.get('conflict', 0)}")
    print(f"\n📄 Saved to: {args.output}")