skill-seekers-reference/tests/test_merge_sources_github.py

"""
Tests for Phase 3: Enhanced Source Merging with GitHub Streams

Tests the multi-layer merging architecture:
- Layer 1: C3.x code (ground truth)
- Layer 2: HTML docs (official intent)
- Layer 3: GitHub docs (README/CONTRIBUTING)
- Layer 4: GitHub insights (issues)
"""

import pytest
from pathlib import Path
from unittest.mock import Mock
from skill_seekers.cli.merge_sources import (
    categorize_issues_by_topic,
    generate_hybrid_content,
    RuleBasedMerger,
    _match_issues_to_apis
)
from skill_seekers.cli.github_fetcher import (
    CodeStream,
    DocsStream,
    InsightsStream,
    ThreeStreamData
)
from skill_seekers.cli.conflict_detector import Conflict


class TestIssueCategorization:
    """Test issue categorization by topic."""

    def test_categorize_issues_basic(self):
        """Test basic issue categorization."""
        problems = [
            {'title': 'OAuth setup fails', 'labels': ['bug', 'oauth'], 'number': 1, 'state': 'open', 'comments': 10},
            {'title': 'Testing framework issue', 'labels': ['testing'], 'number': 2, 'state': 'open', 'comments': 5}
        ]
        solutions = [
            {'title': 'Fixed OAuth redirect', 'labels': ['oauth'], 'number': 3, 'state': 'closed', 'comments': 3}
        ]

        topics = ['oauth', 'testing', 'async']

        categorized = categorize_issues_by_topic(problems, solutions, topics)

        assert 'oauth' in categorized
        assert len(categorized['oauth']) == 2  # 1 problem + 1 solution
        assert 'testing' in categorized
        assert len(categorized['testing']) == 1

    def test_categorize_issues_keyword_matching(self):
        """Test keyword matching in titles and labels."""
        problems = [
            {'title': 'Database connection timeout', 'labels': ['db'], 'number': 1, 'state': 'open', 'comments': 7}
        ]
        solutions = []

        topics = ['database']

        categorized = categorize_issues_by_topic(problems, solutions, topics)

        # Should match 'database' topic due to 'db' in labels
        assert 'database' in categorized or 'other' in categorized

    def test_categorize_issues_multi_keyword_topic(self):
        """Test topics with multiple keywords."""
        problems = [
            {'title': 'Async API call fails', 'labels': ['async', 'api'], 'number': 1, 'state': 'open', 'comments': 8}
        ]
        solutions = []

        topics = ['async api']

        categorized = categorize_issues_by_topic(problems, solutions, topics)

        # Should match due to both 'async' and 'api' in labels
        assert 'async api' in categorized
        assert len(categorized['async api']) == 1

    def test_categorize_issues_no_match_goes_to_other(self):
        """Test that unmatched issues go to 'other' category."""
        problems = [
            {'title': 'Random issue', 'labels': ['misc'], 'number': 1, 'state': 'open', 'comments': 5}
        ]
        solutions = []

        topics = ['oauth', 'testing']

        categorized = categorize_issues_by_topic(problems, solutions, topics)

        assert 'other' in categorized
        assert len(categorized['other']) == 1

    def test_categorize_issues_empty_lists(self):
        """Test categorization with empty input."""
        categorized = categorize_issues_by_topic([], [], ['oauth'])

        # Should return empty dict (no categories with issues)
        assert len(categorized) == 0


class TestHybridContent:
    """Test hybrid content generation."""

    def test_generate_hybrid_content_basic(self):
        """Test basic hybrid content generation."""
        api_data = {
            'apis': {
                'oauth_login': {'name': 'oauth_login', 'status': 'matched'}
            },
            'summary': {'total_apis': 1}
        }

        github_docs = {
            'readme': '# Project README',
            'contributing': None,
            'docs_files': [{'path': 'docs/oauth.md', 'content': 'OAuth guide'}]
        }

        github_insights = {
            'metadata': {
                'stars': 1234,
                'forks': 56,
                'language': 'Python',
                'description': 'Test project'
            },
            'common_problems': [
                {'title': 'OAuth fails', 'number': 42, 'state': 'open', 'comments': 10, 'labels': ['bug']}
            ],
            'known_solutions': [
                {'title': 'Fixed OAuth', 'number': 35, 'state': 'closed', 'comments': 5, 'labels': ['bug']}
            ],
            'top_labels': [
                {'label': 'bug', 'count': 10},
                {'label': 'enhancement', 'count': 5}
            ]
        }

        conflicts = []

        hybrid = generate_hybrid_content(api_data, github_docs, github_insights, conflicts)

        # Check structure
        assert 'api_reference' in hybrid
        assert 'github_context' in hybrid
        assert 'conflict_summary' in hybrid
        assert 'issue_links' in hybrid

        # Check GitHub docs layer
        assert hybrid['github_context']['docs']['readme'] == '# Project README'
        assert hybrid['github_context']['docs']['docs_files_count'] == 1

        # Check GitHub insights layer
        assert hybrid['github_context']['metadata']['stars'] == 1234
        assert hybrid['github_context']['metadata']['language'] == 'Python'
        assert hybrid['github_context']['issues']['common_problems_count'] == 1
        assert hybrid['github_context']['issues']['known_solutions_count'] == 1
        assert len(hybrid['github_context']['issues']['top_problems']) == 1
        assert len(hybrid['github_context']['top_labels']) == 2

    def test_generate_hybrid_content_with_conflicts(self):
        """Test hybrid content with conflicts."""
        api_data = {'apis': {}, 'summary': {}}
        github_docs = None
        github_insights = None

        conflicts = [
            Conflict(
                api_name='test_api',
                type='signature_mismatch',
                severity='medium',
                difference='Parameter count differs',
                docs_info={'parameters': ['a', 'b']},
                code_info={'parameters': ['a', 'b', 'c']}
            ),
            Conflict(
                api_name='test_api_2',
                type='missing_in_docs',
                severity='low',
                difference='API not documented',
                docs_info=None,
                code_info={'name': 'test_api_2'}
            )
        ]

        hybrid = generate_hybrid_content(api_data, github_docs, github_insights, conflicts)

        # Check conflict summary
        assert hybrid['conflict_summary']['total_conflicts'] == 2
        assert hybrid['conflict_summary']['by_type']['signature_mismatch'] == 1
        assert hybrid['conflict_summary']['by_type']['missing_in_docs'] == 1
        assert hybrid['conflict_summary']['by_severity']['medium'] == 1
        assert hybrid['conflict_summary']['by_severity']['low'] == 1

    def test_generate_hybrid_content_no_github_data(self):
        """Test hybrid content with no GitHub data."""
        api_data = {'apis': {}, 'summary': {}}

        hybrid = generate_hybrid_content(api_data, None, None, [])

        # Should still have structure, but no GitHub context
        assert 'api_reference' in hybrid
        assert 'github_context' in hybrid
        assert hybrid['github_context'] == {}
        assert hybrid['conflict_summary']['total_conflicts'] == 0


class TestIssueToAPIMatching:
    """Test matching issues to APIs."""

    def test_match_issues_to_apis_basic(self):
        """Test basic issue to API matching."""
        apis = {
            'oauth_login': {'name': 'oauth_login'},
            'async_fetch': {'name': 'async_fetch'}
        }

        problems = [
            {'title': 'OAuth login fails', 'number': 42, 'state': 'open', 'comments': 10, 'labels': ['bug', 'oauth']}
        ]

        solutions = [
            {'title': 'Fixed async fetch timeout', 'number': 35, 'state': 'closed', 'comments': 5, 'labels': ['async']}
        ]

        issue_links = _match_issues_to_apis(apis, problems, solutions)

        # Should match oauth issue to oauth_login API
        assert 'oauth_login' in issue_links
        assert len(issue_links['oauth_login']) == 1
        assert issue_links['oauth_login'][0]['number'] == 42

        # Should match async issue to async_fetch API
        assert 'async_fetch' in issue_links
        assert len(issue_links['async_fetch']) == 1
        assert issue_links['async_fetch'][0]['number'] == 35

    def test_match_issues_to_apis_no_matches(self):
        """Test when no issues match any APIs."""
        apis = {
            'database_connect': {'name': 'database_connect'}
        }

        problems = [
            {'title': 'Random unrelated issue', 'number': 1, 'state': 'open', 'comments': 5, 'labels': ['misc']}
        ]

        issue_links = _match_issues_to_apis(apis, problems, [])

        # Should be empty - no matches
        assert len(issue_links) == 0

    def test_match_issues_to_apis_dotted_names(self):
        """Test matching with dotted API names."""
        apis = {
            'module.oauth.login': {'name': 'module.oauth.login'}
        }

        problems = [
            {'title': 'OAuth module fails', 'number': 42, 'state': 'open', 'comments': 10, 'labels': ['oauth']}
        ]

        issue_links = _match_issues_to_apis(apis, problems, [])

        # Should match due to 'oauth' keyword
        assert 'module.oauth.login' in issue_links
        assert len(issue_links['module.oauth.login']) == 1


class TestRuleBasedMergerWithGitHubStreams:
    """Test RuleBasedMerger with GitHub streams."""

    def test_merger_with_github_streams(self, tmp_path):
        """Test merger with three-stream GitHub data."""
        docs_data = {'pages': []}
        github_data = {'apis': {}}
        conflicts = []

        # Create three-stream data
        code_stream = CodeStream(directory=tmp_path, files=[])
        docs_stream = DocsStream(
            readme='# README',
            contributing='# Contributing',
            docs_files=[{'path': 'docs/guide.md', 'content': 'Guide content'}]
        )
        insights_stream = InsightsStream(
            metadata={'stars': 1234, 'forks': 56, 'language': 'Python'},
            common_problems=[
                {'title': 'Bug 1', 'number': 1, 'state': 'open', 'comments': 10, 'labels': ['bug']}
            ],
            known_solutions=[
                {'title': 'Fix 1', 'number': 2, 'state': 'closed', 'comments': 5, 'labels': ['bug']}
            ],
            top_labels=[{'label': 'bug', 'count': 10}]
        )
        github_streams = ThreeStreamData(code_stream, docs_stream, insights_stream)

        # Create merger with streams
        merger = RuleBasedMerger(docs_data, github_data, conflicts, github_streams)

        assert merger.github_streams is not None
        assert merger.github_docs is not None
        assert merger.github_insights is not None
        assert merger.github_docs['readme'] == '# README'
        assert merger.github_insights['metadata']['stars'] == 1234

    def test_merger_merge_all_with_streams(self, tmp_path):
        """Test merge_all() with GitHub streams."""
        docs_data = {'pages': []}
        github_data = {'apis': {}}
        conflicts = []

        # Create three-stream data
        code_stream = CodeStream(directory=tmp_path, files=[])
        docs_stream = DocsStream(readme='# README', contributing=None, docs_files=[])
        insights_stream = InsightsStream(
            metadata={'stars': 500},
            common_problems=[],
            known_solutions=[],
            top_labels=[]
        )
        github_streams = ThreeStreamData(code_stream, docs_stream, insights_stream)

        # Create and run merger
        merger = RuleBasedMerger(docs_data, github_data, conflicts, github_streams)
        result = merger.merge_all()

        # Check result has GitHub context
        assert 'github_context' in result
        assert 'conflict_summary' in result
        assert 'issue_links' in result
        assert result['github_context']['metadata']['stars'] == 500

    def test_merger_without_streams_backward_compat(self):
        """Test backward compatibility without GitHub streams."""
        docs_data = {'pages': []}
        github_data = {'apis': {}}
        conflicts = []

        # Create merger without streams (old API)
        merger = RuleBasedMerger(docs_data, github_data, conflicts)

        assert merger.github_streams is None
        assert merger.github_docs is None
        assert merger.github_insights is None

        # Should still work
        result = merger.merge_all()
        assert 'apis' in result
        assert 'summary' in result
        # Should not have GitHub context
        assert 'github_context' not in result


class TestIntegration:
    """Integration tests for Phase 3."""

    def test_full_pipeline_with_streams(self, tmp_path):
        """Test complete pipeline with three-stream data."""
        # Create minimal test data
        docs_data = {'pages': []}
        github_data = {'apis': {}}

        # Create three-stream data
        code_stream = CodeStream(directory=tmp_path, files=[])
        docs_stream = DocsStream(
            readme='# Test Project\n\nA test project.',
            contributing='# Contributing\n\nPull requests welcome.',
            docs_files=[
                {'path': 'docs/quickstart.md', 'content': '# Quick Start'},
                {'path': 'docs/api.md', 'content': '# API Reference'}
            ]
        )
        insights_stream = InsightsStream(
            metadata={
                'stars': 2500,
                'forks': 123,
                'language': 'Python',
                'description': 'Test framework'
            },
            common_problems=[
                {'title': 'Installation fails on Windows', 'number': 150, 'state': 'open', 'comments': 25, 'labels': ['bug', 'windows']},
                {'title': 'Memory leak in async mode', 'number': 142, 'state': 'open', 'comments': 18, 'labels': ['bug', 'async']}
            ],
            known_solutions=[
                {'title': 'Fixed config loading', 'number': 130, 'state': 'closed', 'comments': 8, 'labels': ['bug']},
                {'title': 'Resolved OAuth timeout', 'number': 125, 'state': 'closed', 'comments': 12, 'labels': ['oauth']}
            ],
            top_labels=[
                {'label': 'bug', 'count': 45},
                {'label': 'enhancement', 'count': 20},
                {'label': 'question', 'count': 15}
            ]
        )
        github_streams = ThreeStreamData(code_stream, docs_stream, insights_stream)

        # Create merger and merge
        merger = RuleBasedMerger(docs_data, github_data, [], github_streams)
        result = merger.merge_all()

        # Verify all layers present
        assert 'apis' in result  # Layer 1 & 2: Code + Docs
        assert 'github_context' in result  # Layer 3 & 4: GitHub docs + insights

        # Verify Layer 3: GitHub docs
        gh_context = result['github_context']
        assert gh_context['docs']['readme'] == '# Test Project\n\nA test project.'
        assert gh_context['docs']['contributing'] == '# Contributing\n\nPull requests welcome.'
        assert gh_context['docs']['docs_files_count'] == 2

        # Verify Layer 4: GitHub insights
        assert gh_context['metadata']['stars'] == 2500
        assert gh_context['metadata']['language'] == 'Python'
        assert gh_context['issues']['common_problems_count'] == 2
        assert gh_context['issues']['known_solutions_count'] == 2
        assert len(gh_context['issues']['top_problems']) == 2
        assert len(gh_context['issues']['top_solutions']) == 2
        assert len(gh_context['top_labels']) == 3

        # Verify conflict summary
        assert 'conflict_summary' in result
        assert result['conflict_summary']['total_conflicts'] == 0