Files
skill-seekers-reference/tests/test_merge_sources_github.py
Pablo Estevez 5ed767ff9a run ruff
2026-01-17 17:29:21 +00:00

395 lines
15 KiB
Python

"""
Tests for Phase 3: Enhanced Source Merging with GitHub Streams
Tests the multi-layer merging architecture:
- Layer 1: C3.x code (ground truth)
- Layer 2: HTML docs (official intent)
- Layer 3: GitHub docs (README/CONTRIBUTING)
- Layer 4: GitHub insights (issues)
"""
from skill_seekers.cli.conflict_detector import Conflict
from skill_seekers.cli.github_fetcher import CodeStream, DocsStream, InsightsStream, ThreeStreamData
from skill_seekers.cli.merge_sources import (
RuleBasedMerger,
_match_issues_to_apis,
categorize_issues_by_topic,
generate_hybrid_content,
)
class TestIssueCategorization:
"""Test issue categorization by topic."""
def test_categorize_issues_basic(self):
"""Test basic issue categorization."""
problems = [
{"title": "OAuth setup fails", "labels": ["bug", "oauth"], "number": 1, "state": "open", "comments": 10},
{"title": "Testing framework issue", "labels": ["testing"], "number": 2, "state": "open", "comments": 5},
]
solutions = [
{"title": "Fixed OAuth redirect", "labels": ["oauth"], "number": 3, "state": "closed", "comments": 3}
]
topics = ["oauth", "testing", "async"]
categorized = categorize_issues_by_topic(problems, solutions, topics)
assert "oauth" in categorized
assert len(categorized["oauth"]) == 2 # 1 problem + 1 solution
assert "testing" in categorized
assert len(categorized["testing"]) == 1
def test_categorize_issues_keyword_matching(self):
"""Test keyword matching in titles and labels."""
problems = [
{"title": "Database connection timeout", "labels": ["db"], "number": 1, "state": "open", "comments": 7}
]
solutions = []
topics = ["database"]
categorized = categorize_issues_by_topic(problems, solutions, topics)
# Should match 'database' topic due to 'db' in labels
assert "database" in categorized or "other" in categorized
def test_categorize_issues_multi_keyword_topic(self):
"""Test topics with multiple keywords."""
problems = [
{"title": "Async API call fails", "labels": ["async", "api"], "number": 1, "state": "open", "comments": 8}
]
solutions = []
topics = ["async api"]
categorized = categorize_issues_by_topic(problems, solutions, topics)
# Should match due to both 'async' and 'api' in labels
assert "async api" in categorized
assert len(categorized["async api"]) == 1
def test_categorize_issues_no_match_goes_to_other(self):
"""Test that unmatched issues go to 'other' category."""
problems = [{"title": "Random issue", "labels": ["misc"], "number": 1, "state": "open", "comments": 5}]
solutions = []
topics = ["oauth", "testing"]
categorized = categorize_issues_by_topic(problems, solutions, topics)
assert "other" in categorized
assert len(categorized["other"]) == 1
def test_categorize_issues_empty_lists(self):
"""Test categorization with empty input."""
categorized = categorize_issues_by_topic([], [], ["oauth"])
# Should return empty dict (no categories with issues)
assert len(categorized) == 0
class TestHybridContent:
"""Test hybrid content generation."""
def test_generate_hybrid_content_basic(self):
"""Test basic hybrid content generation."""
api_data = {"apis": {"oauth_login": {"name": "oauth_login", "status": "matched"}}, "summary": {"total_apis": 1}}
github_docs = {
"readme": "# Project README",
"contributing": None,
"docs_files": [{"path": "docs/oauth.md", "content": "OAuth guide"}],
}
github_insights = {
"metadata": {"stars": 1234, "forks": 56, "language": "Python", "description": "Test project"},
"common_problems": [
{"title": "OAuth fails", "number": 42, "state": "open", "comments": 10, "labels": ["bug"]}
],
"known_solutions": [
{"title": "Fixed OAuth", "number": 35, "state": "closed", "comments": 5, "labels": ["bug"]}
],
"top_labels": [{"label": "bug", "count": 10}, {"label": "enhancement", "count": 5}],
}
conflicts = []
hybrid = generate_hybrid_content(api_data, github_docs, github_insights, conflicts)
# Check structure
assert "api_reference" in hybrid
assert "github_context" in hybrid
assert "conflict_summary" in hybrid
assert "issue_links" in hybrid
# Check GitHub docs layer
assert hybrid["github_context"]["docs"]["readme"] == "# Project README"
assert hybrid["github_context"]["docs"]["docs_files_count"] == 1
# Check GitHub insights layer
assert hybrid["github_context"]["metadata"]["stars"] == 1234
assert hybrid["github_context"]["metadata"]["language"] == "Python"
assert hybrid["github_context"]["issues"]["common_problems_count"] == 1
assert hybrid["github_context"]["issues"]["known_solutions_count"] == 1
assert len(hybrid["github_context"]["issues"]["top_problems"]) == 1
assert len(hybrid["github_context"]["top_labels"]) == 2
def test_generate_hybrid_content_with_conflicts(self):
"""Test hybrid content with conflicts."""
api_data = {"apis": {}, "summary": {}}
github_docs = None
github_insights = None
conflicts = [
Conflict(
api_name="test_api",
type="signature_mismatch",
severity="medium",
difference="Parameter count differs",
docs_info={"parameters": ["a", "b"]},
code_info={"parameters": ["a", "b", "c"]},
),
Conflict(
api_name="test_api_2",
type="missing_in_docs",
severity="low",
difference="API not documented",
docs_info=None,
code_info={"name": "test_api_2"},
),
]
hybrid = generate_hybrid_content(api_data, github_docs, github_insights, conflicts)
# Check conflict summary
assert hybrid["conflict_summary"]["total_conflicts"] == 2
assert hybrid["conflict_summary"]["by_type"]["signature_mismatch"] == 1
assert hybrid["conflict_summary"]["by_type"]["missing_in_docs"] == 1
assert hybrid["conflict_summary"]["by_severity"]["medium"] == 1
assert hybrid["conflict_summary"]["by_severity"]["low"] == 1
def test_generate_hybrid_content_no_github_data(self):
"""Test hybrid content with no GitHub data."""
api_data = {"apis": {}, "summary": {}}
hybrid = generate_hybrid_content(api_data, None, None, [])
# Should still have structure, but no GitHub context
assert "api_reference" in hybrid
assert "github_context" in hybrid
assert hybrid["github_context"] == {}
assert hybrid["conflict_summary"]["total_conflicts"] == 0
class TestIssueToAPIMatching:
"""Test matching issues to APIs."""
def test_match_issues_to_apis_basic(self):
"""Test basic issue to API matching."""
apis = {"oauth_login": {"name": "oauth_login"}, "async_fetch": {"name": "async_fetch"}}
problems = [
{"title": "OAuth login fails", "number": 42, "state": "open", "comments": 10, "labels": ["bug", "oauth"]}
]
solutions = [
{"title": "Fixed async fetch timeout", "number": 35, "state": "closed", "comments": 5, "labels": ["async"]}
]
issue_links = _match_issues_to_apis(apis, problems, solutions)
# Should match oauth issue to oauth_login API
assert "oauth_login" in issue_links
assert len(issue_links["oauth_login"]) == 1
assert issue_links["oauth_login"][0]["number"] == 42
# Should match async issue to async_fetch API
assert "async_fetch" in issue_links
assert len(issue_links["async_fetch"]) == 1
assert issue_links["async_fetch"][0]["number"] == 35
def test_match_issues_to_apis_no_matches(self):
"""Test when no issues match any APIs."""
apis = {"database_connect": {"name": "database_connect"}}
problems = [
{"title": "Random unrelated issue", "number": 1, "state": "open", "comments": 5, "labels": ["misc"]}
]
issue_links = _match_issues_to_apis(apis, problems, [])
# Should be empty - no matches
assert len(issue_links) == 0
def test_match_issues_to_apis_dotted_names(self):
"""Test matching with dotted API names."""
apis = {"module.oauth.login": {"name": "module.oauth.login"}}
problems = [{"title": "OAuth module fails", "number": 42, "state": "open", "comments": 10, "labels": ["oauth"]}]
issue_links = _match_issues_to_apis(apis, problems, [])
# Should match due to 'oauth' keyword
assert "module.oauth.login" in issue_links
assert len(issue_links["module.oauth.login"]) == 1
class TestRuleBasedMergerWithGitHubStreams:
"""Test RuleBasedMerger with GitHub streams."""
def test_merger_with_github_streams(self, tmp_path):
"""Test merger with three-stream GitHub data."""
docs_data = {"pages": []}
github_data = {"apis": {}}
conflicts = []
# Create three-stream data
code_stream = CodeStream(directory=tmp_path, files=[])
docs_stream = DocsStream(
readme="# README",
contributing="# Contributing",
docs_files=[{"path": "docs/guide.md", "content": "Guide content"}],
)
insights_stream = InsightsStream(
metadata={"stars": 1234, "forks": 56, "language": "Python"},
common_problems=[{"title": "Bug 1", "number": 1, "state": "open", "comments": 10, "labels": ["bug"]}],
known_solutions=[{"title": "Fix 1", "number": 2, "state": "closed", "comments": 5, "labels": ["bug"]}],
top_labels=[{"label": "bug", "count": 10}],
)
github_streams = ThreeStreamData(code_stream, docs_stream, insights_stream)
# Create merger with streams
merger = RuleBasedMerger(docs_data, github_data, conflicts, github_streams)
assert merger.github_streams is not None
assert merger.github_docs is not None
assert merger.github_insights is not None
assert merger.github_docs["readme"] == "# README"
assert merger.github_insights["metadata"]["stars"] == 1234
def test_merger_merge_all_with_streams(self, tmp_path):
"""Test merge_all() with GitHub streams."""
docs_data = {"pages": []}
github_data = {"apis": {}}
conflicts = []
# Create three-stream data
code_stream = CodeStream(directory=tmp_path, files=[])
docs_stream = DocsStream(readme="# README", contributing=None, docs_files=[])
insights_stream = InsightsStream(metadata={"stars": 500}, common_problems=[], known_solutions=[], top_labels=[])
github_streams = ThreeStreamData(code_stream, docs_stream, insights_stream)
# Create and run merger
merger = RuleBasedMerger(docs_data, github_data, conflicts, github_streams)
result = merger.merge_all()
# Check result has GitHub context
assert "github_context" in result
assert "conflict_summary" in result
assert "issue_links" in result
assert result["github_context"]["metadata"]["stars"] == 500
def test_merger_without_streams_backward_compat(self):
"""Test backward compatibility without GitHub streams."""
docs_data = {"pages": []}
github_data = {"apis": {}}
conflicts = []
# Create merger without streams (old API)
merger = RuleBasedMerger(docs_data, github_data, conflicts)
assert merger.github_streams is None
assert merger.github_docs is None
assert merger.github_insights is None
# Should still work
result = merger.merge_all()
assert "apis" in result
assert "summary" in result
# Should not have GitHub context
assert "github_context" not in result
class TestIntegration:
"""Integration tests for Phase 3."""
def test_full_pipeline_with_streams(self, tmp_path):
"""Test complete pipeline with three-stream data."""
# Create minimal test data
docs_data = {"pages": []}
github_data = {"apis": {}}
# Create three-stream data
code_stream = CodeStream(directory=tmp_path, files=[])
docs_stream = DocsStream(
readme="# Test Project\n\nA test project.",
contributing="# Contributing\n\nPull requests welcome.",
docs_files=[
{"path": "docs/quickstart.md", "content": "# Quick Start"},
{"path": "docs/api.md", "content": "# API Reference"},
],
)
insights_stream = InsightsStream(
metadata={"stars": 2500, "forks": 123, "language": "Python", "description": "Test framework"},
common_problems=[
{
"title": "Installation fails on Windows",
"number": 150,
"state": "open",
"comments": 25,
"labels": ["bug", "windows"],
},
{
"title": "Memory leak in async mode",
"number": 142,
"state": "open",
"comments": 18,
"labels": ["bug", "async"],
},
],
known_solutions=[
{"title": "Fixed config loading", "number": 130, "state": "closed", "comments": 8, "labels": ["bug"]},
{
"title": "Resolved OAuth timeout",
"number": 125,
"state": "closed",
"comments": 12,
"labels": ["oauth"],
},
],
top_labels=[
{"label": "bug", "count": 45},
{"label": "enhancement", "count": 20},
{"label": "question", "count": 15},
],
)
github_streams = ThreeStreamData(code_stream, docs_stream, insights_stream)
# Create merger and merge
merger = RuleBasedMerger(docs_data, github_data, [], github_streams)
result = merger.merge_all()
# Verify all layers present
assert "apis" in result # Layer 1 & 2: Code + Docs
assert "github_context" in result # Layer 3 & 4: GitHub docs + insights
# Verify Layer 3: GitHub docs
gh_context = result["github_context"]
assert gh_context["docs"]["readme"] == "# Test Project\n\nA test project."
assert gh_context["docs"]["contributing"] == "# Contributing\n\nPull requests welcome."
assert gh_context["docs"]["docs_files_count"] == 2
# Verify Layer 4: GitHub insights
assert gh_context["metadata"]["stars"] == 2500
assert gh_context["metadata"]["language"] == "Python"
assert gh_context["issues"]["common_problems_count"] == 2
assert gh_context["issues"]["known_solutions_count"] == 2
assert len(gh_context["issues"]["top_problems"]) == 2
assert len(gh_context["issues"]["top_solutions"]) == 2
assert len(gh_context["top_labels"]) == 3
# Verify conflict summary
assert "conflict_summary" in result
assert result["conflict_summary"]["total_conflicts"] == 0