""" End-to-End Tests for Three-Stream GitHub Architecture Pipeline (Phase 5) Tests the complete workflow: 1. Fetch GitHub repo with three streams (code, docs, insights) 2. Analyze with unified codebase analyzer (basic or c3x) 3. Merge sources with GitHub streams 4. Generate router with GitHub integration 5. Validate output structure and quality """ import json from unittest.mock import Mock, patch import pytest from skill_seekers.cli.generate_router import RouterGenerator from skill_seekers.cli.github_fetcher import ( CodeStream, DocsStream, InsightsStream, ThreeStreamData, ) from skill_seekers.cli.merge_sources import categorize_issues_by_topic from skill_seekers.cli.unified_codebase_analyzer import UnifiedCodebaseAnalyzer class TestE2EBasicWorkflow: """Test E2E workflow with basic analysis (fast).""" @patch("skill_seekers.cli.unified_codebase_analyzer.GitHubThreeStreamFetcher") def test_github_url_to_basic_analysis(self, mock_fetcher_class, tmp_path): """ Test complete pipeline: GitHub URL → Basic analysis → Merged output This tests the fast path (1-2 minutes) without C3.x analysis. """ # Step 1: Mock GitHub three-stream fetcher mock_fetcher = Mock() mock_fetcher_class.return_value = mock_fetcher # Create test code files (tmp_path / "main.py").write_text(""" import os import sys def hello(): print("Hello, World!") """) (tmp_path / "utils.js").write_text(""" function greet(name) { console.log(`Hello, ${name}!`); } """) # Create mock three-stream data code_stream = CodeStream( directory=tmp_path, files=[tmp_path / "main.py", tmp_path / "utils.js"] ) docs_stream = DocsStream( readme="""# Test Project A simple test project for demonstrating the three-stream architecture. ## Installation ```bash pip install test-project ``` ## Quick Start ```python from test_project import hello hello() ``` """, contributing="# Contributing\n\nPull requests welcome!", docs_files=[ {"path": "docs/guide.md", "content": "# User Guide\n\nHow to use this project."} ], ) insights_stream = InsightsStream( metadata={ "stars": 1234, "forks": 56, "language": "Python", "description": "A test project", }, common_problems=[ { "title": "Installation fails on Windows", "number": 42, "state": "open", "comments": 15, "labels": ["bug", "windows"], }, { "title": "Import error with Python 3.6", "number": 38, "state": "open", "comments": 10, "labels": ["bug", "python"], }, ], known_solutions=[ { "title": "Fixed: Module not found", "number": 35, "state": "closed", "comments": 8, "labels": ["bug"], } ], top_labels=[ {"label": "bug", "count": 25}, {"label": "enhancement", "count": 15}, {"label": "documentation", "count": 10}, ], ) three_streams = ThreeStreamData(code_stream, docs_stream, insights_stream) mock_fetcher.fetch.return_value = three_streams # Step 2: Run unified analyzer with basic depth analyzer = UnifiedCodebaseAnalyzer() result = analyzer.analyze( source="https://github.com/test/project", depth="basic", fetch_github_metadata=True ) # Step 3: Validate all three streams present assert result.source_type == "github" assert result.analysis_depth == "basic" # Validate code stream results assert result.code_analysis is not None assert result.code_analysis["analysis_type"] == "basic" assert "files" in result.code_analysis assert "structure" in result.code_analysis assert "imports" in result.code_analysis # Validate docs stream results assert result.github_docs is not None assert result.github_docs["readme"].startswith("# Test Project") assert "pip install test-project" in result.github_docs["readme"] # Validate insights stream results assert result.github_insights is not None assert result.github_insights["metadata"]["stars"] == 1234 assert result.github_insights["metadata"]["language"] == "Python" assert len(result.github_insights["common_problems"]) == 2 assert len(result.github_insights["known_solutions"]) == 1 assert len(result.github_insights["top_labels"]) == 3 def test_issue_categorization_by_topic(self): """Test that issues are correctly categorized by topic keywords.""" problems = [ { "title": "OAuth fails on redirect", "number": 50, "state": "open", "comments": 20, "labels": ["oauth", "bug"], }, { "title": "Token refresh issue", "number": 45, "state": "open", "comments": 15, "labels": ["oauth", "token"], }, { "title": "Async deadlock", "number": 40, "state": "open", "comments": 12, "labels": ["async", "bug"], }, { "title": "Database connection lost", "number": 35, "state": "open", "comments": 10, "labels": ["database"], }, ] solutions = [ { "title": "Fixed OAuth flow", "number": 30, "state": "closed", "comments": 8, "labels": ["oauth"], }, { "title": "Resolved async race", "number": 25, "state": "closed", "comments": 6, "labels": ["async"], }, ] topics = ["oauth", "auth", "authentication"] # Categorize issues categorized = categorize_issues_by_topic(problems, solutions, topics) # Validate categorization assert "oauth" in categorized or "auth" in categorized or "authentication" in categorized oauth_issues = ( categorized.get("oauth", []) + categorized.get("auth", []) + categorized.get("authentication", []) ) # Should have 3 OAuth-related issues (2 problems + 1 solution) assert len(oauth_issues) >= 2 # At least the problems # OAuth issues should be in the categorized output oauth_titles = [issue["title"] for issue in oauth_issues] assert any("OAuth" in title for title in oauth_titles) class TestE2ERouterGeneration: """Test E2E router generation with GitHub integration.""" def test_router_generation_with_github_streams(self, tmp_path): """ Test complete router generation workflow with GitHub streams. Validates: 1. Router config created 2. Router SKILL.md includes GitHub metadata 3. Router SKILL.md includes README quick start 4. Router SKILL.md includes common issues 5. Routing keywords include GitHub labels (2x weight) """ # Create sub-skill configs config1 = { "name": "testproject-oauth", "description": "OAuth authentication in Test Project", "base_url": "https://github.com/test/project", "categories": {"oauth": ["oauth", "auth"]}, } config2 = { "name": "testproject-async", "description": "Async operations in Test Project", "base_url": "https://github.com/test/project", "categories": {"async": ["async", "await"]}, } config_path1 = tmp_path / "config1.json" config_path2 = tmp_path / "config2.json" with open(config_path1, "w") as f: json.dump(config1, f) with open(config_path2, "w") as f: json.dump(config2, f) # Create GitHub streams code_stream = CodeStream(directory=tmp_path, files=[]) docs_stream = DocsStream( readme="""# Test Project Fast and simple test framework. ## Installation ```bash pip install test-project ``` ## Quick Start ```python import testproject testproject.run() ``` """, contributing="# Contributing\n\nWelcome!", docs_files=[], ) insights_stream = InsightsStream( metadata={ "stars": 5000, "forks": 250, "language": "Python", "description": "Fast test framework", }, common_problems=[ { "title": "OAuth setup fails", "number": 150, "state": "open", "comments": 30, "labels": ["bug", "oauth"], }, { "title": "Async deadlock", "number": 142, "state": "open", "comments": 25, "labels": ["async", "bug"], }, { "title": "Token refresh issue", "number": 130, "state": "open", "comments": 20, "labels": ["oauth"], }, ], known_solutions=[ { "title": "Fixed OAuth redirect", "number": 120, "state": "closed", "comments": 15, "labels": ["oauth"], }, { "title": "Resolved async race", "number": 110, "state": "closed", "comments": 12, "labels": ["async"], }, ], top_labels=[ {"label": "oauth", "count": 45}, {"label": "async", "count": 38}, {"label": "bug", "count": 30}, ], ) github_streams = ThreeStreamData(code_stream, docs_stream, insights_stream) # Generate router generator = RouterGenerator( [str(config_path1), str(config_path2)], github_streams=github_streams ) # Step 1: Validate GitHub metadata extracted assert generator.github_metadata is not None assert generator.github_metadata["stars"] == 5000 assert generator.github_metadata["language"] == "Python" # Step 2: Validate GitHub docs extracted assert generator.github_docs is not None assert "pip install test-project" in generator.github_docs["readme"] # Step 3: Validate GitHub issues extracted assert generator.github_issues is not None assert len(generator.github_issues["common_problems"]) == 3 assert len(generator.github_issues["known_solutions"]) == 2 assert len(generator.github_issues["top_labels"]) == 3 # Step 4: Generate and validate router SKILL.md skill_md = generator.generate_skill_md() # Validate repository metadata section assert "⭐ 5,000" in skill_md assert "Python" in skill_md assert "Fast test framework" in skill_md # Validate README quick start section assert "## Quick Start" in skill_md assert "pip install test-project" in skill_md # Validate examples section with converted questions (Fix 1) assert "## Examples" in skill_md # Issues converted to natural questions assert ( "how do i fix oauth setup" in skill_md.lower() or "how do i handle oauth setup" in skill_md.lower() ) assert ( "how do i handle async deadlock" in skill_md.lower() or "how do i fix async deadlock" in skill_md.lower() ) # Common Issues section may still exist with other issues # Note: Issue numbers may appear in Common Issues or Common Patterns sections # Step 5: Validate routing keywords include GitHub labels (2x weight) routing = generator.extract_routing_keywords() oauth_keywords = routing["testproject-oauth"] async_keywords = routing["testproject-async"] # Labels should be included with 2x weight assert oauth_keywords.count("oauth") >= 2 # Base + name + 2x from label assert async_keywords.count("async") >= 2 # Base + name + 2x from label # Step 6: Generate router config router_config = generator.create_router_config() assert router_config["name"] == "testproject" assert router_config["_router"] is True assert len(router_config["_sub_skills"]) == 2 assert "testproject-oauth" in router_config["_sub_skills"] assert "testproject-async" in router_config["_sub_skills"] class TestE2EQualityMetrics: """Test quality metrics as specified in Phase 5.""" def test_github_overhead_within_limits(self, tmp_path): """ Test that GitHub integration adds ~30-50 lines per skill (not more). Quality metric: GitHub overhead should be minimal. """ # Create minimal config config = { "name": "test-skill", "description": "Test skill", "base_url": "https://github.com/test/repo", "categories": {"api": ["api"]}, } config_path = tmp_path / "config.json" with open(config_path, "w") as f: json.dump(config, f) # Create GitHub streams with realistic data code_stream = CodeStream(directory=tmp_path, files=[]) docs_stream = DocsStream( readme="# Test\n\nA short README.", contributing=None, docs_files=[] ) insights_stream = InsightsStream( metadata={"stars": 100, "forks": 10, "language": "Python", "description": "Test"}, common_problems=[ { "title": "Issue 1", "number": 1, "state": "open", "comments": 5, "labels": ["bug"], }, { "title": "Issue 2", "number": 2, "state": "open", "comments": 3, "labels": ["bug"], }, ], known_solutions=[], top_labels=[{"label": "bug", "count": 10}], ) github_streams = ThreeStreamData(code_stream, docs_stream, insights_stream) # Generate router without GitHub generator_no_github = RouterGenerator([str(config_path)]) skill_md_no_github = generator_no_github.generate_skill_md() lines_no_github = len(skill_md_no_github.split("\n")) # Generate router with GitHub generator_with_github = RouterGenerator([str(config_path)], github_streams=github_streams) skill_md_with_github = generator_with_github.generate_skill_md() lines_with_github = len(skill_md_with_github.split("\n")) # Calculate GitHub overhead github_overhead = lines_with_github - lines_no_github # Validate overhead is within acceptable range (30-50 lines) assert 20 <= github_overhead <= 60, ( f"GitHub overhead is {github_overhead} lines, expected 20-60" ) def test_router_size_within_limits(self, tmp_path): """ Test that router SKILL.md is ~150 lines (±20). Quality metric: Router should be concise overview, not exhaustive. """ # Create multiple sub-skill configs configs = [] for i in range(4): config = { "name": f"test-skill-{i}", "description": f"Test skill {i}", "base_url": "https://github.com/test/repo", "categories": {f"topic{i}": [f"topic{i}"]}, } config_path = tmp_path / f"config{i}.json" with open(config_path, "w") as f: json.dump(config, f) configs.append(str(config_path)) # Generate router generator = RouterGenerator(configs) skill_md = generator.generate_skill_md() lines = len(skill_md.split("\n")) # Validate router size is reasonable (60-250 lines for 4 sub-skills) # Actual size depends on whether GitHub streams included - can be as small as 60 lines assert 60 <= lines <= 250, f"Router is {lines} lines, expected 60-250 for 4 sub-skills" class TestE2EBackwardCompatibility: """Test that old code still works without GitHub streams.""" def test_router_without_github_streams(self, tmp_path): """Test that router generation works without GitHub streams (backward compat).""" config = { "name": "test-skill", "description": "Test skill", "base_url": "https://example.com", "categories": {"api": ["api"]}, } config_path = tmp_path / "config.json" with open(config_path, "w") as f: json.dump(config, f) # Generate router WITHOUT GitHub streams generator = RouterGenerator([str(config_path)]) assert generator.github_metadata is None assert generator.github_docs is None assert generator.github_issues is None # Should still generate valid SKILL.md skill_md = generator.generate_skill_md() assert "When to Use This Skill" in skill_md assert "How It Works" in skill_md # Should NOT have GitHub-specific sections assert "⭐" not in skill_md assert "Repository Info" not in skill_md assert "Quick Start (from README)" not in skill_md assert "Common Issues (from GitHub)" not in skill_md @patch("skill_seekers.cli.unified_codebase_analyzer.GitHubThreeStreamFetcher") def test_analyzer_without_github_metadata(self, mock_fetcher_class, tmp_path): """Test analyzer with fetch_github_metadata=False.""" mock_fetcher = Mock() mock_fetcher_class.return_value = mock_fetcher code_stream = CodeStream(directory=tmp_path, files=[]) docs_stream = DocsStream(readme=None, contributing=None, docs_files=[]) insights_stream = InsightsStream( metadata={}, common_problems=[], known_solutions=[], top_labels=[] ) three_streams = ThreeStreamData(code_stream, docs_stream, insights_stream) mock_fetcher.fetch.return_value = three_streams (tmp_path / "main.py").write_text("print('hello')") analyzer = UnifiedCodebaseAnalyzer() result = analyzer.analyze( source="https://github.com/test/repo", depth="basic", fetch_github_metadata=False, # Explicitly disable ) # Should not include GitHub docs/insights assert result.github_docs is None assert result.github_insights is None class TestE2ETokenEfficiency: """Test token efficiency metrics.""" def test_three_stream_produces_compact_output(self, tmp_path): """ Test that three-stream architecture produces compact, efficient output. This is a qualitative test - we verify that output is structured and not duplicated across streams. """ # Create test files (tmp_path / "main.py").write_text("import os\nprint('test')") # Create GitHub streams code_stream = CodeStream(directory=tmp_path, files=[tmp_path / "main.py"]) docs_stream = DocsStream( readme="# Test\n\nQuick start guide.", contributing=None, docs_files=[] ) insights_stream = InsightsStream( metadata={"stars": 100}, common_problems=[], known_solutions=[], top_labels=[] ) _three_streams = ThreeStreamData(code_stream, docs_stream, insights_stream) # Verify streams are separate (no duplication) assert code_stream.directory == tmp_path assert docs_stream.readme is not None assert insights_stream.metadata is not None # Verify no cross-contamination assert "Quick start guide" not in str(code_stream.files) assert str(tmp_path) not in docs_stream.readme if __name__ == "__main__": pytest.main([__file__, "-v"])