Files
skill-seekers-reference/tests/test_e2e_three_stream_pipeline.py
yusyus 596b219599 fix: Resolve remaining 188 linting errors (249 total fixed)
Second batch of comprehensive linting fixes:

Unused Arguments/Variables (136 errors):
- ARG002/ARG001 (91 errors): Prefixed unused method/function arguments with '_'
  - Interface methods in adaptors (base.py, gemini.py, markdown.py)
  - AST analyzer methods maintaining signatures (code_analyzer.py)
  - Test fixtures and hooks (conftest.py)
  - Added noqa: ARG001/ARG002 for pytest hooks requiring exact names
- F841 (45 errors): Prefixed unused local variables with '_'
  - Tuple unpacking where some values aren't needed
  - Variables assigned but not referenced

Loop & Boolean Quality (28 errors):
- B007 (18 errors): Prefixed unused loop control variables with '_'
  - enumerate() loops where index not used
  - for-in loops where loop variable not referenced
- E712 (10 errors): Simplified boolean comparisons
  - Changed '== True' to direct boolean check
  - Changed '== False' to 'not' expression
  - Improved test readability

Code Quality (24 errors):
- SIM201 (4 errors): Already fixed in previous commit
- SIM118 (2 errors): Already fixed in previous commit
- E741 (4 errors): Already fixed in previous commit
- Config manager loop variable fix (1 error)

All Tests Passing:
- test_scraper_features.py: 42 passed
- test_integration.py: 51 passed
- test_architecture_scenarios.py: 11 passed
- test_real_world_fastmcp.py: 19 passed, 1 skipped

Note: Some SIM errors (nested if, multiple with) remain unfixed as they
would require non-trivial refactoring. Focus was on functional correctness.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-17 23:02:11 +03:00

599 lines
21 KiB
Python

"""
End-to-End Tests for Three-Stream GitHub Architecture Pipeline (Phase 5)
Tests the complete workflow:
1. Fetch GitHub repo with three streams (code, docs, insights)
2. Analyze with unified codebase analyzer (basic or c3x)
3. Merge sources with GitHub streams
4. Generate router with GitHub integration
5. Validate output structure and quality
"""
import json
from unittest.mock import Mock, patch
import pytest
from skill_seekers.cli.generate_router import RouterGenerator
from skill_seekers.cli.github_fetcher import (
CodeStream,
DocsStream,
InsightsStream,
ThreeStreamData,
)
from skill_seekers.cli.merge_sources import categorize_issues_by_topic
from skill_seekers.cli.unified_codebase_analyzer import UnifiedCodebaseAnalyzer
class TestE2EBasicWorkflow:
"""Test E2E workflow with basic analysis (fast)."""
@patch("skill_seekers.cli.unified_codebase_analyzer.GitHubThreeStreamFetcher")
def test_github_url_to_basic_analysis(self, mock_fetcher_class, tmp_path):
"""
Test complete pipeline: GitHub URL → Basic analysis → Merged output
This tests the fast path (1-2 minutes) without C3.x analysis.
"""
# Step 1: Mock GitHub three-stream fetcher
mock_fetcher = Mock()
mock_fetcher_class.return_value = mock_fetcher
# Create test code files
(tmp_path / "main.py").write_text("""
import os
import sys
def hello():
print("Hello, World!")
""")
(tmp_path / "utils.js").write_text("""
function greet(name) {
console.log(`Hello, ${name}!`);
}
""")
# Create mock three-stream data
code_stream = CodeStream(
directory=tmp_path, files=[tmp_path / "main.py", tmp_path / "utils.js"]
)
docs_stream = DocsStream(
readme="""# Test Project
A simple test project for demonstrating the three-stream architecture.
## Installation
```bash
pip install test-project
```
## Quick Start
```python
from test_project import hello
hello()
```
""",
contributing="# Contributing\n\nPull requests welcome!",
docs_files=[
{"path": "docs/guide.md", "content": "# User Guide\n\nHow to use this project."}
],
)
insights_stream = InsightsStream(
metadata={
"stars": 1234,
"forks": 56,
"language": "Python",
"description": "A test project",
},
common_problems=[
{
"title": "Installation fails on Windows",
"number": 42,
"state": "open",
"comments": 15,
"labels": ["bug", "windows"],
},
{
"title": "Import error with Python 3.6",
"number": 38,
"state": "open",
"comments": 10,
"labels": ["bug", "python"],
},
],
known_solutions=[
{
"title": "Fixed: Module not found",
"number": 35,
"state": "closed",
"comments": 8,
"labels": ["bug"],
}
],
top_labels=[
{"label": "bug", "count": 25},
{"label": "enhancement", "count": 15},
{"label": "documentation", "count": 10},
],
)
three_streams = ThreeStreamData(code_stream, docs_stream, insights_stream)
mock_fetcher.fetch.return_value = three_streams
# Step 2: Run unified analyzer with basic depth
analyzer = UnifiedCodebaseAnalyzer()
result = analyzer.analyze(
source="https://github.com/test/project", depth="basic", fetch_github_metadata=True
)
# Step 3: Validate all three streams present
assert result.source_type == "github"
assert result.analysis_depth == "basic"
# Validate code stream results
assert result.code_analysis is not None
assert result.code_analysis["analysis_type"] == "basic"
assert "files" in result.code_analysis
assert "structure" in result.code_analysis
assert "imports" in result.code_analysis
# Validate docs stream results
assert result.github_docs is not None
assert result.github_docs["readme"].startswith("# Test Project")
assert "pip install test-project" in result.github_docs["readme"]
# Validate insights stream results
assert result.github_insights is not None
assert result.github_insights["metadata"]["stars"] == 1234
assert result.github_insights["metadata"]["language"] == "Python"
assert len(result.github_insights["common_problems"]) == 2
assert len(result.github_insights["known_solutions"]) == 1
assert len(result.github_insights["top_labels"]) == 3
def test_issue_categorization_by_topic(self):
"""Test that issues are correctly categorized by topic keywords."""
problems = [
{
"title": "OAuth fails on redirect",
"number": 50,
"state": "open",
"comments": 20,
"labels": ["oauth", "bug"],
},
{
"title": "Token refresh issue",
"number": 45,
"state": "open",
"comments": 15,
"labels": ["oauth", "token"],
},
{
"title": "Async deadlock",
"number": 40,
"state": "open",
"comments": 12,
"labels": ["async", "bug"],
},
{
"title": "Database connection lost",
"number": 35,
"state": "open",
"comments": 10,
"labels": ["database"],
},
]
solutions = [
{
"title": "Fixed OAuth flow",
"number": 30,
"state": "closed",
"comments": 8,
"labels": ["oauth"],
},
{
"title": "Resolved async race",
"number": 25,
"state": "closed",
"comments": 6,
"labels": ["async"],
},
]
topics = ["oauth", "auth", "authentication"]
# Categorize issues
categorized = categorize_issues_by_topic(problems, solutions, topics)
# Validate categorization
assert "oauth" in categorized or "auth" in categorized or "authentication" in categorized
oauth_issues = (
categorized.get("oauth", [])
+ categorized.get("auth", [])
+ categorized.get("authentication", [])
)
# Should have 3 OAuth-related issues (2 problems + 1 solution)
assert len(oauth_issues) >= 2 # At least the problems
# OAuth issues should be in the categorized output
oauth_titles = [issue["title"] for issue in oauth_issues]
assert any("OAuth" in title for title in oauth_titles)
class TestE2ERouterGeneration:
"""Test E2E router generation with GitHub integration."""
def test_router_generation_with_github_streams(self, tmp_path):
"""
Test complete router generation workflow with GitHub streams.
Validates:
1. Router config created
2. Router SKILL.md includes GitHub metadata
3. Router SKILL.md includes README quick start
4. Router SKILL.md includes common issues
5. Routing keywords include GitHub labels (2x weight)
"""
# Create sub-skill configs
config1 = {
"name": "testproject-oauth",
"description": "OAuth authentication in Test Project",
"base_url": "https://github.com/test/project",
"categories": {"oauth": ["oauth", "auth"]},
}
config2 = {
"name": "testproject-async",
"description": "Async operations in Test Project",
"base_url": "https://github.com/test/project",
"categories": {"async": ["async", "await"]},
}
config_path1 = tmp_path / "config1.json"
config_path2 = tmp_path / "config2.json"
with open(config_path1, "w") as f:
json.dump(config1, f)
with open(config_path2, "w") as f:
json.dump(config2, f)
# Create GitHub streams
code_stream = CodeStream(directory=tmp_path, files=[])
docs_stream = DocsStream(
readme="""# Test Project
Fast and simple test framework.
## Installation
```bash
pip install test-project
```
## Quick Start
```python
import testproject
testproject.run()
```
""",
contributing="# Contributing\n\nWelcome!",
docs_files=[],
)
insights_stream = InsightsStream(
metadata={
"stars": 5000,
"forks": 250,
"language": "Python",
"description": "Fast test framework",
},
common_problems=[
{
"title": "OAuth setup fails",
"number": 150,
"state": "open",
"comments": 30,
"labels": ["bug", "oauth"],
},
{
"title": "Async deadlock",
"number": 142,
"state": "open",
"comments": 25,
"labels": ["async", "bug"],
},
{
"title": "Token refresh issue",
"number": 130,
"state": "open",
"comments": 20,
"labels": ["oauth"],
},
],
known_solutions=[
{
"title": "Fixed OAuth redirect",
"number": 120,
"state": "closed",
"comments": 15,
"labels": ["oauth"],
},
{
"title": "Resolved async race",
"number": 110,
"state": "closed",
"comments": 12,
"labels": ["async"],
},
],
top_labels=[
{"label": "oauth", "count": 45},
{"label": "async", "count": 38},
{"label": "bug", "count": 30},
],
)
github_streams = ThreeStreamData(code_stream, docs_stream, insights_stream)
# Generate router
generator = RouterGenerator(
[str(config_path1), str(config_path2)], github_streams=github_streams
)
# Step 1: Validate GitHub metadata extracted
assert generator.github_metadata is not None
assert generator.github_metadata["stars"] == 5000
assert generator.github_metadata["language"] == "Python"
# Step 2: Validate GitHub docs extracted
assert generator.github_docs is not None
assert "pip install test-project" in generator.github_docs["readme"]
# Step 3: Validate GitHub issues extracted
assert generator.github_issues is not None
assert len(generator.github_issues["common_problems"]) == 3
assert len(generator.github_issues["known_solutions"]) == 2
assert len(generator.github_issues["top_labels"]) == 3
# Step 4: Generate and validate router SKILL.md
skill_md = generator.generate_skill_md()
# Validate repository metadata section
assert "⭐ 5,000" in skill_md
assert "Python" in skill_md
assert "Fast test framework" in skill_md
# Validate README quick start section
assert "## Quick Start" in skill_md
assert "pip install test-project" in skill_md
# Validate examples section with converted questions (Fix 1)
assert "## Examples" in skill_md
# Issues converted to natural questions
assert (
"how do i fix oauth setup" in skill_md.lower()
or "how do i handle oauth setup" in skill_md.lower()
)
assert (
"how do i handle async deadlock" in skill_md.lower()
or "how do i fix async deadlock" in skill_md.lower()
)
# Common Issues section may still exist with other issues
# Note: Issue numbers may appear in Common Issues or Common Patterns sections
# Step 5: Validate routing keywords include GitHub labels (2x weight)
routing = generator.extract_routing_keywords()
oauth_keywords = routing["testproject-oauth"]
async_keywords = routing["testproject-async"]
# Labels should be included with 2x weight
assert oauth_keywords.count("oauth") >= 2 # Base + name + 2x from label
assert async_keywords.count("async") >= 2 # Base + name + 2x from label
# Step 6: Generate router config
router_config = generator.create_router_config()
assert router_config["name"] == "testproject"
assert router_config["_router"] is True
assert len(router_config["_sub_skills"]) == 2
assert "testproject-oauth" in router_config["_sub_skills"]
assert "testproject-async" in router_config["_sub_skills"]
class TestE2EQualityMetrics:
"""Test quality metrics as specified in Phase 5."""
def test_github_overhead_within_limits(self, tmp_path):
"""
Test that GitHub integration adds ~30-50 lines per skill (not more).
Quality metric: GitHub overhead should be minimal.
"""
# Create minimal config
config = {
"name": "test-skill",
"description": "Test skill",
"base_url": "https://github.com/test/repo",
"categories": {"api": ["api"]},
}
config_path = tmp_path / "config.json"
with open(config_path, "w") as f:
json.dump(config, f)
# Create GitHub streams with realistic data
code_stream = CodeStream(directory=tmp_path, files=[])
docs_stream = DocsStream(
readme="# Test\n\nA short README.", contributing=None, docs_files=[]
)
insights_stream = InsightsStream(
metadata={"stars": 100, "forks": 10, "language": "Python", "description": "Test"},
common_problems=[
{
"title": "Issue 1",
"number": 1,
"state": "open",
"comments": 5,
"labels": ["bug"],
},
{
"title": "Issue 2",
"number": 2,
"state": "open",
"comments": 3,
"labels": ["bug"],
},
],
known_solutions=[],
top_labels=[{"label": "bug", "count": 10}],
)
github_streams = ThreeStreamData(code_stream, docs_stream, insights_stream)
# Generate router without GitHub
generator_no_github = RouterGenerator([str(config_path)])
skill_md_no_github = generator_no_github.generate_skill_md()
lines_no_github = len(skill_md_no_github.split("\n"))
# Generate router with GitHub
generator_with_github = RouterGenerator([str(config_path)], github_streams=github_streams)
skill_md_with_github = generator_with_github.generate_skill_md()
lines_with_github = len(skill_md_with_github.split("\n"))
# Calculate GitHub overhead
github_overhead = lines_with_github - lines_no_github
# Validate overhead is within acceptable range (30-50 lines)
assert 20 <= github_overhead <= 60, (
f"GitHub overhead is {github_overhead} lines, expected 20-60"
)
def test_router_size_within_limits(self, tmp_path):
"""
Test that router SKILL.md is ~150 lines (±20).
Quality metric: Router should be concise overview, not exhaustive.
"""
# Create multiple sub-skill configs
configs = []
for i in range(4):
config = {
"name": f"test-skill-{i}",
"description": f"Test skill {i}",
"base_url": "https://github.com/test/repo",
"categories": {f"topic{i}": [f"topic{i}"]},
}
config_path = tmp_path / f"config{i}.json"
with open(config_path, "w") as f:
json.dump(config, f)
configs.append(str(config_path))
# Generate router
generator = RouterGenerator(configs)
skill_md = generator.generate_skill_md()
lines = len(skill_md.split("\n"))
# Validate router size is reasonable (60-250 lines for 4 sub-skills)
# Actual size depends on whether GitHub streams included - can be as small as 60 lines
assert 60 <= lines <= 250, f"Router is {lines} lines, expected 60-250 for 4 sub-skills"
class TestE2EBackwardCompatibility:
"""Test that old code still works without GitHub streams."""
def test_router_without_github_streams(self, tmp_path):
"""Test that router generation works without GitHub streams (backward compat)."""
config = {
"name": "test-skill",
"description": "Test skill",
"base_url": "https://example.com",
"categories": {"api": ["api"]},
}
config_path = tmp_path / "config.json"
with open(config_path, "w") as f:
json.dump(config, f)
# Generate router WITHOUT GitHub streams
generator = RouterGenerator([str(config_path)])
assert generator.github_metadata is None
assert generator.github_docs is None
assert generator.github_issues is None
# Should still generate valid SKILL.md
skill_md = generator.generate_skill_md()
assert "When to Use This Skill" in skill_md
assert "How It Works" in skill_md
# Should NOT have GitHub-specific sections
assert "" not in skill_md
assert "Repository Info" not in skill_md
assert "Quick Start (from README)" not in skill_md
assert "Common Issues (from GitHub)" not in skill_md
@patch("skill_seekers.cli.unified_codebase_analyzer.GitHubThreeStreamFetcher")
def test_analyzer_without_github_metadata(self, mock_fetcher_class, tmp_path):
"""Test analyzer with fetch_github_metadata=False."""
mock_fetcher = Mock()
mock_fetcher_class.return_value = mock_fetcher
code_stream = CodeStream(directory=tmp_path, files=[])
docs_stream = DocsStream(readme=None, contributing=None, docs_files=[])
insights_stream = InsightsStream(
metadata={}, common_problems=[], known_solutions=[], top_labels=[]
)
three_streams = ThreeStreamData(code_stream, docs_stream, insights_stream)
mock_fetcher.fetch.return_value = three_streams
(tmp_path / "main.py").write_text("print('hello')")
analyzer = UnifiedCodebaseAnalyzer()
result = analyzer.analyze(
source="https://github.com/test/repo",
depth="basic",
fetch_github_metadata=False, # Explicitly disable
)
# Should not include GitHub docs/insights
assert result.github_docs is None
assert result.github_insights is None
class TestE2ETokenEfficiency:
"""Test token efficiency metrics."""
def test_three_stream_produces_compact_output(self, tmp_path):
"""
Test that three-stream architecture produces compact, efficient output.
This is a qualitative test - we verify that output is structured and
not duplicated across streams.
"""
# Create test files
(tmp_path / "main.py").write_text("import os\nprint('test')")
# Create GitHub streams
code_stream = CodeStream(directory=tmp_path, files=[tmp_path / "main.py"])
docs_stream = DocsStream(
readme="# Test\n\nQuick start guide.", contributing=None, docs_files=[]
)
insights_stream = InsightsStream(
metadata={"stars": 100}, common_problems=[], known_solutions=[], top_labels=[]
)
_three_streams = ThreeStreamData(code_stream, docs_stream, insights_stream)
# Verify streams are separate (no duplication)
assert code_stream.directory == tmp_path
assert docs_stream.readme is not None
assert insights_stream.metadata is not None
# Verify no cross-contamination
assert "Quick start guide" not in str(code_stream.files)
assert str(tmp_path) not in docs_stream.readme
if __name__ == "__main__":
pytest.main([__file__, "-v"])