""" Tests for GitHub Three-Stream Fetcher Tests the three-stream architecture that splits GitHub repositories into: - Code stream (for C3.x) - Docs stream (README, docs/*.md) - Insights stream (issues, metadata) """ from pathlib import Path from unittest.mock import Mock, patch import pytest from skill_seekers.cli.github_fetcher import ( CodeStream, DocsStream, GitHubThreeStreamFetcher, InsightsStream, ThreeStreamData, ) class TestDataClasses: """Test data class definitions.""" def test_code_stream(self): """Test CodeStream data class.""" code_stream = CodeStream(directory=Path("/tmp/repo"), files=[Path("/tmp/repo/src/main.py")]) assert code_stream.directory == Path("/tmp/repo") assert len(code_stream.files) == 1 def test_docs_stream(self): """Test DocsStream data class.""" docs_stream = DocsStream( readme="# README", contributing="# Contributing", docs_files=[{"path": "docs/guide.md", "content": "# Guide"}], ) assert docs_stream.readme == "# README" assert docs_stream.contributing == "# Contributing" assert len(docs_stream.docs_files) == 1 def test_insights_stream(self): """Test InsightsStream data class.""" insights_stream = InsightsStream( metadata={"stars": 1234, "forks": 56}, common_problems=[{"title": "Bug", "number": 42}], known_solutions=[{"title": "Fix", "number": 35}], top_labels=[{"label": "bug", "count": 10}], ) assert insights_stream.metadata["stars"] == 1234 assert len(insights_stream.common_problems) == 1 assert len(insights_stream.known_solutions) == 1 assert len(insights_stream.top_labels) == 1 def test_three_stream_data(self): """Test ThreeStreamData combination.""" three_streams = ThreeStreamData( code_stream=CodeStream(Path("/tmp"), []), docs_stream=DocsStream(None, None, []), insights_stream=InsightsStream({}, [], [], []), ) assert isinstance(three_streams.code_stream, CodeStream) assert isinstance(three_streams.docs_stream, DocsStream) assert isinstance(three_streams.insights_stream, InsightsStream) class TestGitHubFetcherInit: """Test GitHubThreeStreamFetcher initialization.""" def test_parse_https_url(self): """Test parsing HTTPS GitHub URLs.""" fetcher = GitHubThreeStreamFetcher("https://github.com/facebook/react") assert fetcher.owner == "facebook" assert fetcher.repo == "react" def test_parse_https_url_with_git(self): """Test parsing HTTPS URLs with .git suffix.""" fetcher = GitHubThreeStreamFetcher("https://github.com/facebook/react.git") assert fetcher.owner == "facebook" assert fetcher.repo == "react" def test_parse_git_url(self): """Test parsing git@ URLs.""" fetcher = GitHubThreeStreamFetcher("git@github.com:facebook/react.git") assert fetcher.owner == "facebook" assert fetcher.repo == "react" def test_invalid_url(self): """Test invalid URL raises error.""" with pytest.raises(ValueError): GitHubThreeStreamFetcher("https://invalid.com/repo") @patch.dict("os.environ", {"GITHUB_TOKEN": "test_token"}) def test_github_token_from_env(self): """Test GitHub token loaded from environment.""" fetcher = GitHubThreeStreamFetcher("https://github.com/facebook/react") assert fetcher.github_token == "test_token" class TestFileClassification: """Test file classification into code vs docs.""" def test_classify_files(self, tmp_path): """Test classify_files separates code and docs correctly.""" # Create test directory structure (tmp_path / "src").mkdir() (tmp_path / "src" / "main.py").write_text("print('hello')") (tmp_path / "src" / "utils.js").write_text("function(){}") (tmp_path / "docs").mkdir() (tmp_path / "README.md").write_text("# README") (tmp_path / "docs" / "guide.md").write_text("# Guide") (tmp_path / "docs" / "api.rst").write_text("API") (tmp_path / "node_modules").mkdir() (tmp_path / "node_modules" / "lib.js").write_text("// should be excluded") fetcher = GitHubThreeStreamFetcher("https://github.com/test/repo") code_files, doc_files = fetcher.classify_files(tmp_path) # Check code files code_paths = [f.name for f in code_files] assert "main.py" in code_paths assert "utils.js" in code_paths assert "lib.js" not in code_paths # Excluded # Check doc files doc_paths = [f.name for f in doc_files] assert "README.md" in doc_paths assert "guide.md" in doc_paths assert "api.rst" in doc_paths def test_classify_excludes_hidden_files(self, tmp_path): """Test that hidden files are excluded (except in docs/).""" (tmp_path / ".hidden.py").write_text("hidden") (tmp_path / "visible.py").write_text("visible") fetcher = GitHubThreeStreamFetcher("https://github.com/test/repo") code_files, doc_files = fetcher.classify_files(tmp_path) code_names = [f.name for f in code_files] assert ".hidden.py" not in code_names assert "visible.py" in code_names def test_classify_various_code_extensions(self, tmp_path): """Test classification of various code file extensions.""" extensions = [".py", ".js", ".ts", ".go", ".rs", ".java", ".kt", ".rb", ".php"] for ext in extensions: (tmp_path / f"file{ext}").write_text("code") fetcher = GitHubThreeStreamFetcher("https://github.com/test/repo") code_files, doc_files = fetcher.classify_files(tmp_path) assert len(code_files) == len(extensions) class TestIssueAnalysis: """Test GitHub issue analysis.""" def test_analyze_issues_common_problems(self): """Test extraction of common problems (open issues with 5+ comments).""" issues = [ { "title": "OAuth fails", "number": 42, "state": "open", "comments": 10, "labels": [{"name": "bug"}, {"name": "oauth"}], }, { "title": "Minor issue", "number": 43, "state": "open", "comments": 2, # Too few comments "labels": [], }, ] fetcher = GitHubThreeStreamFetcher("https://github.com/test/repo") insights = fetcher.analyze_issues(issues) assert len(insights["common_problems"]) == 1 assert insights["common_problems"][0]["number"] == 42 assert insights["common_problems"][0]["comments"] == 10 def test_analyze_issues_known_solutions(self): """Test extraction of known solutions (closed issues with comments).""" issues = [ { "title": "Fixed OAuth", "number": 35, "state": "closed", "comments": 5, "labels": [{"name": "bug"}], }, { "title": "Closed without comments", "number": 36, "state": "closed", "comments": 0, # No comments "labels": [], }, ] fetcher = GitHubThreeStreamFetcher("https://github.com/test/repo") insights = fetcher.analyze_issues(issues) assert len(insights["known_solutions"]) == 1 assert insights["known_solutions"][0]["number"] == 35 def test_analyze_issues_top_labels(self): """Test counting of top issue labels.""" issues = [ {"state": "open", "comments": 5, "labels": [{"name": "bug"}, {"name": "oauth"}]}, {"state": "open", "comments": 5, "labels": [{"name": "bug"}]}, {"state": "closed", "comments": 3, "labels": [{"name": "enhancement"}]}, ] fetcher = GitHubThreeStreamFetcher("https://github.com/test/repo") insights = fetcher.analyze_issues(issues) # Bug should be top label (appears twice) assert insights["top_labels"][0]["label"] == "bug" assert insights["top_labels"][0]["count"] == 2 def test_analyze_issues_limits_to_10(self): """Test that analysis limits results to top 10.""" issues = [ { "title": f"Issue {i}", "number": i, "state": "open", "comments": 20 - i, # Descending comment count "labels": [], } for i in range(20) ] fetcher = GitHubThreeStreamFetcher("https://github.com/test/repo") insights = fetcher.analyze_issues(issues) assert len(insights["common_problems"]) <= 10 # Should be sorted by comment count (descending) if len(insights["common_problems"]) > 1: assert ( insights["common_problems"][0]["comments"] >= insights["common_problems"][1]["comments"] ) class TestGitHubAPI: """Test GitHub API interactions.""" @patch("requests.get") def test_fetch_github_metadata(self, mock_get): """Test fetching repository metadata via GitHub API.""" mock_response = Mock() mock_response.json.return_value = { "stargazers_count": 1234, "forks_count": 56, "open_issues_count": 12, "language": "Python", "description": "Test repo", "homepage": "https://example.com", "created_at": "2020-01-01", "updated_at": "2024-01-01", } mock_response.raise_for_status = Mock() mock_get.return_value = mock_response fetcher = GitHubThreeStreamFetcher("https://github.com/test/repo") metadata = fetcher.fetch_github_metadata() assert metadata["stars"] == 1234 assert metadata["forks"] == 56 assert metadata["language"] == "Python" @patch("requests.get") def test_fetch_github_metadata_failure(self, mock_get): """Test graceful handling of metadata fetch failure.""" mock_get.side_effect = Exception("API error") fetcher = GitHubThreeStreamFetcher("https://github.com/test/repo") metadata = fetcher.fetch_github_metadata() # Should return default values instead of crashing assert metadata["stars"] == 0 assert metadata["language"] == "Unknown" @patch("requests.get") def test_fetch_issues(self, mock_get): """Test fetching issues via GitHub API.""" mock_response = Mock() mock_response.json.return_value = [ { "title": "Bug", "number": 42, "state": "open", "comments": 10, "labels": [{"name": "bug"}], } ] mock_response.raise_for_status = Mock() mock_get.return_value = mock_response fetcher = GitHubThreeStreamFetcher("https://github.com/test/repo") issues = fetcher.fetch_issues(max_issues=100) assert len(issues) > 0 # Should be called twice (open + closed) assert mock_get.call_count == 2 @patch("requests.get") def test_fetch_issues_filters_pull_requests(self, mock_get): """Test that pull requests are filtered out of issues.""" mock_response = Mock() mock_response.json.return_value = [ {"title": "Issue", "number": 42, "state": "open", "comments": 5, "labels": []}, { "title": "PR", "number": 43, "state": "open", "comments": 3, "labels": [], "pull_request": {}, }, ] mock_response.raise_for_status = Mock() mock_get.return_value = mock_response fetcher = GitHubThreeStreamFetcher("https://github.com/test/repo") issues = fetcher.fetch_issues(max_issues=100) # Should only include the issue, not the PR assert all("pull_request" not in issue for issue in issues) class TestReadFile: """Test file reading utilities.""" def test_read_file_success(self, tmp_path): """Test successful file reading.""" test_file = tmp_path / "test.txt" test_file.write_text("Hello, world!") fetcher = GitHubThreeStreamFetcher("https://github.com/test/repo") content = fetcher.read_file(test_file) assert content == "Hello, world!" def test_read_file_not_found(self, tmp_path): """Test reading non-existent file returns None.""" fetcher = GitHubThreeStreamFetcher("https://github.com/test/repo") content = fetcher.read_file(tmp_path / "missing.txt") assert content is None def test_read_file_encoding_fallback(self, tmp_path): """Test fallback to latin-1 encoding if UTF-8 fails.""" test_file = tmp_path / "test.txt" # Write bytes that are invalid UTF-8 but valid latin-1 test_file.write_bytes(b"\xff\xfe") fetcher = GitHubThreeStreamFetcher("https://github.com/test/repo") content = fetcher.read_file(test_file) # Should still read successfully with latin-1 assert content is not None class TestIntegration: """Integration tests for complete three-stream fetching.""" @patch("subprocess.run") @patch("requests.get") def test_fetch_integration(self, mock_get, mock_run, tmp_path): """Test complete fetch() integration.""" # Mock git clone mock_run.return_value = Mock(returncode=0, stderr="") # Mock GitHub API calls def api_side_effect(*args, **_kwargs): url = args[0] mock_response = Mock() mock_response.raise_for_status = Mock() if "repos/" in url and "/issues" not in url: # Metadata call mock_response.json.return_value = { "stargazers_count": 1234, "forks_count": 56, "open_issues_count": 12, "language": "Python", } else: # Issues call mock_response.json.return_value = [ { "title": "Test Issue", "number": 42, "state": "open", "comments": 10, "labels": [{"name": "bug"}], } ] return mock_response mock_get.side_effect = api_side_effect # Create test repo structure repo_dir = tmp_path / "repo" repo_dir.mkdir() (repo_dir / "src").mkdir() (repo_dir / "src" / "main.py").write_text("print('hello')") (repo_dir / "README.md").write_text("# README") fetcher = GitHubThreeStreamFetcher("https://github.com/test/repo", interactive=False) # Mock clone to use our tmp_path with patch.object(fetcher, "clone_repo", return_value=repo_dir): three_streams = fetcher.fetch() # Verify all 3 streams present assert three_streams.code_stream is not None assert three_streams.docs_stream is not None assert three_streams.insights_stream is not None # Verify code stream assert len(three_streams.code_stream.files) > 0 # Verify docs stream assert three_streams.docs_stream.readme is not None assert "# README" in three_streams.docs_stream.readme # Verify insights stream assert three_streams.insights_stream.metadata["stars"] == 1234 assert len(three_streams.insights_stream.common_problems) > 0