CI uses 'ruff format' not 'black' - applied proper formatting: Files reformatted by ruff: - config_extractor.py - doc_scraper.py - how_to_guide_builder.py - llms_txt_parser.py - pattern_recognizer.py - test_example_extractor.py - unified_codebase_analyzer.py - test_architecture_scenarios.py - test_async_scraping.py - test_github_scraper.py - test_guide_enhancer.py - test_install_agent.py - test_issue_219_e2e.py - test_llms_txt_downloader.py - test_skip_llms_txt.py Fixes CI formatting check failure. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
1016 lines
38 KiB
Python
1016 lines
38 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Tests for GitHub Scraper (cli/github_scraper.py)
|
|
|
|
Tests cover:
|
|
- GitHubScraper initialization and configuration (C1.1)
|
|
- README extraction (C1.2)
|
|
- Language detection (C1.4)
|
|
- GitHub Issues extraction (C1.7)
|
|
- CHANGELOG extraction (C1.8)
|
|
- GitHub Releases extraction (C1.9)
|
|
- GitHubToSkillConverter and skill building (C1.10)
|
|
- Authentication handling
|
|
- Error handling and edge cases
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import shutil
|
|
import tempfile
|
|
import unittest
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from unittest.mock import Mock, patch
|
|
|
|
try:
|
|
from github import Github, GithubException # noqa: F401
|
|
|
|
PYGITHUB_AVAILABLE = True
|
|
except ImportError:
|
|
PYGITHUB_AVAILABLE = False
|
|
|
|
|
|
class TestGitHubScraperInitialization(unittest.TestCase):
|
|
"""Test GitHubScraper initialization and configuration (C1.1)"""
|
|
|
|
def setUp(self):
|
|
if not PYGITHUB_AVAILABLE:
|
|
self.skipTest("PyGithub not installed")
|
|
from skill_seekers.cli.github_scraper import GitHubScraper
|
|
|
|
self.GitHubScraper = GitHubScraper
|
|
|
|
# Create temporary directory for test output
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
self.output_dir = Path(self.temp_dir)
|
|
|
|
def tearDown(self):
|
|
# Clean up temporary directory
|
|
if hasattr(self, "temp_dir"):
|
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
|
|
def test_init_with_repo_name(self):
|
|
"""Test initialization with repository name"""
|
|
config = {"repo": "facebook/react", "name": "react", "github_token": None}
|
|
|
|
scraper = self.GitHubScraper(config)
|
|
|
|
self.assertEqual(scraper.repo_name, "facebook/react")
|
|
self.assertEqual(scraper.name, "react")
|
|
self.assertIsNotNone(scraper.github)
|
|
|
|
def test_init_with_token_from_config(self):
|
|
"""Test initialization with token from config"""
|
|
config = {
|
|
"repo": "facebook/react",
|
|
"name": "react",
|
|
"github_token": "test_token_123",
|
|
}
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github") as mock_github:
|
|
_scraper = self.GitHubScraper(config)
|
|
mock_github.assert_called_once_with("test_token_123")
|
|
|
|
def test_init_with_token_from_env(self):
|
|
"""Test initialization with token from environment variable"""
|
|
config = {"repo": "facebook/react", "name": "react", "github_token": None}
|
|
|
|
with (
|
|
patch.dict(os.environ, {"GITHUB_TOKEN": "env_token_456"}),
|
|
patch("skill_seekers.cli.github_scraper.Github") as mock_github,
|
|
):
|
|
_scraper = self.GitHubScraper(config)
|
|
mock_github.assert_called_once_with("env_token_456")
|
|
|
|
def test_init_without_token(self):
|
|
"""Test initialization without authentication"""
|
|
config = {"repo": "facebook/react", "name": "react", "github_token": None}
|
|
|
|
with (
|
|
patch("skill_seekers.cli.github_scraper.Github"),
|
|
patch.dict(os.environ, {}, clear=True),
|
|
):
|
|
scraper = self.GitHubScraper(config)
|
|
# Should create unauthenticated client
|
|
self.assertIsNotNone(scraper.github)
|
|
|
|
def test_token_priority_env_over_config(self):
|
|
"""Test that GITHUB_TOKEN env var takes priority over config"""
|
|
config = {
|
|
"repo": "facebook/react",
|
|
"name": "react",
|
|
"github_token": "config_token",
|
|
}
|
|
|
|
with patch.dict(os.environ, {"GITHUB_TOKEN": "env_token"}):
|
|
scraper = self.GitHubScraper(config)
|
|
token = scraper._get_token()
|
|
self.assertEqual(token, "env_token")
|
|
|
|
|
|
class TestREADMEExtraction(unittest.TestCase):
|
|
"""Test README extraction (C1.2)"""
|
|
|
|
def setUp(self):
|
|
if not PYGITHUB_AVAILABLE:
|
|
self.skipTest("PyGithub not installed")
|
|
from skill_seekers.cli.github_scraper import GitHubScraper
|
|
|
|
self.GitHubScraper = GitHubScraper
|
|
|
|
def test_extract_readme_success(self):
|
|
"""Test successful README extraction"""
|
|
config = {"repo": "facebook/react", "name": "react", "github_token": None}
|
|
|
|
mock_content = Mock()
|
|
mock_content.decoded_content = b"# React\n\nA JavaScript library"
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
scraper.repo = Mock()
|
|
scraper.repo.get_contents.return_value = mock_content
|
|
|
|
scraper._extract_readme()
|
|
|
|
self.assertIn("readme", scraper.extracted_data)
|
|
self.assertEqual(scraper.extracted_data["readme"], "# React\n\nA JavaScript library")
|
|
|
|
def test_extract_readme_tries_multiple_locations(self):
|
|
"""Test that README extraction tries multiple file locations"""
|
|
config = {"repo": "facebook/react", "name": "react", "github_token": None}
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
scraper.repo = Mock()
|
|
|
|
# Make first attempts fail, succeed on third
|
|
def side_effect(path):
|
|
if path in ["README.md", "README.rst"]:
|
|
raise GithubException(404, "Not found")
|
|
mock_content = Mock()
|
|
mock_content.decoded_content = b"# README"
|
|
return mock_content
|
|
|
|
scraper.repo.get_contents.side_effect = side_effect
|
|
|
|
scraper._extract_readme()
|
|
|
|
# Should have tried multiple paths
|
|
self.assertGreaterEqual(scraper.repo.get_contents.call_count, 1)
|
|
|
|
def test_extract_readme_not_found(self):
|
|
"""Test README extraction when no README exists"""
|
|
config = {"repo": "test/norepo", "name": "norepo", "github_token": None}
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
scraper.repo = Mock()
|
|
scraper.repo.get_contents.side_effect = GithubException(404, "Not found")
|
|
|
|
scraper._extract_readme()
|
|
|
|
# Should not crash, just log warning (readme initialized as empty string)
|
|
self.assertEqual(scraper.extracted_data["readme"], "")
|
|
|
|
|
|
class TestLanguageDetection(unittest.TestCase):
|
|
"""Test language detection (C1.4)"""
|
|
|
|
def setUp(self):
|
|
if not PYGITHUB_AVAILABLE:
|
|
self.skipTest("PyGithub not installed")
|
|
from skill_seekers.cli.github_scraper import GitHubScraper
|
|
|
|
self.GitHubScraper = GitHubScraper
|
|
|
|
def test_extract_languages_success(self):
|
|
"""Test successful language detection"""
|
|
config = {"repo": "facebook/react", "name": "react", "github_token": None}
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
scraper.repo = Mock()
|
|
scraper.repo.get_languages.return_value = {
|
|
"JavaScript": 8000,
|
|
"TypeScript": 2000,
|
|
}
|
|
|
|
scraper._extract_languages()
|
|
|
|
self.assertIn("languages", scraper.extracted_data)
|
|
self.assertIn("JavaScript", scraper.extracted_data["languages"])
|
|
self.assertIn("TypeScript", scraper.extracted_data["languages"])
|
|
|
|
# Check percentages
|
|
js_data = scraper.extracted_data["languages"]["JavaScript"]
|
|
self.assertEqual(js_data["bytes"], 8000)
|
|
self.assertEqual(js_data["percentage"], 80.0)
|
|
|
|
ts_data = scraper.extracted_data["languages"]["TypeScript"]
|
|
self.assertEqual(ts_data["bytes"], 2000)
|
|
self.assertEqual(ts_data["percentage"], 20.0)
|
|
|
|
def test_extract_languages_empty(self):
|
|
"""Test language detection with no languages"""
|
|
config = {"repo": "test/norepo", "name": "norepo", "github_token": None}
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
scraper.repo = Mock()
|
|
scraper.repo.get_languages.return_value = {}
|
|
|
|
scraper._extract_languages()
|
|
|
|
self.assertIn("languages", scraper.extracted_data)
|
|
self.assertEqual(scraper.extracted_data["languages"], {})
|
|
|
|
|
|
class TestIssuesExtraction(unittest.TestCase):
|
|
"""Test GitHub Issues extraction (C1.7)"""
|
|
|
|
def setUp(self):
|
|
if not PYGITHUB_AVAILABLE:
|
|
self.skipTest("PyGithub not installed")
|
|
from skill_seekers.cli.github_scraper import GitHubScraper
|
|
|
|
self.GitHubScraper = GitHubScraper
|
|
|
|
def test_extract_issues_success(self):
|
|
"""Test successful issues extraction"""
|
|
config = {
|
|
"repo": "facebook/react",
|
|
"name": "react",
|
|
"github_token": None,
|
|
"max_issues": 10,
|
|
}
|
|
|
|
# Create mock issues
|
|
mock_label1 = Mock()
|
|
mock_label1.name = "bug"
|
|
mock_label2 = Mock()
|
|
mock_label2.name = "high-priority"
|
|
|
|
mock_milestone = Mock()
|
|
mock_milestone.title = "v18.0"
|
|
|
|
mock_issue1 = Mock()
|
|
mock_issue1.number = 123
|
|
mock_issue1.title = "Bug in useState"
|
|
mock_issue1.state = "open"
|
|
mock_issue1.labels = [mock_label1, mock_label2]
|
|
mock_issue1.milestone = mock_milestone
|
|
mock_issue1.created_at = datetime(2023, 1, 1)
|
|
mock_issue1.updated_at = datetime(2023, 1, 2)
|
|
mock_issue1.closed_at = None
|
|
mock_issue1.html_url = "https://github.com/facebook/react/issues/123"
|
|
mock_issue1.body = "Issue description"
|
|
mock_issue1.pull_request = None
|
|
|
|
mock_label3 = Mock()
|
|
mock_label3.name = "enhancement"
|
|
|
|
mock_issue2 = Mock()
|
|
mock_issue2.number = 124
|
|
mock_issue2.title = "Feature request"
|
|
mock_issue2.state = "closed"
|
|
mock_issue2.labels = [mock_label3]
|
|
mock_issue2.milestone = None
|
|
mock_issue2.created_at = datetime(2023, 1, 3)
|
|
mock_issue2.updated_at = datetime(2023, 1, 4)
|
|
mock_issue2.closed_at = datetime(2023, 1, 5)
|
|
mock_issue2.html_url = "https://github.com/facebook/react/issues/124"
|
|
mock_issue2.body = "Feature description"
|
|
mock_issue2.pull_request = None
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
scraper.repo = Mock()
|
|
scraper.repo.get_issues.return_value = [mock_issue1, mock_issue2]
|
|
|
|
scraper._extract_issues()
|
|
|
|
self.assertIn("issues", scraper.extracted_data)
|
|
issues = scraper.extracted_data["issues"]
|
|
self.assertEqual(len(issues), 2)
|
|
|
|
# Check first issue
|
|
self.assertEqual(issues[0]["number"], 123)
|
|
self.assertEqual(issues[0]["title"], "Bug in useState")
|
|
self.assertEqual(issues[0]["state"], "open")
|
|
self.assertEqual(issues[0]["labels"], ["bug", "high-priority"])
|
|
self.assertEqual(issues[0]["milestone"], "v18.0")
|
|
|
|
# Check second issue
|
|
self.assertEqual(issues[1]["number"], 124)
|
|
self.assertEqual(issues[1]["state"], "closed")
|
|
self.assertIsNone(issues[1]["milestone"])
|
|
|
|
def test_extract_issues_filters_pull_requests(self):
|
|
"""Test that pull requests are filtered out from issues"""
|
|
config = {
|
|
"repo": "facebook/react",
|
|
"name": "react",
|
|
"github_token": None,
|
|
"max_issues": 10,
|
|
}
|
|
|
|
# Create mock issue (need all required attributes)
|
|
mock_issue = Mock()
|
|
mock_issue.number = 123
|
|
mock_issue.title = "Real issue"
|
|
mock_issue.state = "open"
|
|
mock_issue.labels = []
|
|
mock_issue.milestone = None
|
|
mock_issue.created_at = datetime(2023, 1, 1)
|
|
mock_issue.updated_at = datetime(2023, 1, 2)
|
|
mock_issue.closed_at = None
|
|
mock_issue.html_url = "https://github.com/test/repo/issues/123"
|
|
mock_issue.body = "Issue body"
|
|
mock_issue.pull_request = None
|
|
|
|
mock_pr = Mock()
|
|
mock_pr.number = 124
|
|
mock_pr.title = "Pull request"
|
|
mock_pr.pull_request = Mock() # Has pull_request attribute
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
scraper.repo = Mock()
|
|
scraper.repo.get_issues.return_value = [mock_issue, mock_pr]
|
|
|
|
scraper._extract_issues()
|
|
|
|
issues = scraper.extracted_data["issues"]
|
|
# Should only have the real issue, not the PR
|
|
self.assertEqual(len(issues), 1)
|
|
self.assertEqual(issues[0]["number"], 123)
|
|
|
|
def test_extract_issues_respects_max_limit(self):
|
|
"""Test that max_issues limit is respected"""
|
|
config = {
|
|
"repo": "facebook/react",
|
|
"name": "react",
|
|
"github_token": None,
|
|
"max_issues": 2,
|
|
}
|
|
|
|
# Create 5 mock issues
|
|
mock_issues = []
|
|
for i in range(5):
|
|
mock_issue = Mock()
|
|
mock_issue.number = i
|
|
mock_issue.title = f"Issue {i}"
|
|
mock_issue.state = "open"
|
|
mock_issue.labels = []
|
|
mock_issue.milestone = None
|
|
mock_issue.created_at = datetime(2023, 1, 1)
|
|
mock_issue.updated_at = datetime(2023, 1, 2)
|
|
mock_issue.closed_at = None
|
|
mock_issue.html_url = f"https://github.com/test/repo/issues/{i}"
|
|
mock_issue.body = None
|
|
mock_issue.pull_request = None
|
|
mock_issues.append(mock_issue)
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
scraper.repo = Mock()
|
|
scraper.repo.get_issues.return_value = mock_issues
|
|
|
|
scraper._extract_issues()
|
|
|
|
issues = scraper.extracted_data["issues"]
|
|
# Should only extract first 2 issues
|
|
self.assertEqual(len(issues), 2)
|
|
|
|
|
|
class TestChangelogExtraction(unittest.TestCase):
|
|
"""Test CHANGELOG extraction (C1.8)"""
|
|
|
|
def setUp(self):
|
|
if not PYGITHUB_AVAILABLE:
|
|
self.skipTest("PyGithub not installed")
|
|
from skill_seekers.cli.github_scraper import GitHubScraper
|
|
|
|
self.GitHubScraper = GitHubScraper
|
|
|
|
def test_extract_changelog_success(self):
|
|
"""Test successful CHANGELOG extraction"""
|
|
config = {"repo": "facebook/react", "name": "react", "github_token": None}
|
|
|
|
mock_content = Mock()
|
|
mock_content.decoded_content = b"# Changelog\n\n## v1.0.0\n- Initial release"
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
scraper.repo = Mock()
|
|
scraper.repo.get_contents.return_value = mock_content
|
|
|
|
scraper._extract_changelog()
|
|
|
|
self.assertIn("changelog", scraper.extracted_data)
|
|
self.assertIn("Initial release", scraper.extracted_data["changelog"])
|
|
|
|
def test_extract_changelog_tries_multiple_locations(self):
|
|
"""Test that CHANGELOG extraction tries multiple file locations"""
|
|
config = {"repo": "facebook/react", "name": "react", "github_token": None}
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
scraper.repo = Mock()
|
|
|
|
# Make first attempts fail
|
|
call_count = {"count": 0}
|
|
|
|
def side_effect(path):
|
|
call_count["count"] += 1
|
|
if path in ["CHANGELOG.md", "CHANGES.md"]:
|
|
raise GithubException(404, "Not found")
|
|
mock_content = Mock()
|
|
mock_content.decoded_content = b"# History"
|
|
return mock_content
|
|
|
|
scraper.repo.get_contents.side_effect = side_effect
|
|
|
|
scraper._extract_changelog()
|
|
|
|
# Should have tried multiple paths
|
|
self.assertGreaterEqual(call_count["count"], 1)
|
|
|
|
def test_extract_changelog_not_found(self):
|
|
"""Test CHANGELOG extraction when no changelog exists"""
|
|
config = {"repo": "test/norepo", "name": "norepo", "github_token": None}
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
scraper.repo = Mock()
|
|
scraper.repo.get_contents.side_effect = GithubException(404, "Not found")
|
|
|
|
scraper._extract_changelog()
|
|
|
|
# Should not crash, just log warning (changelog initialized as empty string)
|
|
self.assertEqual(scraper.extracted_data["changelog"], "")
|
|
|
|
|
|
class TestReleasesExtraction(unittest.TestCase):
|
|
"""Test GitHub Releases extraction (C1.9)"""
|
|
|
|
def setUp(self):
|
|
if not PYGITHUB_AVAILABLE:
|
|
self.skipTest("PyGithub not installed")
|
|
from skill_seekers.cli.github_scraper import GitHubScraper
|
|
|
|
self.GitHubScraper = GitHubScraper
|
|
|
|
def test_extract_releases_success(self):
|
|
"""Test successful releases extraction"""
|
|
config = {"repo": "facebook/react", "name": "react", "github_token": None}
|
|
|
|
# Create mock releases
|
|
mock_release1 = Mock()
|
|
mock_release1.tag_name = "v18.0.0"
|
|
mock_release1.title = "React 18.0.0"
|
|
mock_release1.body = "New features:\n- Concurrent rendering"
|
|
mock_release1.draft = False
|
|
mock_release1.prerelease = False
|
|
mock_release1.created_at = datetime(2023, 3, 1)
|
|
mock_release1.published_at = datetime(2023, 3, 1)
|
|
mock_release1.html_url = "https://github.com/facebook/react/releases/tag/v18.0.0"
|
|
mock_release1.tarball_url = "https://github.com/facebook/react/archive/v18.0.0.tar.gz"
|
|
mock_release1.zipball_url = "https://github.com/facebook/react/archive/v18.0.0.zip"
|
|
|
|
mock_release2 = Mock()
|
|
mock_release2.tag_name = "v18.0.0-rc.0"
|
|
mock_release2.title = "React 18.0.0 RC"
|
|
mock_release2.body = "Release candidate"
|
|
mock_release2.draft = False
|
|
mock_release2.prerelease = True
|
|
mock_release2.created_at = datetime(2023, 2, 1)
|
|
mock_release2.published_at = datetime(2023, 2, 1)
|
|
mock_release2.html_url = "https://github.com/facebook/react/releases/tag/v18.0.0-rc.0"
|
|
mock_release2.tarball_url = "https://github.com/facebook/react/archive/v18.0.0-rc.0.tar.gz"
|
|
mock_release2.zipball_url = "https://github.com/facebook/react/archive/v18.0.0-rc.0.zip"
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
scraper.repo = Mock()
|
|
scraper.repo.get_releases.return_value = [mock_release1, mock_release2]
|
|
|
|
scraper._extract_releases()
|
|
|
|
self.assertIn("releases", scraper.extracted_data)
|
|
releases = scraper.extracted_data["releases"]
|
|
self.assertEqual(len(releases), 2)
|
|
|
|
# Check first release
|
|
self.assertEqual(releases[0]["tag_name"], "v18.0.0")
|
|
self.assertEqual(releases[0]["name"], "React 18.0.0")
|
|
self.assertFalse(releases[0]["draft"])
|
|
self.assertFalse(releases[0]["prerelease"])
|
|
self.assertIn("Concurrent rendering", releases[0]["body"])
|
|
|
|
# Check second release (prerelease)
|
|
self.assertEqual(releases[1]["tag_name"], "v18.0.0-rc.0")
|
|
self.assertTrue(releases[1]["prerelease"])
|
|
|
|
def test_extract_releases_empty(self):
|
|
"""Test releases extraction with no releases"""
|
|
config = {"repo": "test/norepo", "name": "norepo", "github_token": None}
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
scraper.repo = Mock()
|
|
scraper.repo.get_releases.return_value = []
|
|
|
|
scraper._extract_releases()
|
|
|
|
self.assertIn("releases", scraper.extracted_data)
|
|
self.assertEqual(scraper.extracted_data["releases"], [])
|
|
|
|
|
|
class TestGitHubToSkillConverter(unittest.TestCase):
|
|
"""Test GitHubToSkillConverter and skill building (C1.10)"""
|
|
|
|
def setUp(self):
|
|
if not PYGITHUB_AVAILABLE:
|
|
self.skipTest("PyGithub not installed")
|
|
from skill_seekers.cli.github_scraper import GitHubToSkillConverter
|
|
|
|
self.GitHubToSkillConverter = GitHubToSkillConverter
|
|
|
|
# Create temporary directory for test output
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
self.output_dir = Path(self.temp_dir)
|
|
|
|
# Create mock data file
|
|
self.data_file = self.output_dir / "test_github_data.json"
|
|
self.mock_data = {
|
|
"repo_info": {
|
|
"name": "react",
|
|
"full_name": "facebook/react",
|
|
"description": "A JavaScript library",
|
|
"stars": 200000,
|
|
"language": "JavaScript",
|
|
},
|
|
"readme": "# React\n\nA JavaScript library for building user interfaces.",
|
|
"languages": {
|
|
"JavaScript": {"bytes": 8000, "percentage": 80.0},
|
|
"TypeScript": {"bytes": 2000, "percentage": 20.0},
|
|
},
|
|
"issues": [
|
|
{
|
|
"number": 123,
|
|
"title": "Bug in useState",
|
|
"state": "open",
|
|
"labels": ["bug"],
|
|
"milestone": "v18.0",
|
|
"created_at": "2023-01-01T10:00:00",
|
|
"updated_at": "2023-01-02T10:00:00",
|
|
"closed_at": None,
|
|
"url": "https://github.com/facebook/react/issues/123",
|
|
"body": "Issue description",
|
|
}
|
|
],
|
|
"changelog": "# Changelog\n\n## v18.0.0\n- New features",
|
|
"releases": [
|
|
{
|
|
"tag_name": "v18.0.0",
|
|
"name": "React 18.0.0",
|
|
"body": "Release notes",
|
|
"published_at": "2023-03-01T10:00:00",
|
|
"prerelease": False,
|
|
"draft": False,
|
|
"url": "https://github.com/facebook/react/releases/tag/v18.0.0",
|
|
}
|
|
],
|
|
}
|
|
|
|
with open(self.data_file, "w") as f:
|
|
json.dump(self.mock_data, f)
|
|
|
|
def tearDown(self):
|
|
# Clean up temporary directory
|
|
if hasattr(self, "temp_dir"):
|
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
|
|
def test_init_loads_data(self):
|
|
"""Test that converter loads data file on initialization"""
|
|
config = {"repo": "facebook/react", "name": "test", "description": "Test skill"}
|
|
|
|
# Override data file path
|
|
with patch("skill_seekers.cli.github_scraper.GitHubToSkillConverter.__init__") as mock_init:
|
|
mock_init.return_value = None
|
|
converter = self.GitHubToSkillConverter(config)
|
|
converter.data_file = str(self.data_file)
|
|
converter.data = converter._load_data()
|
|
|
|
self.assertIn("repo_info", converter.data)
|
|
self.assertEqual(converter.data["repo_info"]["name"], "react")
|
|
|
|
def test_build_skill_creates_directory_structure(self):
|
|
"""Test that build_skill creates proper directory structure"""
|
|
# Create data file in expected location
|
|
data_file_path = self.output_dir / "test_github_data.json"
|
|
with open(data_file_path, "w") as f:
|
|
json.dump(self.mock_data, f)
|
|
|
|
config = {"repo": "facebook/react", "name": "test", "description": "Test skill"}
|
|
|
|
# Patch the paths to use our temp directory
|
|
with patch(
|
|
"skill_seekers.cli.github_scraper.GitHubToSkillConverter._load_data"
|
|
) as mock_load:
|
|
mock_load.return_value = self.mock_data
|
|
converter = self.GitHubToSkillConverter(config)
|
|
converter.skill_dir = str(self.output_dir / "test_skill")
|
|
converter.data = self.mock_data
|
|
|
|
converter.build_skill()
|
|
|
|
skill_dir = Path(converter.skill_dir)
|
|
self.assertTrue(skill_dir.exists())
|
|
self.assertTrue((skill_dir / "SKILL.md").exists())
|
|
self.assertTrue((skill_dir / "references").exists())
|
|
|
|
|
|
class TestSymlinkHandling(unittest.TestCase):
|
|
"""Test symlink handling (Issue #225)"""
|
|
|
|
def setUp(self):
|
|
if not PYGITHUB_AVAILABLE:
|
|
self.skipTest("PyGithub not installed")
|
|
from skill_seekers.cli.github_scraper import GitHubScraper
|
|
|
|
self.GitHubScraper = GitHubScraper
|
|
|
|
def test_get_file_content_regular_file(self):
|
|
"""Test _get_file_content with regular file"""
|
|
config = {"repo": "facebook/react", "name": "react", "github_token": None}
|
|
|
|
# Create mock regular file
|
|
mock_content = Mock()
|
|
mock_content.type = "file"
|
|
mock_content.encoding = "base64"
|
|
mock_content.decoded_content = b"# React\n\nA JavaScript library"
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
scraper.repo = Mock()
|
|
scraper.repo.get_contents.return_value = mock_content
|
|
|
|
result = scraper._get_file_content("README.md")
|
|
|
|
self.assertEqual(result, "# React\n\nA JavaScript library")
|
|
scraper.repo.get_contents.assert_called_once_with("README.md")
|
|
|
|
def test_get_file_content_symlink(self):
|
|
"""Test _get_file_content with symlink file"""
|
|
config = {"repo": "vercel/ai", "name": "ai", "github_token": None}
|
|
|
|
# Create mock symlink
|
|
mock_symlink = Mock()
|
|
mock_symlink.type = "symlink"
|
|
mock_symlink.encoding = None
|
|
mock_symlink.target = "packages/ai/README.md"
|
|
|
|
# Create mock target file
|
|
mock_target = Mock()
|
|
mock_target.type = "file"
|
|
mock_target.encoding = "base64"
|
|
mock_target.decoded_content = b"# AI SDK\n\nReal content from symlink target"
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
scraper.repo = Mock()
|
|
|
|
# First call returns symlink, second call returns target
|
|
scraper.repo.get_contents.side_effect = [mock_symlink, mock_target]
|
|
|
|
result = scraper._get_file_content("README.md")
|
|
|
|
self.assertEqual(result, "# AI SDK\n\nReal content from symlink target")
|
|
# Should have called get_contents twice: once for symlink, once for target
|
|
self.assertEqual(scraper.repo.get_contents.call_count, 2)
|
|
scraper.repo.get_contents.assert_any_call("README.md")
|
|
scraper.repo.get_contents.assert_any_call("packages/ai/README.md")
|
|
|
|
def test_get_file_content_broken_symlink(self):
|
|
"""Test _get_file_content with broken symlink"""
|
|
config = {"repo": "test/repo", "name": "test", "github_token": None}
|
|
|
|
# Create mock symlink with broken target
|
|
mock_symlink = Mock()
|
|
mock_symlink.type = "symlink"
|
|
mock_symlink.encoding = None
|
|
mock_symlink.target = "nonexistent/file.md"
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
scraper.repo = Mock()
|
|
|
|
# First call returns symlink, second call raises 404
|
|
scraper.repo.get_contents.side_effect = [
|
|
mock_symlink,
|
|
GithubException(404, "Not found"),
|
|
]
|
|
|
|
result = scraper._get_file_content("README.md")
|
|
|
|
# Should return None gracefully
|
|
self.assertIsNone(result)
|
|
|
|
def test_get_file_content_symlink_no_target(self):
|
|
"""Test _get_file_content with symlink that has no target attribute"""
|
|
config = {"repo": "test/repo", "name": "test", "github_token": None}
|
|
|
|
# Create mock symlink without target
|
|
mock_symlink = Mock()
|
|
mock_symlink.type = "symlink"
|
|
mock_symlink.encoding = None
|
|
mock_symlink.target = None
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
scraper.repo = Mock()
|
|
scraper.repo.get_contents.return_value = mock_symlink
|
|
|
|
result = scraper._get_file_content("README.md")
|
|
|
|
# Should return None gracefully
|
|
self.assertIsNone(result)
|
|
|
|
def test_extract_readme_with_symlink(self):
|
|
"""Test README extraction with symlinked README.md (Integration test for Issue #225)"""
|
|
config = {"repo": "vercel/ai", "name": "ai", "github_token": None}
|
|
|
|
# Create mock symlink
|
|
mock_symlink = Mock()
|
|
mock_symlink.type = "symlink"
|
|
mock_symlink.encoding = None
|
|
mock_symlink.target = "packages/ai/README.md"
|
|
|
|
# Create mock target file
|
|
mock_target = Mock()
|
|
mock_target.type = "file"
|
|
mock_target.encoding = "base64"
|
|
mock_target.decoded_content = b"# AI SDK\n\nThe AI SDK is a TypeScript toolkit"
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
scraper.repo = Mock()
|
|
scraper.repo.get_contents.side_effect = [mock_symlink, mock_target]
|
|
|
|
scraper._extract_readme()
|
|
|
|
# Should successfully extract README content
|
|
self.assertIn("readme", scraper.extracted_data)
|
|
self.assertEqual(
|
|
scraper.extracted_data["readme"],
|
|
"# AI SDK\n\nThe AI SDK is a TypeScript toolkit",
|
|
)
|
|
|
|
def test_extract_changelog_with_symlink(self):
|
|
"""Test CHANGELOG extraction with symlinked CHANGELOG.md"""
|
|
config = {"repo": "test/repo", "name": "test", "github_token": None}
|
|
|
|
# Create mock symlink
|
|
mock_symlink = Mock()
|
|
mock_symlink.type = "symlink"
|
|
mock_symlink.encoding = None
|
|
mock_symlink.target = "docs/CHANGELOG.md"
|
|
|
|
# Create mock target file
|
|
mock_target = Mock()
|
|
mock_target.type = "file"
|
|
mock_target.encoding = "base64"
|
|
mock_target.decoded_content = b"# Changelog\n\n## v1.0.0\n- Initial release"
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
scraper.repo = Mock()
|
|
scraper.repo.get_contents.side_effect = [mock_symlink, mock_target]
|
|
|
|
scraper._extract_changelog()
|
|
|
|
# Should successfully extract CHANGELOG content
|
|
self.assertIn("changelog", scraper.extracted_data)
|
|
self.assertIn("Initial release", scraper.extracted_data["changelog"])
|
|
|
|
def test_get_file_content_encoding_error(self):
|
|
"""Test _get_file_content handles encoding errors gracefully"""
|
|
config = {"repo": "test/repo", "name": "test", "github_token": None}
|
|
|
|
# Create mock file with invalid UTF-8 content
|
|
mock_content = Mock()
|
|
mock_content.type = "file"
|
|
mock_content.encoding = "base64"
|
|
# Mock decoded_content that can't be decoded as UTF-8
|
|
mock_content.decoded_content = b"\xff\xfe Invalid UTF-8"
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
scraper.repo = Mock()
|
|
scraper.repo.get_contents.return_value = mock_content
|
|
|
|
# Should try latin-1 fallback
|
|
result = scraper._get_file_content("README.md")
|
|
|
|
# Should not crash (will try latin-1 fallback)
|
|
self.assertIsNotNone(result)
|
|
|
|
def test_get_file_content_large_file(self):
|
|
"""Test _get_file_content handles large files with encoding='none' (Issue #219)"""
|
|
config = {"repo": "ccxt/ccxt", "name": "ccxt", "github_token": None}
|
|
|
|
# Create mock large file (encoding="none")
|
|
mock_content = Mock()
|
|
mock_content.type = "file"
|
|
mock_content.encoding = "none" # Large files have encoding="none"
|
|
mock_content.size = 1388271 # 1.4MB CHANGELOG
|
|
mock_content.download_url = (
|
|
"https://raw.githubusercontent.com/ccxt/ccxt/master/CHANGELOG.md"
|
|
)
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
scraper.repo = Mock()
|
|
scraper.repo.get_contents.return_value = mock_content
|
|
|
|
# Mock requests.get
|
|
with patch("requests.get") as mock_requests:
|
|
mock_response = Mock()
|
|
mock_response.text = "# Changelog\n\n## v1.0.0\n- Initial release"
|
|
mock_response.raise_for_status = Mock()
|
|
mock_requests.return_value = mock_response
|
|
|
|
result = scraper._get_file_content("CHANGELOG.md")
|
|
|
|
# Should download via download_url
|
|
self.assertEqual(result, "# Changelog\n\n## v1.0.0\n- Initial release")
|
|
mock_requests.assert_called_once_with(
|
|
"https://raw.githubusercontent.com/ccxt/ccxt/master/CHANGELOG.md",
|
|
timeout=30,
|
|
)
|
|
|
|
def test_extract_changelog_large_file(self):
|
|
"""Test CHANGELOG extraction with large file (Integration test for Issue #219)"""
|
|
config = {"repo": "ccxt/ccxt", "name": "ccxt", "github_token": None}
|
|
|
|
# Create mock large CHANGELOG
|
|
mock_content = Mock()
|
|
mock_content.type = "file"
|
|
mock_content.encoding = "none"
|
|
mock_content.size = 1388271
|
|
mock_content.download_url = (
|
|
"https://raw.githubusercontent.com/ccxt/ccxt/master/CHANGELOG.md"
|
|
)
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
scraper.repo = Mock()
|
|
scraper.repo.get_contents.return_value = mock_content
|
|
|
|
# Mock requests.get
|
|
with patch("requests.get") as mock_requests:
|
|
mock_response = Mock()
|
|
mock_response.text = "# CCXT Changelog\n\n## v4.0.0\n- Major update"
|
|
mock_response.raise_for_status = Mock()
|
|
mock_requests.return_value = mock_response
|
|
|
|
scraper._extract_changelog()
|
|
|
|
# Should successfully extract CHANGELOG content
|
|
self.assertIn("changelog", scraper.extracted_data)
|
|
self.assertIn("Major update", scraper.extracted_data["changelog"])
|
|
|
|
|
|
class TestGitignoreSupport(unittest.TestCase):
|
|
"""Test .gitignore support in github_scraper (C2.1)"""
|
|
|
|
def setUp(self):
|
|
"""Set up test environment"""
|
|
if not PYGITHUB_AVAILABLE:
|
|
self.skipTest("PyGithub not installed")
|
|
from skill_seekers.cli.github_scraper import GitHubScraper
|
|
|
|
self.GitHubScraper = GitHubScraper
|
|
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
self.repo_path = Path(self.temp_dir)
|
|
|
|
def tearDown(self):
|
|
"""Clean up test environment"""
|
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
|
|
def test_load_gitignore_exists(self):
|
|
"""Test loading existing .gitignore file."""
|
|
# Create .gitignore
|
|
gitignore_path = self.repo_path / ".gitignore"
|
|
gitignore_path.write_text("*.log\ntemp/\n__pycache__/")
|
|
|
|
config = {"repo": "test/repo", "local_repo_path": str(self.repo_path)}
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
|
|
# Should load .gitignore if pathspec available
|
|
if hasattr(scraper, "gitignore_spec"):
|
|
# pathspec is installed
|
|
self.assertIsNotNone(scraper.gitignore_spec)
|
|
else:
|
|
# pathspec not installed
|
|
self.assertIsNone(scraper.gitignore_spec)
|
|
|
|
def test_load_gitignore_missing(self):
|
|
"""Test behavior when no .gitignore exists."""
|
|
config = {"repo": "test/repo", "local_repo_path": str(self.repo_path)}
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
|
|
# Should be None when no .gitignore found
|
|
self.assertIsNone(scraper.gitignore_spec)
|
|
|
|
def test_should_exclude_dir_with_gitignore(self):
|
|
"""Test directory exclusion with .gitignore rules."""
|
|
# Create .gitignore
|
|
gitignore_path = self.repo_path / ".gitignore"
|
|
gitignore_path.write_text("temp/\nbuild/\n*.egg-info")
|
|
|
|
config = {"repo": "test/repo", "local_repo_path": str(self.repo_path)}
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
|
|
# Test .gitignore exclusion (if pathspec available)
|
|
if scraper.gitignore_spec:
|
|
self.assertTrue(scraper.should_exclude_dir("temp", "temp"))
|
|
self.assertTrue(scraper.should_exclude_dir("build", "build"))
|
|
|
|
# Non-excluded dir should pass
|
|
self.assertFalse(scraper.should_exclude_dir("src", "src"))
|
|
|
|
def test_should_exclude_dir_default_exclusions(self):
|
|
"""Test that default exclusions still work."""
|
|
config = {"repo": "test/repo", "local_repo_path": str(self.repo_path)}
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
|
|
# Default exclusions should still work
|
|
self.assertTrue(scraper.should_exclude_dir("node_modules"))
|
|
self.assertTrue(scraper.should_exclude_dir("venv"))
|
|
self.assertTrue(scraper.should_exclude_dir("__pycache__"))
|
|
|
|
# Normal directories should not be excluded
|
|
self.assertFalse(scraper.should_exclude_dir("src"))
|
|
self.assertFalse(scraper.should_exclude_dir("tests"))
|
|
|
|
|
|
class TestErrorHandling(unittest.TestCase):
|
|
"""Test error handling and edge cases"""
|
|
|
|
def setUp(self):
|
|
if not PYGITHUB_AVAILABLE:
|
|
self.skipTest("PyGithub not installed")
|
|
from skill_seekers.cli.github_scraper import GitHubScraper
|
|
|
|
self.GitHubScraper = GitHubScraper
|
|
|
|
def test_invalid_repo_name(self):
|
|
"""Test handling of invalid repository name"""
|
|
config = {"repo": "invalid_repo_format", "name": "test", "github_token": None}
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
scraper.repo = None
|
|
scraper.github.get_repo = Mock(side_effect=GithubException(404, "Not found"))
|
|
|
|
# Should raise ValueError with helpful message
|
|
with self.assertRaises(ValueError) as context:
|
|
scraper._fetch_repository()
|
|
|
|
self.assertIn("Repository not found", str(context.exception))
|
|
|
|
def test_rate_limit_error(self):
|
|
"""Test handling of rate limit errors"""
|
|
config = {
|
|
"repo": "facebook/react",
|
|
"name": "react",
|
|
"github_token": None,
|
|
"max_issues": 10,
|
|
}
|
|
|
|
with patch("skill_seekers.cli.github_scraper.Github"):
|
|
scraper = self.GitHubScraper(config)
|
|
scraper.repo = Mock()
|
|
scraper.repo.get_issues.side_effect = GithubException(403, "Rate limit exceeded")
|
|
|
|
# Should handle gracefully and log warning
|
|
scraper._extract_issues()
|
|
# Should not crash, just log warning
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|