fix: Handle symlinked README.md and CHANGELOG.md in GitHub scraper

- Add _get_file_content() helper method to detect and follow symlinks
- Update _extract_readme() to use new helper
- Update _extract_changelog() to use new helper
- Add 7 comprehensive tests for symlink handling
- All 29 GitHub scraper tests passing

Fixes #225

When README.md or CHANGELOG.md are symlinks (like in vercel/ai repo),
PyGithub returns ContentFile with type='symlink' and encoding=None.
Direct access to decoded_content throws AssertionError.

Solution: Detect symlink type, follow target path, then decode actual file.
Handles edge cases: broken symlinks, missing targets, encoding errors.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-01-01 20:41:28 +03:00
parent 8a111eb526
commit 58286f454a
2 changed files with 283 additions and 45 deletions

View File

@@ -325,6 +325,58 @@ class GitHubScraper:
raise ValueError(f"Repository not found: {self.repo_name}")
raise
def _get_file_content(self, file_path: str) -> Optional[str]:
"""
Safely get file content, handling symlinks and encoding issues.
Args:
file_path: Path to file in repository
Returns:
File content as string, or None if file not found/error
"""
try:
content = self.repo.get_contents(file_path)
if not content:
return None
# Handle symlinks - follow the target to get actual file
if hasattr(content, 'type') and content.type == 'symlink':
target = getattr(content, 'target', None)
if target:
target = target.strip()
logger.debug(f"File {file_path} is a symlink to {target}, following...")
try:
content = self.repo.get_contents(target)
except GithubException as e:
logger.warning(f"Failed to follow symlink {file_path} -> {target}: {e}")
return None
else:
logger.warning(f"Symlink {file_path} has no target")
return None
# Handle regular files - decode content
try:
if isinstance(content.decoded_content, bytes):
return content.decoded_content.decode('utf-8')
else:
return str(content.decoded_content)
except (UnicodeDecodeError, AttributeError, LookupError, AssertionError) as e:
logger.warning(f"Encoding issue with {file_path}: {e}")
# Try alternative encoding
try:
if isinstance(content.decoded_content, bytes):
return content.decoded_content.decode('latin-1')
except Exception:
return None
return None
except GithubException:
return None
except Exception as e:
logger.warning(f"Error reading {file_path}: {e}")
return None
def _extract_readme(self):
"""C1.2: Extract README.md files."""
logger.info("Extracting README...")
@@ -334,24 +386,21 @@ class GitHubScraper:
'docs/README.md', '.github/README.md']
for readme_path in readme_files:
try:
content = self.repo.get_contents(readme_path)
if content:
self.extracted_data['readme'] = content.decoded_content.decode('utf-8')
logger.info(f"README found: {readme_path}")
readme_content = self._get_file_content(readme_path)
if readme_content:
self.extracted_data['readme'] = readme_content
logger.info(f"README found: {readme_path}")
# Update description if not explicitly set in config
if 'description' not in self.config:
smart_description = extract_description_from_readme(
self.extracted_data['readme'],
self.repo_name
)
self.description = smart_description
logger.debug(f"Generated description: {self.description}")
# Update description if not explicitly set in config
if 'description' not in self.config:
smart_description = extract_description_from_readme(
self.extracted_data['readme'],
self.repo_name
)
self.description = smart_description
logger.debug(f"Generated description: {self.description}")
return
except GithubException:
continue
return
logger.warning("No README found in repository")
@@ -666,35 +715,11 @@ class GitHubScraper:
'docs/CHANGELOG.md', '.github/CHANGELOG.md']
for changelog_path in changelog_files:
try:
content = self.repo.get_contents(changelog_path)
if content:
# decoded_content is already bytes, decode to string
# Handle potential encoding issues gracefully
try:
if isinstance(content.decoded_content, bytes):
changelog_text = content.decoded_content.decode('utf-8')
else:
# Already a string
changelog_text = str(content.decoded_content)
except (UnicodeDecodeError, AttributeError, LookupError) as e:
# Try alternative encodings or skip this file
logger.warning(f"Encoding issue with {changelog_path}: {e}, trying latin-1")
try:
changelog_text = content.decoded_content.decode('latin-1')
except Exception:
logger.warning(f"Could not decode {changelog_path}, skipping")
continue
self.extracted_data['changelog'] = changelog_text
logger.info(f"CHANGELOG found: {changelog_path}")
return
except GithubException:
continue
except Exception as e:
# Catch any other errors (like "unsupported encoding: none")
logger.warning(f"Error reading {changelog_path}: {e}")
continue
changelog_content = self._get_file_content(changelog_path)
if changelog_content:
self.extracted_data['changelog'] = changelog_content
logger.info(f"CHANGELOG found: {changelog_path}")
return
logger.warning("No CHANGELOG found in repository")

View File

@@ -680,6 +680,219 @@ class TestGitHubToSkillConverter(unittest.TestCase):
self.assertTrue((skill_dir / 'references').exists())
class TestSymlinkHandling(unittest.TestCase):
"""Test symlink handling (Issue #225)"""
def setUp(self):
if not PYGITHUB_AVAILABLE:
self.skipTest("PyGithub not installed")
from skill_seekers.cli.github_scraper import GitHubScraper
self.GitHubScraper = GitHubScraper
def test_get_file_content_regular_file(self):
"""Test _get_file_content with regular file"""
config = {
'repo': 'facebook/react',
'name': 'react',
'github_token': None
}
# Create mock regular file
mock_content = Mock()
mock_content.type = 'file'
mock_content.encoding = 'base64'
mock_content.decoded_content = b'# React\n\nA JavaScript library'
with patch('skill_seekers.cli.github_scraper.Github'):
scraper = self.GitHubScraper(config)
scraper.repo = Mock()
scraper.repo.get_contents.return_value = mock_content
result = scraper._get_file_content('README.md')
self.assertEqual(result, '# React\n\nA JavaScript library')
scraper.repo.get_contents.assert_called_once_with('README.md')
def test_get_file_content_symlink(self):
"""Test _get_file_content with symlink file"""
config = {
'repo': 'vercel/ai',
'name': 'ai',
'github_token': None
}
# Create mock symlink
mock_symlink = Mock()
mock_symlink.type = 'symlink'
mock_symlink.encoding = None
mock_symlink.target = 'packages/ai/README.md'
# Create mock target file
mock_target = Mock()
mock_target.type = 'file'
mock_target.encoding = 'base64'
mock_target.decoded_content = b'# AI SDK\n\nReal content from symlink target'
with patch('skill_seekers.cli.github_scraper.Github'):
scraper = self.GitHubScraper(config)
scraper.repo = Mock()
# First call returns symlink, second call returns target
scraper.repo.get_contents.side_effect = [mock_symlink, mock_target]
result = scraper._get_file_content('README.md')
self.assertEqual(result, '# AI SDK\n\nReal content from symlink target')
# Should have called get_contents twice: once for symlink, once for target
self.assertEqual(scraper.repo.get_contents.call_count, 2)
scraper.repo.get_contents.assert_any_call('README.md')
scraper.repo.get_contents.assert_any_call('packages/ai/README.md')
def test_get_file_content_broken_symlink(self):
"""Test _get_file_content with broken symlink"""
config = {
'repo': 'test/repo',
'name': 'test',
'github_token': None
}
# Create mock symlink with broken target
mock_symlink = Mock()
mock_symlink.type = 'symlink'
mock_symlink.encoding = None
mock_symlink.target = 'nonexistent/file.md'
with patch('skill_seekers.cli.github_scraper.Github'):
scraper = self.GitHubScraper(config)
scraper.repo = Mock()
# First call returns symlink, second call raises 404
scraper.repo.get_contents.side_effect = [
mock_symlink,
GithubException(404, 'Not found')
]
result = scraper._get_file_content('README.md')
# Should return None gracefully
self.assertIsNone(result)
def test_get_file_content_symlink_no_target(self):
"""Test _get_file_content with symlink that has no target attribute"""
config = {
'repo': 'test/repo',
'name': 'test',
'github_token': None
}
# Create mock symlink without target
mock_symlink = Mock()
mock_symlink.type = 'symlink'
mock_symlink.encoding = None
mock_symlink.target = None
with patch('skill_seekers.cli.github_scraper.Github'):
scraper = self.GitHubScraper(config)
scraper.repo = Mock()
scraper.repo.get_contents.return_value = mock_symlink
result = scraper._get_file_content('README.md')
# Should return None gracefully
self.assertIsNone(result)
def test_extract_readme_with_symlink(self):
"""Test README extraction with symlinked README.md (Integration test for Issue #225)"""
config = {
'repo': 'vercel/ai',
'name': 'ai',
'github_token': None
}
# Create mock symlink
mock_symlink = Mock()
mock_symlink.type = 'symlink'
mock_symlink.encoding = None
mock_symlink.target = 'packages/ai/README.md'
# Create mock target file
mock_target = Mock()
mock_target.type = 'file'
mock_target.encoding = 'base64'
mock_target.decoded_content = b'# AI SDK\n\nThe AI SDK is a TypeScript toolkit'
with patch('skill_seekers.cli.github_scraper.Github'):
scraper = self.GitHubScraper(config)
scraper.repo = Mock()
scraper.repo.get_contents.side_effect = [mock_symlink, mock_target]
scraper._extract_readme()
# Should successfully extract README content
self.assertIn('readme', scraper.extracted_data)
self.assertEqual(
scraper.extracted_data['readme'],
'# AI SDK\n\nThe AI SDK is a TypeScript toolkit'
)
def test_extract_changelog_with_symlink(self):
"""Test CHANGELOG extraction with symlinked CHANGELOG.md"""
config = {
'repo': 'test/repo',
'name': 'test',
'github_token': None
}
# Create mock symlink
mock_symlink = Mock()
mock_symlink.type = 'symlink'
mock_symlink.encoding = None
mock_symlink.target = 'docs/CHANGELOG.md'
# Create mock target file
mock_target = Mock()
mock_target.type = 'file'
mock_target.encoding = 'base64'
mock_target.decoded_content = b'# Changelog\n\n## v1.0.0\n- Initial release'
with patch('skill_seekers.cli.github_scraper.Github'):
scraper = self.GitHubScraper(config)
scraper.repo = Mock()
scraper.repo.get_contents.side_effect = [mock_symlink, mock_target]
scraper._extract_changelog()
# Should successfully extract CHANGELOG content
self.assertIn('changelog', scraper.extracted_data)
self.assertIn('Initial release', scraper.extracted_data['changelog'])
def test_get_file_content_encoding_error(self):
"""Test _get_file_content handles encoding errors gracefully"""
config = {
'repo': 'test/repo',
'name': 'test',
'github_token': None
}
# Create mock file with invalid UTF-8 content
mock_content = Mock()
mock_content.type = 'file'
mock_content.encoding = 'base64'
# Mock decoded_content that can't be decoded as UTF-8
mock_content.decoded_content = b'\xff\xfe Invalid UTF-8'
with patch('skill_seekers.cli.github_scraper.Github'):
scraper = self.GitHubScraper(config)
scraper.repo = Mock()
scraper.repo.get_contents.return_value = mock_content
# Should try latin-1 fallback
result = scraper._get_file_content('README.md')
# Should not crash (will try latin-1 fallback)
self.assertIsNotNone(result)
class TestErrorHandling(unittest.TestCase):
"""Test error handling and edge cases"""