fix: Handle symlinked README.md and CHANGELOG.md in GitHub scraper

- Add _get_file_content() helper method to detect and follow symlinks - Update _extract_readme() to use new helper - Update _extract_changelog() to use new helper - Add 7 comprehensive tests for symlink handling - All 29 GitHub scraper tests passing Fixes #225 When README.md or CHANGELOG.md are symlinks (like in vercel/ai repo), PyGithub returns ContentFile with type='symlink' and encoding=None. Direct access to decoded_content throws AssertionError. Solution: Detect symlink type, follow target path, then decode actual file. Handles edge cases: broken symlinks, missing targets, encoding errors. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-01 20:41:28 +03:00
parent 8a111eb526
commit 58286f454a
2 changed files with 283 additions and 45 deletions
--- a/src/skill_seekers/cli/github_scraper.py
+++ b/src/skill_seekers/cli/github_scraper.py
@@ -325,6 +325,58 @@ class GitHubScraper:
                raise ValueError(f"Repository not found: {self.repo_name}")
            raise

+    def _get_file_content(self, file_path: str) -> Optional[str]:
+        """
+        Safely get file content, handling symlinks and encoding issues.
+
+        Args:
+            file_path: Path to file in repository
+
+        Returns:
+            File content as string, or None if file not found/error
+        """
+        try:
+            content = self.repo.get_contents(file_path)
+            if not content:
+                return None
+
+            # Handle symlinks - follow the target to get actual file
+            if hasattr(content, 'type') and content.type == 'symlink':
+                target = getattr(content, 'target', None)
+                if target:
+                    target = target.strip()
+                    logger.debug(f"File {file_path} is a symlink to {target}, following...")
+                    try:
+                        content = self.repo.get_contents(target)
+                    except GithubException as e:
+                        logger.warning(f"Failed to follow symlink {file_path} -> {target}: {e}")
+                        return None
+                else:
+                    logger.warning(f"Symlink {file_path} has no target")
+                    return None
+
+            # Handle regular files - decode content
+            try:
+                if isinstance(content.decoded_content, bytes):
+                    return content.decoded_content.decode('utf-8')
+                else:
+                    return str(content.decoded_content)
+            except (UnicodeDecodeError, AttributeError, LookupError, AssertionError) as e:
+                logger.warning(f"Encoding issue with {file_path}: {e}")
+                # Try alternative encoding
+                try:
+                    if isinstance(content.decoded_content, bytes):
+                        return content.decoded_content.decode('latin-1')
+                except Exception:
+                    return None
+                return None
+
+        except GithubException:
+            return None
+        except Exception as e:
+            logger.warning(f"Error reading {file_path}: {e}")
+            return None
+
    def _extract_readme(self):
        """C1.2: Extract README.md files."""
        logger.info("Extracting README...")
@@ -334,24 +386,21 @@ class GitHubScraper:
                       'docs/README.md', '.github/README.md']

        for readme_path in readme_files:
-            try:
-                content = self.repo.get_contents(readme_path)
-                if content:
-                    self.extracted_data['readme'] = content.decoded_content.decode('utf-8')
-                    logger.info(f"README found: {readme_path}")
+            readme_content = self._get_file_content(readme_path)
+            if readme_content:
+                self.extracted_data['readme'] = readme_content
+                logger.info(f"README found: {readme_path}")

-                    # Update description if not explicitly set in config
-                    if 'description' not in self.config:
-                        smart_description = extract_description_from_readme(
-                            self.extracted_data['readme'],
-                            self.repo_name
-                        )
-                        self.description = smart_description
-                        logger.debug(f"Generated description: {self.description}")
+                # Update description if not explicitly set in config
+                if 'description' not in self.config:
+                    smart_description = extract_description_from_readme(
+                        self.extracted_data['readme'],
+                        self.repo_name
+                    )
+                    self.description = smart_description
+                    logger.debug(f"Generated description: {self.description}")

-                    return
-            except GithubException:
-                continue
+                return

        logger.warning("No README found in repository")

@@ -666,35 +715,11 @@ class GitHubScraper:
                          'docs/CHANGELOG.md', '.github/CHANGELOG.md']

        for changelog_path in changelog_files:
-            try:
-                content = self.repo.get_contents(changelog_path)
-                if content:
-                    # decoded_content is already bytes, decode to string
-                    # Handle potential encoding issues gracefully
-                    try:
-                        if isinstance(content.decoded_content, bytes):
-                            changelog_text = content.decoded_content.decode('utf-8')
-                        else:
-                            # Already a string
-                            changelog_text = str(content.decoded_content)
-                    except (UnicodeDecodeError, AttributeError, LookupError) as e:
-                        # Try alternative encodings or skip this file
-                        logger.warning(f"Encoding issue with {changelog_path}: {e}, trying latin-1")
-                        try:
-                            changelog_text = content.decoded_content.decode('latin-1')
-                        except Exception:
-                            logger.warning(f"Could not decode {changelog_path}, skipping")
-                            continue
-
-                    self.extracted_data['changelog'] = changelog_text
-                    logger.info(f"CHANGELOG found: {changelog_path}")
-                    return
-            except GithubException:
-                continue
-            except Exception as e:
-                # Catch any other errors (like "unsupported encoding: none")
-                logger.warning(f"Error reading {changelog_path}: {e}")
-                continue
+            changelog_content = self._get_file_content(changelog_path)
+            if changelog_content:
+                self.extracted_data['changelog'] = changelog_content
+                logger.info(f"CHANGELOG found: {changelog_path}")
+                return

        logger.warning("No CHANGELOG found in repository")

--- a/tests/test_github_scraper.py
+++ b/tests/test_github_scraper.py
@@ -680,6 +680,219 @@ class TestGitHubToSkillConverter(unittest.TestCase):
            self.assertTrue((skill_dir / 'references').exists())


+class TestSymlinkHandling(unittest.TestCase):
+    """Test symlink handling (Issue #225)"""
+
+    def setUp(self):
+        if not PYGITHUB_AVAILABLE:
+            self.skipTest("PyGithub not installed")
+        from skill_seekers.cli.github_scraper import GitHubScraper
+        self.GitHubScraper = GitHubScraper
+
+    def test_get_file_content_regular_file(self):
+        """Test _get_file_content with regular file"""
+        config = {
+            'repo': 'facebook/react',
+            'name': 'react',
+            'github_token': None
+        }
+
+        # Create mock regular file
+        mock_content = Mock()
+        mock_content.type = 'file'
+        mock_content.encoding = 'base64'
+        mock_content.decoded_content = b'# React\n\nA JavaScript library'
+
+        with patch('skill_seekers.cli.github_scraper.Github'):
+            scraper = self.GitHubScraper(config)
+            scraper.repo = Mock()
+            scraper.repo.get_contents.return_value = mock_content
+
+            result = scraper._get_file_content('README.md')
+
+            self.assertEqual(result, '# React\n\nA JavaScript library')
+            scraper.repo.get_contents.assert_called_once_with('README.md')
+
+    def test_get_file_content_symlink(self):
+        """Test _get_file_content with symlink file"""
+        config = {
+            'repo': 'vercel/ai',
+            'name': 'ai',
+            'github_token': None
+        }
+
+        # Create mock symlink
+        mock_symlink = Mock()
+        mock_symlink.type = 'symlink'
+        mock_symlink.encoding = None
+        mock_symlink.target = 'packages/ai/README.md'
+
+        # Create mock target file
+        mock_target = Mock()
+        mock_target.type = 'file'
+        mock_target.encoding = 'base64'
+        mock_target.decoded_content = b'# AI SDK\n\nReal content from symlink target'
+
+        with patch('skill_seekers.cli.github_scraper.Github'):
+            scraper = self.GitHubScraper(config)
+            scraper.repo = Mock()
+
+            # First call returns symlink, second call returns target
+            scraper.repo.get_contents.side_effect = [mock_symlink, mock_target]
+
+            result = scraper._get_file_content('README.md')
+
+            self.assertEqual(result, '# AI SDK\n\nReal content from symlink target')
+            # Should have called get_contents twice: once for symlink, once for target
+            self.assertEqual(scraper.repo.get_contents.call_count, 2)
+            scraper.repo.get_contents.assert_any_call('README.md')
+            scraper.repo.get_contents.assert_any_call('packages/ai/README.md')
+
+    def test_get_file_content_broken_symlink(self):
+        """Test _get_file_content with broken symlink"""
+        config = {
+            'repo': 'test/repo',
+            'name': 'test',
+            'github_token': None
+        }
+
+        # Create mock symlink with broken target
+        mock_symlink = Mock()
+        mock_symlink.type = 'symlink'
+        mock_symlink.encoding = None
+        mock_symlink.target = 'nonexistent/file.md'
+
+        with patch('skill_seekers.cli.github_scraper.Github'):
+            scraper = self.GitHubScraper(config)
+            scraper.repo = Mock()
+
+            # First call returns symlink, second call raises 404
+            scraper.repo.get_contents.side_effect = [
+                mock_symlink,
+                GithubException(404, 'Not found')
+            ]
+
+            result = scraper._get_file_content('README.md')
+
+            # Should return None gracefully
+            self.assertIsNone(result)
+
+    def test_get_file_content_symlink_no_target(self):
+        """Test _get_file_content with symlink that has no target attribute"""
+        config = {
+            'repo': 'test/repo',
+            'name': 'test',
+            'github_token': None
+        }
+
+        # Create mock symlink without target
+        mock_symlink = Mock()
+        mock_symlink.type = 'symlink'
+        mock_symlink.encoding = None
+        mock_symlink.target = None
+
+        with patch('skill_seekers.cli.github_scraper.Github'):
+            scraper = self.GitHubScraper(config)
+            scraper.repo = Mock()
+            scraper.repo.get_contents.return_value = mock_symlink
+
+            result = scraper._get_file_content('README.md')
+
+            # Should return None gracefully
+            self.assertIsNone(result)
+
+    def test_extract_readme_with_symlink(self):
+        """Test README extraction with symlinked README.md (Integration test for Issue #225)"""
+        config = {
+            'repo': 'vercel/ai',
+            'name': 'ai',
+            'github_token': None
+        }
+
+        # Create mock symlink
+        mock_symlink = Mock()
+        mock_symlink.type = 'symlink'
+        mock_symlink.encoding = None
+        mock_symlink.target = 'packages/ai/README.md'
+
+        # Create mock target file
+        mock_target = Mock()
+        mock_target.type = 'file'
+        mock_target.encoding = 'base64'
+        mock_target.decoded_content = b'# AI SDK\n\nThe AI SDK is a TypeScript toolkit'
+
+        with patch('skill_seekers.cli.github_scraper.Github'):
+            scraper = self.GitHubScraper(config)
+            scraper.repo = Mock()
+            scraper.repo.get_contents.side_effect = [mock_symlink, mock_target]
+
+            scraper._extract_readme()
+
+            # Should successfully extract README content
+            self.assertIn('readme', scraper.extracted_data)
+            self.assertEqual(
+                scraper.extracted_data['readme'],
+                '# AI SDK\n\nThe AI SDK is a TypeScript toolkit'
+            )
+
+    def test_extract_changelog_with_symlink(self):
+        """Test CHANGELOG extraction with symlinked CHANGELOG.md"""
+        config = {
+            'repo': 'test/repo',
+            'name': 'test',
+            'github_token': None
+        }
+
+        # Create mock symlink
+        mock_symlink = Mock()
+        mock_symlink.type = 'symlink'
+        mock_symlink.encoding = None
+        mock_symlink.target = 'docs/CHANGELOG.md'
+
+        # Create mock target file
+        mock_target = Mock()
+        mock_target.type = 'file'
+        mock_target.encoding = 'base64'
+        mock_target.decoded_content = b'# Changelog\n\n## v1.0.0\n- Initial release'
+
+        with patch('skill_seekers.cli.github_scraper.Github'):
+            scraper = self.GitHubScraper(config)
+            scraper.repo = Mock()
+            scraper.repo.get_contents.side_effect = [mock_symlink, mock_target]
+
+            scraper._extract_changelog()
+
+            # Should successfully extract CHANGELOG content
+            self.assertIn('changelog', scraper.extracted_data)
+            self.assertIn('Initial release', scraper.extracted_data['changelog'])
+
+    def test_get_file_content_encoding_error(self):
+        """Test _get_file_content handles encoding errors gracefully"""
+        config = {
+            'repo': 'test/repo',
+            'name': 'test',
+            'github_token': None
+        }
+
+        # Create mock file with invalid UTF-8 content
+        mock_content = Mock()
+        mock_content.type = 'file'
+        mock_content.encoding = 'base64'
+        # Mock decoded_content that can't be decoded as UTF-8
+        mock_content.decoded_content = b'\xff\xfe Invalid UTF-8'
+
+        with patch('skill_seekers.cli.github_scraper.Github'):
+            scraper = self.GitHubScraper(config)
+            scraper.repo = Mock()
+            scraper.repo.get_contents.return_value = mock_content
+
+            # Should try latin-1 fallback
+            result = scraper._get_file_content('README.md')
+
+            # Should not crash (will try latin-1 fallback)
+            self.assertIsNotNone(result)
+
+
 class TestErrorHandling(unittest.TestCase):
    """Test error handling and edge cases"""