fix: filter non-integer metadata from GitHub languages API response (#322)
PyGithub's get_languages() returns raw API JSON which in some environments includes non-integer metadata keys (e.g., "url"), causing a TypeError in sum(). Now filters to integer values only before calculating percentages. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Fixed
|
||||
- **GitHub language detection crashes with `TypeError`** when API response contains non-integer metadata keys (e.g., `"url"`) — now filters to integer values only (#322)
|
||||
|
||||
## [3.4.0] - 2026-03-21
|
||||
|
||||
### Added
|
||||
|
||||
@@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
|
||||
|
||||
**Skill Seekers** converts documentation from 17 source types into production-ready formats for 24+ AI platforms (LLM platforms, RAG frameworks, vector databases, AI coding assistants). Published on PyPI as `skill-seekers`.
|
||||
|
||||
**Version:** 3.3.0 | **Python:** 3.10+ | **Website:** https://skillseekersweb.com/
|
||||
**Version:** 3.4.0 | **Python:** 3.10+ | **Website:** https://skillseekersweb.com/
|
||||
|
||||
**Architecture:** See `docs/UML_ARCHITECTURE.md` for UML diagrams and module overview. StarUML project at `docs/UML/skill_seekers.mdj`.
|
||||
|
||||
|
||||
@@ -519,6 +519,13 @@ class GitHubScraper:
|
||||
|
||||
try:
|
||||
languages = self.repo.get_languages()
|
||||
# Filter out non-integer metadata (e.g., "url" key from some API configurations)
|
||||
non_lang_keys = {k for k, v in languages.items() if not isinstance(v, int)}
|
||||
if non_lang_keys:
|
||||
logger.debug(
|
||||
f"Filtered non-language keys from API response: {non_lang_keys}"
|
||||
)
|
||||
languages = {k: v for k, v in languages.items() if isinstance(v, int)}
|
||||
total_bytes = sum(languages.values())
|
||||
|
||||
self.extracted_data["languages"] = {
|
||||
|
||||
@@ -230,6 +230,31 @@ class TestLanguageDetection(unittest.TestCase):
|
||||
self.assertIn("languages", scraper.extracted_data)
|
||||
self.assertEqual(scraper.extracted_data["languages"], {})
|
||||
|
||||
def test_extract_languages_filters_non_integer_metadata(self):
|
||||
"""Test that non-integer metadata keys (e.g., 'url') are filtered out (#322)"""
|
||||
config = {"repo": "xyflow/xyflow", "name": "xyflow", "github_token": None}
|
||||
|
||||
with patch("skill_seekers.cli.github_scraper.Github"):
|
||||
scraper = self.GitHubScraper(config)
|
||||
scraper.repo = Mock()
|
||||
scraper.repo.get_languages.return_value = {
|
||||
"TypeScript": 707330,
|
||||
"Svelte": 95784,
|
||||
"url": "https://api.github.com/repos/xyflow/xyflow/languages",
|
||||
}
|
||||
|
||||
scraper._extract_languages()
|
||||
|
||||
self.assertIn("languages", scraper.extracted_data)
|
||||
self.assertIn("TypeScript", scraper.extracted_data["languages"])
|
||||
self.assertIn("Svelte", scraper.extracted_data["languages"])
|
||||
self.assertNotIn("url", scraper.extracted_data["languages"])
|
||||
|
||||
# Percentages should be calculated only from real languages
|
||||
ts_data = scraper.extracted_data["languages"]["TypeScript"]
|
||||
total = 707330 + 95784
|
||||
self.assertEqual(ts_data["percentage"], round(707330 / total * 100, 2))
|
||||
|
||||
|
||||
class TestIssuesExtraction(unittest.TestCase):
|
||||
"""Test GitHub Issues extraction (C1.7)"""
|
||||
|
||||
Reference in New Issue
Block a user