From d71c1d3aa37dd17de3664132f3f4cab065a88829 Mon Sep 17 00:00:00 2001 From: yusyus Date: Thu, 26 Mar 2026 23:44:52 +0300 Subject: [PATCH] fix: filter non-integer metadata from GitHub languages API response (#322) PyGithub's get_languages() returns raw API JSON which in some environments includes non-integer metadata keys (e.g., "url"), causing a TypeError in sum(). Now filters to integer values only before calculating percentages. Co-Authored-By: Claude Opus 4.6 (1M context) --- CHANGELOG.md | 3 +++ CLAUDE.md | 2 +- src/skill_seekers/cli/github_scraper.py | 7 +++++++ tests/test_github_scraper.py | 25 +++++++++++++++++++++++++ 4 files changed, 36 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e6d5fe3..e8fd7ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed +- **GitHub language detection crashes with `TypeError`** when API response contains non-integer metadata keys (e.g., `"url"`) — now filters to integer values only (#322) + ## [3.4.0] - 2026-03-21 ### Added diff --git a/CLAUDE.md b/CLAUDE.md index b0c1cdf..ee62935 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co **Skill Seekers** converts documentation from 17 source types into production-ready formats for 24+ AI platforms (LLM platforms, RAG frameworks, vector databases, AI coding assistants). Published on PyPI as `skill-seekers`. -**Version:** 3.3.0 | **Python:** 3.10+ | **Website:** https://skillseekersweb.com/ +**Version:** 3.4.0 | **Python:** 3.10+ | **Website:** https://skillseekersweb.com/ **Architecture:** See `docs/UML_ARCHITECTURE.md` for UML diagrams and module overview. StarUML project at `docs/UML/skill_seekers.mdj`. diff --git a/src/skill_seekers/cli/github_scraper.py b/src/skill_seekers/cli/github_scraper.py index 4e8b4d9..e027930 100644 --- a/src/skill_seekers/cli/github_scraper.py +++ b/src/skill_seekers/cli/github_scraper.py @@ -519,6 +519,13 @@ class GitHubScraper: try: languages = self.repo.get_languages() + # Filter out non-integer metadata (e.g., "url" key from some API configurations) + non_lang_keys = {k for k, v in languages.items() if not isinstance(v, int)} + if non_lang_keys: + logger.debug( + f"Filtered non-language keys from API response: {non_lang_keys}" + ) + languages = {k: v for k, v in languages.items() if isinstance(v, int)} total_bytes = sum(languages.values()) self.extracted_data["languages"] = { diff --git a/tests/test_github_scraper.py b/tests/test_github_scraper.py index 9909233..0341364 100644 --- a/tests/test_github_scraper.py +++ b/tests/test_github_scraper.py @@ -230,6 +230,31 @@ class TestLanguageDetection(unittest.TestCase): self.assertIn("languages", scraper.extracted_data) self.assertEqual(scraper.extracted_data["languages"], {}) + def test_extract_languages_filters_non_integer_metadata(self): + """Test that non-integer metadata keys (e.g., 'url') are filtered out (#322)""" + config = {"repo": "xyflow/xyflow", "name": "xyflow", "github_token": None} + + with patch("skill_seekers.cli.github_scraper.Github"): + scraper = self.GitHubScraper(config) + scraper.repo = Mock() + scraper.repo.get_languages.return_value = { + "TypeScript": 707330, + "Svelte": 95784, + "url": "https://api.github.com/repos/xyflow/xyflow/languages", + } + + scraper._extract_languages() + + self.assertIn("languages", scraper.extracted_data) + self.assertIn("TypeScript", scraper.extracted_data["languages"]) + self.assertIn("Svelte", scraper.extracted_data["languages"]) + self.assertNotIn("url", scraper.extracted_data["languages"]) + + # Percentages should be calculated only from real languages + ts_data = scraper.extracted_data["languages"]["TypeScript"] + total = 707330 + 95784 + self.assertEqual(ts_data["percentage"], round(707330 / total * 100, 2)) + class TestIssuesExtraction(unittest.TestCase): """Test GitHub Issues extraction (C1.7)"""