feat: Add unified language detector for code analysis

- Created LanguageDetector class supporting 20+ programming languages
- Confidence-based detection with customizable thresholds (min_confidence parameter)
- Replaces duplicate language detection code in doc_scraper and pdf_extractor
- Comprehensive test suite with 100+ test cases

Changes:
- NEW: src/skill_seekers/cli/language_detector.py (17 KB)
  - Unified detector with pattern matching for 20+ languages
  - Confidence scoring (0.0-1.0 scale)
  - Supports: Python, JavaScript, TypeScript, Java, C++, C#, Go, Rust, PHP, Ruby, Swift, Kotlin, Shell, SQL, HTML, CSS, JSON, YAML, XML, and more

- NEW: tests/test_language_detector.py (20 KB)
  - 100+ test cases covering all supported languages
  - Edge case testing (mixed code, low confidence, etc.)

- MODIFIED: src/skill_seekers/cli/doc_scraper.py
  - Removed 80+ lines of duplicate detection code
  - Now uses shared LanguageDetector instance

- MODIFIED: src/skill_seekers/cli/pdf_extractor_poc.py
  - Removed 130+ lines of duplicate detection code
  - Now uses shared LanguageDetector instance

- MODIFIED: tests/test_pdf_extractor.py
  - Fixed imports to use proper package paths
  - Added manual detector initialization in test setup

Benefits:
- DRY: Single source of truth for language detection
- Maintainability: Add new languages in one place
- Consistency: Same detection logic across all scrapers
- Testability: Comprehensive test coverage
- Extensibility: Easy to add new languages or improve patterns

Addresses technical debt from having duplicate detection logic in multiple files.
This commit is contained in:
yusyus
2025-12-21 22:53:05 +03:00
parent 8eb8cd2940
commit 785fff087e
5 changed files with 1310 additions and 211 deletions

View File

@@ -32,6 +32,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from skill_seekers.cli.llms_txt_detector import LlmsTxtDetector
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
from skill_seekers.cli.llms_txt_downloader import LlmsTxtDownloader
from skill_seekers.cli.language_detector import LanguageDetector
from skill_seekers.cli.constants import (
DEFAULT_RATE_LIMIT,
DEFAULT_MAX_PAGES,
@@ -111,6 +112,9 @@ class DocToSkillConverter:
self.pages: List[Dict[str, Any]] = []
self.pages_scraped = 0
# Language detection
self.language_detector = LanguageDetector(min_confidence=0.15)
# Thread-safe lock for parallel scraping
if self.workers > 1:
import threading
@@ -278,81 +282,18 @@ class DocToSkillConverter:
return page
def _extract_language_from_classes(self, classes):
"""Extract language from class list
Supports multiple patterns:
- language-{lang} (e.g., "language-python")
- lang-{lang} (e.g., "lang-javascript")
- brush: {lang} (e.g., "brush: java")
- bare language name (e.g., "python", "java")
"""
# Define common programming languages
known_languages = [
"javascript", "java", "xml", "html", "python", "bash", "cpp", "typescript",
"go", "rust", "php", "ruby", "swift", "kotlin", "csharp", "c", "sql",
"yaml", "json", "markdown", "css", "scss", "sass", "jsx", "tsx", "vue",
"shell", "powershell", "r", "scala", "dart", "perl", "lua", "elixir"
]
for cls in classes:
# Clean special characters (except word chars and hyphens)
cls = re.sub(r'[^\w-]', '', cls)
if 'language-' in cls:
return cls.replace('language-', '')
if 'lang-' in cls:
return cls.replace('lang-', '')
# Check for brush: pattern (e.g., "brush: java")
if 'brush' in cls.lower():
lang = cls.lower().replace('brush', '').strip()
if lang in known_languages:
return lang
# Check for bare language name
if cls in known_languages:
return cls
return None
def detect_language(self, elem, code):
"""Detect programming language from code block"""
"""Detect programming language from code block
# Check element classes
lang = self._extract_language_from_classes(elem.get('class', []))
if lang:
return lang
UPDATED: Now uses confidence-based detection with 20+ languages
"""
lang, confidence = self.language_detector.detect_from_html(elem, code)
# Check parent pre element
parent = elem.parent
if parent and parent.name == 'pre':
lang = self._extract_language_from_classes(parent.get('class', []))
if lang:
return lang
# Log low-confidence detections for debugging
if confidence < 0.5:
logger.debug(f"Low confidence language detection: {lang} ({confidence:.2f})")
# Heuristic detection
if 'import ' in code and 'from ' in code:
return 'python'
if 'const ' in code or 'let ' in code or '=>' in code:
return 'javascript'
if 'func ' in code and 'var ' in code:
return 'gdscript'
if 'def ' in code and ':' in code:
return 'python'
if '#include' in code or 'int main' in code:
return 'cpp'
# C# detection
if 'using System' in code or 'namespace ' in code:
return 'csharp'
if '{ get; set; }' in code:
return 'csharp'
if any(keyword in code for keyword in ['public class ', 'private class ', 'internal class ', 'public static void ']):
return 'csharp'
return 'unknown'
return lang # Return string for backward compatibility
def extract_patterns(self, main: Any, code_samples: List[Dict[str, Any]]) -> List[Dict[str, str]]:
"""Extract common coding patterns (NEW FEATURE)"""