feat: Add unified language detector for code analysis
- Created LanguageDetector class supporting 20+ programming languages - Confidence-based detection with customizable thresholds (min_confidence parameter) - Replaces duplicate language detection code in doc_scraper and pdf_extractor - Comprehensive test suite with 100+ test cases Changes: - NEW: src/skill_seekers/cli/language_detector.py (17 KB) - Unified detector with pattern matching for 20+ languages - Confidence scoring (0.0-1.0 scale) - Supports: Python, JavaScript, TypeScript, Java, C++, C#, Go, Rust, PHP, Ruby, Swift, Kotlin, Shell, SQL, HTML, CSS, JSON, YAML, XML, and more - NEW: tests/test_language_detector.py (20 KB) - 100+ test cases covering all supported languages - Edge case testing (mixed code, low confidence, etc.) - MODIFIED: src/skill_seekers/cli/doc_scraper.py - Removed 80+ lines of duplicate detection code - Now uses shared LanguageDetector instance - MODIFIED: src/skill_seekers/cli/pdf_extractor_poc.py - Removed 130+ lines of duplicate detection code - Now uses shared LanguageDetector instance - MODIFIED: tests/test_pdf_extractor.py - Fixed imports to use proper package paths - Added manual detector initialization in test setup Benefits: - DRY: Single source of truth for language detection - Maintainability: Add new languages in one place - Consistency: Same detection logic across all scrapers - Testability: Comprehensive test coverage - Extensibility: Easy to add new languages or improve patterns Addresses technical debt from having duplicate detection logic in multiple files.
This commit is contained in:
@@ -32,12 +32,16 @@ class TestLanguageDetection(unittest.TestCase):
|
||||
def setUp(self):
|
||||
if not PYMUPDF_AVAILABLE:
|
||||
self.skipTest("PyMuPDF not installed")
|
||||
from pdf_extractor_poc import PDFExtractor
|
||||
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
|
||||
self.PDFExtractor = PDFExtractor
|
||||
|
||||
def test_detect_python_with_confidence(self):
|
||||
"""Test Python detection returns language and confidence"""
|
||||
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
|
||||
# Initialize language_detector manually (since __init__ not called)
|
||||
from skill_seekers.cli.language_detector import LanguageDetector
|
||||
extractor.language_detector = LanguageDetector(min_confidence=0.15)
|
||||
|
||||
code = "def hello():\n print('world')\n return True"
|
||||
|
||||
language, confidence = extractor.detect_language_from_code(code)
|
||||
@@ -49,6 +53,10 @@ class TestLanguageDetection(unittest.TestCase):
|
||||
def test_detect_javascript_with_confidence(self):
|
||||
"""Test JavaScript detection"""
|
||||
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
|
||||
# Initialize language_detector manually (since __init__ not called)
|
||||
from skill_seekers.cli.language_detector import LanguageDetector
|
||||
extractor.language_detector = LanguageDetector(min_confidence=0.15)
|
||||
|
||||
code = "const handleClick = () => {\n console.log('clicked');\n};"
|
||||
|
||||
language, confidence = extractor.detect_language_from_code(code)
|
||||
@@ -59,6 +67,10 @@ class TestLanguageDetection(unittest.TestCase):
|
||||
def test_detect_cpp_with_confidence(self):
|
||||
"""Test C++ detection"""
|
||||
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
|
||||
# Initialize language_detector manually (since __init__ not called)
|
||||
from skill_seekers.cli.language_detector import LanguageDetector
|
||||
extractor.language_detector = LanguageDetector(min_confidence=0.15)
|
||||
|
||||
code = "#include <iostream>\nint main() {\n std::cout << \"Hello\";\n}"
|
||||
|
||||
language, confidence = extractor.detect_language_from_code(code)
|
||||
@@ -69,6 +81,10 @@ class TestLanguageDetection(unittest.TestCase):
|
||||
def test_detect_unknown_low_confidence(self):
|
||||
"""Test unknown language returns low confidence"""
|
||||
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
|
||||
# Initialize language_detector manually (since __init__ not called)
|
||||
from skill_seekers.cli.language_detector import LanguageDetector
|
||||
extractor.language_detector = LanguageDetector(min_confidence=0.15)
|
||||
|
||||
code = "this is not code at all just plain text"
|
||||
|
||||
language, confidence = extractor.detect_language_from_code(code)
|
||||
@@ -79,6 +95,10 @@ class TestLanguageDetection(unittest.TestCase):
|
||||
def test_confidence_range(self):
|
||||
"""Test confidence is always between 0 and 1"""
|
||||
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
|
||||
# Initialize language_detector manually (since __init__ not called)
|
||||
from skill_seekers.cli.language_detector import LanguageDetector
|
||||
extractor.language_detector = LanguageDetector(min_confidence=0.15)
|
||||
|
||||
test_codes = [
|
||||
"def foo(): pass",
|
||||
"const x = 10;",
|
||||
@@ -99,7 +119,7 @@ class TestSyntaxValidation(unittest.TestCase):
|
||||
def setUp(self):
|
||||
if not PYMUPDF_AVAILABLE:
|
||||
self.skipTest("PyMuPDF not installed")
|
||||
from pdf_extractor_poc import PDFExtractor
|
||||
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
|
||||
self.PDFExtractor = PDFExtractor
|
||||
|
||||
def test_validate_python_valid(self):
|
||||
@@ -159,7 +179,7 @@ class TestQualityScoring(unittest.TestCase):
|
||||
def setUp(self):
|
||||
if not PYMUPDF_AVAILABLE:
|
||||
self.skipTest("PyMuPDF not installed")
|
||||
from pdf_extractor_poc import PDFExtractor
|
||||
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
|
||||
self.PDFExtractor = PDFExtractor
|
||||
|
||||
def test_quality_score_range(self):
|
||||
@@ -216,7 +236,7 @@ class TestChapterDetection(unittest.TestCase):
|
||||
def setUp(self):
|
||||
if not PYMUPDF_AVAILABLE:
|
||||
self.skipTest("PyMuPDF not installed")
|
||||
from pdf_extractor_poc import PDFExtractor
|
||||
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
|
||||
self.PDFExtractor = PDFExtractor
|
||||
|
||||
def test_detect_chapter_with_number(self):
|
||||
@@ -275,7 +295,7 @@ class TestCodeBlockMerging(unittest.TestCase):
|
||||
def setUp(self):
|
||||
if not PYMUPDF_AVAILABLE:
|
||||
self.skipTest("PyMuPDF not installed")
|
||||
from pdf_extractor_poc import PDFExtractor
|
||||
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
|
||||
self.PDFExtractor = PDFExtractor
|
||||
|
||||
def test_merge_continued_blocks(self):
|
||||
@@ -340,7 +360,7 @@ class TestCodeDetectionMethods(unittest.TestCase):
|
||||
def setUp(self):
|
||||
if not PYMUPDF_AVAILABLE:
|
||||
self.skipTest("PyMuPDF not installed")
|
||||
from pdf_extractor_poc import PDFExtractor
|
||||
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
|
||||
self.PDFExtractor = PDFExtractor
|
||||
|
||||
def test_pattern_based_detection(self):
|
||||
@@ -373,7 +393,7 @@ class TestQualityFiltering(unittest.TestCase):
|
||||
def setUp(self):
|
||||
if not PYMUPDF_AVAILABLE:
|
||||
self.skipTest("PyMuPDF not installed")
|
||||
from pdf_extractor_poc import PDFExtractor
|
||||
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
|
||||
self.PDFExtractor = PDFExtractor
|
||||
|
||||
def test_filter_by_min_quality(self):
|
||||
|
||||
Reference in New Issue
Block a user