This commit is contained in:
Pablo Estevez
2026-01-17 17:29:21 +00:00
parent c89f059712
commit 5ed767ff9a
144 changed files with 14142 additions and 16488 deletions

View File

@@ -12,8 +12,8 @@ Tests cover:
- Code block merging
"""
import unittest
import sys
import unittest
from pathlib import Path
# Add parent directory to path for imports
@@ -21,6 +21,7 @@ sys.path.insert(0, str(Path(__file__).parent.parent / "cli"))
try:
import fitz # PyMuPDF
PYMUPDF_AVAILABLE = True
except ImportError:
PYMUPDF_AVAILABLE = False
@@ -33,6 +34,7 @@ class TestLanguageDetection(unittest.TestCase):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
def test_detect_python_with_confidence(self):
@@ -40,6 +42,7 @@ class TestLanguageDetection(unittest.TestCase):
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
# Initialize language_detector manually (since __init__ not called)
from skill_seekers.cli.language_detector import LanguageDetector
extractor.language_detector = LanguageDetector(min_confidence=0.15)
code = "def hello():\n print('world')\n return True"
@@ -55,6 +58,7 @@ class TestLanguageDetection(unittest.TestCase):
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
# Initialize language_detector manually (since __init__ not called)
from skill_seekers.cli.language_detector import LanguageDetector
extractor.language_detector = LanguageDetector(min_confidence=0.15)
code = "const handleClick = () => {\n console.log('clicked');\n};"
@@ -69,9 +73,10 @@ class TestLanguageDetection(unittest.TestCase):
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
# Initialize language_detector manually (since __init__ not called)
from skill_seekers.cli.language_detector import LanguageDetector
extractor.language_detector = LanguageDetector(min_confidence=0.15)
code = "#include <iostream>\nint main() {\n std::cout << \"Hello\";\n}"
code = '#include <iostream>\nint main() {\n std::cout << "Hello";\n}'
language, confidence = extractor.detect_language_from_code(code)
@@ -83,6 +88,7 @@ class TestLanguageDetection(unittest.TestCase):
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
# Initialize language_detector manually (since __init__ not called)
from skill_seekers.cli.language_detector import LanguageDetector
extractor.language_detector = LanguageDetector(min_confidence=0.15)
code = "this is not code at all just plain text"
@@ -97,15 +103,10 @@ class TestLanguageDetection(unittest.TestCase):
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
# Initialize language_detector manually (since __init__ not called)
from skill_seekers.cli.language_detector import LanguageDetector
extractor.language_detector = LanguageDetector(min_confidence=0.15)
test_codes = [
"def foo(): pass",
"const x = 10;",
"#include <stdio.h>",
"random text here",
""
]
test_codes = ["def foo(): pass", "const x = 10;", "#include <stdio.h>", "random text here", ""]
for code in test_codes:
_, confidence = extractor.detect_language_from_code(code)
@@ -120,6 +121,7 @@ class TestSyntaxValidation(unittest.TestCase):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
def test_validate_python_valid(self):
@@ -170,7 +172,7 @@ class TestSyntaxValidation(unittest.TestCase):
is_valid, issues = extractor.validate_code_syntax(code, "python")
self.assertFalse(is_valid)
self.assertIn('May be natural language', ' '.join(issues))
self.assertIn("May be natural language", " ".join(issues))
class TestQualityScoring(unittest.TestCase):
@@ -180,6 +182,7 @@ class TestQualityScoring(unittest.TestCase):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
def test_quality_score_range(self):
@@ -237,15 +240,13 @@ class TestChapterDetection(unittest.TestCase):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
def test_detect_chapter_with_number(self):
"""Test chapter detection with number"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
page_data = {
'text': 'Chapter 1: Introduction to Python\nThis is the first chapter.',
'headings': []
}
page_data = {"text": "Chapter 1: Introduction to Python\nThis is the first chapter.", "headings": []}
is_chapter, title = extractor.detect_chapter_start(page_data)
@@ -256,8 +257,8 @@ class TestChapterDetection(unittest.TestCase):
"""Test chapter detection with uppercase"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
page_data = {
'text': 'Chapter 1\nThis is the introduction', # Pattern requires Chapter + digit
'headings': []
"text": "Chapter 1\nThis is the introduction", # Pattern requires Chapter + digit
"headings": [],
}
is_chapter, title = extractor.detect_chapter_start(page_data)
@@ -267,10 +268,7 @@ class TestChapterDetection(unittest.TestCase):
def test_detect_section_heading(self):
"""Test section heading detection"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
page_data = {
'text': '2. Getting Started\nThis is a section.',
'headings': []
}
page_data = {"text": "2. Getting Started\nThis is a section.", "headings": []}
is_chapter, title = extractor.detect_chapter_start(page_data)
@@ -279,10 +277,7 @@ class TestChapterDetection(unittest.TestCase):
def test_not_chapter(self):
"""Test normal text is not detected as chapter"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
page_data = {
'text': 'This is just normal paragraph text without any chapter markers.',
'headings': []
}
page_data = {"text": "This is just normal paragraph text without any chapter markers.", "headings": []}
is_chapter, title = extractor.detect_chapter_start(page_data)
@@ -296,6 +291,7 @@ class TestCodeBlockMerging(unittest.TestCase):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
def test_merge_continued_blocks(self):
@@ -305,26 +301,22 @@ class TestCodeBlockMerging(unittest.TestCase):
pages = [
{
'page_number': 1,
'code_samples': [
{'code': 'def hello():', 'language': 'python', 'detection_method': 'pattern'}
],
'code_blocks_count': 1
"page_number": 1,
"code_samples": [{"code": "def hello():", "language": "python", "detection_method": "pattern"}],
"code_blocks_count": 1,
},
{
'page_number': 2,
'code_samples': [
{'code': ' print("world")', 'language': 'python', 'detection_method': 'pattern'}
],
'code_blocks_count': 1
}
"page_number": 2,
"code_samples": [{"code": ' print("world")', "language": "python", "detection_method": "pattern"}],
"code_blocks_count": 1,
},
]
merged = extractor.merge_continued_code_blocks(pages)
# Should have merged the two blocks
self.assertIn('def hello():', merged[0]['code_samples'][0]['code'])
self.assertIn('print("world")', merged[0]['code_samples'][0]['code'])
self.assertIn("def hello():", merged[0]["code_samples"][0]["code"])
self.assertIn('print("world")', merged[0]["code_samples"][0]["code"])
def test_no_merge_different_languages(self):
"""Test blocks with different languages are not merged"""
@@ -332,26 +324,22 @@ class TestCodeBlockMerging(unittest.TestCase):
pages = [
{
'page_number': 1,
'code_samples': [
{'code': 'def foo():', 'language': 'python', 'detection_method': 'pattern'}
],
'code_blocks_count': 1
"page_number": 1,
"code_samples": [{"code": "def foo():", "language": "python", "detection_method": "pattern"}],
"code_blocks_count": 1,
},
{
'page_number': 2,
'code_samples': [
{'code': 'const x = 10;', 'language': 'javascript', 'detection_method': 'pattern'}
],
'code_blocks_count': 1
}
"page_number": 2,
"code_samples": [{"code": "const x = 10;", "language": "javascript", "detection_method": "pattern"}],
"code_blocks_count": 1,
},
]
merged = extractor.merge_continued_code_blocks(pages)
# Should NOT merge different languages
self.assertEqual(len(merged[0]['code_samples']), 1)
self.assertEqual(len(merged[1]['code_samples']), 1)
self.assertEqual(len(merged[0]["code_samples"]), 1)
self.assertEqual(len(merged[1]["code_samples"]), 1)
class TestCodeDetectionMethods(unittest.TestCase):
@@ -361,6 +349,7 @@ class TestCodeDetectionMethods(unittest.TestCase):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
def test_pattern_based_detection(self):
@@ -394,6 +383,7 @@ class TestQualityFiltering(unittest.TestCase):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
def test_filter_by_min_quality(self):
@@ -402,23 +392,15 @@ class TestQualityFiltering(unittest.TestCase):
extractor.min_quality = 5.0
# High quality block
high_quality = {
'code': 'def calculate():\n return 42',
'language': 'python',
'quality': 8.0
}
high_quality = {"code": "def calculate():\n return 42", "language": "python", "quality": 8.0}
# Low quality block
low_quality = {
'code': 'x',
'language': 'unknown',
'quality': 2.0
}
low_quality = {"code": "x", "language": "unknown", "quality": 2.0}
# Only high quality should pass
self.assertGreaterEqual(high_quality['quality'], extractor.min_quality)
self.assertLess(low_quality['quality'], extractor.min_quality)
self.assertGreaterEqual(high_quality["quality"], extractor.min_quality)
self.assertLess(low_quality["quality"], extractor.min_quality)
if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()