Files
skill-seekers-reference/tests/test_pdf_extractor.py
yusyus ec3e0bf491 fix: Resolve 61 critical linting errors
Fixed priority linting errors to improve code quality:

Critical Fixes:
- F821 (2 errors): Fixed undefined name 'original_result' in config_enhancer.py
- UP035 (2 errors): Removed deprecated typing.Dict and typing.Type imports
- F401 (27 errors): Removed unused imports and added noqa for availability checks
- E722 (19 errors): Replaced bare 'except:' with 'except Exception:'

Code Quality Improvements:
- SIM201 (4 errors): Simplified 'not x == y' to 'x != y'
- SIM118 (2 errors): Removed unnecessary .keys() in dict iterations
- E741 (4 errors): Renamed ambiguous variable 'l' to 'line'
- I001 (1 error): Sorted imports in test_bootstrap_skill.py

All modified areas tested and passing:
- test_scraper_features.py: 42 passed
- test_integration.py: 51 passed
- test_architecture_scenarios.py: 11 passed
- test_real_world_fastmcp.py: 19 passed (1 skipped)

Remaining linting errors: 249 (mostly code style suggestions like ARG002, F841, SIM102)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-17 22:54:40 +03:00

439 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Tests for PDF Extractor (cli/pdf_extractor_poc.py)
Tests cover:
- Language detection with confidence scoring
- Code block detection (font, indent, pattern)
- Syntax validation
- Quality scoring
- Chapter detection
- Page chunking
- Code block merging
"""
import sys
import unittest
from pathlib import Path
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "cli"))
try:
import fitz # noqa: F401 PyMuPDF
PYMUPDF_AVAILABLE = True
except ImportError:
PYMUPDF_AVAILABLE = False
class TestLanguageDetection(unittest.TestCase):
"""Test language detection with confidence scoring"""
def setUp(self):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
def test_detect_python_with_confidence(self):
"""Test Python detection returns language and confidence"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
# Initialize language_detector manually (since __init__ not called)
from skill_seekers.cli.language_detector import LanguageDetector
extractor.language_detector = LanguageDetector(min_confidence=0.15)
code = "def hello():\n print('world')\n return True"
language, confidence = extractor.detect_language_from_code(code)
self.assertEqual(language, "python")
self.assertGreater(confidence, 0.4) # Should have reasonable confidence
self.assertLessEqual(confidence, 1.0)
def test_detect_javascript_with_confidence(self):
"""Test JavaScript detection"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
# Initialize language_detector manually (since __init__ not called)
from skill_seekers.cli.language_detector import LanguageDetector
extractor.language_detector = LanguageDetector(min_confidence=0.15)
code = "const handleClick = () => {\n console.log('clicked');\n};"
language, confidence = extractor.detect_language_from_code(code)
self.assertEqual(language, "javascript")
self.assertGreater(confidence, 0.5)
def test_detect_cpp_with_confidence(self):
"""Test C++ detection"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
# Initialize language_detector manually (since __init__ not called)
from skill_seekers.cli.language_detector import LanguageDetector
extractor.language_detector = LanguageDetector(min_confidence=0.15)
code = '#include <iostream>\nint main() {\n std::cout << "Hello";\n}'
language, confidence = extractor.detect_language_from_code(code)
self.assertEqual(language, "cpp")
self.assertGreater(confidence, 0.5)
def test_detect_unknown_low_confidence(self):
"""Test unknown language returns low confidence"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
# Initialize language_detector manually (since __init__ not called)
from skill_seekers.cli.language_detector import LanguageDetector
extractor.language_detector = LanguageDetector(min_confidence=0.15)
code = "this is not code at all just plain text"
language, confidence = extractor.detect_language_from_code(code)
self.assertEqual(language, "unknown")
self.assertLess(confidence, 0.3) # Should be low confidence
def test_confidence_range(self):
"""Test confidence is always between 0 and 1"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
# Initialize language_detector manually (since __init__ not called)
from skill_seekers.cli.language_detector import LanguageDetector
extractor.language_detector = LanguageDetector(min_confidence=0.15)
test_codes = [
"def foo(): pass",
"const x = 10;",
"#include <stdio.h>",
"random text here",
"",
]
for code in test_codes:
_, confidence = extractor.detect_language_from_code(code)
self.assertGreaterEqual(confidence, 0.0)
self.assertLessEqual(confidence, 1.0)
class TestSyntaxValidation(unittest.TestCase):
"""Test syntax validation for different languages"""
def setUp(self):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
def test_validate_python_valid(self):
"""Test valid Python syntax"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
code = "def hello():\n print('world')\n return True"
is_valid, issues = extractor.validate_code_syntax(code, "python")
self.assertTrue(is_valid)
self.assertEqual(len(issues), 0)
def test_validate_python_invalid_indentation(self):
"""Test invalid Python indentation"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
code = "def hello():\n print('world')\n\tprint('mixed')" # Mixed tabs and spaces
is_valid, issues = extractor.validate_code_syntax(code, "python")
self.assertFalse(is_valid)
self.assertGreater(len(issues), 0)
def test_validate_python_unbalanced_brackets(self):
"""Test unbalanced brackets"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
code = "x = [[[1, 2, 3" # Severely unbalanced brackets
is_valid, issues = extractor.validate_code_syntax(code, "python")
self.assertFalse(is_valid)
self.assertGreater(len(issues), 0)
def test_validate_javascript_valid(self):
"""Test valid JavaScript syntax"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
code = "const x = () => { return 42; };"
is_valid, issues = extractor.validate_code_syntax(code, "javascript")
self.assertTrue(is_valid)
self.assertEqual(len(issues), 0)
def test_validate_natural_language_fails(self):
"""Test natural language fails validation"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
code = "This is just a regular sentence with the and for and with and that and have and from words."
is_valid, issues = extractor.validate_code_syntax(code, "python")
self.assertFalse(is_valid)
self.assertIn("May be natural language", " ".join(issues))
class TestQualityScoring(unittest.TestCase):
"""Test code quality scoring (0-10 scale)"""
def setUp(self):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
def test_quality_score_range(self):
"""Test quality score is between 0 and 10"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
code = "def hello():\n print('world')"
quality = extractor.score_code_quality(code, "python", 0.8)
self.assertGreaterEqual(quality, 0.0)
self.assertLessEqual(quality, 10.0)
def test_high_quality_code(self):
"""Test high-quality code gets good score"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
code = """def calculate_sum(numbers):
'''Calculate sum of numbers'''
total = 0
for num in numbers:
total += num
return total"""
quality = extractor.score_code_quality(code, "python", 0.9)
self.assertGreater(quality, 6.0) # Should be good quality
def test_low_quality_code(self):
"""Test low-quality code gets low score"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
code = "x" # Too short, no structure
quality = extractor.score_code_quality(code, "unknown", 0.1)
self.assertLess(quality, 6.0) # Should be low quality
def test_quality_factors(self):
"""Test that quality considers multiple factors"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
# Good: proper structure, indentation, confidence
good_code = "def foo():\n return bar()"
good_quality = extractor.score_code_quality(good_code, "python", 0.9)
# Bad: no structure, low confidence
bad_code = "some text"
bad_quality = extractor.score_code_quality(bad_code, "unknown", 0.1)
self.assertGreater(good_quality, bad_quality)
class TestChapterDetection(unittest.TestCase):
"""Test chapter/section detection"""
def setUp(self):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
def test_detect_chapter_with_number(self):
"""Test chapter detection with number"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
page_data = {
"text": "Chapter 1: Introduction to Python\nThis is the first chapter.",
"headings": [],
}
is_chapter, title = extractor.detect_chapter_start(page_data)
self.assertTrue(is_chapter)
self.assertIsNotNone(title)
def test_detect_chapter_uppercase(self):
"""Test chapter detection with uppercase"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
page_data = {
"text": "Chapter 1\nThis is the introduction", # Pattern requires Chapter + digit
"headings": [],
}
is_chapter, title = extractor.detect_chapter_start(page_data)
self.assertTrue(is_chapter)
def test_detect_section_heading(self):
"""Test section heading detection"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
page_data = {"text": "2. Getting Started\nThis is a section.", "headings": []}
is_chapter, title = extractor.detect_chapter_start(page_data)
self.assertTrue(is_chapter)
def test_not_chapter(self):
"""Test normal text is not detected as chapter"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
page_data = {
"text": "This is just normal paragraph text without any chapter markers.",
"headings": [],
}
is_chapter, title = extractor.detect_chapter_start(page_data)
self.assertFalse(is_chapter)
class TestCodeBlockMerging(unittest.TestCase):
"""Test code block merging across pages"""
def setUp(self):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
def test_merge_continued_blocks(self):
"""Test merging code blocks split across pages"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor.verbose = False # Initialize verbose attribute
pages = [
{
"page_number": 1,
"code_samples": [
{"code": "def hello():", "language": "python", "detection_method": "pattern"}
],
"code_blocks_count": 1,
},
{
"page_number": 2,
"code_samples": [
{
"code": ' print("world")',
"language": "python",
"detection_method": "pattern",
}
],
"code_blocks_count": 1,
},
]
merged = extractor.merge_continued_code_blocks(pages)
# Should have merged the two blocks
self.assertIn("def hello():", merged[0]["code_samples"][0]["code"])
self.assertIn('print("world")', merged[0]["code_samples"][0]["code"])
def test_no_merge_different_languages(self):
"""Test blocks with different languages are not merged"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
pages = [
{
"page_number": 1,
"code_samples": [
{"code": "def foo():", "language": "python", "detection_method": "pattern"}
],
"code_blocks_count": 1,
},
{
"page_number": 2,
"code_samples": [
{
"code": "const x = 10;",
"language": "javascript",
"detection_method": "pattern",
}
],
"code_blocks_count": 1,
},
]
merged = extractor.merge_continued_code_blocks(pages)
# Should NOT merge different languages
self.assertEqual(len(merged[0]["code_samples"]), 1)
self.assertEqual(len(merged[1]["code_samples"]), 1)
class TestCodeDetectionMethods(unittest.TestCase):
"""Test different code detection methods"""
def setUp(self):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
def test_pattern_based_detection(self):
"""Test pattern-based code detection"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
# Should detect function definitions
text = "Here is an example:\ndef calculate(x, y):\n return x + y"
# Pattern-based detection should find this
# (implementation details depend on pdf_extractor_poc.py)
self.assertIn("def ", text)
self.assertIn("return", text)
def test_indent_based_detection(self):
"""Test indent-based code detection"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
# Code with consistent indentation
indented_text = """ def foo():
return bar()"""
# Should detect as code due to indentation
self.assertTrue(indented_text.startswith(" " * 4))
class TestQualityFiltering(unittest.TestCase):
"""Test quality-based filtering"""
def setUp(self):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from skill_seekers.cli.pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
def test_filter_by_min_quality(self):
"""Test filtering code blocks by minimum quality"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor.min_quality = 5.0
# High quality block
high_quality = {
"code": "def calculate():\n return 42",
"language": "python",
"quality": 8.0,
}
# Low quality block
low_quality = {"code": "x", "language": "unknown", "quality": 2.0}
# Only high quality should pass
self.assertGreaterEqual(high_quality["quality"], extractor.min_quality)
self.assertLess(low_quality["quality"], extractor.min_quality)
if __name__ == "__main__":
unittest.main()