Files
skill-seekers-reference/tests/test_pdf_advanced_features.py
Pablo Estevez 5ed767ff9a run ruff
2026-01-17 17:29:21 +00:00

539 lines
17 KiB
Python

#!/usr/bin/env python3
"""
Tests for PDF Advanced Features (Priority 2 & 3)
Tests cover:
- OCR support for scanned PDFs
- Password-protected PDFs
- Table extraction
- Parallel processing
- Caching
"""
import io
import shutil
import sys
import tempfile
import unittest
from pathlib import Path
from unittest.mock import MagicMock, Mock, patch
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "cli"))
try:
import fitz # PyMuPDF
PYMUPDF_AVAILABLE = True
except ImportError:
PYMUPDF_AVAILABLE = False
try:
import pytesseract
from PIL import Image
TESSERACT_AVAILABLE = True
except ImportError:
TESSERACT_AVAILABLE = False
class TestOCRSupport(unittest.TestCase):
"""Test OCR support for scanned PDFs (Priority 2)"""
def setUp(self):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
self.temp_dir = tempfile.mkdtemp()
def tearDown(self):
if hasattr(self, "temp_dir"):
shutil.rmtree(self.temp_dir, ignore_errors=True)
def test_ocr_initialization(self):
"""Test OCR flag initialization"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor.use_ocr = True
self.assertTrue(extractor.use_ocr)
def test_extract_text_with_ocr_disabled(self):
"""Test that OCR can be disabled"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor.use_ocr = False
extractor.verbose = False
# Create mock page with normal text
mock_page = Mock()
mock_page.get_text.return_value = "This is regular text"
text = extractor.extract_text_with_ocr(mock_page)
self.assertEqual(text, "This is regular text")
mock_page.get_text.assert_called_once_with("text")
def test_extract_text_with_ocr_sufficient_text(self):
"""Test OCR not triggered when sufficient text exists"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor.use_ocr = True
extractor.verbose = False
# Create mock page with enough text
mock_page = Mock()
mock_page.get_text.return_value = "This is a long paragraph with more than 50 characters"
text = extractor.extract_text_with_ocr(mock_page)
self.assertEqual(len(text), 53) # Length after .strip()
# OCR should not be triggered
mock_page.get_pixmap.assert_not_called()
@patch("pdf_extractor_poc.TESSERACT_AVAILABLE", False)
def test_ocr_unavailable_warning(self):
"""Test warning when OCR requested but pytesseract not available"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor.use_ocr = True
extractor.verbose = True
mock_page = Mock()
mock_page.get_text.return_value = "Short" # Less than 50 chars
# Capture output
with patch("sys.stdout", new=io.StringIO()) as fake_out:
text = extractor.extract_text_with_ocr(mock_page)
output = fake_out.getvalue()
self.assertIn("OCR requested but pytesseract not installed", output)
self.assertEqual(text, "Short")
@unittest.skipUnless(TESSERACT_AVAILABLE, "pytesseract not installed")
def test_ocr_extraction_triggered(self):
"""Test OCR extraction when text is minimal"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor.use_ocr = True
extractor.verbose = False
# Create mock page with minimal text
mock_page = Mock()
mock_page.get_text.return_value = "X" # Less than 50 chars
# Mock pixmap and PIL Image
mock_pix = Mock()
mock_pix.width = 100
mock_pix.height = 100
mock_pix.samples = b"\x00" * (100 * 100 * 3)
mock_page.get_pixmap.return_value = mock_pix
with patch("pytesseract.image_to_string", return_value="OCR extracted text here"):
text = extractor.extract_text_with_ocr(mock_page)
# Should use OCR text since it's longer
self.assertEqual(text, "OCR extracted text here")
mock_page.get_pixmap.assert_called_once()
class TestPasswordProtection(unittest.TestCase):
"""Test password-protected PDF support (Priority 2)"""
def setUp(self):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
self.temp_dir = tempfile.mkdtemp()
def tearDown(self):
if hasattr(self, "temp_dir"):
shutil.rmtree(self.temp_dir, ignore_errors=True)
def test_password_initialization(self):
"""Test password parameter initialization"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor.password = "test_password"
self.assertEqual(extractor.password, "test_password")
def test_encrypted_pdf_detection(self):
"""Test detection of encrypted PDF"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor.pdf_path = "test.pdf"
extractor.password = "mypassword"
extractor.verbose = False
# Mock encrypted document (use MagicMock for __len__)
mock_doc = MagicMock()
mock_doc.is_encrypted = True
mock_doc.authenticate.return_value = True
mock_doc.metadata = {}
mock_doc.__len__.return_value = 10
with patch("fitz.open", return_value=mock_doc):
# This would be called in extract_all()
doc = fitz.open(extractor.pdf_path)
self.assertTrue(doc.is_encrypted)
result = doc.authenticate(extractor.password)
self.assertTrue(result)
def test_wrong_password_handling(self):
"""Test handling of wrong password"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor.pdf_path = "test.pdf"
extractor.password = "wrong_password"
mock_doc = Mock()
mock_doc.is_encrypted = True
mock_doc.authenticate.return_value = False
with patch("fitz.open", return_value=mock_doc):
doc = fitz.open(extractor.pdf_path)
result = doc.authenticate(extractor.password)
self.assertFalse(result)
def test_missing_password_for_encrypted_pdf(self):
"""Test error when password is missing for encrypted PDF"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor.pdf_path = "test.pdf"
extractor.password = None
mock_doc = Mock()
mock_doc.is_encrypted = True
with patch("fitz.open", return_value=mock_doc):
doc = fitz.open(extractor.pdf_path)
self.assertTrue(doc.is_encrypted)
self.assertIsNone(extractor.password)
class TestTableExtraction(unittest.TestCase):
"""Test table extraction (Priority 2)"""
def setUp(self):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
self.temp_dir = tempfile.mkdtemp()
def tearDown(self):
if hasattr(self, "temp_dir"):
shutil.rmtree(self.temp_dir, ignore_errors=True)
def test_table_extraction_initialization(self):
"""Test table extraction flag initialization"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor.extract_tables = True
self.assertTrue(extractor.extract_tables)
def test_table_extraction_disabled(self):
"""Test no tables extracted when disabled"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor.extract_tables = False
extractor.verbose = False
mock_page = Mock()
tables = extractor.extract_tables_from_page(mock_page)
self.assertEqual(tables, [])
# find_tables should not be called
mock_page.find_tables.assert_not_called()
def test_table_extraction_basic(self):
"""Test basic table extraction"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor.extract_tables = True
extractor.verbose = False
# Create mock table
mock_table = Mock()
mock_table.extract.return_value = [["Header 1", "Header 2", "Header 3"], ["Data 1", "Data 2", "Data 3"]]
mock_table.bbox = (0, 0, 100, 100)
# Create mock tables result
mock_tables = Mock()
mock_tables.tables = [mock_table]
mock_page = Mock()
mock_page.find_tables.return_value = mock_tables
tables = extractor.extract_tables_from_page(mock_page)
self.assertEqual(len(tables), 1)
self.assertEqual(tables[0]["row_count"], 2)
self.assertEqual(tables[0]["col_count"], 3)
self.assertEqual(tables[0]["table_index"], 0)
def test_multiple_tables_extraction(self):
"""Test extraction of multiple tables from one page"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor.extract_tables = True
extractor.verbose = False
# Create two mock tables
mock_table1 = Mock()
mock_table1.extract.return_value = [["A", "B"], ["1", "2"]]
mock_table1.bbox = (0, 0, 50, 50)
mock_table2 = Mock()
mock_table2.extract.return_value = [["X", "Y", "Z"], ["10", "20", "30"]]
mock_table2.bbox = (0, 60, 50, 110)
mock_tables = Mock()
mock_tables.tables = [mock_table1, mock_table2]
mock_page = Mock()
mock_page.find_tables.return_value = mock_tables
tables = extractor.extract_tables_from_page(mock_page)
self.assertEqual(len(tables), 2)
self.assertEqual(tables[0]["table_index"], 0)
self.assertEqual(tables[1]["table_index"], 1)
def test_table_extraction_error_handling(self):
"""Test error handling during table extraction"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor.extract_tables = True
extractor.verbose = False
mock_page = Mock()
mock_page.find_tables.side_effect = Exception("Table extraction failed")
# Should not raise, should return empty list
tables = extractor.extract_tables_from_page(mock_page)
self.assertEqual(tables, [])
class TestCaching(unittest.TestCase):
"""Test caching of expensive operations (Priority 3)"""
def setUp(self):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
self.temp_dir = tempfile.mkdtemp()
def tearDown(self):
if hasattr(self, "temp_dir"):
shutil.rmtree(self.temp_dir, ignore_errors=True)
def test_cache_initialization(self):
"""Test cache is initialized"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor._cache = {}
extractor.use_cache = True
self.assertIsInstance(extractor._cache, dict)
self.assertTrue(extractor.use_cache)
def test_cache_set_and_get(self):
"""Test setting and getting cached values"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor._cache = {}
extractor.use_cache = True
# Set cache
test_data = {"page": 1, "text": "cached content"}
extractor.set_cached("page_1", test_data)
# Get cache
cached = extractor.get_cached("page_1")
self.assertEqual(cached, test_data)
def test_cache_miss(self):
"""Test cache miss returns None"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor._cache = {}
extractor.use_cache = True
cached = extractor.get_cached("nonexistent_key")
self.assertIsNone(cached)
def test_cache_disabled(self):
"""Test caching can be disabled"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor._cache = {}
extractor.use_cache = False
# Try to set cache
extractor.set_cached("page_1", {"data": "test"})
# Cache should be empty
self.assertEqual(len(extractor._cache), 0)
# Try to get cache
cached = extractor.get_cached("page_1")
self.assertIsNone(cached)
def test_cache_overwrite(self):
"""Test cache can be overwritten"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor._cache = {}
extractor.use_cache = True
# Set initial value
extractor.set_cached("page_1", {"version": 1})
# Overwrite
extractor.set_cached("page_1", {"version": 2})
# Get cached value
cached = extractor.get_cached("page_1")
self.assertEqual(cached["version"], 2)
class TestParallelProcessing(unittest.TestCase):
"""Test parallel page processing (Priority 3)"""
def setUp(self):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
self.temp_dir = tempfile.mkdtemp()
def tearDown(self):
if hasattr(self, "temp_dir"):
shutil.rmtree(self.temp_dir, ignore_errors=True)
def test_parallel_initialization(self):
"""Test parallel processing flag initialization"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor.parallel = True
extractor.max_workers = 4
self.assertTrue(extractor.parallel)
self.assertEqual(extractor.max_workers, 4)
def test_parallel_disabled_by_default(self):
"""Test parallel processing is disabled by default"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor.parallel = False
self.assertFalse(extractor.parallel)
def test_worker_count_auto_detect(self):
"""Test worker count auto-detection"""
import os
cpu_count = os.cpu_count()
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor.max_workers = cpu_count
self.assertIsNotNone(extractor.max_workers)
self.assertGreater(extractor.max_workers, 0)
def test_custom_worker_count(self):
"""Test custom worker count"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
extractor.max_workers = 8
self.assertEqual(extractor.max_workers, 8)
class TestIntegration(unittest.TestCase):
"""Integration tests for advanced features"""
def setUp(self):
if not PYMUPDF_AVAILABLE:
self.skipTest("PyMuPDF not installed")
from pdf_extractor_poc import PDFExtractor
self.PDFExtractor = PDFExtractor
self.temp_dir = tempfile.mkdtemp()
def tearDown(self):
if hasattr(self, "temp_dir"):
shutil.rmtree(self.temp_dir, ignore_errors=True)
def test_full_initialization_with_all_features(self):
"""Test initialization with all advanced features enabled"""
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
# Set all advanced features
extractor.use_ocr = True
extractor.password = "test_password"
extractor.extract_tables = True
extractor.parallel = True
extractor.max_workers = 4
extractor.use_cache = True
extractor._cache = {}
# Verify all features are set
self.assertTrue(extractor.use_ocr)
self.assertEqual(extractor.password, "test_password")
self.assertTrue(extractor.extract_tables)
self.assertTrue(extractor.parallel)
self.assertEqual(extractor.max_workers, 4)
self.assertTrue(extractor.use_cache)
def test_feature_combinations(self):
"""Test various feature combinations"""
combinations = [
{"use_ocr": True, "extract_tables": True},
{"password": "test", "parallel": True},
{"use_cache": True, "extract_tables": True, "parallel": True},
{"use_ocr": True, "password": "test", "extract_tables": True, "parallel": True},
]
for combo in combinations:
extractor = self.PDFExtractor.__new__(self.PDFExtractor)
for key, value in combo.items():
setattr(extractor, key, value)
# Verify all attributes are set correctly
for key, value in combo.items():
self.assertEqual(getattr(extractor, key), value)
def test_page_data_includes_tables(self):
"""Test that page data includes table count"""
# This tests that the page_data structure includes tables
expected_keys = [
"page_number",
"text",
"markdown",
"headings",
"code_samples",
"images_count",
"extracted_images",
"tables",
"char_count",
"code_blocks_count",
"tables_count",
]
# Just verify the structure is correct
# Actual extraction is tested in other test classes
page_data = {
"page_number": 1,
"text": "test",
"markdown": "test",
"headings": [],
"code_samples": [],
"images_count": 0,
"extracted_images": [],
"tables": [],
"char_count": 4,
"code_blocks_count": 0,
"tables_count": 0,
}
for key in expected_keys:
self.assertIn(key, page_data)
if __name__ == "__main__":
unittest.main()