#!/usr/bin/env python3 """ Tests for PDF Scraper (cli/pdf_scraper.py) Tests cover: - Config-based PDF extraction - Direct PDF path conversion - JSON-based workflow - Skill structure generation - Categorization - Error handling """ import json import shutil import tempfile import unittest from pathlib import Path try: import fitz # noqa: F401 PyMuPDF PYMUPDF_AVAILABLE = True except ImportError: PYMUPDF_AVAILABLE = False class TestPDFToSkillConverter(unittest.TestCase): """Test PDFToSkillConverter initialization and basic functionality""" def setUp(self): if not PYMUPDF_AVAILABLE: self.skipTest("PyMuPDF not installed") from skill_seekers.cli.pdf_scraper import PDFToSkillConverter self.PDFToSkillConverter = PDFToSkillConverter # Create temporary directory for test output self.temp_dir = tempfile.mkdtemp() self.output_dir = Path(self.temp_dir) def tearDown(self): # Clean up temporary directory if hasattr(self, "temp_dir"): shutil.rmtree(self.temp_dir, ignore_errors=True) def test_init_with_name_and_pdf_path(self): """Test initialization with name and PDF path""" config = {"name": "test_skill", "pdf_path": "test.pdf"} converter = self.PDFToSkillConverter(config) self.assertEqual(converter.name, "test_skill") self.assertEqual(converter.pdf_path, "test.pdf") def test_init_with_config(self): """Test initialization with config file""" # Create test config config = { "name": "config_skill", "description": "Test skill", "pdf_path": "docs/test.pdf", "extract_options": {"chunk_size": 10, "min_quality": 5.0}, } converter = self.PDFToSkillConverter(config) self.assertEqual(converter.name, "config_skill") self.assertEqual(converter.config.get("description"), "Test skill") def test_init_requires_name_or_config(self): """Test that initialization requires config dict with 'name' field""" with self.assertRaises((ValueError, TypeError, KeyError)): self.PDFToSkillConverter({}) class TestCategorization(unittest.TestCase): """Test content categorization functionality""" def setUp(self): if not PYMUPDF_AVAILABLE: self.skipTest("PyMuPDF not installed") from skill_seekers.cli.pdf_scraper import PDFToSkillConverter self.PDFToSkillConverter = PDFToSkillConverter self.temp_dir = tempfile.mkdtemp() def tearDown(self): shutil.rmtree(self.temp_dir, ignore_errors=True) def test_categorize_by_keywords(self): """Test categorization using keyword matching""" config = { "name": "test", "pdf_path": "test.pdf", "categories": { "getting_started": ["introduction", "getting started"], "api": ["api", "reference", "function"], }, } converter = self.PDFToSkillConverter(config) # Mock extracted data with different content converter.extracted_data = { "pages": [ { "page_number": 1, "text": "Introduction to the API", "chapter": "Chapter 1: Getting Started", }, {"page_number": 2, "text": "API reference for functions", "chapter": None}, ] } categories = converter.categorize_content() # With single PDF source, should use single-file strategy # Category named after PDF basename (test.pdf -> test) self.assertIn("test", categories) self.assertEqual(len(categories), 1) self.assertEqual(len(categories["test"]["pages"]), 2) def test_categorize_by_chapters(self): """Test categorization using chapter information""" config = {"name": "test", "pdf_path": "test.pdf"} converter = self.PDFToSkillConverter(config) # Mock data with chapters converter.extracted_data = { "pages": [ {"page_number": 1, "text": "Content here", "chapter": "Chapter 1: Introduction"}, {"page_number": 2, "text": "More content", "chapter": "Chapter 1: Introduction"}, {"page_number": 3, "text": "New chapter", "chapter": "Chapter 2: Advanced Topics"}, ] } categories = converter.categorize_content() # Should create categories based on chapters self.assertIsInstance(categories, dict) self.assertGreater(len(categories), 0) def test_categorize_handles_no_chapters(self): """Test categorization when no chapters are detected""" config = {"name": "test", "pdf_path": "test.pdf"} converter = self.PDFToSkillConverter(config) # Mock data without chapters converter.extracted_data = { "pages": [{"page_number": 1, "text": "Some content", "chapter": None}] } categories = converter.categorize_content() # Should still create categories (fallback to "other") self.assertIsInstance(categories, dict) class TestSkillBuilding(unittest.TestCase): """Test skill structure generation""" def setUp(self): if not PYMUPDF_AVAILABLE: self.skipTest("PyMuPDF not installed") from skill_seekers.cli.pdf_scraper import PDFToSkillConverter self.PDFToSkillConverter = PDFToSkillConverter self.temp_dir = tempfile.mkdtemp() def tearDown(self): shutil.rmtree(self.temp_dir, ignore_errors=True) def test_build_skill_creates_structure(self): """Test that build_skill creates required directory structure""" config = {"name": "test_skill", "pdf_path": "test.pdf"} converter = self.PDFToSkillConverter(config) # Override skill_dir to use temp directory converter.skill_dir = str(Path(self.temp_dir) / "test_skill") # Mock extracted data converter.extracted_data = { "pages": [{"page_number": 1, "text": "Test content", "code_blocks": [], "images": []}], "total_pages": 1, } # Mock categorization converter.categories = {"getting_started": [converter.extracted_data["pages"][0]]} converter.build_skill() # Check directory structure skill_dir = Path(self.temp_dir) / "test_skill" self.assertTrue(skill_dir.exists()) self.assertTrue((skill_dir / "references").exists()) self.assertTrue((skill_dir / "scripts").exists()) self.assertTrue((skill_dir / "assets").exists()) def test_build_skill_creates_skill_md(self): """Test that SKILL.md is created""" config = {"name": "test_skill", "pdf_path": "test.pdf", "description": "Test description"} converter = self.PDFToSkillConverter(config) # Override skill_dir to use temp directory converter.skill_dir = str(Path(self.temp_dir) / "test_skill") converter.extracted_data = { "pages": [{"page_number": 1, "text": "Test", "code_blocks": [], "images": []}], "total_pages": 1, } converter.categories = {"test": [converter.extracted_data["pages"][0]]} converter.build_skill() skill_md = Path(self.temp_dir) / "test_skill" / "SKILL.md" self.assertTrue(skill_md.exists()) # Check content content = skill_md.read_text() self.assertIn("test_skill", content) self.assertIn("Test description", content) def test_build_skill_creates_reference_files(self): """Test that reference files are created for categories""" config = {"name": "test_skill", "pdf_path": "test.pdf"} converter = self.PDFToSkillConverter(config) # Override skill_dir to use temp directory converter.skill_dir = str(Path(self.temp_dir) / "test_skill") converter.extracted_data = { "pages": [ {"page_number": 1, "text": "Getting started", "code_blocks": [], "images": []}, {"page_number": 2, "text": "API reference", "code_blocks": [], "images": []}, ], "total_pages": 2, } converter.build_skill() # Check reference files exist # With single PDF source, uses single-file strategy (named after PDF basename) refs_dir = Path(self.temp_dir) / "test_skill" / "references" self.assertTrue((refs_dir / "test.md").exists()) self.assertTrue((refs_dir / "index.md").exists()) class TestCodeBlockHandling(unittest.TestCase): """Test code block extraction and inclusion in references""" def setUp(self): if not PYMUPDF_AVAILABLE: self.skipTest("PyMuPDF not installed") from skill_seekers.cli.pdf_scraper import PDFToSkillConverter self.PDFToSkillConverter = PDFToSkillConverter self.temp_dir = tempfile.mkdtemp() def tearDown(self): shutil.rmtree(self.temp_dir, ignore_errors=True) def test_code_blocks_included_in_references(self): """Test that code blocks are included in reference files""" config = {"name": "test_skill", "pdf_path": "test.pdf"} converter = self.PDFToSkillConverter(config) # Override skill_dir to use temp directory converter.skill_dir = str(Path(self.temp_dir) / "test_skill") # Mock data with code blocks converter.extracted_data = { "pages": [ { "page_number": 1, "text": "Example code", "code_blocks": [ { "code": "def hello():\n print('world')", "language": "python", "quality": 8.0, } ], "images": [], } ], "total_pages": 1, } converter.build_skill() # Check code block in reference file # With single PDF source, uses single-file strategy (named after PDF basename) ref_file = Path(self.temp_dir) / "test_skill" / "references" / "test.md" content = ref_file.read_text() self.assertIn("```python", content) self.assertIn("def hello()", content) self.assertIn("print('world')", content) def test_high_quality_code_preferred(self): """Test that high-quality code blocks are prioritized""" config = {"name": "test_skill", "pdf_path": "test.pdf"} converter = self.PDFToSkillConverter(config) # Override skill_dir to use temp directory converter.skill_dir = str(Path(self.temp_dir) / "test_skill") # Mock data with varying quality converter.extracted_data = { "pages": [ { "page_number": 1, "text": "Code examples", "code_blocks": [ {"code": "x = 1", "language": "python", "quality": 2.0}, { "code": "def process():\n return result", "language": "python", "quality": 9.0, }, ], "images": [], } ], "total_pages": 1, } converter.build_skill() # With single PDF source, uses single-file strategy (named after PDF basename) ref_file = Path(self.temp_dir) / "test_skill" / "references" / "test.md" content = ref_file.read_text() # High quality code should be included self.assertIn("def process()", content) class TestImageHandling(unittest.TestCase): """Test image extraction and handling""" def setUp(self): if not PYMUPDF_AVAILABLE: self.skipTest("PyMuPDF not installed") from skill_seekers.cli.pdf_scraper import PDFToSkillConverter self.PDFToSkillConverter = PDFToSkillConverter self.temp_dir = tempfile.mkdtemp() def tearDown(self): shutil.rmtree(self.temp_dir, ignore_errors=True) def test_images_saved_to_assets(self): """Test that images are saved to assets directory""" config = {"name": "test_skill", "pdf_path": "test.pdf"} converter = self.PDFToSkillConverter(config) # Override skill_dir to use temp directory converter.skill_dir = str(Path(self.temp_dir) / "test_skill") # Mock image data (1x1 white PNG) mock_image_bytes = b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x06\x00\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\nIDATx\x9cc\x00\x01\x00\x00\x05\x00\x01\r\n-\xb4\x00\x00\x00\x00IEND\xaeB`\x82" converter.extracted_data = { "pages": [ { "page_number": 1, "text": "See diagram", "code_blocks": [], "images": [ { "page": 1, "index": 0, "width": 100, "height": 100, "data": mock_image_bytes, } ], } ], "total_pages": 1, } converter.categories = {"diagrams": [converter.extracted_data["pages"][0]]} converter.build_skill() # Check assets directory has image assets_dir = Path(self.temp_dir) / "test_skill" / "assets" image_files = list(assets_dir.glob("*.png")) self.assertGreater(len(image_files), 0) def test_image_references_in_markdown(self): """Test that images are referenced in markdown files""" config = {"name": "test_skill", "pdf_path": "test.pdf"} converter = self.PDFToSkillConverter(config) # Override skill_dir to use temp directory converter.skill_dir = str(Path(self.temp_dir) / "test_skill") mock_image_bytes = b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x06\x00\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\nIDATx\x9cc\x00\x01\x00\x00\x05\x00\x01\r\n-\xb4\x00\x00\x00\x00IEND\xaeB`\x82" converter.extracted_data = { "pages": [ { "page_number": 1, "text": "Architecture diagram", "code_blocks": [], "images": [ { "page": 1, "index": 0, "width": 200, "height": 150, "data": mock_image_bytes, } ], } ], "total_pages": 1, } converter.build_skill() # Check markdown has image reference # With single PDF source, uses single-file strategy (named after PDF basename) ref_file = Path(self.temp_dir) / "test_skill" / "references" / "test.md" content = ref_file.read_text() self.assertIn("![", content) # Markdown image syntax self.assertIn("../assets/", content) # Relative path to assets class TestErrorHandling(unittest.TestCase): """Test error handling for invalid inputs""" def setUp(self): if not PYMUPDF_AVAILABLE: self.skipTest("PyMuPDF not installed") from skill_seekers.cli.pdf_scraper import PDFToSkillConverter self.PDFToSkillConverter = PDFToSkillConverter self.temp_dir = tempfile.mkdtemp() def tearDown(self): shutil.rmtree(self.temp_dir, ignore_errors=True) def test_missing_pdf_file(self): """Test error when PDF file doesn't exist""" config = {"name": "test", "pdf_path": "nonexistent.pdf"} converter = self.PDFToSkillConverter(config) with self.assertRaises((FileNotFoundError, RuntimeError)): converter.extract_pdf() def test_invalid_config_file(self): """Test error when config dict is invalid""" invalid_config = "invalid string not a dict" with self.assertRaises((ValueError, TypeError, AttributeError)): self.PDFToSkillConverter(invalid_config) def test_missing_required_config_fields(self): """Test error when config is missing required fields""" config = {"description": "Missing name and pdf_path"} with self.assertRaises((ValueError, KeyError)): converter = self.PDFToSkillConverter(config) converter.extract_pdf() class TestJSONWorkflow(unittest.TestCase): """Test building skills from extracted JSON""" def setUp(self): if not PYMUPDF_AVAILABLE: self.skipTest("PyMuPDF not installed") from skill_seekers.cli.pdf_scraper import PDFToSkillConverter self.PDFToSkillConverter = PDFToSkillConverter self.temp_dir = tempfile.mkdtemp() def tearDown(self): shutil.rmtree(self.temp_dir, ignore_errors=True) def test_load_from_json(self): """Test loading extracted data from JSON file""" # Create mock extracted JSON extracted_data = { "pages": [{"page_number": 1, "text": "Test content", "code_blocks": [], "images": []}], "total_pages": 1, "metadata": {"title": "Test PDF"}, } json_path = Path(self.temp_dir) / "extracted.json" json_path.write_text(json.dumps(extracted_data, indent=2)) config = {"name": "test_skill", "pdf_path": "test.pdf"} converter = self.PDFToSkillConverter(config) converter.load_extracted_data(str(json_path)) self.assertEqual(converter.extracted_data["total_pages"], 1) self.assertEqual(len(converter.extracted_data["pages"]), 1) def test_build_from_json_without_extraction(self): """Test that from_json workflow skips PDF extraction""" extracted_data = { "pages": [{"page_number": 1, "text": "Content", "code_blocks": [], "images": []}], "total_pages": 1, } json_path = Path(self.temp_dir) / "extracted.json" json_path.write_text(json.dumps(extracted_data)) config = {"name": "test_skill", "pdf_path": "test.pdf"} converter = self.PDFToSkillConverter(config) converter.load_extracted_data(str(json_path)) # Should have data loaded without calling extract_pdf() self.assertIsNotNone(converter.extracted_data) self.assertEqual(converter.extracted_data["total_pages"], 1) if __name__ == "__main__": unittest.main()