""" Tests for EPUB scraper (epub_scraper.py). Covers: initialization, extraction, categorization, skill building, code blocks, tables, images, error handling, JSON workflow, CLI arguments, helper functions, source detection, DRM detection, and edge cases. Tests use mock data and do not require actual EPUB files or ebooklib installed. """ import json import os import shutil import tempfile import unittest from pathlib import Path from unittest.mock import MagicMock, patch # Conditional import (same pattern as test_word_scraper.py) try: import ebooklib EPUB_AVAILABLE = True except ImportError: EPUB_AVAILABLE = False try: from skill_seekers.cli.epub_scraper import ( EpubToSkillConverter, _build_section, _extract_table_from_html, _score_code_quality, infer_description_from_epub, ) IMPORT_OK = True except ImportError: IMPORT_OK = False def _make_sample_extracted_data( num_sections=2, include_code=False, include_tables=False, include_images=False, ) -> dict: """Create minimal extracted_data dict for testing.""" sections = [] total_code = 0 total_images = 0 languages = {} for i in range(1, num_sections + 1): section = { "section_number": i, "heading": f"Chapter {i}", "heading_level": "h1", "text": f"Content of chapter {i}. This is sample text.", "headings": [{"level": "h2", "text": f"Section {i}.1"}], "code_samples": [], "tables": [], "images": [], } if include_code: section["code_samples"] = [ { "code": f"def func_{i}():\n return {i}", "language": "python", "quality_score": 7.5, }, { "code": f"console.log({i})", "language": "javascript", "quality_score": 4.0, }, ] total_code += 2 languages["python"] = languages.get("python", 0) + 1 languages["javascript"] = languages.get("javascript", 0) + 1 if include_tables: section["tables"] = [{"headers": ["Name", "Value"], "rows": [["key", "val"]]}] if include_images: section["images"] = [ {"index": 0, "data": b"\x89PNG\r\n\x1a\n", "width": 100, "height": 100} ] total_images += 1 sections.append(section) return { "source_file": "test.epub", "metadata": { "title": "Test Book", "author": "Test Author", "language": "en", "publisher": "Test Publisher", "date": "2024-01-01", "description": "A test book for unit testing", "subject": "Testing, Unit Tests", "rights": "Copyright 2024", "identifier": "urn:uuid:12345", }, "total_sections": num_sections, "total_code_blocks": total_code, "total_images": total_images, "languages_detected": languages, "pages": sections, } # ============================================================================ # Class 1: TestEpubToSkillConverterInit # ============================================================================ class TestEpubToSkillConverterInit(unittest.TestCase): """Test EpubToSkillConverter initialization.""" def setUp(self): if not IMPORT_OK: self.skipTest("epub_scraper not importable") self.temp_dir = tempfile.mkdtemp() def tearDown(self): shutil.rmtree(self.temp_dir, ignore_errors=True) def test_init_with_name_and_epub_path(self): config = {"name": "test_skill", "epub_path": "test.epub"} converter = EpubToSkillConverter(config) self.assertEqual(converter.name, "test_skill") self.assertEqual(converter.epub_path, "test.epub") def test_init_with_full_config(self): config = { "name": "mybook", "epub_path": "/path/to/book.epub", "description": "Custom description", "categories": {"ch1": ["intro"]}, } converter = EpubToSkillConverter(config) self.assertEqual(converter.name, "mybook") self.assertEqual(converter.epub_path, "/path/to/book.epub") self.assertEqual(converter.description, "Custom description") self.assertEqual(converter.categories, {"ch1": ["intro"]}) def test_default_description_uses_name(self): config = {"name": "test_skill"} converter = EpubToSkillConverter(config) self.assertIn("test_skill", converter.description) self.assertTrue(converter.description.startswith("Use when referencing")) def test_skill_dir_uses_name(self): config = {"name": "mybook"} converter = EpubToSkillConverter(config) self.assertEqual(converter.skill_dir, "output/mybook") def test_data_file_uses_name(self): config = {"name": "mybook"} converter = EpubToSkillConverter(config) self.assertEqual(converter.data_file, "output/mybook_extracted.json") def test_init_requires_name(self): with self.assertRaises(KeyError): EpubToSkillConverter({}) def test_init_empty_name(self): config = {"name": ""} converter = EpubToSkillConverter(config) self.assertEqual(converter.name, "") def test_init_with_special_characters_in_name(self): config = {"name": "my-book name_2024"} converter = EpubToSkillConverter(config) self.assertEqual(converter.name, "my-book name_2024") self.assertIn("my-book name_2024", converter.skill_dir) # ============================================================================ # Class 2: TestEpubExtraction # ============================================================================ class TestEpubExtraction(unittest.TestCase): """Test EPUB content extraction.""" def setUp(self): if not IMPORT_OK: self.skipTest("epub_scraper not importable") if not EPUB_AVAILABLE: self.skipTest("ebooklib not installed") self.temp_dir = tempfile.mkdtemp() def tearDown(self): shutil.rmtree(self.temp_dir, ignore_errors=True) def _make_mock_book(self, spine_content=None, metadata=None, images=None): """Create a mock ebooklib EpubBook.""" book = MagicMock() if metadata is None: metadata = { "title": [("Test Book", {})], "creator": [("Test Author", {})], "language": [("en", {})], "publisher": [("Test Publisher", {})], "date": [("2024-01-01", {})], "description": [("A test book", {})], "subject": [("Testing", {})], "rights": [("Copyright 2024", {})], "identifier": [("urn:uuid:12345", {})], } def get_metadata(ns, key): if ns == "DC": return metadata.get(key, []) return [] book.get_metadata = get_metadata # Spine items if spine_content is None: spine_content = [ ( "ch1", "

Chapter 1

Content 1

", ), ] spine_items = [] items_dict = {} for item_id, content in spine_content: item = MagicMock() item.get_type.return_value = ebooklib.ITEM_DOCUMENT item.get_content.return_value = content.encode("utf-8") items_dict[item_id] = item spine_items.append((item_id, "yes")) book.spine = spine_items book.get_item_with_id = lambda x: items_dict.get(x) # Images if images is None: images = [] img_items = [] for img in images: img_item = MagicMock() img_item.media_type = img.get("media_type", "image/png") img_item.get_content.return_value = img.get("data", b"\x89PNG") img_item.file_name = img.get("file_name", "image.png") img_items.append(img_item) book.get_items_of_type = lambda t: img_items if t == ebooklib.ITEM_IMAGE else [] # All items (for DRM detection, SVG counting) all_items = list(items_dict.values()) + img_items book.get_items = lambda: all_items return book @patch("skill_seekers.cli.epub_scraper.epub") @patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True) @patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True) def test_extract_basic_epub(self, mock_isfile, mock_exists, mock_epub): mock_book = self._make_mock_book() mock_epub.read_epub.return_value = mock_book config = {"name": "test", "epub_path": "test.epub"} converter = EpubToSkillConverter(config) converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") result = converter.extract_epub() self.assertTrue(result) self.assertIsNotNone(converter.extracted_data) self.assertGreaterEqual(len(converter.extracted_data["pages"]), 1) @patch("skill_seekers.cli.epub_scraper.epub") @patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True) @patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True) def test_extract_metadata(self, mock_isfile, mock_exists, mock_epub): mock_book = self._make_mock_book() mock_epub.read_epub.return_value = mock_book config = {"name": "test", "epub_path": "test.epub"} converter = EpubToSkillConverter(config) converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") converter.extract_epub() metadata = converter.extracted_data["metadata"] self.assertEqual(metadata["title"], "Test Book") self.assertEqual(metadata["author"], "Test Author") self.assertEqual(metadata["language"], "en") @patch("skill_seekers.cli.epub_scraper.epub") @patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True) @patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True) def test_extract_multiple_chapters(self, mock_isfile, mock_exists, mock_epub): spine = [ ("ch1", "

Chapter 1

Text 1

"), ("ch2", "

Chapter 2

Text 2

"), ("ch3", "

Chapter 3

Text 3

"), ] mock_book = self._make_mock_book(spine_content=spine) mock_epub.read_epub.return_value = mock_book config = {"name": "test", "epub_path": "test.epub"} converter = EpubToSkillConverter(config) converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") converter.extract_epub() self.assertEqual(len(converter.extracted_data["pages"]), 3) @patch("skill_seekers.cli.epub_scraper.epub") @patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True) @patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True) def test_extract_code_blocks(self, mock_isfile, mock_exists, mock_epub): spine = [ ( "ch1", "

Code Chapter

" '
def hello():\n    print("hi")
' "", ), ] mock_book = self._make_mock_book(spine_content=spine) mock_epub.read_epub.return_value = mock_book config = {"name": "test", "epub_path": "test.epub"} converter = EpubToSkillConverter(config) converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") converter.extract_epub() code_samples = converter.extracted_data["pages"][0]["code_samples"] self.assertGreaterEqual(len(code_samples), 1) self.assertEqual(code_samples[0]["language"], "python") @patch("skill_seekers.cli.epub_scraper.epub") @patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True) @patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True) def test_extract_images(self, mock_isfile, mock_exists, mock_epub): images = [{"media_type": "image/png", "data": b"\x89PNG", "file_name": "fig1.png"}] mock_book = self._make_mock_book(images=images) mock_epub.read_epub.return_value = mock_book config = {"name": "test", "epub_path": "test.epub"} converter = EpubToSkillConverter(config) converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") converter.extract_epub() self.assertGreaterEqual(converter.extracted_data["total_images"], 1) @patch("skill_seekers.cli.epub_scraper.epub") @patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True) @patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True) def test_heading_boundary_splitting(self, mock_isfile, mock_exists, mock_epub): spine = [ ( "ch1", "" "

First Heading

First content

" "

Second Heading

Second content

" "", ), ] mock_book = self._make_mock_book(spine_content=spine) mock_epub.read_epub.return_value = mock_book config = {"name": "test", "epub_path": "test.epub"} converter = EpubToSkillConverter(config) converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") converter.extract_epub() pages = converter.extracted_data["pages"] self.assertEqual(len(pages), 2) self.assertEqual(pages[0]["heading"], "First Heading") self.assertEqual(pages[1]["heading"], "Second Heading") def test_extract_missing_file_raises_error(self): config = {"name": "test", "epub_path": "/nonexistent/book.epub"} converter = EpubToSkillConverter(config) with self.assertRaises(FileNotFoundError): converter.extract_epub() def test_extract_invalid_extension_raises_error(self): # Create a real file with wrong extension bad_file = os.path.join(self.temp_dir, "test.txt") Path(bad_file).write_text("not an epub") config = {"name": "test", "epub_path": bad_file} converter = EpubToSkillConverter(config) with self.assertRaises(ValueError): converter.extract_epub() def test_extract_deps_not_installed(self): from skill_seekers.cli.epub_scraper import _check_epub_deps with patch("skill_seekers.cli.epub_scraper.EPUB_AVAILABLE", False): with self.assertRaises(RuntimeError) as ctx: _check_epub_deps() self.assertIn("ebooklib", str(ctx.exception)) self.assertIn("pip install", str(ctx.exception)) @patch("skill_seekers.cli.epub_scraper.epub") @patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True) @patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True) def test_extract_empty_spine(self, mock_isfile, mock_exists, mock_epub): mock_book = self._make_mock_book(spine_content=[]) mock_book.spine = [] mock_epub.read_epub.return_value = mock_book config = {"name": "test", "epub_path": "test.epub"} converter = EpubToSkillConverter(config) converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") converter.extract_epub() self.assertEqual(len(converter.extracted_data["pages"]), 0) @patch("skill_seekers.cli.epub_scraper.epub") @patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True) @patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True) def test_extract_spine_item_no_body(self, mock_isfile, mock_exists, mock_epub): spine = [ ("ch1", "No Body"), ] mock_book = self._make_mock_book(spine_content=spine) mock_epub.read_epub.return_value = mock_book config = {"name": "test", "epub_path": "test.epub"} converter = EpubToSkillConverter(config) converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") # Should not crash — body fallback to soup converter.extract_epub() self.assertIsNotNone(converter.extracted_data) # ============================================================================ # Class 3: TestEpubDrmDetection # ============================================================================ class TestEpubDrmDetection(unittest.TestCase): """Test DRM detection logic.""" def setUp(self): if not IMPORT_OK: self.skipTest("epub_scraper not importable") self.temp_dir = tempfile.mkdtemp() def tearDown(self): shutil.rmtree(self.temp_dir, ignore_errors=True) def _make_converter(self): config = {"name": "test", "epub_path": "test.epub"} return EpubToSkillConverter(config) def _make_book_with_encryption(self, encryption_xml_content): """Create a mock book with META-INF/encryption.xml.""" book = MagicMock() enc_item = MagicMock() enc_item.file_name = "META-INF/encryption.xml" enc_item.get_content.return_value = encryption_xml_content.encode("utf-8") book.get_items.return_value = [enc_item] return book def test_no_drm_detected(self): converter = self._make_converter() book = MagicMock() book.get_items.return_value = [] self.assertFalse(converter._detect_drm(book)) def test_drm_detected_adobe_adept(self): converter = self._make_converter() xml = '' book = self._make_book_with_encryption(xml) self.assertTrue(converter._detect_drm(book)) def test_drm_detected_apple_fairplay(self): converter = self._make_converter() xml = '' book = self._make_book_with_encryption(xml) self.assertTrue(converter._detect_drm(book)) def test_drm_detected_readium_lcp(self): converter = self._make_converter() xml = '' book = self._make_book_with_encryption(xml) self.assertTrue(converter._detect_drm(book)) def test_font_obfuscation_not_drm(self): converter = self._make_converter() xml = ( "" '' "" ) book = self._make_book_with_encryption(xml) self.assertFalse(converter._detect_drm(book)) def test_drm_error_message_is_clear(self): converter = self._make_converter() xml = '' book = self._make_book_with_encryption(xml) self.assertTrue(converter._detect_drm(book)) # The error message is raised in extract_epub, not _detect_drm # Just confirm detection works # ============================================================================ # Class 4: TestEpubCategorization # ============================================================================ class TestEpubCategorization(unittest.TestCase): """Test content categorization.""" def setUp(self): if not IMPORT_OK: self.skipTest("epub_scraper not importable") self.temp_dir = tempfile.mkdtemp() def tearDown(self): shutil.rmtree(self.temp_dir, ignore_errors=True) def test_single_source_creates_one_category(self): config = {"name": "test", "epub_path": "mybook.epub"} converter = EpubToSkillConverter(config) converter.extracted_data = _make_sample_extracted_data(num_sections=3) categories = converter.categorize_content() self.assertEqual(len(categories), 1) self.assertIn("mybook", categories) def test_keyword_categorization(self): config = { "name": "test", "categories": { "intro": ["introduction", "getting started"], "advanced": ["advanced", "deep dive"], }, } converter = EpubToSkillConverter(config) data = _make_sample_extracted_data(num_sections=2) data["pages"][0]["heading"] = "Introduction to Testing" data["pages"][1]["heading"] = "Advanced Techniques" converter.extracted_data = data categories = converter.categorize_content() self.assertIn("intro", categories) self.assertIn("advanced", categories) def test_no_categories_uses_default(self): config = {"name": "test"} converter = EpubToSkillConverter(config) converter.extracted_data = _make_sample_extracted_data(num_sections=2) categories = converter.categorize_content() self.assertIn("content", categories) self.assertEqual(categories["content"]["title"], "Content") def test_categorize_empty_sections(self): config = {"name": "test"} converter = EpubToSkillConverter(config) converter.extracted_data = _make_sample_extracted_data(num_sections=0) categories = converter.categorize_content() self.assertIn("content", categories) self.assertEqual(len(categories["content"]["pages"]), 0) def test_categorize_no_keyword_matches(self): config = { "name": "test", "categories": {"intro": ["zzzzz_no_match"]}, } converter = EpubToSkillConverter(config) converter.extracted_data = _make_sample_extracted_data(num_sections=2) categories = converter.categorize_content() self.assertIn("other", categories) self.assertEqual(len(categories["other"]["pages"]), 2) def test_categorize_single_section(self): config = {"name": "test", "epub_path": "book.epub"} converter = EpubToSkillConverter(config) converter.extracted_data = _make_sample_extracted_data(num_sections=1) categories = converter.categorize_content() total_pages = sum(len(c["pages"]) for c in categories.values()) self.assertEqual(total_pages, 1) def test_categorize_many_sections(self): config = {"name": "test", "epub_path": "book.epub"} converter = EpubToSkillConverter(config) converter.extracted_data = _make_sample_extracted_data(num_sections=50) categories = converter.categorize_content() total_pages = sum(len(c["pages"]) for c in categories.values()) self.assertEqual(total_pages, 50) def test_categorize_preserves_section_order(self): config = {"name": "test", "epub_path": "book.epub"} converter = EpubToSkillConverter(config) converter.extracted_data = _make_sample_extracted_data(num_sections=5) categories = converter.categorize_content() for cat_data in categories.values(): section_nums = [s["section_number"] for s in cat_data["pages"]] self.assertEqual(section_nums, sorted(section_nums)) # ============================================================================ # Class 5: TestEpubSkillBuilding # ============================================================================ class TestEpubSkillBuilding(unittest.TestCase): """Test skill building (directory structure, SKILL.md, reference files).""" def setUp(self): if not IMPORT_OK: self.skipTest("epub_scraper not importable") self.temp_dir = tempfile.mkdtemp() def tearDown(self): shutil.rmtree(self.temp_dir, ignore_errors=True) def _make_converter(self, name="test_book", epub_path="test.epub"): config = {"name": name, "epub_path": epub_path} converter = EpubToSkillConverter(config) converter.skill_dir = os.path.join(self.temp_dir, name) converter.data_file = os.path.join(self.temp_dir, f"{name}_extracted.json") return converter def test_build_creates_directory_structure(self): converter = self._make_converter() converter.extracted_data = _make_sample_extracted_data() converter.build_skill() skill_dir = Path(self.temp_dir) / "test_book" self.assertTrue(skill_dir.exists()) self.assertTrue((skill_dir / "references").exists()) self.assertTrue((skill_dir / "scripts").exists()) self.assertTrue((skill_dir / "assets").exists()) def test_build_generates_skill_md(self): converter = self._make_converter() converter.extracted_data = _make_sample_extracted_data() converter.build_skill() skill_md = Path(self.temp_dir) / "test_book" / "SKILL.md" self.assertTrue(skill_md.exists()) content = skill_md.read_text() self.assertIn("---", content) self.assertIn("name:", content) self.assertIn("description:", content) def test_build_generates_reference_files(self): converter = self._make_converter() converter.extracted_data = _make_sample_extracted_data() converter.build_skill() refs_dir = Path(self.temp_dir) / "test_book" / "references" md_files = list(refs_dir.glob("*.md")) # At least index.md + one reference file self.assertGreaterEqual(len(md_files), 2) def test_build_generates_index(self): converter = self._make_converter() converter.extracted_data = _make_sample_extracted_data() converter.build_skill() index_path = Path(self.temp_dir) / "test_book" / "references" / "index.md" self.assertTrue(index_path.exists()) content = index_path.read_text() self.assertIn("Categories", content) self.assertIn("Statistics", content) def test_skill_md_contains_metadata(self): converter = self._make_converter() converter.extracted_data = _make_sample_extracted_data() converter.build_skill() skill_md = Path(self.temp_dir) / "test_book" / "SKILL.md" content = skill_md.read_text() self.assertIn("Test Book", content) self.assertIn("Test Author", content) def test_skill_md_yaml_frontmatter(self): converter = self._make_converter() converter.extracted_data = _make_sample_extracted_data() converter.build_skill() skill_md = Path(self.temp_dir) / "test_book" / "SKILL.md" content = skill_md.read_text() # YAML frontmatter starts and ends with --- lines = content.split("\n") self.assertEqual(lines[0], "---") # Find closing --- closing_idx = None for i, line in enumerate(lines[1:], 1): if line == "---": closing_idx = i break self.assertIsNotNone(closing_idx) def test_build_without_extracted_data_fails(self): converter = self._make_converter() converter.extracted_data = None with self.assertRaises((AttributeError, TypeError)): converter.build_skill() def test_build_overwrites_existing_output(self): converter = self._make_converter() converter.extracted_data = _make_sample_extracted_data() # Build once converter.build_skill() skill_md_1 = (Path(self.temp_dir) / "test_book" / "SKILL.md").read_text() # Build again converter.build_skill() skill_md_2 = (Path(self.temp_dir) / "test_book" / "SKILL.md").read_text() self.assertEqual(skill_md_1, skill_md_2) def test_build_with_long_name(self): long_name = "a" * 100 converter = self._make_converter(name=long_name) converter.extracted_data = _make_sample_extracted_data() converter.build_skill() skill_md = Path(converter.skill_dir) / "SKILL.md" content = skill_md.read_text() # Name in frontmatter is truncated to 64 chars lines = content.split("\n") for line in lines: if line.startswith("name:"): name_val = line.split(":", 1)[1].strip() self.assertLessEqual(len(name_val), 64) break def test_build_with_unicode_content(self): converter = self._make_converter() data = _make_sample_extracted_data() data["pages"][0]["heading"] = ( "Unicode: \u4e2d\u6587 \u0627\u0644\u0639\u0631\u0628\u064a\u0629 \U0001f600" ) data["pages"][0]["text"] = ( "Content with CJK: \u4f60\u597d, Arabic: \u0645\u0631\u062d\u0628\u0627, Emoji: \U0001f680" ) converter.extracted_data = data converter.build_skill() refs_dir = Path(self.temp_dir) / "test_book" / "references" md_files = list(refs_dir.glob("*.md")) # Should have reference files self.assertGreaterEqual(len(md_files), 1) # Unicode should be preserved in at least one file found_unicode = False for f in md_files: content = f.read_text(encoding="utf-8") if "\u4e2d\u6587" in content or "\u4f60\u597d" in content: found_unicode = True break self.assertTrue(found_unicode) # ============================================================================ # Class 6: TestEpubCodeBlocks # ============================================================================ class TestEpubCodeBlocks(unittest.TestCase): """Test code block extraction and rendering.""" def setUp(self): if not IMPORT_OK: self.skipTest("epub_scraper not importable") self.temp_dir = tempfile.mkdtemp() def tearDown(self): shutil.rmtree(self.temp_dir, ignore_errors=True) def _make_converter(self): config = {"name": "test", "epub_path": "test.epub"} converter = EpubToSkillConverter(config) converter.skill_dir = os.path.join(self.temp_dir, "test") converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") return converter def test_code_blocks_included_in_reference_files(self): converter = self._make_converter() converter.extracted_data = _make_sample_extracted_data(include_code=True) converter.build_skill() refs_dir = Path(self.temp_dir) / "test" / "references" found_code = False for f in refs_dir.glob("*.md"): if f.name == "index.md": continue content = f.read_text() if "```python" in content or "def func_" in content: found_code = True break self.assertTrue(found_code) def test_code_blocks_in_skill_md_top_15(self): converter = self._make_converter() converter.extracted_data = _make_sample_extracted_data(num_sections=10, include_code=True) converter.build_skill() skill_md = Path(self.temp_dir) / "test" / "SKILL.md" content = skill_md.read_text() self.assertIn("Code Examples", content) def test_code_language_grouped(self): converter = self._make_converter() converter.extracted_data = _make_sample_extracted_data(num_sections=3, include_code=True) converter.build_skill() skill_md = Path(self.temp_dir) / "test" / "SKILL.md" content = skill_md.read_text() self.assertIn("Python Examples", content) self.assertIn("Javascript Examples", content) def test_empty_code_block(self): from bs4 import BeautifulSoup html = "
" soup = BeautifulSoup(html, "html.parser") elements = list(soup.children) section = _build_section(1, "Test", "h1", elements) self.assertEqual(len(section["code_samples"]), 0) def test_code_block_with_html_entities(self): from bs4 import BeautifulSoup html = "
if (x < 10 && y > 5) {}
" soup = BeautifulSoup(html, "html.parser") elements = list(soup.children) section = _build_section(1, "Test", "h1", elements) self.assertEqual(len(section["code_samples"]), 1) code = section["code_samples"][0]["code"] self.assertIn("<", code) self.assertIn(">", code) self.assertIn("&&", code) def test_code_block_with_syntax_highlighting_spans(self): from bs4 import BeautifulSoup html = ( '
def '
            'foo():
' ) soup = BeautifulSoup(html, "html.parser") elements = list(soup.children) section = _build_section(1, "Test", "h1", elements) self.assertEqual(len(section["code_samples"]), 1) code = section["code_samples"][0]["code"] self.assertIn("def", code) self.assertIn("foo", code) self.assertNotIn("NameAge" "Alice30" ) soup = BeautifulSoup(html, "html.parser") table = soup.find("table") result = _extract_table_from_html(table) self.assertIsNotNone(result) self.assertEqual(result["headers"], ["Name", "Age"]) self.assertEqual(result["rows"], [["Alice", "30"]]) def test_table_no_thead(self): from bs4 import BeautifulSoup html = ( "" "
Header1Header2
Val1Val2
" ) soup = BeautifulSoup(html, "html.parser") table = soup.find("table") result = _extract_table_from_html(table) self.assertIsNotNone(result) self.assertEqual(result["headers"], ["Header1", "Header2"]) self.assertEqual(result["rows"], [["Val1", "Val2"]]) def test_empty_table(self): from bs4 import BeautifulSoup html = "
" soup = BeautifulSoup(html, "html.parser") table = soup.find("table") result = _extract_table_from_html(table) self.assertIsNone(result) def test_table_with_colspan_rowspan(self): from bs4 import BeautifulSoup html = ( "" "" "
H1H2
ABC
DE
" ) soup = BeautifulSoup(html, "html.parser") table = soup.find("table") # Should not crash result = _extract_table_from_html(table) self.assertIsNotNone(result) # ============================================================================ # Class 8: TestEpubImages # ============================================================================ class TestEpubImages(unittest.TestCase): """Test image extraction and handling.""" def setUp(self): if not IMPORT_OK: self.skipTest("epub_scraper not importable") self.temp_dir = tempfile.mkdtemp() def tearDown(self): shutil.rmtree(self.temp_dir, ignore_errors=True) def test_images_saved_to_assets(self): config = {"name": "test", "epub_path": "test.epub"} converter = EpubToSkillConverter(config) converter.skill_dir = os.path.join(self.temp_dir, "test") converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") data = _make_sample_extracted_data(include_images=True) converter.extracted_data = data converter.build_skill() assets_dir = Path(self.temp_dir) / "test" / "assets" self.assertTrue(assets_dir.exists()) def test_image_references_in_markdown(self): config = {"name": "test", "epub_path": "test.epub"} converter = EpubToSkillConverter(config) converter.skill_dir = os.path.join(self.temp_dir, "test") converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") data = _make_sample_extracted_data(include_images=True) converter.extracted_data = data converter.build_skill() refs_dir = Path(self.temp_dir) / "test" / "references" found_img_ref = False for f in refs_dir.glob("*.md"): if f.name == "index.md": continue content = f.read_text() if "![Image" in content and "../assets/" in content: found_img_ref = True break self.assertTrue(found_img_ref) def test_image_with_zero_bytes(self): config = {"name": "test", "epub_path": "test.epub"} converter = EpubToSkillConverter(config) converter.skill_dir = os.path.join(self.temp_dir, "test") converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") data = _make_sample_extracted_data() # Add image with empty data data["pages"][0]["images"] = [{"index": 0, "data": b"", "width": 0, "height": 0}] converter.extracted_data = data # Should not crash converter.build_skill() def test_svg_images_handled(self): from bs4 import BeautifulSoup html = '' soup = BeautifulSoup(f"
{html}
", "html.parser") elements = list(soup.find("div").children) section = _build_section(1, "Test", "h1", elements) self.assertEqual(len(section["images"]), 1) def test_image_filename_conflicts(self): config = {"name": "test", "epub_path": "test.epub"} converter = EpubToSkillConverter(config) converter.skill_dir = os.path.join(self.temp_dir, "test") converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") data = _make_sample_extracted_data() # Multiple images with unique indexes data["pages"][0]["images"] = [ {"index": 0, "data": b"\x89PNG\r\n\x1a\n", "width": 50, "height": 50}, {"index": 1, "data": b"\x89PNG\r\n\x1a\n", "width": 50, "height": 50}, ] converter.extracted_data = data converter.build_skill() assets_dir = Path(self.temp_dir) / "test" / "assets" png_files = list(assets_dir.glob("*.png")) self.assertGreaterEqual(len(png_files), 2) def test_cover_image_identified(self): from bs4 import BeautifulSoup html = '' soup = BeautifulSoup(f"
{html}
", "html.parser") elements = list(soup.find("div").children) section = _build_section(1, "Cover", "h1", elements) self.assertEqual(len(section["images"]), 1) def test_many_images(self): config = {"name": "test", "epub_path": "test.epub"} converter = EpubToSkillConverter(config) converter.skill_dir = os.path.join(self.temp_dir, "test") converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") data = _make_sample_extracted_data() data["pages"][0]["images"] = [ {"index": i, "data": b"\x89PNG\r\n\x1a\n", "width": 10, "height": 10} for i in range(100) ] converter.extracted_data = data # Should handle 100+ images without error converter.build_skill() # ============================================================================ # Class 9: TestEpubErrorHandling # ============================================================================ class TestEpubErrorHandling(unittest.TestCase): """Test error handling for various failure scenarios.""" def setUp(self): if not IMPORT_OK: self.skipTest("epub_scraper not importable") if not EPUB_AVAILABLE: self.skipTest("ebooklib not installed") self.temp_dir = tempfile.mkdtemp() def tearDown(self): shutil.rmtree(self.temp_dir, ignore_errors=True) def test_missing_epub_file_raises_error(self): config = {"name": "test", "epub_path": "/nonexistent/path/test.epub"} converter = EpubToSkillConverter(config) with self.assertRaises(FileNotFoundError): converter.extract_epub() def test_not_a_file_raises_error(self): config = {"name": "test", "epub_path": self.temp_dir} converter = EpubToSkillConverter(config) with self.assertRaises((ValueError, FileNotFoundError)): converter.extract_epub() def test_not_epub_extension_raises_error(self): txt_file = os.path.join(self.temp_dir, "test.txt") Path(txt_file).write_text("not an epub") config = {"name": "test", "epub_path": txt_file} converter = EpubToSkillConverter(config) with self.assertRaises(ValueError): converter.extract_epub() @patch("skill_seekers.cli.epub_scraper.epub") @patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True) @patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True) def test_corrupted_epub_raises_error(self, mock_isfile, mock_exists, mock_epub): mock_epub.read_epub.side_effect = Exception("Bad ZIP file") config = {"name": "test", "epub_path": "corrupted.epub"} converter = EpubToSkillConverter(config) with self.assertRaises(ValueError): converter.extract_epub() @patch("skill_seekers.cli.epub_scraper.epub") @patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True) @patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True) def test_drm_protected_raises_error(self, mock_isfile, mock_exists, mock_epub): book = MagicMock() enc_item = MagicMock() enc_item.file_name = "META-INF/encryption.xml" enc_item.get_content.return_value = ( b'' ) book.get_items.return_value = [enc_item] book.get_metadata.return_value = [] mock_epub.read_epub.return_value = book config = {"name": "test", "epub_path": "drm.epub"} converter = EpubToSkillConverter(config) with self.assertRaises(RuntimeError) as ctx: converter.extract_epub() self.assertIn("DRM", str(ctx.exception)) def test_ebooklib_not_installed_error(self): from skill_seekers.cli.epub_scraper import _check_epub_deps with patch("skill_seekers.cli.epub_scraper.EPUB_AVAILABLE", False): with self.assertRaises(RuntimeError) as ctx: _check_epub_deps() self.assertIn("ebooklib", str(ctx.exception)) self.assertIn("pip install", str(ctx.exception)) @patch("skill_seekers.cli.epub_scraper.epub") @patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True) @patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True) def test_malformed_xhtml_handled_gracefully(self, mock_isfile, mock_exists, mock_epub): """Malformed XHTML should not crash thanks to BeautifulSoup tolerant parsing.""" book = MagicMock() item = MagicMock() item.get_type.return_value = ebooklib.ITEM_DOCUMENT item.get_content.return_value = b"

Test

Unclosed tags

and more" book.spine = [("ch1", "yes")] book.get_item_with_id = lambda _x: item book.get_metadata.return_value = [] book.get_items_of_type = lambda _t: [] book.get_items = lambda: [item] mock_epub.read_epub.return_value = book config = {"name": "test", "epub_path": "malformed.epub"} converter = EpubToSkillConverter(config) converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") # Should not crash result = converter.extract_epub() self.assertTrue(result) # ============================================================================ # Class 10: TestEpubJSONWorkflow # ============================================================================ class TestEpubJSONWorkflow(unittest.TestCase): """Test JSON-based workflow (load/save extracted data).""" def setUp(self): if not IMPORT_OK: self.skipTest("epub_scraper not importable") self.temp_dir = tempfile.mkdtemp() def tearDown(self): shutil.rmtree(self.temp_dir, ignore_errors=True) def test_load_extracted_json(self): config = {"name": "test"} converter = EpubToSkillConverter(config) data = _make_sample_extracted_data() json_path = os.path.join(self.temp_dir, "test_extracted.json") with open(json_path, "w") as f: json.dump(data, f) result = converter.load_extracted_data(json_path) self.assertTrue(result) self.assertIsNotNone(converter.extracted_data) self.assertEqual(converter.extracted_data["total_sections"], 2) def test_build_from_json(self): config = {"name": "test"} converter = EpubToSkillConverter(config) converter.skill_dir = os.path.join(self.temp_dir, "test") converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") data = _make_sample_extracted_data() json_path = os.path.join(self.temp_dir, "test_extracted.json") with open(json_path, "w") as f: json.dump(data, f) converter.load_extracted_data(json_path) converter.build_skill() skill_md = Path(self.temp_dir) / "test" / "SKILL.md" self.assertTrue(skill_md.exists()) def test_json_round_trip(self): config = {"name": "test"} converter = EpubToSkillConverter(config) converter.skill_dir = os.path.join(self.temp_dir, "test") converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") original_data = _make_sample_extracted_data(include_code=True, include_tables=True) # Save json_path = os.path.join(self.temp_dir, "test_extracted.json") with open(json_path, "w") as f: json.dump(original_data, f, default=str) # Load converter.load_extracted_data(json_path) self.assertEqual( converter.extracted_data["total_sections"], original_data["total_sections"], ) self.assertEqual( converter.extracted_data["total_code_blocks"], original_data["total_code_blocks"], ) def test_load_invalid_json(self): config = {"name": "test"} converter = EpubToSkillConverter(config) bad_json = os.path.join(self.temp_dir, "bad.json") Path(bad_json).write_text("{invalid json content") with self.assertRaises(json.JSONDecodeError): converter.load_extracted_data(bad_json) def test_load_nonexistent_json(self): config = {"name": "test"} converter = EpubToSkillConverter(config) with self.assertRaises(FileNotFoundError): converter.load_extracted_data("/nonexistent/path/data.json") def test_json_with_missing_fields(self): config = {"name": "test"} converter = EpubToSkillConverter(config) converter.skill_dir = os.path.join(self.temp_dir, "test") converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") # Minimal JSON — missing optional fields minimal_data = { "pages": [ { "section_number": 1, "heading": "Test", "heading_level": "h1", "text": "Content", "headings": [], "code_samples": [], "tables": [], "images": [], } ], "metadata": {"title": "Test"}, } json_path = os.path.join(self.temp_dir, "minimal.json") with open(json_path, "w") as f: json.dump(minimal_data, f) converter.load_extracted_data(json_path) # Should not crash when building converter.build_skill() # ============================================================================ # Class 11: TestEpubCLIArguments # ============================================================================ class TestEpubCLIArguments(unittest.TestCase): """Test CLI argument parsing.""" def setUp(self): if not IMPORT_OK: self.skipTest("epub_scraper not importable") def _parse_args(self, args_list): import argparse from skill_seekers.cli.arguments.epub import add_epub_arguments parser = argparse.ArgumentParser() add_epub_arguments(parser) return parser.parse_args(args_list) def test_epub_flag_accepted(self): args = self._parse_args(["--epub", "book.epub"]) self.assertEqual(args.epub, "book.epub") def test_from_json_flag_accepted(self): args = self._parse_args(["--from-json", "data.json"]) self.assertEqual(args.from_json, "data.json") def test_name_flag_accepted(self): args = self._parse_args(["--epub", "book.epub", "--name", "mybook"]) self.assertEqual(args.name, "mybook") def test_enhance_level_default_zero(self): args = self._parse_args(["--epub", "book.epub"]) self.assertEqual(args.enhance_level, 0) def test_dry_run_flag(self): args = self._parse_args(["--epub", "book.epub", "--dry-run"]) self.assertTrue(args.dry_run) def test_no_args_accepted(self): # Parser itself doesn't enforce --epub or --from-json — main() does args = self._parse_args([]) self.assertIsNone(getattr(args, "epub", None)) def test_verbose_flag(self): args = self._parse_args(["--epub", "book.epub", "--verbose"]) self.assertTrue(args.verbose) def test_quiet_flag(self): args = self._parse_args(["--epub", "book.epub", "--quiet"]) self.assertTrue(args.quiet) # ============================================================================ # Class 12: TestEpubHelperFunctions # ============================================================================ class TestEpubHelperFunctions(unittest.TestCase): """Test module-level helper functions.""" def setUp(self): if not IMPORT_OK: self.skipTest("epub_scraper not importable") def test_infer_description_from_metadata_description(self): metadata = {"description": "A comprehensive guide to testing software"} result = infer_description_from_epub(metadata) self.assertTrue(result.startswith("Use when")) self.assertIn("testing", result.lower()) def test_infer_description_from_metadata_title(self): metadata = {"title": "Programming Rust, 2nd Edition"} result = infer_description_from_epub(metadata) self.assertIn("programming rust", result.lower()) def test_infer_description_fallback(self): result = infer_description_from_epub(name="mybook") self.assertIn("mybook", result) def test_infer_description_empty_metadata(self): result = infer_description_from_epub({}) self.assertEqual(result, "Use when referencing this documentation") def test_score_code_quality_ranges(self): self.assertEqual(_score_code_quality(""), 0.0) score = _score_code_quality("x = 1") self.assertGreaterEqual(score, 0.0) self.assertLessEqual(score, 10.0) # Long code with functions scores higher long_code = "\n".join([f"def func_{i}():" for i in range(15)] + [" return True"]) score_long = _score_code_quality(long_code) self.assertGreater(score_long, score) def test_sanitize_filename(self): config = {"name": "test"} converter = EpubToSkillConverter(config) self.assertEqual(converter._sanitize_filename("Hello World!"), "hello_world") self.assertEqual(converter._sanitize_filename("my-file_name"), "my_file_name") self.assertEqual( converter._sanitize_filename("Test: Special & Chars"), "test_special_chars" ) # ============================================================================ # Class 13: TestEpubSourceDetection # ============================================================================ class TestEpubSourceDetection(unittest.TestCase): """Test source detection for EPUB files.""" def setUp(self): try: from skill_seekers.cli.source_detector import SourceDetector self.SourceDetector = SourceDetector except ImportError: self.skipTest("source_detector not importable") self.temp_dir = tempfile.mkdtemp() def tearDown(self): shutil.rmtree(self.temp_dir, ignore_errors=True) def test_epub_detected_as_epub_type(self): result = self.SourceDetector.detect("test.epub") self.assertEqual(result.type, "epub") def test_epub_suggested_name(self): result = self.SourceDetector.detect("my-ebook.epub") self.assertEqual(result.suggested_name, "my-ebook") def test_epub_validation_missing_file(self): result = self.SourceDetector.detect("/nonexistent/book.epub") with self.assertRaises(ValueError): self.SourceDetector.validate_source(result) def test_epub_validation_not_a_file(self): result = self.SourceDetector.detect(self.temp_dir + ".epub") # Path doesn't end with .epub but let's test a directory that would be detected dir_path = os.path.join(self.temp_dir, "test.epub") os.makedirs(dir_path) # Create a directory with .epub name result = self.SourceDetector.detect(dir_path) with self.assertRaises(ValueError): self.SourceDetector.validate_source(result) def test_epub_with_path(self): result = self.SourceDetector.detect("./books/test.epub") self.assertEqual(result.type, "epub") self.assertEqual(result.parsed["file_path"], "./books/test.epub") def test_pdf_still_detected(self): """Regression test: .pdf files still detected as pdf type.""" result = self.SourceDetector.detect("document.pdf") self.assertEqual(result.type, "pdf") # ============================================================================ # Class 14: TestEpubEdgeCases # ============================================================================ class TestEpubEdgeCases(unittest.TestCase): """Test edge cases per W3C EPUB 3.3 spec.""" def setUp(self): if not IMPORT_OK: self.skipTest("epub_scraper not importable") self.temp_dir = tempfile.mkdtemp() def tearDown(self): shutil.rmtree(self.temp_dir, ignore_errors=True) def test_epub_no_toc(self): """EPUB without TOC should still extract using spine order.""" config = {"name": "test"} converter = EpubToSkillConverter(config) converter.skill_dir = os.path.join(self.temp_dir, "test") converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") converter.extracted_data = _make_sample_extracted_data() converter.build_skill() skill_md = Path(self.temp_dir) / "test" / "SKILL.md" self.assertTrue(skill_md.exists()) def test_epub_empty_chapters(self): """Chapters with no text content handled gracefully.""" # Empty body — no elements to process section = _build_section(1, "Empty", "h1", []) self.assertEqual(section["text"], "") self.assertEqual(section["code_samples"], []) def test_epub_single_chapter(self): """Single chapter produces valid output.""" config = {"name": "test", "epub_path": "test.epub"} converter = EpubToSkillConverter(config) converter.skill_dir = os.path.join(self.temp_dir, "test") converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") converter.extracted_data = _make_sample_extracted_data(num_sections=1) converter.build_skill() skill_md = Path(self.temp_dir) / "test" / "SKILL.md" self.assertTrue(skill_md.exists()) content = skill_md.read_text() self.assertIn("Chapter 1", content) def test_epub_unicode_content(self): """CJK, Arabic, Cyrillic, emoji text preserved.""" from bs4 import BeautifulSoup html = "

\u4f60\u597d\u4e16\u754c \u041f\u0440\u0438\u0432\u0435\u0442 \U0001f600

" soup = BeautifulSoup(html, "html.parser") elements = list(soup.children) section = _build_section(1, "Unicode", "h1", elements) self.assertIn("\u4f60\u597d", section["text"]) self.assertIn("\U0001f600", section["text"]) def test_epub_large_section_count(self): """100+ sections processed without error.""" config = {"name": "test", "epub_path": "test.epub"} converter = EpubToSkillConverter(config) converter.skill_dir = os.path.join(self.temp_dir, "test") converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") converter.extracted_data = _make_sample_extracted_data(num_sections=100) converter.build_skill() skill_md = Path(self.temp_dir) / "test" / "SKILL.md" self.assertTrue(skill_md.exists()) def test_epub_nested_headings(self): """h3/h4/h5/h6 become sub-headings within sections.""" from bs4 import BeautifulSoup html = ( "

Sub-section A

" "

Content A

" "

Sub-sub-section B

" "

Content B

" "
Deep heading
" "
Deepest heading
" ) soup = BeautifulSoup(html, "html.parser") elements = list(soup.children) section = _build_section(1, "Main", "h1", elements) self.assertEqual(len(section["headings"]), 4) self.assertEqual(section["headings"][0]["level"], "h3") self.assertEqual(section["headings"][0]["text"], "Sub-section A") self.assertEqual(section["headings"][3]["level"], "h6") def test_fixed_layout_detected(self): """Fixed-layout EPUB — we extract whatever text exists.""" config = {"name": "test"} converter = EpubToSkillConverter(config) converter.skill_dir = os.path.join(self.temp_dir, "test") converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") data = _make_sample_extracted_data(num_sections=1) data["pages"][0]["text"] = "Some text from fixed-layout EPUB" converter.extracted_data = data converter.build_skill() refs_dir = Path(self.temp_dir) / "test" / "references" found = False for f in refs_dir.glob("*.md"): if "fixed-layout" in f.read_text(): found = True break self.assertTrue(found) def test_epub2_vs_epub3(self): """Both EPUB 2 and EPUB 3 use the same code path — verify section building works.""" from bs4 import BeautifulSoup # EPUB 2 style (simpler XHTML) html2 = "

EPUB 2 content

" soup2 = BeautifulSoup(html2, "html.parser") section2 = _build_section(1, "EPUB 2 Chapter", "h1", list(soup2.children)) self.assertIn("EPUB 2 content", section2["text"]) # EPUB 3 style (HTML5-ish XHTML) html3 = "

EPUB 3 content

" soup3 = BeautifulSoup(html3, "html.parser") section3 = _build_section(1, "EPUB 3 Chapter", "h1", list(soup3.children)) self.assertIn("EPUB 3 content", section3["text"]) if __name__ == "__main__": unittest.main()