""" Tests for EPUB scraper (epub_scraper.py). Covers: initialization, extraction, categorization, skill building, code blocks, tables, images, error handling, JSON workflow, CLI arguments, helper functions, source detection, DRM detection, and edge cases. Tests use mock data and do not require actual EPUB files or ebooklib installed. """ import json import os import shutil import tempfile import unittest from pathlib import Path from unittest.mock import MagicMock, patch # Conditional import (same pattern as test_word_scraper.py) try: import ebooklib EPUB_AVAILABLE = True except ImportError: EPUB_AVAILABLE = False try: from skill_seekers.cli.epub_scraper import ( EpubToSkillConverter, _build_section, _extract_table_from_html, _score_code_quality, infer_description_from_epub, ) IMPORT_OK = True except ImportError: IMPORT_OK = False def _make_sample_extracted_data( num_sections=2, include_code=False, include_tables=False, include_images=False, ) -> dict: """Create minimal extracted_data dict for testing.""" sections = [] total_code = 0 total_images = 0 languages = {} for i in range(1, num_sections + 1): section = { "section_number": i, "heading": f"Chapter {i}", "heading_level": "h1", "text": f"Content of chapter {i}. This is sample text.", "headings": [{"level": "h2", "text": f"Section {i}.1"}], "code_samples": [], "tables": [], "images": [], } if include_code: section["code_samples"] = [ { "code": f"def func_{i}():\n return {i}", "language": "python", "quality_score": 7.5, }, { "code": f"console.log({i})", "language": "javascript", "quality_score": 4.0, }, ] total_code += 2 languages["python"] = languages.get("python", 0) + 1 languages["javascript"] = languages.get("javascript", 0) + 1 if include_tables: section["tables"] = [{"headers": ["Name", "Value"], "rows": [["key", "val"]]}] if include_images: section["images"] = [ {"index": 0, "data": b"\x89PNG\r\n\x1a\n", "width": 100, "height": 100} ] total_images += 1 sections.append(section) return { "source_file": "test.epub", "metadata": { "title": "Test Book", "author": "Test Author", "language": "en", "publisher": "Test Publisher", "date": "2024-01-01", "description": "A test book for unit testing", "subject": "Testing, Unit Tests", "rights": "Copyright 2024", "identifier": "urn:uuid:12345", }, "total_sections": num_sections, "total_code_blocks": total_code, "total_images": total_images, "languages_detected": languages, "pages": sections, } # ============================================================================ # Class 1: TestEpubToSkillConverterInit # ============================================================================ class TestEpubToSkillConverterInit(unittest.TestCase): """Test EpubToSkillConverter initialization.""" def setUp(self): if not IMPORT_OK: self.skipTest("epub_scraper not importable") self.temp_dir = tempfile.mkdtemp() def tearDown(self): shutil.rmtree(self.temp_dir, ignore_errors=True) def test_init_with_name_and_epub_path(self): config = {"name": "test_skill", "epub_path": "test.epub"} converter = EpubToSkillConverter(config) self.assertEqual(converter.name, "test_skill") self.assertEqual(converter.epub_path, "test.epub") def test_init_with_full_config(self): config = { "name": "mybook", "epub_path": "/path/to/book.epub", "description": "Custom description", "categories": {"ch1": ["intro"]}, } converter = EpubToSkillConverter(config) self.assertEqual(converter.name, "mybook") self.assertEqual(converter.epub_path, "/path/to/book.epub") self.assertEqual(converter.description, "Custom description") self.assertEqual(converter.categories, {"ch1": ["intro"]}) def test_default_description_uses_name(self): config = {"name": "test_skill"} converter = EpubToSkillConverter(config) self.assertIn("test_skill", converter.description) self.assertTrue(converter.description.startswith("Use when referencing")) def test_skill_dir_uses_name(self): config = {"name": "mybook"} converter = EpubToSkillConverter(config) self.assertEqual(converter.skill_dir, "output/mybook") def test_data_file_uses_name(self): config = {"name": "mybook"} converter = EpubToSkillConverter(config) self.assertEqual(converter.data_file, "output/mybook_extracted.json") def test_init_requires_name(self): with self.assertRaises(KeyError): EpubToSkillConverter({}) def test_init_empty_name(self): config = {"name": ""} converter = EpubToSkillConverter(config) self.assertEqual(converter.name, "") def test_init_with_special_characters_in_name(self): config = {"name": "my-book name_2024"} converter = EpubToSkillConverter(config) self.assertEqual(converter.name, "my-book name_2024") self.assertIn("my-book name_2024", converter.skill_dir) # ============================================================================ # Class 2: TestEpubExtraction # ============================================================================ class TestEpubExtraction(unittest.TestCase): """Test EPUB content extraction.""" def setUp(self): if not IMPORT_OK: self.skipTest("epub_scraper not importable") if not EPUB_AVAILABLE: self.skipTest("ebooklib not installed") self.temp_dir = tempfile.mkdtemp() def tearDown(self): shutil.rmtree(self.temp_dir, ignore_errors=True) def _make_mock_book(self, spine_content=None, metadata=None, images=None): """Create a mock ebooklib EpubBook.""" book = MagicMock() if metadata is None: metadata = { "title": [("Test Book", {})], "creator": [("Test Author", {})], "language": [("en", {})], "publisher": [("Test Publisher", {})], "date": [("2024-01-01", {})], "description": [("A test book", {})], "subject": [("Testing", {})], "rights": [("Copyright 2024", {})], "identifier": [("urn:uuid:12345", {})], } def get_metadata(ns, key): if ns == "DC": return metadata.get(key, []) return [] book.get_metadata = get_metadata # Spine items if spine_content is None: spine_content = [ ( "ch1", "
Content 1
", ), ] spine_items = [] items_dict = {} for item_id, content in spine_content: item = MagicMock() item.get_type.return_value = ebooklib.ITEM_DOCUMENT item.get_content.return_value = content.encode("utf-8") items_dict[item_id] = item spine_items.append((item_id, "yes")) book.spine = spine_items book.get_item_with_id = lambda x: items_dict.get(x) # Images if images is None: images = [] img_items = [] for img in images: img_item = MagicMock() img_item.media_type = img.get("media_type", "image/png") img_item.get_content.return_value = img.get("data", b"\x89PNG") img_item.file_name = img.get("file_name", "image.png") img_items.append(img_item) book.get_items_of_type = lambda t: img_items if t == ebooklib.ITEM_IMAGE else [] # All items (for DRM detection, SVG counting) all_items = list(items_dict.values()) + img_items book.get_items = lambda: all_items return book @patch("skill_seekers.cli.epub_scraper.epub") @patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True) @patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True) def test_extract_basic_epub(self, mock_isfile, mock_exists, mock_epub): mock_book = self._make_mock_book() mock_epub.read_epub.return_value = mock_book config = {"name": "test", "epub_path": "test.epub"} converter = EpubToSkillConverter(config) converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") result = converter.extract_epub() self.assertTrue(result) self.assertIsNotNone(converter.extracted_data) self.assertGreaterEqual(len(converter.extracted_data["pages"]), 1) @patch("skill_seekers.cli.epub_scraper.epub") @patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True) @patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True) def test_extract_metadata(self, mock_isfile, mock_exists, mock_epub): mock_book = self._make_mock_book() mock_epub.read_epub.return_value = mock_book config = {"name": "test", "epub_path": "test.epub"} converter = EpubToSkillConverter(config) converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") converter.extract_epub() metadata = converter.extracted_data["metadata"] self.assertEqual(metadata["title"], "Test Book") self.assertEqual(metadata["author"], "Test Author") self.assertEqual(metadata["language"], "en") @patch("skill_seekers.cli.epub_scraper.epub") @patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True) @patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True) def test_extract_multiple_chapters(self, mock_isfile, mock_exists, mock_epub): spine = [ ("ch1", "Text 1
"), ("ch2", "Text 2
"), ("ch3", "Text 3
"), ] mock_book = self._make_mock_book(spine_content=spine) mock_epub.read_epub.return_value = mock_book config = {"name": "test", "epub_path": "test.epub"} converter = EpubToSkillConverter(config) converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") converter.extract_epub() self.assertEqual(len(converter.extracted_data["pages"]), 3) @patch("skill_seekers.cli.epub_scraper.epub") @patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True) @patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True) def test_extract_code_blocks(self, mock_isfile, mock_exists, mock_epub): spine = [ ( "ch1", "def hello():\n print("hi")'
"",
),
]
mock_book = self._make_mock_book(spine_content=spine)
mock_epub.read_epub.return_value = mock_book
config = {"name": "test", "epub_path": "test.epub"}
converter = EpubToSkillConverter(config)
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
converter.extract_epub()
code_samples = converter.extracted_data["pages"][0]["code_samples"]
self.assertGreaterEqual(len(code_samples), 1)
self.assertEqual(code_samples[0]["language"], "python")
@patch("skill_seekers.cli.epub_scraper.epub")
@patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True)
@patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True)
def test_extract_images(self, mock_isfile, mock_exists, mock_epub):
images = [{"media_type": "image/png", "data": b"\x89PNG", "file_name": "fig1.png"}]
mock_book = self._make_mock_book(images=images)
mock_epub.read_epub.return_value = mock_book
config = {"name": "test", "epub_path": "test.epub"}
converter = EpubToSkillConverter(config)
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
converter.extract_epub()
self.assertGreaterEqual(converter.extracted_data["total_images"], 1)
@patch("skill_seekers.cli.epub_scraper.epub")
@patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True)
@patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True)
def test_heading_boundary_splitting(self, mock_isfile, mock_exists, mock_epub):
spine = [
(
"ch1",
""
"First content
" "Second content
" "", ), ] mock_book = self._make_mock_book(spine_content=spine) mock_epub.read_epub.return_value = mock_book config = {"name": "test", "epub_path": "test.epub"} converter = EpubToSkillConverter(config) converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") converter.extract_epub() pages = converter.extracted_data["pages"] self.assertEqual(len(pages), 2) self.assertEqual(pages[0]["heading"], "First Heading") self.assertEqual(pages[1]["heading"], "Second Heading") def test_extract_missing_file_raises_error(self): config = {"name": "test", "epub_path": "/nonexistent/book.epub"} converter = EpubToSkillConverter(config) with self.assertRaises(FileNotFoundError): converter.extract_epub() def test_extract_invalid_extension_raises_error(self): # Create a real file with wrong extension bad_file = os.path.join(self.temp_dir, "test.txt") Path(bad_file).write_text("not an epub") config = {"name": "test", "epub_path": bad_file} converter = EpubToSkillConverter(config) with self.assertRaises(ValueError): converter.extract_epub() def test_extract_deps_not_installed(self): from skill_seekers.cli.epub_scraper import _check_epub_deps with patch("skill_seekers.cli.epub_scraper.EPUB_AVAILABLE", False): with self.assertRaises(RuntimeError) as ctx: _check_epub_deps() self.assertIn("ebooklib", str(ctx.exception)) self.assertIn("pip install", str(ctx.exception)) @patch("skill_seekers.cli.epub_scraper.epub") @patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True) @patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True) def test_extract_empty_spine(self, mock_isfile, mock_exists, mock_epub): mock_book = self._make_mock_book(spine_content=[]) mock_book.spine = [] mock_epub.read_epub.return_value = mock_book config = {"name": "test", "epub_path": "test.epub"} converter = EpubToSkillConverter(config) converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") converter.extract_epub() self.assertEqual(len(converter.extracted_data["pages"]), 0) @patch("skill_seekers.cli.epub_scraper.epub") @patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True) @patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True) def test_extract_spine_item_no_body(self, mock_isfile, mock_exists, mock_epub): spine = [ ("ch1", ""
soup = BeautifulSoup(html, "html.parser")
elements = list(soup.children)
section = _build_section(1, "Test", "h1", elements)
self.assertEqual(len(section["code_samples"]), 0)
def test_code_block_with_html_entities(self):
from bs4 import BeautifulSoup
html = "if (x < 10 && y > 5) {}"
soup = BeautifulSoup(html, "html.parser")
elements = list(soup.children)
section = _build_section(1, "Test", "h1", elements)
self.assertEqual(len(section["code_samples"]), 1)
code = section["code_samples"][0]["code"]
self.assertIn("<", code)
self.assertIn(">", code)
self.assertIn("&&", code)
def test_code_block_with_syntax_highlighting_spans(self):
from bs4 import BeautifulSoup
html = (
'def '
'foo():'
)
soup = BeautifulSoup(html, "html.parser")
elements = list(soup.children)
section = _build_section(1, "Test", "h1", elements)
self.assertEqual(len(section["code_samples"]), 1)
code = section["code_samples"][0]["code"]
self.assertIn("def", code)
self.assertIn("foo", code)
self.assertNotIn("| Header1 | Header2 |
| Val1 | Val2 |
| H1 | H2 | |
|---|---|---|
| A | B | C |
| D | E | |
'
soup = BeautifulSoup(f"Unclosed tags
\u4f60\u597d\u4e16\u754c \u041f\u0440\u0438\u0432\u0435\u0442 \U0001f600
" soup = BeautifulSoup(html, "html.parser") elements = list(soup.children) section = _build_section(1, "Unicode", "h1", elements) self.assertIn("\u4f60\u597d", section["text"]) self.assertIn("\U0001f600", section["text"]) def test_epub_large_section_count(self): """100+ sections processed without error.""" config = {"name": "test", "epub_path": "test.epub"} converter = EpubToSkillConverter(config) converter.skill_dir = os.path.join(self.temp_dir, "test") converter.data_file = os.path.join(self.temp_dir, "test_extracted.json") converter.extracted_data = _make_sample_extracted_data(num_sections=100) converter.build_skill() skill_md = Path(self.temp_dir) / "test" / "SKILL.md" self.assertTrue(skill_md.exists()) def test_epub_nested_headings(self): """h3/h4/h5/h6 become sub-headings within sections.""" from bs4 import BeautifulSoup html = ( "Content A
" "Content B
" "EPUB 2 content
" soup2 = BeautifulSoup(html2, "html.parser") section2 = _build_section(1, "EPUB 2 Chapter", "h1", list(soup2.children)) self.assertIn("EPUB 2 content", section2["text"]) # EPUB 3 style (HTML5-ish XHTML) html3 = "EPUB 3 content