Adds EPUB as a first-class input source for skill generation. - EpubToSkillConverter (epub_scraper.py, ~1200 lines) following PDF scraper pattern - Dublin Core metadata, spine items, code blocks, tables, images extraction - DRM detection (Adobe ADEPT, Apple FairPlay, Readium LCP) with fail-fast - EPUB 3 NCX TOC bug workaround (ignore_ncx=True) - ebooklib as optional dep: pip install skill-seekers[epub] - Wired into create command with .epub auto-detection - 104 tests, all passing Review fixes: removed 3 empty test stubs, fixed SVG double-counting in _extract_images(), added logger.debug to bare except pass. Based on PR #310 by @christianbaumann. Co-authored-by: Christian Baumann <mail@chriss-baumann.de>
1627 lines
61 KiB
Python
1627 lines
61 KiB
Python
"""
|
|
Tests for EPUB scraper (epub_scraper.py).
|
|
|
|
Covers: initialization, extraction, categorization, skill building,
|
|
code blocks, tables, images, error handling, JSON workflow, CLI arguments,
|
|
helper functions, source detection, DRM detection, and edge cases.
|
|
|
|
Tests use mock data and do not require actual EPUB files or ebooklib installed.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import shutil
|
|
import tempfile
|
|
import unittest
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
|
|
# Conditional import (same pattern as test_word_scraper.py)
|
|
try:
|
|
import ebooklib
|
|
|
|
EPUB_AVAILABLE = True
|
|
except ImportError:
|
|
EPUB_AVAILABLE = False
|
|
|
|
try:
|
|
from skill_seekers.cli.epub_scraper import (
|
|
EpubToSkillConverter,
|
|
_build_section,
|
|
_extract_table_from_html,
|
|
_score_code_quality,
|
|
infer_description_from_epub,
|
|
)
|
|
|
|
IMPORT_OK = True
|
|
except ImportError:
|
|
IMPORT_OK = False
|
|
|
|
|
|
def _make_sample_extracted_data(
|
|
num_sections=2,
|
|
include_code=False,
|
|
include_tables=False,
|
|
include_images=False,
|
|
) -> dict:
|
|
"""Create minimal extracted_data dict for testing."""
|
|
sections = []
|
|
total_code = 0
|
|
total_images = 0
|
|
languages = {}
|
|
|
|
for i in range(1, num_sections + 1):
|
|
section = {
|
|
"section_number": i,
|
|
"heading": f"Chapter {i}",
|
|
"heading_level": "h1",
|
|
"text": f"Content of chapter {i}. This is sample text.",
|
|
"headings": [{"level": "h2", "text": f"Section {i}.1"}],
|
|
"code_samples": [],
|
|
"tables": [],
|
|
"images": [],
|
|
}
|
|
|
|
if include_code:
|
|
section["code_samples"] = [
|
|
{
|
|
"code": f"def func_{i}():\n return {i}",
|
|
"language": "python",
|
|
"quality_score": 7.5,
|
|
},
|
|
{
|
|
"code": f"console.log({i})",
|
|
"language": "javascript",
|
|
"quality_score": 4.0,
|
|
},
|
|
]
|
|
total_code += 2
|
|
languages["python"] = languages.get("python", 0) + 1
|
|
languages["javascript"] = languages.get("javascript", 0) + 1
|
|
|
|
if include_tables:
|
|
section["tables"] = [{"headers": ["Name", "Value"], "rows": [["key", "val"]]}]
|
|
|
|
if include_images:
|
|
section["images"] = [
|
|
{"index": 0, "data": b"\x89PNG\r\n\x1a\n", "width": 100, "height": 100}
|
|
]
|
|
total_images += 1
|
|
|
|
sections.append(section)
|
|
|
|
return {
|
|
"source_file": "test.epub",
|
|
"metadata": {
|
|
"title": "Test Book",
|
|
"author": "Test Author",
|
|
"language": "en",
|
|
"publisher": "Test Publisher",
|
|
"date": "2024-01-01",
|
|
"description": "A test book for unit testing",
|
|
"subject": "Testing, Unit Tests",
|
|
"rights": "Copyright 2024",
|
|
"identifier": "urn:uuid:12345",
|
|
},
|
|
"total_sections": num_sections,
|
|
"total_code_blocks": total_code,
|
|
"total_images": total_images,
|
|
"languages_detected": languages,
|
|
"pages": sections,
|
|
}
|
|
|
|
|
|
# ============================================================================
|
|
# Class 1: TestEpubToSkillConverterInit
|
|
# ============================================================================
|
|
|
|
|
|
class TestEpubToSkillConverterInit(unittest.TestCase):
|
|
"""Test EpubToSkillConverter initialization."""
|
|
|
|
def setUp(self):
|
|
if not IMPORT_OK:
|
|
self.skipTest("epub_scraper not importable")
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
|
|
def tearDown(self):
|
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
|
|
def test_init_with_name_and_epub_path(self):
|
|
config = {"name": "test_skill", "epub_path": "test.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
self.assertEqual(converter.name, "test_skill")
|
|
self.assertEqual(converter.epub_path, "test.epub")
|
|
|
|
def test_init_with_full_config(self):
|
|
config = {
|
|
"name": "mybook",
|
|
"epub_path": "/path/to/book.epub",
|
|
"description": "Custom description",
|
|
"categories": {"ch1": ["intro"]},
|
|
}
|
|
converter = EpubToSkillConverter(config)
|
|
self.assertEqual(converter.name, "mybook")
|
|
self.assertEqual(converter.epub_path, "/path/to/book.epub")
|
|
self.assertEqual(converter.description, "Custom description")
|
|
self.assertEqual(converter.categories, {"ch1": ["intro"]})
|
|
|
|
def test_default_description_uses_name(self):
|
|
config = {"name": "test_skill"}
|
|
converter = EpubToSkillConverter(config)
|
|
self.assertIn("test_skill", converter.description)
|
|
self.assertTrue(converter.description.startswith("Use when referencing"))
|
|
|
|
def test_skill_dir_uses_name(self):
|
|
config = {"name": "mybook"}
|
|
converter = EpubToSkillConverter(config)
|
|
self.assertEqual(converter.skill_dir, "output/mybook")
|
|
|
|
def test_data_file_uses_name(self):
|
|
config = {"name": "mybook"}
|
|
converter = EpubToSkillConverter(config)
|
|
self.assertEqual(converter.data_file, "output/mybook_extracted.json")
|
|
|
|
def test_init_requires_name(self):
|
|
with self.assertRaises(KeyError):
|
|
EpubToSkillConverter({})
|
|
|
|
def test_init_empty_name(self):
|
|
config = {"name": ""}
|
|
converter = EpubToSkillConverter(config)
|
|
self.assertEqual(converter.name, "")
|
|
|
|
def test_init_with_special_characters_in_name(self):
|
|
config = {"name": "my-book name_2024"}
|
|
converter = EpubToSkillConverter(config)
|
|
self.assertEqual(converter.name, "my-book name_2024")
|
|
self.assertIn("my-book name_2024", converter.skill_dir)
|
|
|
|
|
|
# ============================================================================
|
|
# Class 2: TestEpubExtraction
|
|
# ============================================================================
|
|
|
|
|
|
class TestEpubExtraction(unittest.TestCase):
|
|
"""Test EPUB content extraction."""
|
|
|
|
def setUp(self):
|
|
if not IMPORT_OK:
|
|
self.skipTest("epub_scraper not importable")
|
|
if not EPUB_AVAILABLE:
|
|
self.skipTest("ebooklib not installed")
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
|
|
def tearDown(self):
|
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
|
|
def _make_mock_book(self, spine_content=None, metadata=None, images=None):
|
|
"""Create a mock ebooklib EpubBook."""
|
|
book = MagicMock()
|
|
|
|
if metadata is None:
|
|
metadata = {
|
|
"title": [("Test Book", {})],
|
|
"creator": [("Test Author", {})],
|
|
"language": [("en", {})],
|
|
"publisher": [("Test Publisher", {})],
|
|
"date": [("2024-01-01", {})],
|
|
"description": [("A test book", {})],
|
|
"subject": [("Testing", {})],
|
|
"rights": [("Copyright 2024", {})],
|
|
"identifier": [("urn:uuid:12345", {})],
|
|
}
|
|
|
|
def get_metadata(ns, key):
|
|
if ns == "DC":
|
|
return metadata.get(key, [])
|
|
return []
|
|
|
|
book.get_metadata = get_metadata
|
|
|
|
# Spine items
|
|
if spine_content is None:
|
|
spine_content = [
|
|
(
|
|
"ch1",
|
|
"<html><body><h1>Chapter 1</h1><p>Content 1</p></body></html>",
|
|
),
|
|
]
|
|
|
|
spine_items = []
|
|
items_dict = {}
|
|
for item_id, content in spine_content:
|
|
item = MagicMock()
|
|
item.get_type.return_value = ebooklib.ITEM_DOCUMENT
|
|
item.get_content.return_value = content.encode("utf-8")
|
|
items_dict[item_id] = item
|
|
spine_items.append((item_id, "yes"))
|
|
|
|
book.spine = spine_items
|
|
book.get_item_with_id = lambda x: items_dict.get(x)
|
|
|
|
# Images
|
|
if images is None:
|
|
images = []
|
|
img_items = []
|
|
for img in images:
|
|
img_item = MagicMock()
|
|
img_item.media_type = img.get("media_type", "image/png")
|
|
img_item.get_content.return_value = img.get("data", b"\x89PNG")
|
|
img_item.file_name = img.get("file_name", "image.png")
|
|
img_items.append(img_item)
|
|
|
|
book.get_items_of_type = lambda t: img_items if t == ebooklib.ITEM_IMAGE else []
|
|
|
|
# All items (for DRM detection, SVG counting)
|
|
all_items = list(items_dict.values()) + img_items
|
|
book.get_items = lambda: all_items
|
|
|
|
return book
|
|
|
|
@patch("skill_seekers.cli.epub_scraper.epub")
|
|
@patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True)
|
|
@patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True)
|
|
def test_extract_basic_epub(self, mock_isfile, mock_exists, mock_epub):
|
|
mock_book = self._make_mock_book()
|
|
mock_epub.read_epub.return_value = mock_book
|
|
|
|
config = {"name": "test", "epub_path": "test.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
|
|
|
|
result = converter.extract_epub()
|
|
self.assertTrue(result)
|
|
self.assertIsNotNone(converter.extracted_data)
|
|
self.assertGreaterEqual(len(converter.extracted_data["pages"]), 1)
|
|
|
|
@patch("skill_seekers.cli.epub_scraper.epub")
|
|
@patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True)
|
|
@patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True)
|
|
def test_extract_metadata(self, mock_isfile, mock_exists, mock_epub):
|
|
mock_book = self._make_mock_book()
|
|
mock_epub.read_epub.return_value = mock_book
|
|
|
|
config = {"name": "test", "epub_path": "test.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
|
|
|
|
converter.extract_epub()
|
|
metadata = converter.extracted_data["metadata"]
|
|
self.assertEqual(metadata["title"], "Test Book")
|
|
self.assertEqual(metadata["author"], "Test Author")
|
|
self.assertEqual(metadata["language"], "en")
|
|
|
|
@patch("skill_seekers.cli.epub_scraper.epub")
|
|
@patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True)
|
|
@patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True)
|
|
def test_extract_multiple_chapters(self, mock_isfile, mock_exists, mock_epub):
|
|
spine = [
|
|
("ch1", "<html><body><h1>Chapter 1</h1><p>Text 1</p></body></html>"),
|
|
("ch2", "<html><body><h1>Chapter 2</h1><p>Text 2</p></body></html>"),
|
|
("ch3", "<html><body><h1>Chapter 3</h1><p>Text 3</p></body></html>"),
|
|
]
|
|
mock_book = self._make_mock_book(spine_content=spine)
|
|
mock_epub.read_epub.return_value = mock_book
|
|
|
|
config = {"name": "test", "epub_path": "test.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
|
|
|
|
converter.extract_epub()
|
|
self.assertEqual(len(converter.extracted_data["pages"]), 3)
|
|
|
|
@patch("skill_seekers.cli.epub_scraper.epub")
|
|
@patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True)
|
|
@patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True)
|
|
def test_extract_code_blocks(self, mock_isfile, mock_exists, mock_epub):
|
|
spine = [
|
|
(
|
|
"ch1",
|
|
"<html><body><h1>Code Chapter</h1>"
|
|
'<pre><code class="language-python">def hello():\n print("hi")</code></pre>'
|
|
"</body></html>",
|
|
),
|
|
]
|
|
mock_book = self._make_mock_book(spine_content=spine)
|
|
mock_epub.read_epub.return_value = mock_book
|
|
|
|
config = {"name": "test", "epub_path": "test.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
|
|
|
|
converter.extract_epub()
|
|
code_samples = converter.extracted_data["pages"][0]["code_samples"]
|
|
self.assertGreaterEqual(len(code_samples), 1)
|
|
self.assertEqual(code_samples[0]["language"], "python")
|
|
|
|
@patch("skill_seekers.cli.epub_scraper.epub")
|
|
@patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True)
|
|
@patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True)
|
|
def test_extract_images(self, mock_isfile, mock_exists, mock_epub):
|
|
images = [{"media_type": "image/png", "data": b"\x89PNG", "file_name": "fig1.png"}]
|
|
mock_book = self._make_mock_book(images=images)
|
|
mock_epub.read_epub.return_value = mock_book
|
|
|
|
config = {"name": "test", "epub_path": "test.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
|
|
|
|
converter.extract_epub()
|
|
self.assertGreaterEqual(converter.extracted_data["total_images"], 1)
|
|
|
|
@patch("skill_seekers.cli.epub_scraper.epub")
|
|
@patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True)
|
|
@patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True)
|
|
def test_heading_boundary_splitting(self, mock_isfile, mock_exists, mock_epub):
|
|
spine = [
|
|
(
|
|
"ch1",
|
|
"<html><body>"
|
|
"<h1>First Heading</h1><p>First content</p>"
|
|
"<h2>Second Heading</h2><p>Second content</p>"
|
|
"</body></html>",
|
|
),
|
|
]
|
|
mock_book = self._make_mock_book(spine_content=spine)
|
|
mock_epub.read_epub.return_value = mock_book
|
|
|
|
config = {"name": "test", "epub_path": "test.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
|
|
|
|
converter.extract_epub()
|
|
pages = converter.extracted_data["pages"]
|
|
self.assertEqual(len(pages), 2)
|
|
self.assertEqual(pages[0]["heading"], "First Heading")
|
|
self.assertEqual(pages[1]["heading"], "Second Heading")
|
|
|
|
def test_extract_missing_file_raises_error(self):
|
|
config = {"name": "test", "epub_path": "/nonexistent/book.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
with self.assertRaises(FileNotFoundError):
|
|
converter.extract_epub()
|
|
|
|
def test_extract_invalid_extension_raises_error(self):
|
|
# Create a real file with wrong extension
|
|
bad_file = os.path.join(self.temp_dir, "test.txt")
|
|
Path(bad_file).write_text("not an epub")
|
|
|
|
config = {"name": "test", "epub_path": bad_file}
|
|
converter = EpubToSkillConverter(config)
|
|
with self.assertRaises(ValueError):
|
|
converter.extract_epub()
|
|
|
|
def test_extract_deps_not_installed(self):
|
|
from skill_seekers.cli.epub_scraper import _check_epub_deps
|
|
|
|
with patch("skill_seekers.cli.epub_scraper.EPUB_AVAILABLE", False):
|
|
with self.assertRaises(RuntimeError) as ctx:
|
|
_check_epub_deps()
|
|
self.assertIn("ebooklib", str(ctx.exception))
|
|
self.assertIn("pip install", str(ctx.exception))
|
|
|
|
@patch("skill_seekers.cli.epub_scraper.epub")
|
|
@patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True)
|
|
@patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True)
|
|
def test_extract_empty_spine(self, mock_isfile, mock_exists, mock_epub):
|
|
mock_book = self._make_mock_book(spine_content=[])
|
|
mock_book.spine = []
|
|
mock_epub.read_epub.return_value = mock_book
|
|
|
|
config = {"name": "test", "epub_path": "test.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
|
|
|
|
converter.extract_epub()
|
|
self.assertEqual(len(converter.extracted_data["pages"]), 0)
|
|
|
|
@patch("skill_seekers.cli.epub_scraper.epub")
|
|
@patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True)
|
|
@patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True)
|
|
def test_extract_spine_item_no_body(self, mock_isfile, mock_exists, mock_epub):
|
|
spine = [
|
|
("ch1", "<html><head><title>No Body</title></head></html>"),
|
|
]
|
|
mock_book = self._make_mock_book(spine_content=spine)
|
|
mock_epub.read_epub.return_value = mock_book
|
|
|
|
config = {"name": "test", "epub_path": "test.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
|
|
|
|
# Should not crash — body fallback to soup
|
|
converter.extract_epub()
|
|
self.assertIsNotNone(converter.extracted_data)
|
|
|
|
|
|
# ============================================================================
|
|
# Class 3: TestEpubDrmDetection
|
|
# ============================================================================
|
|
|
|
|
|
class TestEpubDrmDetection(unittest.TestCase):
|
|
"""Test DRM detection logic."""
|
|
|
|
def setUp(self):
|
|
if not IMPORT_OK:
|
|
self.skipTest("epub_scraper not importable")
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
|
|
def tearDown(self):
|
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
|
|
def _make_converter(self):
|
|
config = {"name": "test", "epub_path": "test.epub"}
|
|
return EpubToSkillConverter(config)
|
|
|
|
def _make_book_with_encryption(self, encryption_xml_content):
|
|
"""Create a mock book with META-INF/encryption.xml."""
|
|
book = MagicMock()
|
|
enc_item = MagicMock()
|
|
enc_item.file_name = "META-INF/encryption.xml"
|
|
enc_item.get_content.return_value = encryption_xml_content.encode("utf-8")
|
|
book.get_items.return_value = [enc_item]
|
|
return book
|
|
|
|
def test_no_drm_detected(self):
|
|
converter = self._make_converter()
|
|
book = MagicMock()
|
|
book.get_items.return_value = []
|
|
self.assertFalse(converter._detect_drm(book))
|
|
|
|
def test_drm_detected_adobe_adept(self):
|
|
converter = self._make_converter()
|
|
xml = '<encryption xmlns="http://ns.adobe.com/adept"><EncryptedData/></encryption>'
|
|
book = self._make_book_with_encryption(xml)
|
|
self.assertTrue(converter._detect_drm(book))
|
|
|
|
def test_drm_detected_apple_fairplay(self):
|
|
converter = self._make_converter()
|
|
xml = '<encryption><EncryptedData xmlns="http://itunes.apple.com/dataenc"/></encryption>'
|
|
book = self._make_book_with_encryption(xml)
|
|
self.assertTrue(converter._detect_drm(book))
|
|
|
|
def test_drm_detected_readium_lcp(self):
|
|
converter = self._make_converter()
|
|
xml = '<encryption xmlns="http://readium.org/2014/01/lcp"><EncryptedData/></encryption>'
|
|
book = self._make_book_with_encryption(xml)
|
|
self.assertTrue(converter._detect_drm(book))
|
|
|
|
def test_font_obfuscation_not_drm(self):
|
|
converter = self._make_converter()
|
|
xml = (
|
|
"<encryption>"
|
|
'<EncryptionMethod Algorithm="http://www.idpf.org/2008/embedding"/>'
|
|
"</encryption>"
|
|
)
|
|
book = self._make_book_with_encryption(xml)
|
|
self.assertFalse(converter._detect_drm(book))
|
|
|
|
def test_drm_error_message_is_clear(self):
|
|
converter = self._make_converter()
|
|
xml = '<encryption xmlns="http://ns.adobe.com/adept"><EncryptedData/></encryption>'
|
|
book = self._make_book_with_encryption(xml)
|
|
self.assertTrue(converter._detect_drm(book))
|
|
# The error message is raised in extract_epub, not _detect_drm
|
|
# Just confirm detection works
|
|
|
|
|
|
# ============================================================================
|
|
# Class 4: TestEpubCategorization
|
|
# ============================================================================
|
|
|
|
|
|
class TestEpubCategorization(unittest.TestCase):
|
|
"""Test content categorization."""
|
|
|
|
def setUp(self):
|
|
if not IMPORT_OK:
|
|
self.skipTest("epub_scraper not importable")
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
|
|
def tearDown(self):
|
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
|
|
def test_single_source_creates_one_category(self):
|
|
config = {"name": "test", "epub_path": "mybook.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.extracted_data = _make_sample_extracted_data(num_sections=3)
|
|
|
|
categories = converter.categorize_content()
|
|
self.assertEqual(len(categories), 1)
|
|
self.assertIn("mybook", categories)
|
|
|
|
def test_keyword_categorization(self):
|
|
config = {
|
|
"name": "test",
|
|
"categories": {
|
|
"intro": ["introduction", "getting started"],
|
|
"advanced": ["advanced", "deep dive"],
|
|
},
|
|
}
|
|
converter = EpubToSkillConverter(config)
|
|
data = _make_sample_extracted_data(num_sections=2)
|
|
data["pages"][0]["heading"] = "Introduction to Testing"
|
|
data["pages"][1]["heading"] = "Advanced Techniques"
|
|
converter.extracted_data = data
|
|
|
|
categories = converter.categorize_content()
|
|
self.assertIn("intro", categories)
|
|
self.assertIn("advanced", categories)
|
|
|
|
def test_no_categories_uses_default(self):
|
|
config = {"name": "test"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.extracted_data = _make_sample_extracted_data(num_sections=2)
|
|
|
|
categories = converter.categorize_content()
|
|
self.assertIn("content", categories)
|
|
self.assertEqual(categories["content"]["title"], "Content")
|
|
|
|
def test_categorize_empty_sections(self):
|
|
config = {"name": "test"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.extracted_data = _make_sample_extracted_data(num_sections=0)
|
|
|
|
categories = converter.categorize_content()
|
|
self.assertIn("content", categories)
|
|
self.assertEqual(len(categories["content"]["pages"]), 0)
|
|
|
|
def test_categorize_no_keyword_matches(self):
|
|
config = {
|
|
"name": "test",
|
|
"categories": {"intro": ["zzzzz_no_match"]},
|
|
}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.extracted_data = _make_sample_extracted_data(num_sections=2)
|
|
|
|
categories = converter.categorize_content()
|
|
self.assertIn("other", categories)
|
|
self.assertEqual(len(categories["other"]["pages"]), 2)
|
|
|
|
def test_categorize_single_section(self):
|
|
config = {"name": "test", "epub_path": "book.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.extracted_data = _make_sample_extracted_data(num_sections=1)
|
|
|
|
categories = converter.categorize_content()
|
|
total_pages = sum(len(c["pages"]) for c in categories.values())
|
|
self.assertEqual(total_pages, 1)
|
|
|
|
def test_categorize_many_sections(self):
|
|
config = {"name": "test", "epub_path": "book.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.extracted_data = _make_sample_extracted_data(num_sections=50)
|
|
|
|
categories = converter.categorize_content()
|
|
total_pages = sum(len(c["pages"]) for c in categories.values())
|
|
self.assertEqual(total_pages, 50)
|
|
|
|
def test_categorize_preserves_section_order(self):
|
|
config = {"name": "test", "epub_path": "book.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.extracted_data = _make_sample_extracted_data(num_sections=5)
|
|
|
|
categories = converter.categorize_content()
|
|
for cat_data in categories.values():
|
|
section_nums = [s["section_number"] for s in cat_data["pages"]]
|
|
self.assertEqual(section_nums, sorted(section_nums))
|
|
|
|
|
|
# ============================================================================
|
|
# Class 5: TestEpubSkillBuilding
|
|
# ============================================================================
|
|
|
|
|
|
class TestEpubSkillBuilding(unittest.TestCase):
|
|
"""Test skill building (directory structure, SKILL.md, reference files)."""
|
|
|
|
def setUp(self):
|
|
if not IMPORT_OK:
|
|
self.skipTest("epub_scraper not importable")
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
|
|
def tearDown(self):
|
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
|
|
def _make_converter(self, name="test_book", epub_path="test.epub"):
|
|
config = {"name": name, "epub_path": epub_path}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.skill_dir = os.path.join(self.temp_dir, name)
|
|
converter.data_file = os.path.join(self.temp_dir, f"{name}_extracted.json")
|
|
return converter
|
|
|
|
def test_build_creates_directory_structure(self):
|
|
converter = self._make_converter()
|
|
converter.extracted_data = _make_sample_extracted_data()
|
|
|
|
converter.build_skill()
|
|
|
|
skill_dir = Path(self.temp_dir) / "test_book"
|
|
self.assertTrue(skill_dir.exists())
|
|
self.assertTrue((skill_dir / "references").exists())
|
|
self.assertTrue((skill_dir / "scripts").exists())
|
|
self.assertTrue((skill_dir / "assets").exists())
|
|
|
|
def test_build_generates_skill_md(self):
|
|
converter = self._make_converter()
|
|
converter.extracted_data = _make_sample_extracted_data()
|
|
|
|
converter.build_skill()
|
|
|
|
skill_md = Path(self.temp_dir) / "test_book" / "SKILL.md"
|
|
self.assertTrue(skill_md.exists())
|
|
content = skill_md.read_text()
|
|
self.assertIn("---", content)
|
|
self.assertIn("name:", content)
|
|
self.assertIn("description:", content)
|
|
|
|
def test_build_generates_reference_files(self):
|
|
converter = self._make_converter()
|
|
converter.extracted_data = _make_sample_extracted_data()
|
|
|
|
converter.build_skill()
|
|
|
|
refs_dir = Path(self.temp_dir) / "test_book" / "references"
|
|
md_files = list(refs_dir.glob("*.md"))
|
|
# At least index.md + one reference file
|
|
self.assertGreaterEqual(len(md_files), 2)
|
|
|
|
def test_build_generates_index(self):
|
|
converter = self._make_converter()
|
|
converter.extracted_data = _make_sample_extracted_data()
|
|
|
|
converter.build_skill()
|
|
|
|
index_path = Path(self.temp_dir) / "test_book" / "references" / "index.md"
|
|
self.assertTrue(index_path.exists())
|
|
content = index_path.read_text()
|
|
self.assertIn("Categories", content)
|
|
self.assertIn("Statistics", content)
|
|
|
|
def test_skill_md_contains_metadata(self):
|
|
converter = self._make_converter()
|
|
converter.extracted_data = _make_sample_extracted_data()
|
|
|
|
converter.build_skill()
|
|
|
|
skill_md = Path(self.temp_dir) / "test_book" / "SKILL.md"
|
|
content = skill_md.read_text()
|
|
self.assertIn("Test Book", content)
|
|
self.assertIn("Test Author", content)
|
|
|
|
def test_skill_md_yaml_frontmatter(self):
|
|
converter = self._make_converter()
|
|
converter.extracted_data = _make_sample_extracted_data()
|
|
|
|
converter.build_skill()
|
|
|
|
skill_md = Path(self.temp_dir) / "test_book" / "SKILL.md"
|
|
content = skill_md.read_text()
|
|
# YAML frontmatter starts and ends with ---
|
|
lines = content.split("\n")
|
|
self.assertEqual(lines[0], "---")
|
|
# Find closing ---
|
|
closing_idx = None
|
|
for i, line in enumerate(lines[1:], 1):
|
|
if line == "---":
|
|
closing_idx = i
|
|
break
|
|
self.assertIsNotNone(closing_idx)
|
|
|
|
def test_build_without_extracted_data_fails(self):
|
|
converter = self._make_converter()
|
|
converter.extracted_data = None
|
|
with self.assertRaises((AttributeError, TypeError)):
|
|
converter.build_skill()
|
|
|
|
def test_build_overwrites_existing_output(self):
|
|
converter = self._make_converter()
|
|
converter.extracted_data = _make_sample_extracted_data()
|
|
|
|
# Build once
|
|
converter.build_skill()
|
|
skill_md_1 = (Path(self.temp_dir) / "test_book" / "SKILL.md").read_text()
|
|
|
|
# Build again
|
|
converter.build_skill()
|
|
skill_md_2 = (Path(self.temp_dir) / "test_book" / "SKILL.md").read_text()
|
|
|
|
self.assertEqual(skill_md_1, skill_md_2)
|
|
|
|
def test_build_with_long_name(self):
|
|
long_name = "a" * 100
|
|
converter = self._make_converter(name=long_name)
|
|
converter.extracted_data = _make_sample_extracted_data()
|
|
|
|
converter.build_skill()
|
|
|
|
skill_md = Path(converter.skill_dir) / "SKILL.md"
|
|
content = skill_md.read_text()
|
|
# Name in frontmatter is truncated to 64 chars
|
|
lines = content.split("\n")
|
|
for line in lines:
|
|
if line.startswith("name:"):
|
|
name_val = line.split(":", 1)[1].strip()
|
|
self.assertLessEqual(len(name_val), 64)
|
|
break
|
|
|
|
def test_build_with_unicode_content(self):
|
|
converter = self._make_converter()
|
|
data = _make_sample_extracted_data()
|
|
data["pages"][0]["heading"] = (
|
|
"Unicode: \u4e2d\u6587 \u0627\u0644\u0639\u0631\u0628\u064a\u0629 \U0001f600"
|
|
)
|
|
data["pages"][0]["text"] = (
|
|
"Content with CJK: \u4f60\u597d, Arabic: \u0645\u0631\u062d\u0628\u0627, Emoji: \U0001f680"
|
|
)
|
|
converter.extracted_data = data
|
|
|
|
converter.build_skill()
|
|
|
|
refs_dir = Path(self.temp_dir) / "test_book" / "references"
|
|
md_files = list(refs_dir.glob("*.md"))
|
|
# Should have reference files
|
|
self.assertGreaterEqual(len(md_files), 1)
|
|
# Unicode should be preserved in at least one file
|
|
found_unicode = False
|
|
for f in md_files:
|
|
content = f.read_text(encoding="utf-8")
|
|
if "\u4e2d\u6587" in content or "\u4f60\u597d" in content:
|
|
found_unicode = True
|
|
break
|
|
self.assertTrue(found_unicode)
|
|
|
|
|
|
# ============================================================================
|
|
# Class 6: TestEpubCodeBlocks
|
|
# ============================================================================
|
|
|
|
|
|
class TestEpubCodeBlocks(unittest.TestCase):
|
|
"""Test code block extraction and rendering."""
|
|
|
|
def setUp(self):
|
|
if not IMPORT_OK:
|
|
self.skipTest("epub_scraper not importable")
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
|
|
def tearDown(self):
|
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
|
|
def _make_converter(self):
|
|
config = {"name": "test", "epub_path": "test.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.skill_dir = os.path.join(self.temp_dir, "test")
|
|
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
|
|
return converter
|
|
|
|
def test_code_blocks_included_in_reference_files(self):
|
|
converter = self._make_converter()
|
|
converter.extracted_data = _make_sample_extracted_data(include_code=True)
|
|
|
|
converter.build_skill()
|
|
|
|
refs_dir = Path(self.temp_dir) / "test" / "references"
|
|
found_code = False
|
|
for f in refs_dir.glob("*.md"):
|
|
if f.name == "index.md":
|
|
continue
|
|
content = f.read_text()
|
|
if "```python" in content or "def func_" in content:
|
|
found_code = True
|
|
break
|
|
self.assertTrue(found_code)
|
|
|
|
def test_code_blocks_in_skill_md_top_15(self):
|
|
converter = self._make_converter()
|
|
converter.extracted_data = _make_sample_extracted_data(num_sections=10, include_code=True)
|
|
|
|
converter.build_skill()
|
|
|
|
skill_md = Path(self.temp_dir) / "test" / "SKILL.md"
|
|
content = skill_md.read_text()
|
|
self.assertIn("Code Examples", content)
|
|
|
|
def test_code_language_grouped(self):
|
|
converter = self._make_converter()
|
|
converter.extracted_data = _make_sample_extracted_data(num_sections=3, include_code=True)
|
|
|
|
converter.build_skill()
|
|
|
|
skill_md = Path(self.temp_dir) / "test" / "SKILL.md"
|
|
content = skill_md.read_text()
|
|
self.assertIn("Python Examples", content)
|
|
self.assertIn("Javascript Examples", content)
|
|
|
|
def test_empty_code_block(self):
|
|
from bs4 import BeautifulSoup
|
|
|
|
html = "<pre><code></code></pre>"
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
elements = list(soup.children)
|
|
section = _build_section(1, "Test", "h1", elements)
|
|
self.assertEqual(len(section["code_samples"]), 0)
|
|
|
|
def test_code_block_with_html_entities(self):
|
|
from bs4 import BeautifulSoup
|
|
|
|
html = "<pre><code>if (x < 10 && y > 5) {}</code></pre>"
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
elements = list(soup.children)
|
|
section = _build_section(1, "Test", "h1", elements)
|
|
self.assertEqual(len(section["code_samples"]), 1)
|
|
code = section["code_samples"][0]["code"]
|
|
self.assertIn("<", code)
|
|
self.assertIn(">", code)
|
|
self.assertIn("&&", code)
|
|
|
|
def test_code_block_with_syntax_highlighting_spans(self):
|
|
from bs4 import BeautifulSoup
|
|
|
|
html = (
|
|
'<pre><code><span class="keyword">def</span> '
|
|
'<span class="name">foo</span>():</code></pre>'
|
|
)
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
elements = list(soup.children)
|
|
section = _build_section(1, "Test", "h1", elements)
|
|
self.assertEqual(len(section["code_samples"]), 1)
|
|
code = section["code_samples"][0]["code"]
|
|
self.assertIn("def", code)
|
|
self.assertIn("foo", code)
|
|
self.assertNotIn("<span", code)
|
|
|
|
def test_code_block_language_from_class(self):
|
|
from bs4 import BeautifulSoup
|
|
|
|
html = '<pre><code class="language-rust">fn main() {}</code></pre>'
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
elements = list(soup.children)
|
|
section = _build_section(1, "Test", "h1", elements)
|
|
self.assertEqual(section["code_samples"][0]["language"], "rust")
|
|
|
|
def test_code_quality_scoring(self):
|
|
# Short snippet
|
|
score_short = _score_code_quality("x")
|
|
self.assertLessEqual(score_short, 5.0)
|
|
|
|
# Substantial code
|
|
code = (
|
|
"def calculate_sum(numbers):\n"
|
|
" total = 0\n"
|
|
" for n in numbers:\n"
|
|
" total += n\n"
|
|
" return total\n"
|
|
"\n"
|
|
"result = calculate_sum([1, 2, 3])\n"
|
|
)
|
|
score_good = _score_code_quality(code)
|
|
self.assertGreater(score_good, score_short)
|
|
self.assertGreaterEqual(score_good, 0.0)
|
|
self.assertLessEqual(score_good, 10.0)
|
|
|
|
|
|
# ============================================================================
|
|
# Class 7: TestEpubTables
|
|
# ============================================================================
|
|
|
|
|
|
class TestEpubTables(unittest.TestCase):
|
|
"""Test table extraction and rendering."""
|
|
|
|
def setUp(self):
|
|
if not IMPORT_OK:
|
|
self.skipTest("epub_scraper not importable")
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
|
|
def tearDown(self):
|
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
|
|
def test_tables_in_reference_files(self):
|
|
config = {"name": "test", "epub_path": "test.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.skill_dir = os.path.join(self.temp_dir, "test")
|
|
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
|
|
converter.extracted_data = _make_sample_extracted_data(include_tables=True)
|
|
|
|
converter.build_skill()
|
|
|
|
refs_dir = Path(self.temp_dir) / "test" / "references"
|
|
found_table = False
|
|
for f in refs_dir.glob("*.md"):
|
|
if f.name == "index.md":
|
|
continue
|
|
content = f.read_text()
|
|
if "| Name | Value |" in content:
|
|
found_table = True
|
|
break
|
|
self.assertTrue(found_table)
|
|
|
|
def test_table_with_headers(self):
|
|
from bs4 import BeautifulSoup
|
|
|
|
html = (
|
|
"<table><thead><tr><th>Name</th><th>Age</th></tr></thead>"
|
|
"<tbody><tr><td>Alice</td><td>30</td></tr></tbody></table>"
|
|
)
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
table = soup.find("table")
|
|
result = _extract_table_from_html(table)
|
|
self.assertIsNotNone(result)
|
|
self.assertEqual(result["headers"], ["Name", "Age"])
|
|
self.assertEqual(result["rows"], [["Alice", "30"]])
|
|
|
|
def test_table_no_thead(self):
|
|
from bs4 import BeautifulSoup
|
|
|
|
html = (
|
|
"<table><tr><td>Header1</td><td>Header2</td></tr>"
|
|
"<tr><td>Val1</td><td>Val2</td></tr></table>"
|
|
)
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
table = soup.find("table")
|
|
result = _extract_table_from_html(table)
|
|
self.assertIsNotNone(result)
|
|
self.assertEqual(result["headers"], ["Header1", "Header2"])
|
|
self.assertEqual(result["rows"], [["Val1", "Val2"]])
|
|
|
|
def test_empty_table(self):
|
|
from bs4 import BeautifulSoup
|
|
|
|
html = "<table></table>"
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
table = soup.find("table")
|
|
result = _extract_table_from_html(table)
|
|
self.assertIsNone(result)
|
|
|
|
def test_table_with_colspan_rowspan(self):
|
|
from bs4 import BeautifulSoup
|
|
|
|
html = (
|
|
"<table><tr><th>H1</th><th colspan='2'>H2</th></tr>"
|
|
"<tr><td>A</td><td rowspan='2'>B</td><td>C</td></tr>"
|
|
"<tr><td>D</td><td>E</td></tr></table>"
|
|
)
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
table = soup.find("table")
|
|
# Should not crash
|
|
result = _extract_table_from_html(table)
|
|
self.assertIsNotNone(result)
|
|
|
|
|
|
# ============================================================================
|
|
# Class 8: TestEpubImages
|
|
# ============================================================================
|
|
|
|
|
|
class TestEpubImages(unittest.TestCase):
|
|
"""Test image extraction and handling."""
|
|
|
|
def setUp(self):
|
|
if not IMPORT_OK:
|
|
self.skipTest("epub_scraper not importable")
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
|
|
def tearDown(self):
|
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
|
|
def test_images_saved_to_assets(self):
|
|
config = {"name": "test", "epub_path": "test.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.skill_dir = os.path.join(self.temp_dir, "test")
|
|
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
|
|
data = _make_sample_extracted_data(include_images=True)
|
|
converter.extracted_data = data
|
|
|
|
converter.build_skill()
|
|
|
|
assets_dir = Path(self.temp_dir) / "test" / "assets"
|
|
self.assertTrue(assets_dir.exists())
|
|
|
|
def test_image_references_in_markdown(self):
|
|
config = {"name": "test", "epub_path": "test.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.skill_dir = os.path.join(self.temp_dir, "test")
|
|
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
|
|
data = _make_sample_extracted_data(include_images=True)
|
|
converter.extracted_data = data
|
|
|
|
converter.build_skill()
|
|
|
|
refs_dir = Path(self.temp_dir) / "test" / "references"
|
|
found_img_ref = False
|
|
for f in refs_dir.glob("*.md"):
|
|
if f.name == "index.md":
|
|
continue
|
|
content = f.read_text()
|
|
if "![Image" in content and "../assets/" in content:
|
|
found_img_ref = True
|
|
break
|
|
self.assertTrue(found_img_ref)
|
|
|
|
def test_image_with_zero_bytes(self):
|
|
config = {"name": "test", "epub_path": "test.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.skill_dir = os.path.join(self.temp_dir, "test")
|
|
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
|
|
data = _make_sample_extracted_data()
|
|
# Add image with empty data
|
|
data["pages"][0]["images"] = [{"index": 0, "data": b"", "width": 0, "height": 0}]
|
|
converter.extracted_data = data
|
|
|
|
# Should not crash
|
|
converter.build_skill()
|
|
|
|
def test_svg_images_handled(self):
|
|
from bs4 import BeautifulSoup
|
|
|
|
html = '<img src="diagram.svg" width="200" height="100"/>'
|
|
soup = BeautifulSoup(f"<div>{html}</div>", "html.parser")
|
|
elements = list(soup.find("div").children)
|
|
section = _build_section(1, "Test", "h1", elements)
|
|
self.assertEqual(len(section["images"]), 1)
|
|
|
|
def test_image_filename_conflicts(self):
|
|
config = {"name": "test", "epub_path": "test.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.skill_dir = os.path.join(self.temp_dir, "test")
|
|
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
|
|
data = _make_sample_extracted_data()
|
|
# Multiple images with unique indexes
|
|
data["pages"][0]["images"] = [
|
|
{"index": 0, "data": b"\x89PNG\r\n\x1a\n", "width": 50, "height": 50},
|
|
{"index": 1, "data": b"\x89PNG\r\n\x1a\n", "width": 50, "height": 50},
|
|
]
|
|
converter.extracted_data = data
|
|
|
|
converter.build_skill()
|
|
|
|
assets_dir = Path(self.temp_dir) / "test" / "assets"
|
|
png_files = list(assets_dir.glob("*.png"))
|
|
self.assertGreaterEqual(len(png_files), 2)
|
|
|
|
def test_cover_image_identified(self):
|
|
from bs4 import BeautifulSoup
|
|
|
|
html = '<img src="cover.jpg" width="600" height="900"/>'
|
|
soup = BeautifulSoup(f"<div>{html}</div>", "html.parser")
|
|
elements = list(soup.find("div").children)
|
|
section = _build_section(1, "Cover", "h1", elements)
|
|
self.assertEqual(len(section["images"]), 1)
|
|
|
|
def test_many_images(self):
|
|
config = {"name": "test", "epub_path": "test.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.skill_dir = os.path.join(self.temp_dir, "test")
|
|
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
|
|
data = _make_sample_extracted_data()
|
|
data["pages"][0]["images"] = [
|
|
{"index": i, "data": b"\x89PNG\r\n\x1a\n", "width": 10, "height": 10}
|
|
for i in range(100)
|
|
]
|
|
converter.extracted_data = data
|
|
|
|
# Should handle 100+ images without error
|
|
converter.build_skill()
|
|
|
|
|
|
# ============================================================================
|
|
# Class 9: TestEpubErrorHandling
|
|
# ============================================================================
|
|
|
|
|
|
class TestEpubErrorHandling(unittest.TestCase):
|
|
"""Test error handling for various failure scenarios."""
|
|
|
|
def setUp(self):
|
|
if not IMPORT_OK:
|
|
self.skipTest("epub_scraper not importable")
|
|
if not EPUB_AVAILABLE:
|
|
self.skipTest("ebooklib not installed")
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
|
|
def tearDown(self):
|
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
|
|
def test_missing_epub_file_raises_error(self):
|
|
config = {"name": "test", "epub_path": "/nonexistent/path/test.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
with self.assertRaises(FileNotFoundError):
|
|
converter.extract_epub()
|
|
|
|
def test_not_a_file_raises_error(self):
|
|
config = {"name": "test", "epub_path": self.temp_dir}
|
|
converter = EpubToSkillConverter(config)
|
|
with self.assertRaises((ValueError, FileNotFoundError)):
|
|
converter.extract_epub()
|
|
|
|
def test_not_epub_extension_raises_error(self):
|
|
txt_file = os.path.join(self.temp_dir, "test.txt")
|
|
Path(txt_file).write_text("not an epub")
|
|
config = {"name": "test", "epub_path": txt_file}
|
|
converter = EpubToSkillConverter(config)
|
|
with self.assertRaises(ValueError):
|
|
converter.extract_epub()
|
|
|
|
@patch("skill_seekers.cli.epub_scraper.epub")
|
|
@patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True)
|
|
@patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True)
|
|
def test_corrupted_epub_raises_error(self, mock_isfile, mock_exists, mock_epub):
|
|
mock_epub.read_epub.side_effect = Exception("Bad ZIP file")
|
|
config = {"name": "test", "epub_path": "corrupted.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
with self.assertRaises(ValueError):
|
|
converter.extract_epub()
|
|
|
|
@patch("skill_seekers.cli.epub_scraper.epub")
|
|
@patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True)
|
|
@patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True)
|
|
def test_drm_protected_raises_error(self, mock_isfile, mock_exists, mock_epub):
|
|
book = MagicMock()
|
|
enc_item = MagicMock()
|
|
enc_item.file_name = "META-INF/encryption.xml"
|
|
enc_item.get_content.return_value = (
|
|
b'<encryption xmlns="http://ns.adobe.com/adept"><EncryptedData/></encryption>'
|
|
)
|
|
book.get_items.return_value = [enc_item]
|
|
book.get_metadata.return_value = []
|
|
mock_epub.read_epub.return_value = book
|
|
|
|
config = {"name": "test", "epub_path": "drm.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
with self.assertRaises(RuntimeError) as ctx:
|
|
converter.extract_epub()
|
|
self.assertIn("DRM", str(ctx.exception))
|
|
|
|
def test_ebooklib_not_installed_error(self):
|
|
from skill_seekers.cli.epub_scraper import _check_epub_deps
|
|
|
|
with patch("skill_seekers.cli.epub_scraper.EPUB_AVAILABLE", False):
|
|
with self.assertRaises(RuntimeError) as ctx:
|
|
_check_epub_deps()
|
|
self.assertIn("ebooklib", str(ctx.exception))
|
|
self.assertIn("pip install", str(ctx.exception))
|
|
|
|
@patch("skill_seekers.cli.epub_scraper.epub")
|
|
@patch("skill_seekers.cli.epub_scraper.os.path.exists", return_value=True)
|
|
@patch("skill_seekers.cli.epub_scraper.os.path.isfile", return_value=True)
|
|
def test_malformed_xhtml_handled_gracefully(self, mock_isfile, mock_exists, mock_epub):
|
|
"""Malformed XHTML should not crash thanks to BeautifulSoup tolerant parsing."""
|
|
book = MagicMock()
|
|
item = MagicMock()
|
|
item.get_type.return_value = ebooklib.ITEM_DOCUMENT
|
|
item.get_content.return_value = b"<html><body><h1>Test<p>Unclosed tags <div>and more</body>"
|
|
book.spine = [("ch1", "yes")]
|
|
book.get_item_with_id = lambda _x: item
|
|
book.get_metadata.return_value = []
|
|
book.get_items_of_type = lambda _t: []
|
|
book.get_items = lambda: [item]
|
|
mock_epub.read_epub.return_value = book
|
|
|
|
config = {"name": "test", "epub_path": "malformed.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
|
|
|
|
# Should not crash
|
|
result = converter.extract_epub()
|
|
self.assertTrue(result)
|
|
|
|
|
|
# ============================================================================
|
|
# Class 10: TestEpubJSONWorkflow
|
|
# ============================================================================
|
|
|
|
|
|
class TestEpubJSONWorkflow(unittest.TestCase):
|
|
"""Test JSON-based workflow (load/save extracted data)."""
|
|
|
|
def setUp(self):
|
|
if not IMPORT_OK:
|
|
self.skipTest("epub_scraper not importable")
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
|
|
def tearDown(self):
|
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
|
|
def test_load_extracted_json(self):
|
|
config = {"name": "test"}
|
|
converter = EpubToSkillConverter(config)
|
|
|
|
data = _make_sample_extracted_data()
|
|
json_path = os.path.join(self.temp_dir, "test_extracted.json")
|
|
with open(json_path, "w") as f:
|
|
json.dump(data, f)
|
|
|
|
result = converter.load_extracted_data(json_path)
|
|
self.assertTrue(result)
|
|
self.assertIsNotNone(converter.extracted_data)
|
|
self.assertEqual(converter.extracted_data["total_sections"], 2)
|
|
|
|
def test_build_from_json(self):
|
|
config = {"name": "test"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.skill_dir = os.path.join(self.temp_dir, "test")
|
|
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
|
|
|
|
data = _make_sample_extracted_data()
|
|
json_path = os.path.join(self.temp_dir, "test_extracted.json")
|
|
with open(json_path, "w") as f:
|
|
json.dump(data, f)
|
|
|
|
converter.load_extracted_data(json_path)
|
|
converter.build_skill()
|
|
|
|
skill_md = Path(self.temp_dir) / "test" / "SKILL.md"
|
|
self.assertTrue(skill_md.exists())
|
|
|
|
def test_json_round_trip(self):
|
|
config = {"name": "test"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.skill_dir = os.path.join(self.temp_dir, "test")
|
|
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
|
|
|
|
original_data = _make_sample_extracted_data(include_code=True, include_tables=True)
|
|
|
|
# Save
|
|
json_path = os.path.join(self.temp_dir, "test_extracted.json")
|
|
with open(json_path, "w") as f:
|
|
json.dump(original_data, f, default=str)
|
|
|
|
# Load
|
|
converter.load_extracted_data(json_path)
|
|
|
|
self.assertEqual(
|
|
converter.extracted_data["total_sections"],
|
|
original_data["total_sections"],
|
|
)
|
|
self.assertEqual(
|
|
converter.extracted_data["total_code_blocks"],
|
|
original_data["total_code_blocks"],
|
|
)
|
|
|
|
def test_load_invalid_json(self):
|
|
config = {"name": "test"}
|
|
converter = EpubToSkillConverter(config)
|
|
|
|
bad_json = os.path.join(self.temp_dir, "bad.json")
|
|
Path(bad_json).write_text("{invalid json content")
|
|
|
|
with self.assertRaises(json.JSONDecodeError):
|
|
converter.load_extracted_data(bad_json)
|
|
|
|
def test_load_nonexistent_json(self):
|
|
config = {"name": "test"}
|
|
converter = EpubToSkillConverter(config)
|
|
|
|
with self.assertRaises(FileNotFoundError):
|
|
converter.load_extracted_data("/nonexistent/path/data.json")
|
|
|
|
def test_json_with_missing_fields(self):
|
|
config = {"name": "test"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.skill_dir = os.path.join(self.temp_dir, "test")
|
|
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
|
|
|
|
# Minimal JSON — missing optional fields
|
|
minimal_data = {
|
|
"pages": [
|
|
{
|
|
"section_number": 1,
|
|
"heading": "Test",
|
|
"heading_level": "h1",
|
|
"text": "Content",
|
|
"headings": [],
|
|
"code_samples": [],
|
|
"tables": [],
|
|
"images": [],
|
|
}
|
|
],
|
|
"metadata": {"title": "Test"},
|
|
}
|
|
json_path = os.path.join(self.temp_dir, "minimal.json")
|
|
with open(json_path, "w") as f:
|
|
json.dump(minimal_data, f)
|
|
|
|
converter.load_extracted_data(json_path)
|
|
# Should not crash when building
|
|
converter.build_skill()
|
|
|
|
|
|
# ============================================================================
|
|
# Class 11: TestEpubCLIArguments
|
|
# ============================================================================
|
|
|
|
|
|
class TestEpubCLIArguments(unittest.TestCase):
|
|
"""Test CLI argument parsing."""
|
|
|
|
def setUp(self):
|
|
if not IMPORT_OK:
|
|
self.skipTest("epub_scraper not importable")
|
|
|
|
def _parse_args(self, args_list):
|
|
import argparse
|
|
|
|
from skill_seekers.cli.arguments.epub import add_epub_arguments
|
|
|
|
parser = argparse.ArgumentParser()
|
|
add_epub_arguments(parser)
|
|
return parser.parse_args(args_list)
|
|
|
|
def test_epub_flag_accepted(self):
|
|
args = self._parse_args(["--epub", "book.epub"])
|
|
self.assertEqual(args.epub, "book.epub")
|
|
|
|
def test_from_json_flag_accepted(self):
|
|
args = self._parse_args(["--from-json", "data.json"])
|
|
self.assertEqual(args.from_json, "data.json")
|
|
|
|
def test_name_flag_accepted(self):
|
|
args = self._parse_args(["--epub", "book.epub", "--name", "mybook"])
|
|
self.assertEqual(args.name, "mybook")
|
|
|
|
def test_enhance_level_default_zero(self):
|
|
args = self._parse_args(["--epub", "book.epub"])
|
|
self.assertEqual(args.enhance_level, 0)
|
|
|
|
def test_dry_run_flag(self):
|
|
args = self._parse_args(["--epub", "book.epub", "--dry-run"])
|
|
self.assertTrue(args.dry_run)
|
|
|
|
def test_no_args_accepted(self):
|
|
# Parser itself doesn't enforce --epub or --from-json — main() does
|
|
args = self._parse_args([])
|
|
self.assertIsNone(getattr(args, "epub", None))
|
|
|
|
def test_verbose_flag(self):
|
|
args = self._parse_args(["--epub", "book.epub", "--verbose"])
|
|
self.assertTrue(args.verbose)
|
|
|
|
def test_quiet_flag(self):
|
|
args = self._parse_args(["--epub", "book.epub", "--quiet"])
|
|
self.assertTrue(args.quiet)
|
|
|
|
|
|
# ============================================================================
|
|
# Class 12: TestEpubHelperFunctions
|
|
# ============================================================================
|
|
|
|
|
|
class TestEpubHelperFunctions(unittest.TestCase):
|
|
"""Test module-level helper functions."""
|
|
|
|
def setUp(self):
|
|
if not IMPORT_OK:
|
|
self.skipTest("epub_scraper not importable")
|
|
|
|
def test_infer_description_from_metadata_description(self):
|
|
metadata = {"description": "A comprehensive guide to testing software"}
|
|
result = infer_description_from_epub(metadata)
|
|
self.assertTrue(result.startswith("Use when"))
|
|
self.assertIn("testing", result.lower())
|
|
|
|
def test_infer_description_from_metadata_title(self):
|
|
metadata = {"title": "Programming Rust, 2nd Edition"}
|
|
result = infer_description_from_epub(metadata)
|
|
self.assertIn("programming rust", result.lower())
|
|
|
|
def test_infer_description_fallback(self):
|
|
result = infer_description_from_epub(name="mybook")
|
|
self.assertIn("mybook", result)
|
|
|
|
def test_infer_description_empty_metadata(self):
|
|
result = infer_description_from_epub({})
|
|
self.assertEqual(result, "Use when referencing this documentation")
|
|
|
|
def test_score_code_quality_ranges(self):
|
|
self.assertEqual(_score_code_quality(""), 0.0)
|
|
|
|
score = _score_code_quality("x = 1")
|
|
self.assertGreaterEqual(score, 0.0)
|
|
self.assertLessEqual(score, 10.0)
|
|
|
|
# Long code with functions scores higher
|
|
long_code = "\n".join([f"def func_{i}():" for i in range(15)] + [" return True"])
|
|
score_long = _score_code_quality(long_code)
|
|
self.assertGreater(score_long, score)
|
|
|
|
def test_sanitize_filename(self):
|
|
config = {"name": "test"}
|
|
converter = EpubToSkillConverter(config)
|
|
self.assertEqual(converter._sanitize_filename("Hello World!"), "hello_world")
|
|
self.assertEqual(converter._sanitize_filename("my-file_name"), "my_file_name")
|
|
self.assertEqual(
|
|
converter._sanitize_filename("Test: Special & Chars"), "test_special_chars"
|
|
)
|
|
|
|
|
|
# ============================================================================
|
|
# Class 13: TestEpubSourceDetection
|
|
# ============================================================================
|
|
|
|
|
|
class TestEpubSourceDetection(unittest.TestCase):
|
|
"""Test source detection for EPUB files."""
|
|
|
|
def setUp(self):
|
|
try:
|
|
from skill_seekers.cli.source_detector import SourceDetector
|
|
|
|
self.SourceDetector = SourceDetector
|
|
except ImportError:
|
|
self.skipTest("source_detector not importable")
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
|
|
def tearDown(self):
|
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
|
|
def test_epub_detected_as_epub_type(self):
|
|
result = self.SourceDetector.detect("test.epub")
|
|
self.assertEqual(result.type, "epub")
|
|
|
|
def test_epub_suggested_name(self):
|
|
result = self.SourceDetector.detect("my-ebook.epub")
|
|
self.assertEqual(result.suggested_name, "my-ebook")
|
|
|
|
def test_epub_validation_missing_file(self):
|
|
result = self.SourceDetector.detect("/nonexistent/book.epub")
|
|
with self.assertRaises(ValueError):
|
|
self.SourceDetector.validate_source(result)
|
|
|
|
def test_epub_validation_not_a_file(self):
|
|
result = self.SourceDetector.detect(self.temp_dir + ".epub")
|
|
# Path doesn't end with .epub but let's test a directory that would be detected
|
|
dir_path = os.path.join(self.temp_dir, "test.epub")
|
|
os.makedirs(dir_path) # Create a directory with .epub name
|
|
result = self.SourceDetector.detect(dir_path)
|
|
with self.assertRaises(ValueError):
|
|
self.SourceDetector.validate_source(result)
|
|
|
|
def test_epub_with_path(self):
|
|
result = self.SourceDetector.detect("./books/test.epub")
|
|
self.assertEqual(result.type, "epub")
|
|
self.assertEqual(result.parsed["file_path"], "./books/test.epub")
|
|
|
|
def test_pdf_still_detected(self):
|
|
"""Regression test: .pdf files still detected as pdf type."""
|
|
result = self.SourceDetector.detect("document.pdf")
|
|
self.assertEqual(result.type, "pdf")
|
|
|
|
|
|
# ============================================================================
|
|
# Class 14: TestEpubEdgeCases
|
|
# ============================================================================
|
|
|
|
|
|
class TestEpubEdgeCases(unittest.TestCase):
|
|
"""Test edge cases per W3C EPUB 3.3 spec."""
|
|
|
|
def setUp(self):
|
|
if not IMPORT_OK:
|
|
self.skipTest("epub_scraper not importable")
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
|
|
def tearDown(self):
|
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
|
|
def test_epub_no_toc(self):
|
|
"""EPUB without TOC should still extract using spine order."""
|
|
config = {"name": "test"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.skill_dir = os.path.join(self.temp_dir, "test")
|
|
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
|
|
converter.extracted_data = _make_sample_extracted_data()
|
|
|
|
converter.build_skill()
|
|
skill_md = Path(self.temp_dir) / "test" / "SKILL.md"
|
|
self.assertTrue(skill_md.exists())
|
|
|
|
def test_epub_empty_chapters(self):
|
|
"""Chapters with no text content handled gracefully."""
|
|
# Empty body — no elements to process
|
|
section = _build_section(1, "Empty", "h1", [])
|
|
self.assertEqual(section["text"], "")
|
|
self.assertEqual(section["code_samples"], [])
|
|
|
|
def test_epub_single_chapter(self):
|
|
"""Single chapter produces valid output."""
|
|
config = {"name": "test", "epub_path": "test.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.skill_dir = os.path.join(self.temp_dir, "test")
|
|
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
|
|
converter.extracted_data = _make_sample_extracted_data(num_sections=1)
|
|
|
|
converter.build_skill()
|
|
|
|
skill_md = Path(self.temp_dir) / "test" / "SKILL.md"
|
|
self.assertTrue(skill_md.exists())
|
|
content = skill_md.read_text()
|
|
self.assertIn("Chapter 1", content)
|
|
|
|
def test_epub_unicode_content(self):
|
|
"""CJK, Arabic, Cyrillic, emoji text preserved."""
|
|
from bs4 import BeautifulSoup
|
|
|
|
html = "<p>\u4f60\u597d\u4e16\u754c \u041f\u0440\u0438\u0432\u0435\u0442 \U0001f600</p>"
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
elements = list(soup.children)
|
|
section = _build_section(1, "Unicode", "h1", elements)
|
|
self.assertIn("\u4f60\u597d", section["text"])
|
|
self.assertIn("\U0001f600", section["text"])
|
|
|
|
def test_epub_large_section_count(self):
|
|
"""100+ sections processed without error."""
|
|
config = {"name": "test", "epub_path": "test.epub"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.skill_dir = os.path.join(self.temp_dir, "test")
|
|
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
|
|
converter.extracted_data = _make_sample_extracted_data(num_sections=100)
|
|
|
|
converter.build_skill()
|
|
|
|
skill_md = Path(self.temp_dir) / "test" / "SKILL.md"
|
|
self.assertTrue(skill_md.exists())
|
|
|
|
def test_epub_nested_headings(self):
|
|
"""h3/h4/h5/h6 become sub-headings within sections."""
|
|
from bs4 import BeautifulSoup
|
|
|
|
html = (
|
|
"<h3>Sub-section A</h3>"
|
|
"<p>Content A</p>"
|
|
"<h4>Sub-sub-section B</h4>"
|
|
"<p>Content B</p>"
|
|
"<h5>Deep heading</h5>"
|
|
"<h6>Deepest heading</h6>"
|
|
)
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
elements = list(soup.children)
|
|
section = _build_section(1, "Main", "h1", elements)
|
|
self.assertEqual(len(section["headings"]), 4)
|
|
self.assertEqual(section["headings"][0]["level"], "h3")
|
|
self.assertEqual(section["headings"][0]["text"], "Sub-section A")
|
|
self.assertEqual(section["headings"][3]["level"], "h6")
|
|
|
|
def test_fixed_layout_detected(self):
|
|
"""Fixed-layout EPUB — we extract whatever text exists."""
|
|
config = {"name": "test"}
|
|
converter = EpubToSkillConverter(config)
|
|
converter.skill_dir = os.path.join(self.temp_dir, "test")
|
|
converter.data_file = os.path.join(self.temp_dir, "test_extracted.json")
|
|
data = _make_sample_extracted_data(num_sections=1)
|
|
data["pages"][0]["text"] = "Some text from fixed-layout EPUB"
|
|
converter.extracted_data = data
|
|
|
|
converter.build_skill()
|
|
refs_dir = Path(self.temp_dir) / "test" / "references"
|
|
found = False
|
|
for f in refs_dir.glob("*.md"):
|
|
if "fixed-layout" in f.read_text():
|
|
found = True
|
|
break
|
|
self.assertTrue(found)
|
|
|
|
def test_epub2_vs_epub3(self):
|
|
"""Both EPUB 2 and EPUB 3 use the same code path — verify section building works."""
|
|
from bs4 import BeautifulSoup
|
|
|
|
# EPUB 2 style (simpler XHTML)
|
|
html2 = "<p>EPUB 2 content</p>"
|
|
soup2 = BeautifulSoup(html2, "html.parser")
|
|
section2 = _build_section(1, "EPUB 2 Chapter", "h1", list(soup2.children))
|
|
self.assertIn("EPUB 2 content", section2["text"])
|
|
|
|
# EPUB 3 style (HTML5-ish XHTML)
|
|
html3 = "<section><p>EPUB 3 content</p></section>"
|
|
soup3 = BeautifulSoup(html3, "html.parser")
|
|
section3 = _build_section(1, "EPUB 3 Chapter", "h1", list(soup3.children))
|
|
self.assertIn("EPUB 3 content", section3["text"])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|