feat: add EPUB input support (#310)

Adds EPUB as a first-class input source for skill generation. - EpubToSkillConverter (epub_scraper.py, ~1200 lines) following PDF scraper pattern - Dublin Core metadata, spine items, code blocks, tables, images extraction - DRM detection (Adobe ADEPT, Apple FairPlay, Readium LCP) with fail-fast - EPUB 3 NCX TOC bug workaround (ignore_ncx=True) - ebooklib as optional dep: pip install skill-seekers[epub] - Wired into create command with .epub auto-detection - 104 tests, all passing Review fixes: removed 3 empty test stubs, fixed SVG double-counting in _extract_images(), added logger.debug to bare except pass. Based on PR #310 by @christianbaumann. Co-authored-by: Christian Baumann <mail@chriss-baumann.de>
2026-03-15 02:34:41 +03:00
parent 83b9a695ba
commit 2e30970dfb
16 changed files with 4502 additions and 9 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -114,6 +114,11 @@ docx = [
    "python-docx>=1.1.0",
 ]

+# EPUB (.epub) support
+epub = [
+    "ebooklib>=0.18",
+]
+
 # Video processing (lightweight: YouTube transcripts + metadata)
 video = [
    "yt-dlp>=2024.12.0",
@@ -178,6 +183,7 @@ embedding = [
 all = [
    "mammoth>=1.6.0",
    "python-docx>=1.1.0",
+    "ebooklib>=0.18",
    "yt-dlp>=2024.12.0",
    "youtube-transcript-api>=1.2.0",
    "mcp>=1.25,<2",
@@ -222,6 +228,7 @@ skill-seekers-scrape = "skill_seekers.cli.doc_scraper:main"
 skill-seekers-github = "skill_seekers.cli.github_scraper:main"
 skill-seekers-pdf = "skill_seekers.cli.pdf_scraper:main"
 skill-seekers-word = "skill_seekers.cli.word_scraper:main"
+skill-seekers-epub = "skill_seekers.cli.epub_scraper:main"
 skill-seekers-video = "skill_seekers.cli.video_scraper:main"
 skill-seekers-unified = "skill_seekers.cli.unified_scraper:main"
 skill-seekers-enhance = "skill_seekers.cli.enhance_command:main"