feat: add EPUB input support (#310)

Adds EPUB as a first-class input source for skill generation. - EpubToSkillConverter (epub_scraper.py, ~1200 lines) following PDF scraper pattern - Dublin Core metadata, spine items, code blocks, tables, images extraction - DRM detection (Adobe ADEPT, Apple FairPlay, Readium LCP) with fail-fast - EPUB 3 NCX TOC bug workaround (ignore_ncx=True) - ebooklib as optional dep: pip install skill-seekers[epub] - Wired into create command with .epub auto-detection - 104 tests, all passing Review fixes: removed 3 empty test stubs, fixed SVG double-counting in _extract_images(), added logger.debug to bare except pass. Based on PR #310 by @christianbaumann. Co-authored-by: Christian Baumann <mail@chriss-baumann.de>
2026-03-15 02:34:41 +03:00
parent 83b9a695ba
commit 2e30970dfb
16 changed files with 4502 additions and 9 deletions
--- a/src/skill_seekers/cli/arguments/create.py
+++ b/src/skill_seekers/cli/arguments/create.py
@@ -410,6 +410,18 @@ WORD_ARGUMENTS: dict[str, dict[str, Any]] = {
    },
 }

+# EPUB specific (from epub.py)
+EPUB_ARGUMENTS: dict[str, dict[str, Any]] = {
+    "epub": {
+        "flags": ("--epub",),
+        "kwargs": {
+            "type": str,
+            "help": "EPUB file path",
+            "metavar": "PATH",
+        },
+    },
+}
+
 # Video specific (from video.py)
 VIDEO_ARGUMENTS: dict[str, dict[str, Any]] = {
    "video_url": {
@@ -598,6 +610,7 @@ def get_source_specific_arguments(source_type: str) -> dict[str, dict[str, Any]]
        "local": LOCAL_ARGUMENTS,
        "pdf": PDF_ARGUMENTS,
        "word": WORD_ARGUMENTS,
+        "epub": EPUB_ARGUMENTS,
        "video": VIDEO_ARGUMENTS,
        "config": CONFIG_ARGUMENTS,
    }
@@ -636,6 +649,7 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default")
    - 'local': Universal + local-specific
    - 'pdf': Universal + pdf-specific
    - 'word': Universal + word-specific
+    - 'epub': Universal + epub-specific
    - 'video': Universal + video-specific
    - 'advanced': Advanced/rare arguments
    - 'all': All 120+ arguments
@@ -677,6 +691,10 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default")
        for arg_name, arg_def in WORD_ARGUMENTS.items():
            parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])

+    if mode in ["epub", "all"]:
+        for arg_name, arg_def in EPUB_ARGUMENTS.items():
+            parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
+
    if mode in ["video", "all"]:
        for arg_name, arg_def in VIDEO_ARGUMENTS.items():
            parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
--- a/src/skill_seekers/cli/arguments/epub.py
+++ b/src/skill_seekers/cli/arguments/epub.py
@@ -0,0 +1,66 @@
+"""EPUB command argument definitions.
+
+This module defines ALL arguments for the epub command in ONE place.
+Both epub_scraper.py (standalone) and parsers/epub_parser.py (unified CLI)
+import and use these definitions.
+
+Shared arguments (name, description, output, enhance-level, api-key,
+dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
+via ``add_all_standard_arguments()``.
+"""
+
+import argparse
+from typing import Any
+
+from .common import add_all_standard_arguments
+
+# EPUB-specific argument definitions as data structure
+# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
+#       verbose, quiet, workflow args) are registered by add_all_standard_arguments().
+EPUB_ARGUMENTS: dict[str, dict[str, Any]] = {
+    "epub": {
+        "flags": ("--epub",),
+        "kwargs": {
+            "type": str,
+            "help": "Direct EPUB file path",
+            "metavar": "PATH",
+        },
+    },
+    "from_json": {
+        "flags": ("--from-json",),
+        "kwargs": {
+            "type": str,
+            "help": "Build skill from extracted JSON",
+            "metavar": "FILE",
+        },
+    },
+}
+
+
+def add_epub_arguments(parser: argparse.ArgumentParser) -> None:
+    """Add all epub command arguments to a parser.
+
+    Registers shared args (name, description, output, enhance-level, api-key,
+    dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
+    then adds EPUB-specific args on top.
+
+    The default for --enhance-level is overridden to 0 (disabled) for EPUB.
+    """
+    # Shared universal args first
+    add_all_standard_arguments(parser)
+
+    # Override enhance-level default to 0 for EPUB
+    for action in parser._actions:
+        if hasattr(action, "dest") and action.dest == "enhance_level":
+            action.default = 0
+            action.help = (
+                "AI enhancement level (auto-detects API vs LOCAL mode): "
+                "0=disabled (default for EPUB), 1=SKILL.md only, 2=+architecture/config, 3=full enhancement. "
+                "Mode selection: uses API if ANTHROPIC_API_KEY is set, otherwise LOCAL (Claude Code)"
+            )
+
+    # EPUB-specific args
+    for arg_name, arg_def in EPUB_ARGUMENTS.items():
+        flags = arg_def["flags"]
+        kwargs = arg_def["kwargs"]
+        parser.add_argument(*flags, **kwargs)
--- a/src/skill_seekers/cli/create_command.py
+++ b/src/skill_seekers/cli/create_command.py
@@ -134,6 +134,8 @@ class CreateCommand:
            return self._route_pdf()
        elif self.source_info.type == "word":
            return self._route_word()
+        elif self.source_info.type == "epub":
+            return self._route_epub()
        elif self.source_info.type == "video":
            return self._route_video()
        elif self.source_info.type == "config":
@@ -351,6 +353,29 @@ class CreateCommand:
        finally:
            sys.argv = original_argv

+    def _route_epub(self) -> int:
+        """Route to EPUB scraper (epub_scraper.py)."""
+        from skill_seekers.cli import epub_scraper
+
+        # Reconstruct argv for epub_scraper
+        argv = ["epub_scraper"]
+
+        # Add EPUB file
+        file_path = self.source_info.parsed["file_path"]
+        argv.extend(["--epub", file_path])
+
+        # Add universal arguments
+        self._add_common_args(argv)
+
+        # Call epub_scraper with modified argv
+        logger.debug(f"Calling epub_scraper with argv: {argv}")
+        original_argv = sys.argv
+        try:
+            sys.argv = argv
+            return epub_scraper.main()
+        finally:
+            sys.argv = original_argv
+
    def _route_video(self) -> int:
        """Route to video scraper (video_scraper.py)."""
        from skill_seekers.cli import video_scraper
@@ -541,6 +566,7 @@ Examples:
  Local:    skill-seekers create ./my-project -p comprehensive
  PDF:      skill-seekers create tutorial.pdf --ocr
  DOCX:     skill-seekers create document.docx
+  EPUB:     skill-seekers create ebook.epub
  Video:    skill-seekers create https://youtube.com/watch?v=...
  Video:    skill-seekers create recording.mp4
  Config:   skill-seekers create configs/react.json
@@ -551,6 +577,7 @@ Source Auto-Detection:
  • ./path → local codebase
  • file.pdf → PDF extraction
  • file.docx → Word document extraction
+  • file.epub → EPUB extraction
  • youtube.com/... → Video transcript extraction
  • file.mp4 → Video file extraction
  • file.json → multi-source config
@@ -560,6 +587,7 @@ Progressive Help (13 → 120+ flags):
  --help-github    GitHub repository options
  --help-local     Local codebase analysis
  --help-pdf       PDF extraction options
+  --help-epub      EPUB extraction options
  --help-video     Video extraction options
  --help-advanced  Rare/advanced options
  --help-all       All options + compatibility
@@ -591,6 +619,9 @@ Common Workflows:
    parser.add_argument(
        "--help-word", action="store_true", help=argparse.SUPPRESS, dest="_help_word"
    )
+    parser.add_argument(
+        "--help-epub", action="store_true", help=argparse.SUPPRESS, dest="_help_epub"
+    )
    parser.add_argument(
        "--help-video", action="store_true", help=argparse.SUPPRESS, dest="_help_video"
    )
@@ -652,6 +683,15 @@ Common Workflows:
        add_create_arguments(parser_word, mode="word")
        parser_word.print_help()
        return 0
+    elif args._help_epub:
+        parser_epub = argparse.ArgumentParser(
+            prog="skill-seekers create",
+            description="Create skill from EPUB e-book (.epub)",
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+        )
+        add_create_arguments(parser_epub, mode="epub")
+        parser_epub.print_help()
+        return 0
    elif args._help_video:
        parser_video = argparse.ArgumentParser(
            prog="skill-seekers create",
--- a/src/skill_seekers/cli/epub_scraper.py
+++ b/src/skill_seekers/cli/epub_scraper.py
--- a/src/skill_seekers/cli/main.py
+++ b/src/skill_seekers/cli/main.py
@@ -13,6 +13,7 @@ Commands:
    github               Scrape GitHub repository
    pdf                  Extract from PDF file
    word                 Extract from Word (.docx) file
+    epub                 Extract from EPUB e-book (.epub)
    video                Extract from video (YouTube or local)
    unified              Multi-source scraping (docs + GitHub + PDF)
    analyze              Analyze local codebase and extract code knowledge
@@ -50,6 +51,7 @@ COMMAND_MODULES = {
    "github": "skill_seekers.cli.github_scraper",
    "pdf": "skill_seekers.cli.pdf_scraper",
    "word": "skill_seekers.cli.word_scraper",
+    "epub": "skill_seekers.cli.epub_scraper",
    "video": "skill_seekers.cli.video_scraper",
    "unified": "skill_seekers.cli.unified_scraper",
    "enhance": "skill_seekers.cli.enhance_command",
--- a/src/skill_seekers/cli/parsers/init.py
+++ b/src/skill_seekers/cli/parsers/init.py
@@ -13,6 +13,7 @@ from .scrape_parser import ScrapeParser
 from .github_parser import GitHubParser
 from .pdf_parser import PDFParser
 from .word_parser import WordParser
+from .epub_parser import EpubParser
 from .video_parser import VideoParser
 from .unified_parser import UnifiedParser
 from .enhance_parser import EnhanceParser
@@ -45,6 +46,7 @@ PARSERS = [
    EnhanceStatusParser(),
    PDFParser(),
    WordParser(),
+    EpubParser(),
    VideoParser(),
    UnifiedParser(),
    EstimateParser(),
--- a/src/skill_seekers/cli/parsers/epub_parser.py
+++ b/src/skill_seekers/cli/parsers/epub_parser.py
@@ -0,0 +1,32 @@
+"""EPUB subcommand parser.
+
+Uses shared argument definitions from arguments.epub to ensure
+consistency with the standalone epub_scraper module.
+"""
+
+from .base import SubcommandParser
+from skill_seekers.cli.arguments.epub import add_epub_arguments
+
+
+class EpubParser(SubcommandParser):
+    """Parser for epub subcommand."""
+
+    @property
+    def name(self) -> str:
+        return "epub"
+
+    @property
+    def help(self) -> str:
+        return "Extract from EPUB e-book (.epub)"
+
+    @property
+    def description(self) -> str:
+        return "Extract content from EPUB e-book (.epub) and generate skill"
+
+    def add_arguments(self, parser):
+        """Add epub-specific arguments.
+
+        Uses shared argument definitions to ensure consistency
+        with epub_scraper.py (standalone scraper).
+        """
+        add_epub_arguments(parser)
--- a/src/skill_seekers/cli/source_detector.py
+++ b/src/skill_seekers/cli/source_detector.py
@@ -63,6 +63,9 @@ class SourceDetector:
        if source.endswith(".docx"):
            return cls._detect_word(source)

+        if source.endswith(".epub"):
+            return cls._detect_epub(source)
+
        # Video file extensions
        VIDEO_EXTENSIONS = (".mp4", ".mkv", ".avi", ".mov", ".webm", ".flv", ".wmv")
        if source.lower().endswith(VIDEO_EXTENSIONS):
@@ -99,6 +102,7 @@ class SourceDetector:
            "  Local:  skill-seekers create ./my-project\n"
            "  PDF:    skill-seekers create tutorial.pdf\n"
            "  DOCX:   skill-seekers create document.docx\n"
+            "  EPUB:   skill-seekers create ebook.epub\n"
            "  Video:  skill-seekers create https://youtube.com/watch?v=...\n"
            "  Video:  skill-seekers create recording.mp4\n"
            "  Config: skill-seekers create configs/react.json"
@@ -128,6 +132,14 @@ class SourceDetector:
            type="word", parsed={"file_path": source}, suggested_name=name, raw_input=source
        )

+    @classmethod
+    def _detect_epub(cls, source: str) -> SourceInfo:
+        """Detect EPUB file source."""
+        name = os.path.splitext(os.path.basename(source))[0]
+        return SourceInfo(
+            type="epub", parsed={"file_path": source}, suggested_name=name, raw_input=source
+        )
+
    @classmethod
    def _detect_video_file(cls, source: str) -> SourceInfo:
        """Detect local video file source."""
@@ -277,6 +289,13 @@ class SourceDetector:
            if not os.path.isfile(file_path):
                raise ValueError(f"Path is not a file: {file_path}")

+        elif source_info.type == "epub":
+            file_path = source_info.parsed["file_path"]
+            if not os.path.exists(file_path):
+                raise ValueError(f"EPUB file does not exist: {file_path}")
+            if not os.path.isfile(file_path):
+                raise ValueError(f"Path is not a file: {file_path}")
+
        elif source_info.type == "video":
            if source_info.parsed.get("source_kind") == "file":
                file_path = source_info.parsed["file_path"]