feat: add EPUB input support (#310)

Adds EPUB as a first-class input source for skill generation.

- EpubToSkillConverter (epub_scraper.py, ~1200 lines) following PDF scraper pattern
- Dublin Core metadata, spine items, code blocks, tables, images extraction
- DRM detection (Adobe ADEPT, Apple FairPlay, Readium LCP) with fail-fast
- EPUB 3 NCX TOC bug workaround (ignore_ncx=True)
- ebooklib as optional dep: pip install skill-seekers[epub]
- Wired into create command with .epub auto-detection
- 104 tests, all passing

Review fixes: removed 3 empty test stubs, fixed SVG double-counting in
_extract_images(), added logger.debug to bare except pass.

Based on PR #310 by @christianbaumann.
Co-authored-by: Christian Baumann <mail@chriss-baumann.de>
This commit is contained in:
yusyus
2026-03-15 02:34:41 +03:00
committed by GitHub
parent 83b9a695ba
commit 2e30970dfb
16 changed files with 4502 additions and 9 deletions

View File

@@ -410,6 +410,18 @@ WORD_ARGUMENTS: dict[str, dict[str, Any]] = {
},
}
# EPUB specific (from epub.py)
EPUB_ARGUMENTS: dict[str, dict[str, Any]] = {
"epub": {
"flags": ("--epub",),
"kwargs": {
"type": str,
"help": "EPUB file path",
"metavar": "PATH",
},
},
}
# Video specific (from video.py)
VIDEO_ARGUMENTS: dict[str, dict[str, Any]] = {
"video_url": {
@@ -598,6 +610,7 @@ def get_source_specific_arguments(source_type: str) -> dict[str, dict[str, Any]]
"local": LOCAL_ARGUMENTS,
"pdf": PDF_ARGUMENTS,
"word": WORD_ARGUMENTS,
"epub": EPUB_ARGUMENTS,
"video": VIDEO_ARGUMENTS,
"config": CONFIG_ARGUMENTS,
}
@@ -636,6 +649,7 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default")
- 'local': Universal + local-specific
- 'pdf': Universal + pdf-specific
- 'word': Universal + word-specific
- 'epub': Universal + epub-specific
- 'video': Universal + video-specific
- 'advanced': Advanced/rare arguments
- 'all': All 120+ arguments
@@ -677,6 +691,10 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default")
for arg_name, arg_def in WORD_ARGUMENTS.items():
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
if mode in ["epub", "all"]:
for arg_name, arg_def in EPUB_ARGUMENTS.items():
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
if mode in ["video", "all"]:
for arg_name, arg_def in VIDEO_ARGUMENTS.items():
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])

View File

@@ -0,0 +1,66 @@
"""EPUB command argument definitions.
This module defines ALL arguments for the epub command in ONE place.
Both epub_scraper.py (standalone) and parsers/epub_parser.py (unified CLI)
import and use these definitions.
Shared arguments (name, description, output, enhance-level, api-key,
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
via ``add_all_standard_arguments()``.
"""
import argparse
from typing import Any
from .common import add_all_standard_arguments
# EPUB-specific argument definitions as data structure
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
EPUB_ARGUMENTS: dict[str, dict[str, Any]] = {
"epub": {
"flags": ("--epub",),
"kwargs": {
"type": str,
"help": "Direct EPUB file path",
"metavar": "PATH",
},
},
"from_json": {
"flags": ("--from-json",),
"kwargs": {
"type": str,
"help": "Build skill from extracted JSON",
"metavar": "FILE",
},
},
}
def add_epub_arguments(parser: argparse.ArgumentParser) -> None:
"""Add all epub command arguments to a parser.
Registers shared args (name, description, output, enhance-level, api-key,
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
then adds EPUB-specific args on top.
The default for --enhance-level is overridden to 0 (disabled) for EPUB.
"""
# Shared universal args first
add_all_standard_arguments(parser)
# Override enhance-level default to 0 for EPUB
for action in parser._actions:
if hasattr(action, "dest") and action.dest == "enhance_level":
action.default = 0
action.help = (
"AI enhancement level (auto-detects API vs LOCAL mode): "
"0=disabled (default for EPUB), 1=SKILL.md only, 2=+architecture/config, 3=full enhancement. "
"Mode selection: uses API if ANTHROPIC_API_KEY is set, otherwise LOCAL (Claude Code)"
)
# EPUB-specific args
for arg_name, arg_def in EPUB_ARGUMENTS.items():
flags = arg_def["flags"]
kwargs = arg_def["kwargs"]
parser.add_argument(*flags, **kwargs)

View File

@@ -134,6 +134,8 @@ class CreateCommand:
return self._route_pdf()
elif self.source_info.type == "word":
return self._route_word()
elif self.source_info.type == "epub":
return self._route_epub()
elif self.source_info.type == "video":
return self._route_video()
elif self.source_info.type == "config":
@@ -351,6 +353,29 @@ class CreateCommand:
finally:
sys.argv = original_argv
def _route_epub(self) -> int:
"""Route to EPUB scraper (epub_scraper.py)."""
from skill_seekers.cli import epub_scraper
# Reconstruct argv for epub_scraper
argv = ["epub_scraper"]
# Add EPUB file
file_path = self.source_info.parsed["file_path"]
argv.extend(["--epub", file_path])
# Add universal arguments
self._add_common_args(argv)
# Call epub_scraper with modified argv
logger.debug(f"Calling epub_scraper with argv: {argv}")
original_argv = sys.argv
try:
sys.argv = argv
return epub_scraper.main()
finally:
sys.argv = original_argv
def _route_video(self) -> int:
"""Route to video scraper (video_scraper.py)."""
from skill_seekers.cli import video_scraper
@@ -541,6 +566,7 @@ Examples:
Local: skill-seekers create ./my-project -p comprehensive
PDF: skill-seekers create tutorial.pdf --ocr
DOCX: skill-seekers create document.docx
EPUB: skill-seekers create ebook.epub
Video: skill-seekers create https://youtube.com/watch?v=...
Video: skill-seekers create recording.mp4
Config: skill-seekers create configs/react.json
@@ -551,6 +577,7 @@ Source Auto-Detection:
• ./path → local codebase
• file.pdf → PDF extraction
• file.docx → Word document extraction
• file.epub → EPUB extraction
• youtube.com/... → Video transcript extraction
• file.mp4 → Video file extraction
• file.json → multi-source config
@@ -560,6 +587,7 @@ Progressive Help (13 → 120+ flags):
--help-github GitHub repository options
--help-local Local codebase analysis
--help-pdf PDF extraction options
--help-epub EPUB extraction options
--help-video Video extraction options
--help-advanced Rare/advanced options
--help-all All options + compatibility
@@ -591,6 +619,9 @@ Common Workflows:
parser.add_argument(
"--help-word", action="store_true", help=argparse.SUPPRESS, dest="_help_word"
)
parser.add_argument(
"--help-epub", action="store_true", help=argparse.SUPPRESS, dest="_help_epub"
)
parser.add_argument(
"--help-video", action="store_true", help=argparse.SUPPRESS, dest="_help_video"
)
@@ -652,6 +683,15 @@ Common Workflows:
add_create_arguments(parser_word, mode="word")
parser_word.print_help()
return 0
elif args._help_epub:
parser_epub = argparse.ArgumentParser(
prog="skill-seekers create",
description="Create skill from EPUB e-book (.epub)",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
add_create_arguments(parser_epub, mode="epub")
parser_epub.print_help()
return 0
elif args._help_video:
parser_video = argparse.ArgumentParser(
prog="skill-seekers create",

File diff suppressed because it is too large Load Diff

View File

@@ -13,6 +13,7 @@ Commands:
github Scrape GitHub repository
pdf Extract from PDF file
word Extract from Word (.docx) file
epub Extract from EPUB e-book (.epub)
video Extract from video (YouTube or local)
unified Multi-source scraping (docs + GitHub + PDF)
analyze Analyze local codebase and extract code knowledge
@@ -50,6 +51,7 @@ COMMAND_MODULES = {
"github": "skill_seekers.cli.github_scraper",
"pdf": "skill_seekers.cli.pdf_scraper",
"word": "skill_seekers.cli.word_scraper",
"epub": "skill_seekers.cli.epub_scraper",
"video": "skill_seekers.cli.video_scraper",
"unified": "skill_seekers.cli.unified_scraper",
"enhance": "skill_seekers.cli.enhance_command",

View File

@@ -13,6 +13,7 @@ from .scrape_parser import ScrapeParser
from .github_parser import GitHubParser
from .pdf_parser import PDFParser
from .word_parser import WordParser
from .epub_parser import EpubParser
from .video_parser import VideoParser
from .unified_parser import UnifiedParser
from .enhance_parser import EnhanceParser
@@ -45,6 +46,7 @@ PARSERS = [
EnhanceStatusParser(),
PDFParser(),
WordParser(),
EpubParser(),
VideoParser(),
UnifiedParser(),
EstimateParser(),

View File

@@ -0,0 +1,32 @@
"""EPUB subcommand parser.
Uses shared argument definitions from arguments.epub to ensure
consistency with the standalone epub_scraper module.
"""
from .base import SubcommandParser
from skill_seekers.cli.arguments.epub import add_epub_arguments
class EpubParser(SubcommandParser):
"""Parser for epub subcommand."""
@property
def name(self) -> str:
return "epub"
@property
def help(self) -> str:
return "Extract from EPUB e-book (.epub)"
@property
def description(self) -> str:
return "Extract content from EPUB e-book (.epub) and generate skill"
def add_arguments(self, parser):
"""Add epub-specific arguments.
Uses shared argument definitions to ensure consistency
with epub_scraper.py (standalone scraper).
"""
add_epub_arguments(parser)

View File

@@ -63,6 +63,9 @@ class SourceDetector:
if source.endswith(".docx"):
return cls._detect_word(source)
if source.endswith(".epub"):
return cls._detect_epub(source)
# Video file extensions
VIDEO_EXTENSIONS = (".mp4", ".mkv", ".avi", ".mov", ".webm", ".flv", ".wmv")
if source.lower().endswith(VIDEO_EXTENSIONS):
@@ -99,6 +102,7 @@ class SourceDetector:
" Local: skill-seekers create ./my-project\n"
" PDF: skill-seekers create tutorial.pdf\n"
" DOCX: skill-seekers create document.docx\n"
" EPUB: skill-seekers create ebook.epub\n"
" Video: skill-seekers create https://youtube.com/watch?v=...\n"
" Video: skill-seekers create recording.mp4\n"
" Config: skill-seekers create configs/react.json"
@@ -128,6 +132,14 @@ class SourceDetector:
type="word", parsed={"file_path": source}, suggested_name=name, raw_input=source
)
@classmethod
def _detect_epub(cls, source: str) -> SourceInfo:
"""Detect EPUB file source."""
name = os.path.splitext(os.path.basename(source))[0]
return SourceInfo(
type="epub", parsed={"file_path": source}, suggested_name=name, raw_input=source
)
@classmethod
def _detect_video_file(cls, source: str) -> SourceInfo:
"""Detect local video file source."""
@@ -277,6 +289,13 @@ class SourceDetector:
if not os.path.isfile(file_path):
raise ValueError(f"Path is not a file: {file_path}")
elif source_info.type == "epub":
file_path = source_info.parsed["file_path"]
if not os.path.exists(file_path):
raise ValueError(f"EPUB file does not exist: {file_path}")
if not os.path.isfile(file_path):
raise ValueError(f"Path is not a file: {file_path}")
elif source_info.type == "video":
if source_info.parsed.get("source_kind") == "file":
file_path = source_info.parsed["file_path"]