feat: add EPUB input support (#310)
Adds EPUB as a first-class input source for skill generation. - EpubToSkillConverter (epub_scraper.py, ~1200 lines) following PDF scraper pattern - Dublin Core metadata, spine items, code blocks, tables, images extraction - DRM detection (Adobe ADEPT, Apple FairPlay, Readium LCP) with fail-fast - EPUB 3 NCX TOC bug workaround (ignore_ncx=True) - ebooklib as optional dep: pip install skill-seekers[epub] - Wired into create command with .epub auto-detection - 104 tests, all passing Review fixes: removed 3 empty test stubs, fixed SVG double-counting in _extract_images(), added logger.debug to bare except pass. Based on PR #310 by @christianbaumann. Co-authored-by: Christian Baumann <mail@chriss-baumann.de>
This commit is contained in:
@@ -410,6 +410,18 @@ WORD_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
},
|
||||
}
|
||||
|
||||
# EPUB specific (from epub.py)
|
||||
EPUB_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"epub": {
|
||||
"flags": ("--epub",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "EPUB file path",
|
||||
"metavar": "PATH",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
# Video specific (from video.py)
|
||||
VIDEO_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"video_url": {
|
||||
@@ -598,6 +610,7 @@ def get_source_specific_arguments(source_type: str) -> dict[str, dict[str, Any]]
|
||||
"local": LOCAL_ARGUMENTS,
|
||||
"pdf": PDF_ARGUMENTS,
|
||||
"word": WORD_ARGUMENTS,
|
||||
"epub": EPUB_ARGUMENTS,
|
||||
"video": VIDEO_ARGUMENTS,
|
||||
"config": CONFIG_ARGUMENTS,
|
||||
}
|
||||
@@ -636,6 +649,7 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default")
|
||||
- 'local': Universal + local-specific
|
||||
- 'pdf': Universal + pdf-specific
|
||||
- 'word': Universal + word-specific
|
||||
- 'epub': Universal + epub-specific
|
||||
- 'video': Universal + video-specific
|
||||
- 'advanced': Advanced/rare arguments
|
||||
- 'all': All 120+ arguments
|
||||
@@ -677,6 +691,10 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default")
|
||||
for arg_name, arg_def in WORD_ARGUMENTS.items():
|
||||
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
|
||||
|
||||
if mode in ["epub", "all"]:
|
||||
for arg_name, arg_def in EPUB_ARGUMENTS.items():
|
||||
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
|
||||
|
||||
if mode in ["video", "all"]:
|
||||
for arg_name, arg_def in VIDEO_ARGUMENTS.items():
|
||||
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
|
||||
|
||||
66
src/skill_seekers/cli/arguments/epub.py
Normal file
66
src/skill_seekers/cli/arguments/epub.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""EPUB command argument definitions.
|
||||
|
||||
This module defines ALL arguments for the epub command in ONE place.
|
||||
Both epub_scraper.py (standalone) and parsers/epub_parser.py (unified CLI)
|
||||
import and use these definitions.
|
||||
|
||||
Shared arguments (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
|
||||
via ``add_all_standard_arguments()``.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from typing import Any
|
||||
|
||||
from .common import add_all_standard_arguments
|
||||
|
||||
# EPUB-specific argument definitions as data structure
|
||||
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
|
||||
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
|
||||
EPUB_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"epub": {
|
||||
"flags": ("--epub",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Direct EPUB file path",
|
||||
"metavar": "PATH",
|
||||
},
|
||||
},
|
||||
"from_json": {
|
||||
"flags": ("--from-json",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Build skill from extracted JSON",
|
||||
"metavar": "FILE",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def add_epub_arguments(parser: argparse.ArgumentParser) -> None:
|
||||
"""Add all epub command arguments to a parser.
|
||||
|
||||
Registers shared args (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
|
||||
then adds EPUB-specific args on top.
|
||||
|
||||
The default for --enhance-level is overridden to 0 (disabled) for EPUB.
|
||||
"""
|
||||
# Shared universal args first
|
||||
add_all_standard_arguments(parser)
|
||||
|
||||
# Override enhance-level default to 0 for EPUB
|
||||
for action in parser._actions:
|
||||
if hasattr(action, "dest") and action.dest == "enhance_level":
|
||||
action.default = 0
|
||||
action.help = (
|
||||
"AI enhancement level (auto-detects API vs LOCAL mode): "
|
||||
"0=disabled (default for EPUB), 1=SKILL.md only, 2=+architecture/config, 3=full enhancement. "
|
||||
"Mode selection: uses API if ANTHROPIC_API_KEY is set, otherwise LOCAL (Claude Code)"
|
||||
)
|
||||
|
||||
# EPUB-specific args
|
||||
for arg_name, arg_def in EPUB_ARGUMENTS.items():
|
||||
flags = arg_def["flags"]
|
||||
kwargs = arg_def["kwargs"]
|
||||
parser.add_argument(*flags, **kwargs)
|
||||
@@ -134,6 +134,8 @@ class CreateCommand:
|
||||
return self._route_pdf()
|
||||
elif self.source_info.type == "word":
|
||||
return self._route_word()
|
||||
elif self.source_info.type == "epub":
|
||||
return self._route_epub()
|
||||
elif self.source_info.type == "video":
|
||||
return self._route_video()
|
||||
elif self.source_info.type == "config":
|
||||
@@ -351,6 +353,29 @@ class CreateCommand:
|
||||
finally:
|
||||
sys.argv = original_argv
|
||||
|
||||
def _route_epub(self) -> int:
|
||||
"""Route to EPUB scraper (epub_scraper.py)."""
|
||||
from skill_seekers.cli import epub_scraper
|
||||
|
||||
# Reconstruct argv for epub_scraper
|
||||
argv = ["epub_scraper"]
|
||||
|
||||
# Add EPUB file
|
||||
file_path = self.source_info.parsed["file_path"]
|
||||
argv.extend(["--epub", file_path])
|
||||
|
||||
# Add universal arguments
|
||||
self._add_common_args(argv)
|
||||
|
||||
# Call epub_scraper with modified argv
|
||||
logger.debug(f"Calling epub_scraper with argv: {argv}")
|
||||
original_argv = sys.argv
|
||||
try:
|
||||
sys.argv = argv
|
||||
return epub_scraper.main()
|
||||
finally:
|
||||
sys.argv = original_argv
|
||||
|
||||
def _route_video(self) -> int:
|
||||
"""Route to video scraper (video_scraper.py)."""
|
||||
from skill_seekers.cli import video_scraper
|
||||
@@ -541,6 +566,7 @@ Examples:
|
||||
Local: skill-seekers create ./my-project -p comprehensive
|
||||
PDF: skill-seekers create tutorial.pdf --ocr
|
||||
DOCX: skill-seekers create document.docx
|
||||
EPUB: skill-seekers create ebook.epub
|
||||
Video: skill-seekers create https://youtube.com/watch?v=...
|
||||
Video: skill-seekers create recording.mp4
|
||||
Config: skill-seekers create configs/react.json
|
||||
@@ -551,6 +577,7 @@ Source Auto-Detection:
|
||||
• ./path → local codebase
|
||||
• file.pdf → PDF extraction
|
||||
• file.docx → Word document extraction
|
||||
• file.epub → EPUB extraction
|
||||
• youtube.com/... → Video transcript extraction
|
||||
• file.mp4 → Video file extraction
|
||||
• file.json → multi-source config
|
||||
@@ -560,6 +587,7 @@ Progressive Help (13 → 120+ flags):
|
||||
--help-github GitHub repository options
|
||||
--help-local Local codebase analysis
|
||||
--help-pdf PDF extraction options
|
||||
--help-epub EPUB extraction options
|
||||
--help-video Video extraction options
|
||||
--help-advanced Rare/advanced options
|
||||
--help-all All options + compatibility
|
||||
@@ -591,6 +619,9 @@ Common Workflows:
|
||||
parser.add_argument(
|
||||
"--help-word", action="store_true", help=argparse.SUPPRESS, dest="_help_word"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--help-epub", action="store_true", help=argparse.SUPPRESS, dest="_help_epub"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--help-video", action="store_true", help=argparse.SUPPRESS, dest="_help_video"
|
||||
)
|
||||
@@ -652,6 +683,15 @@ Common Workflows:
|
||||
add_create_arguments(parser_word, mode="word")
|
||||
parser_word.print_help()
|
||||
return 0
|
||||
elif args._help_epub:
|
||||
parser_epub = argparse.ArgumentParser(
|
||||
prog="skill-seekers create",
|
||||
description="Create skill from EPUB e-book (.epub)",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
add_create_arguments(parser_epub, mode="epub")
|
||||
parser_epub.print_help()
|
||||
return 0
|
||||
elif args._help_video:
|
||||
parser_video = argparse.ArgumentParser(
|
||||
prog="skill-seekers create",
|
||||
|
||||
1206
src/skill_seekers/cli/epub_scraper.py
Normal file
1206
src/skill_seekers/cli/epub_scraper.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -13,6 +13,7 @@ Commands:
|
||||
github Scrape GitHub repository
|
||||
pdf Extract from PDF file
|
||||
word Extract from Word (.docx) file
|
||||
epub Extract from EPUB e-book (.epub)
|
||||
video Extract from video (YouTube or local)
|
||||
unified Multi-source scraping (docs + GitHub + PDF)
|
||||
analyze Analyze local codebase and extract code knowledge
|
||||
@@ -50,6 +51,7 @@ COMMAND_MODULES = {
|
||||
"github": "skill_seekers.cli.github_scraper",
|
||||
"pdf": "skill_seekers.cli.pdf_scraper",
|
||||
"word": "skill_seekers.cli.word_scraper",
|
||||
"epub": "skill_seekers.cli.epub_scraper",
|
||||
"video": "skill_seekers.cli.video_scraper",
|
||||
"unified": "skill_seekers.cli.unified_scraper",
|
||||
"enhance": "skill_seekers.cli.enhance_command",
|
||||
|
||||
@@ -13,6 +13,7 @@ from .scrape_parser import ScrapeParser
|
||||
from .github_parser import GitHubParser
|
||||
from .pdf_parser import PDFParser
|
||||
from .word_parser import WordParser
|
||||
from .epub_parser import EpubParser
|
||||
from .video_parser import VideoParser
|
||||
from .unified_parser import UnifiedParser
|
||||
from .enhance_parser import EnhanceParser
|
||||
@@ -45,6 +46,7 @@ PARSERS = [
|
||||
EnhanceStatusParser(),
|
||||
PDFParser(),
|
||||
WordParser(),
|
||||
EpubParser(),
|
||||
VideoParser(),
|
||||
UnifiedParser(),
|
||||
EstimateParser(),
|
||||
|
||||
32
src/skill_seekers/cli/parsers/epub_parser.py
Normal file
32
src/skill_seekers/cli/parsers/epub_parser.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""EPUB subcommand parser.
|
||||
|
||||
Uses shared argument definitions from arguments.epub to ensure
|
||||
consistency with the standalone epub_scraper module.
|
||||
"""
|
||||
|
||||
from .base import SubcommandParser
|
||||
from skill_seekers.cli.arguments.epub import add_epub_arguments
|
||||
|
||||
|
||||
class EpubParser(SubcommandParser):
|
||||
"""Parser for epub subcommand."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "epub"
|
||||
|
||||
@property
|
||||
def help(self) -> str:
|
||||
return "Extract from EPUB e-book (.epub)"
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return "Extract content from EPUB e-book (.epub) and generate skill"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
"""Add epub-specific arguments.
|
||||
|
||||
Uses shared argument definitions to ensure consistency
|
||||
with epub_scraper.py (standalone scraper).
|
||||
"""
|
||||
add_epub_arguments(parser)
|
||||
@@ -63,6 +63,9 @@ class SourceDetector:
|
||||
if source.endswith(".docx"):
|
||||
return cls._detect_word(source)
|
||||
|
||||
if source.endswith(".epub"):
|
||||
return cls._detect_epub(source)
|
||||
|
||||
# Video file extensions
|
||||
VIDEO_EXTENSIONS = (".mp4", ".mkv", ".avi", ".mov", ".webm", ".flv", ".wmv")
|
||||
if source.lower().endswith(VIDEO_EXTENSIONS):
|
||||
@@ -99,6 +102,7 @@ class SourceDetector:
|
||||
" Local: skill-seekers create ./my-project\n"
|
||||
" PDF: skill-seekers create tutorial.pdf\n"
|
||||
" DOCX: skill-seekers create document.docx\n"
|
||||
" EPUB: skill-seekers create ebook.epub\n"
|
||||
" Video: skill-seekers create https://youtube.com/watch?v=...\n"
|
||||
" Video: skill-seekers create recording.mp4\n"
|
||||
" Config: skill-seekers create configs/react.json"
|
||||
@@ -128,6 +132,14 @@ class SourceDetector:
|
||||
type="word", parsed={"file_path": source}, suggested_name=name, raw_input=source
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _detect_epub(cls, source: str) -> SourceInfo:
|
||||
"""Detect EPUB file source."""
|
||||
name = os.path.splitext(os.path.basename(source))[0]
|
||||
return SourceInfo(
|
||||
type="epub", parsed={"file_path": source}, suggested_name=name, raw_input=source
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _detect_video_file(cls, source: str) -> SourceInfo:
|
||||
"""Detect local video file source."""
|
||||
@@ -277,6 +289,13 @@ class SourceDetector:
|
||||
if not os.path.isfile(file_path):
|
||||
raise ValueError(f"Path is not a file: {file_path}")
|
||||
|
||||
elif source_info.type == "epub":
|
||||
file_path = source_info.parsed["file_path"]
|
||||
if not os.path.exists(file_path):
|
||||
raise ValueError(f"EPUB file does not exist: {file_path}")
|
||||
if not os.path.isfile(file_path):
|
||||
raise ValueError(f"Path is not a file: {file_path}")
|
||||
|
||||
elif source_info.type == "video":
|
||||
if source_info.parsed.get("source_kind") == "file":
|
||||
file_path = source_info.parsed["file_path"]
|
||||
|
||||
Reference in New Issue
Block a user