Files
skill-seekers-reference/src/skill_seekers/cli/source_detector.py
yusyus 2e30970dfb feat: add EPUB input support (#310)
Adds EPUB as a first-class input source for skill generation.

- EpubToSkillConverter (epub_scraper.py, ~1200 lines) following PDF scraper pattern
- Dublin Core metadata, spine items, code blocks, tables, images extraction
- DRM detection (Adobe ADEPT, Apple FairPlay, Readium LCP) with fail-fast
- EPUB 3 NCX TOC bug workaround (ignore_ncx=True)
- ebooklib as optional dep: pip install skill-seekers[epub]
- Wired into create command with .epub auto-detection
- 104 tests, all passing

Review fixes: removed 3 empty test stubs, fixed SVG double-counting in
_extract_images(), added logger.debug to bare except pass.

Based on PR #310 by @christianbaumann.
Co-authored-by: Christian Baumann <mail@chriss-baumann.de>
2026-03-15 02:34:41 +03:00

317 lines
11 KiB
Python

"""Source type detection for unified create command.
Auto-detects whether a source is a web URL, GitHub repository,
local directory, PDF file, or config file based on patterns.
"""
import os
import re
from dataclasses import dataclass
from typing import Any
from urllib.parse import urlparse
import logging
logger = logging.getLogger(__name__)
@dataclass
class SourceInfo:
"""Information about a detected source.
Attributes:
type: Source type ('web', 'github', 'local', 'pdf', 'config')
parsed: Parsed source information (e.g., {'url': '...'}, {'repo': '...'})
suggested_name: Auto-suggested name for the skill
raw_input: Original user input
"""
type: str
parsed: dict[str, Any]
suggested_name: str
raw_input: str
class SourceDetector:
"""Detects source type from user input and extracts relevant information."""
# GitHub repo patterns
GITHUB_REPO_PATTERN = re.compile(r"^([a-zA-Z0-9_.-]+)/([a-zA-Z0-9_.-]+)$")
GITHUB_URL_PATTERN = re.compile(
r"(?:https?://)?(?:www\.)?github\.com/([a-zA-Z0-9_.-]+)/([a-zA-Z0-9_.-]+)(?:\.git)?"
)
@classmethod
def detect(cls, source: str) -> SourceInfo:
"""Detect source type and extract information.
Args:
source: User input (URL, path, repo, etc.)
Returns:
SourceInfo object with detected type and parsed data
Raises:
ValueError: If source type cannot be determined
"""
# 1. File extension detection
if source.endswith(".json"):
return cls._detect_config(source)
if source.endswith(".pdf"):
return cls._detect_pdf(source)
if source.endswith(".docx"):
return cls._detect_word(source)
if source.endswith(".epub"):
return cls._detect_epub(source)
# Video file extensions
VIDEO_EXTENSIONS = (".mp4", ".mkv", ".avi", ".mov", ".webm", ".flv", ".wmv")
if source.lower().endswith(VIDEO_EXTENSIONS):
return cls._detect_video_file(source)
# 2. Video URL detection (before directory check)
video_url_info = cls._detect_video_url(source)
if video_url_info:
return video_url_info
# 3. Directory detection
if os.path.isdir(source):
return cls._detect_local(source)
# 4. GitHub patterns
github_info = cls._detect_github(source)
if github_info:
return github_info
# 5. URL detection
if source.startswith("http://") or source.startswith("https://"):
return cls._detect_web(source)
# 6. Domain inference (add https://)
if "." in source and not source.startswith("/"):
return cls._detect_web(f"https://{source}")
# 7. Error - cannot determine
raise ValueError(
f"Cannot determine source type for: {source}\n\n"
"Examples:\n"
" Web: skill-seekers create https://docs.react.dev/\n"
" GitHub: skill-seekers create facebook/react\n"
" Local: skill-seekers create ./my-project\n"
" PDF: skill-seekers create tutorial.pdf\n"
" DOCX: skill-seekers create document.docx\n"
" EPUB: skill-seekers create ebook.epub\n"
" Video: skill-seekers create https://youtube.com/watch?v=...\n"
" Video: skill-seekers create recording.mp4\n"
" Config: skill-seekers create configs/react.json"
)
@classmethod
def _detect_config(cls, source: str) -> SourceInfo:
"""Detect config file source."""
name = os.path.splitext(os.path.basename(source))[0]
return SourceInfo(
type="config", parsed={"config_path": source}, suggested_name=name, raw_input=source
)
@classmethod
def _detect_pdf(cls, source: str) -> SourceInfo:
"""Detect PDF file source."""
name = os.path.splitext(os.path.basename(source))[0]
return SourceInfo(
type="pdf", parsed={"file_path": source}, suggested_name=name, raw_input=source
)
@classmethod
def _detect_word(cls, source: str) -> SourceInfo:
"""Detect Word document (.docx) source."""
name = os.path.splitext(os.path.basename(source))[0]
return SourceInfo(
type="word", parsed={"file_path": source}, suggested_name=name, raw_input=source
)
@classmethod
def _detect_epub(cls, source: str) -> SourceInfo:
"""Detect EPUB file source."""
name = os.path.splitext(os.path.basename(source))[0]
return SourceInfo(
type="epub", parsed={"file_path": source}, suggested_name=name, raw_input=source
)
@classmethod
def _detect_video_file(cls, source: str) -> SourceInfo:
"""Detect local video file source."""
name = os.path.splitext(os.path.basename(source))[0]
return SourceInfo(
type="video",
parsed={"file_path": source, "source_kind": "file"},
suggested_name=name,
raw_input=source,
)
@classmethod
def _detect_video_url(cls, source: str) -> SourceInfo | None:
"""Detect video platform URL (YouTube, Vimeo).
Returns SourceInfo if the source is a video URL, None otherwise.
"""
lower = source.lower()
# YouTube patterns
youtube_keywords = [
"youtube.com/watch",
"youtu.be/",
"youtube.com/playlist",
"youtube.com/@",
"youtube.com/channel/",
"youtube.com/c/",
"youtube.com/shorts/",
"youtube.com/embed/",
]
if any(kw in lower for kw in youtube_keywords):
# Determine suggested name
if "playlist" in lower:
name = "youtube_playlist"
elif "/@" in lower or "/channel/" in lower or "/c/" in lower:
name = "youtube_channel"
else:
name = "youtube_video"
return SourceInfo(
type="video",
parsed={"url": source, "source_kind": "url"},
suggested_name=name,
raw_input=source,
)
# Vimeo patterns
if "vimeo.com/" in lower:
return SourceInfo(
type="video",
parsed={"url": source, "source_kind": "url"},
suggested_name="vimeo_video",
raw_input=source,
)
return None
@classmethod
def _detect_local(cls, source: str) -> SourceInfo:
"""Detect local directory source."""
# Clean up path
directory = os.path.abspath(source)
name = os.path.basename(directory)
return SourceInfo(
type="local", parsed={"directory": directory}, suggested_name=name, raw_input=source
)
@classmethod
def _detect_github(cls, source: str) -> SourceInfo | None:
"""Detect GitHub repository source.
Supports patterns:
- owner/repo
- github.com/owner/repo
- https://github.com/owner/repo
"""
# Try simple owner/repo pattern first
match = cls.GITHUB_REPO_PATTERN.match(source)
if match:
owner, repo = match.groups()
return SourceInfo(
type="github",
parsed={"repo": f"{owner}/{repo}"},
suggested_name=repo,
raw_input=source,
)
# Try GitHub URL pattern
match = cls.GITHUB_URL_PATTERN.search(source)
if match:
owner, repo = match.groups()
# Clean up repo name (remove .git suffix if present)
if repo.endswith(".git"):
repo = repo[:-4]
return SourceInfo(
type="github",
parsed={"repo": f"{owner}/{repo}"},
suggested_name=repo,
raw_input=source,
)
return None
@classmethod
def _detect_web(cls, source: str) -> SourceInfo:
"""Detect web documentation source."""
# Parse URL to extract domain for suggested name
parsed_url = urlparse(source)
domain = parsed_url.netloc or parsed_url.path
# Clean up domain for name suggestion
# docs.react.dev -> react
# reactjs.org -> react
name = domain.replace("www.", "").replace("docs.", "")
name = name.split(".")[0] # Take first part before TLD
return SourceInfo(type="web", parsed={"url": source}, suggested_name=name, raw_input=source)
@classmethod
def validate_source(cls, source_info: SourceInfo) -> None:
"""Validate that source is accessible.
Args:
source_info: Detected source information
Raises:
ValueError: If source is not accessible
"""
if source_info.type == "local":
directory = source_info.parsed["directory"]
if not os.path.exists(directory):
raise ValueError(f"Directory does not exist: {directory}")
if not os.path.isdir(directory):
raise ValueError(f"Path is not a directory: {directory}")
elif source_info.type == "pdf":
file_path = source_info.parsed["file_path"]
if not os.path.exists(file_path):
raise ValueError(f"PDF file does not exist: {file_path}")
if not os.path.isfile(file_path):
raise ValueError(f"Path is not a file: {file_path}")
elif source_info.type == "word":
file_path = source_info.parsed["file_path"]
if not os.path.exists(file_path):
raise ValueError(f"Word document does not exist: {file_path}")
if not os.path.isfile(file_path):
raise ValueError(f"Path is not a file: {file_path}")
elif source_info.type == "epub":
file_path = source_info.parsed["file_path"]
if not os.path.exists(file_path):
raise ValueError(f"EPUB file does not exist: {file_path}")
if not os.path.isfile(file_path):
raise ValueError(f"Path is not a file: {file_path}")
elif source_info.type == "video":
if source_info.parsed.get("source_kind") == "file":
file_path = source_info.parsed["file_path"]
if not os.path.exists(file_path):
raise ValueError(f"Video file does not exist: {file_path}")
if not os.path.isfile(file_path):
raise ValueError(f"Path is not a file: {file_path}")
# URL-based video sources are validated during processing
elif source_info.type == "config":
config_path = source_info.parsed["config_path"]
if not os.path.exists(config_path):
raise ValueError(f"Config file does not exist: {config_path}")
if not os.path.isfile(config_path):
raise ValueError(f"Path is not a file: {config_path}")
# For web and github, validation happens during scraping
# (URL accessibility, repo existence)