Adds EPUB as a first-class input source for skill generation. - EpubToSkillConverter (epub_scraper.py, ~1200 lines) following PDF scraper pattern - Dublin Core metadata, spine items, code blocks, tables, images extraction - DRM detection (Adobe ADEPT, Apple FairPlay, Readium LCP) with fail-fast - EPUB 3 NCX TOC bug workaround (ignore_ncx=True) - ebooklib as optional dep: pip install skill-seekers[epub] - Wired into create command with .epub auto-detection - 104 tests, all passing Review fixes: removed 3 empty test stubs, fixed SVG double-counting in _extract_images(), added logger.debug to bare except pass. Based on PR #310 by @christianbaumann. Co-authored-by: Christian Baumann <mail@chriss-baumann.de>
317 lines
11 KiB
Python
317 lines
11 KiB
Python
"""Source type detection for unified create command.
|
|
|
|
Auto-detects whether a source is a web URL, GitHub repository,
|
|
local directory, PDF file, or config file based on patterns.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
from dataclasses import dataclass
|
|
from typing import Any
|
|
from urllib.parse import urlparse
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class SourceInfo:
|
|
"""Information about a detected source.
|
|
|
|
Attributes:
|
|
type: Source type ('web', 'github', 'local', 'pdf', 'config')
|
|
parsed: Parsed source information (e.g., {'url': '...'}, {'repo': '...'})
|
|
suggested_name: Auto-suggested name for the skill
|
|
raw_input: Original user input
|
|
"""
|
|
|
|
type: str
|
|
parsed: dict[str, Any]
|
|
suggested_name: str
|
|
raw_input: str
|
|
|
|
|
|
class SourceDetector:
|
|
"""Detects source type from user input and extracts relevant information."""
|
|
|
|
# GitHub repo patterns
|
|
GITHUB_REPO_PATTERN = re.compile(r"^([a-zA-Z0-9_.-]+)/([a-zA-Z0-9_.-]+)$")
|
|
GITHUB_URL_PATTERN = re.compile(
|
|
r"(?:https?://)?(?:www\.)?github\.com/([a-zA-Z0-9_.-]+)/([a-zA-Z0-9_.-]+)(?:\.git)?"
|
|
)
|
|
|
|
@classmethod
|
|
def detect(cls, source: str) -> SourceInfo:
|
|
"""Detect source type and extract information.
|
|
|
|
Args:
|
|
source: User input (URL, path, repo, etc.)
|
|
|
|
Returns:
|
|
SourceInfo object with detected type and parsed data
|
|
|
|
Raises:
|
|
ValueError: If source type cannot be determined
|
|
"""
|
|
# 1. File extension detection
|
|
if source.endswith(".json"):
|
|
return cls._detect_config(source)
|
|
|
|
if source.endswith(".pdf"):
|
|
return cls._detect_pdf(source)
|
|
|
|
if source.endswith(".docx"):
|
|
return cls._detect_word(source)
|
|
|
|
if source.endswith(".epub"):
|
|
return cls._detect_epub(source)
|
|
|
|
# Video file extensions
|
|
VIDEO_EXTENSIONS = (".mp4", ".mkv", ".avi", ".mov", ".webm", ".flv", ".wmv")
|
|
if source.lower().endswith(VIDEO_EXTENSIONS):
|
|
return cls._detect_video_file(source)
|
|
|
|
# 2. Video URL detection (before directory check)
|
|
video_url_info = cls._detect_video_url(source)
|
|
if video_url_info:
|
|
return video_url_info
|
|
|
|
# 3. Directory detection
|
|
if os.path.isdir(source):
|
|
return cls._detect_local(source)
|
|
|
|
# 4. GitHub patterns
|
|
github_info = cls._detect_github(source)
|
|
if github_info:
|
|
return github_info
|
|
|
|
# 5. URL detection
|
|
if source.startswith("http://") or source.startswith("https://"):
|
|
return cls._detect_web(source)
|
|
|
|
# 6. Domain inference (add https://)
|
|
if "." in source and not source.startswith("/"):
|
|
return cls._detect_web(f"https://{source}")
|
|
|
|
# 7. Error - cannot determine
|
|
raise ValueError(
|
|
f"Cannot determine source type for: {source}\n\n"
|
|
"Examples:\n"
|
|
" Web: skill-seekers create https://docs.react.dev/\n"
|
|
" GitHub: skill-seekers create facebook/react\n"
|
|
" Local: skill-seekers create ./my-project\n"
|
|
" PDF: skill-seekers create tutorial.pdf\n"
|
|
" DOCX: skill-seekers create document.docx\n"
|
|
" EPUB: skill-seekers create ebook.epub\n"
|
|
" Video: skill-seekers create https://youtube.com/watch?v=...\n"
|
|
" Video: skill-seekers create recording.mp4\n"
|
|
" Config: skill-seekers create configs/react.json"
|
|
)
|
|
|
|
@classmethod
|
|
def _detect_config(cls, source: str) -> SourceInfo:
|
|
"""Detect config file source."""
|
|
name = os.path.splitext(os.path.basename(source))[0]
|
|
return SourceInfo(
|
|
type="config", parsed={"config_path": source}, suggested_name=name, raw_input=source
|
|
)
|
|
|
|
@classmethod
|
|
def _detect_pdf(cls, source: str) -> SourceInfo:
|
|
"""Detect PDF file source."""
|
|
name = os.path.splitext(os.path.basename(source))[0]
|
|
return SourceInfo(
|
|
type="pdf", parsed={"file_path": source}, suggested_name=name, raw_input=source
|
|
)
|
|
|
|
@classmethod
|
|
def _detect_word(cls, source: str) -> SourceInfo:
|
|
"""Detect Word document (.docx) source."""
|
|
name = os.path.splitext(os.path.basename(source))[0]
|
|
return SourceInfo(
|
|
type="word", parsed={"file_path": source}, suggested_name=name, raw_input=source
|
|
)
|
|
|
|
@classmethod
|
|
def _detect_epub(cls, source: str) -> SourceInfo:
|
|
"""Detect EPUB file source."""
|
|
name = os.path.splitext(os.path.basename(source))[0]
|
|
return SourceInfo(
|
|
type="epub", parsed={"file_path": source}, suggested_name=name, raw_input=source
|
|
)
|
|
|
|
@classmethod
|
|
def _detect_video_file(cls, source: str) -> SourceInfo:
|
|
"""Detect local video file source."""
|
|
name = os.path.splitext(os.path.basename(source))[0]
|
|
return SourceInfo(
|
|
type="video",
|
|
parsed={"file_path": source, "source_kind": "file"},
|
|
suggested_name=name,
|
|
raw_input=source,
|
|
)
|
|
|
|
@classmethod
|
|
def _detect_video_url(cls, source: str) -> SourceInfo | None:
|
|
"""Detect video platform URL (YouTube, Vimeo).
|
|
|
|
Returns SourceInfo if the source is a video URL, None otherwise.
|
|
"""
|
|
lower = source.lower()
|
|
|
|
# YouTube patterns
|
|
youtube_keywords = [
|
|
"youtube.com/watch",
|
|
"youtu.be/",
|
|
"youtube.com/playlist",
|
|
"youtube.com/@",
|
|
"youtube.com/channel/",
|
|
"youtube.com/c/",
|
|
"youtube.com/shorts/",
|
|
"youtube.com/embed/",
|
|
]
|
|
if any(kw in lower for kw in youtube_keywords):
|
|
# Determine suggested name
|
|
if "playlist" in lower:
|
|
name = "youtube_playlist"
|
|
elif "/@" in lower or "/channel/" in lower or "/c/" in lower:
|
|
name = "youtube_channel"
|
|
else:
|
|
name = "youtube_video"
|
|
return SourceInfo(
|
|
type="video",
|
|
parsed={"url": source, "source_kind": "url"},
|
|
suggested_name=name,
|
|
raw_input=source,
|
|
)
|
|
|
|
# Vimeo patterns
|
|
if "vimeo.com/" in lower:
|
|
return SourceInfo(
|
|
type="video",
|
|
parsed={"url": source, "source_kind": "url"},
|
|
suggested_name="vimeo_video",
|
|
raw_input=source,
|
|
)
|
|
|
|
return None
|
|
|
|
@classmethod
|
|
def _detect_local(cls, source: str) -> SourceInfo:
|
|
"""Detect local directory source."""
|
|
# Clean up path
|
|
directory = os.path.abspath(source)
|
|
name = os.path.basename(directory)
|
|
|
|
return SourceInfo(
|
|
type="local", parsed={"directory": directory}, suggested_name=name, raw_input=source
|
|
)
|
|
|
|
@classmethod
|
|
def _detect_github(cls, source: str) -> SourceInfo | None:
|
|
"""Detect GitHub repository source.
|
|
|
|
Supports patterns:
|
|
- owner/repo
|
|
- github.com/owner/repo
|
|
- https://github.com/owner/repo
|
|
"""
|
|
# Try simple owner/repo pattern first
|
|
match = cls.GITHUB_REPO_PATTERN.match(source)
|
|
if match:
|
|
owner, repo = match.groups()
|
|
return SourceInfo(
|
|
type="github",
|
|
parsed={"repo": f"{owner}/{repo}"},
|
|
suggested_name=repo,
|
|
raw_input=source,
|
|
)
|
|
|
|
# Try GitHub URL pattern
|
|
match = cls.GITHUB_URL_PATTERN.search(source)
|
|
if match:
|
|
owner, repo = match.groups()
|
|
# Clean up repo name (remove .git suffix if present)
|
|
if repo.endswith(".git"):
|
|
repo = repo[:-4]
|
|
return SourceInfo(
|
|
type="github",
|
|
parsed={"repo": f"{owner}/{repo}"},
|
|
suggested_name=repo,
|
|
raw_input=source,
|
|
)
|
|
|
|
return None
|
|
|
|
@classmethod
|
|
def _detect_web(cls, source: str) -> SourceInfo:
|
|
"""Detect web documentation source."""
|
|
# Parse URL to extract domain for suggested name
|
|
parsed_url = urlparse(source)
|
|
domain = parsed_url.netloc or parsed_url.path
|
|
|
|
# Clean up domain for name suggestion
|
|
# docs.react.dev -> react
|
|
# reactjs.org -> react
|
|
name = domain.replace("www.", "").replace("docs.", "")
|
|
name = name.split(".")[0] # Take first part before TLD
|
|
|
|
return SourceInfo(type="web", parsed={"url": source}, suggested_name=name, raw_input=source)
|
|
|
|
@classmethod
|
|
def validate_source(cls, source_info: SourceInfo) -> None:
|
|
"""Validate that source is accessible.
|
|
|
|
Args:
|
|
source_info: Detected source information
|
|
|
|
Raises:
|
|
ValueError: If source is not accessible
|
|
"""
|
|
if source_info.type == "local":
|
|
directory = source_info.parsed["directory"]
|
|
if not os.path.exists(directory):
|
|
raise ValueError(f"Directory does not exist: {directory}")
|
|
if not os.path.isdir(directory):
|
|
raise ValueError(f"Path is not a directory: {directory}")
|
|
|
|
elif source_info.type == "pdf":
|
|
file_path = source_info.parsed["file_path"]
|
|
if not os.path.exists(file_path):
|
|
raise ValueError(f"PDF file does not exist: {file_path}")
|
|
if not os.path.isfile(file_path):
|
|
raise ValueError(f"Path is not a file: {file_path}")
|
|
|
|
elif source_info.type == "word":
|
|
file_path = source_info.parsed["file_path"]
|
|
if not os.path.exists(file_path):
|
|
raise ValueError(f"Word document does not exist: {file_path}")
|
|
if not os.path.isfile(file_path):
|
|
raise ValueError(f"Path is not a file: {file_path}")
|
|
|
|
elif source_info.type == "epub":
|
|
file_path = source_info.parsed["file_path"]
|
|
if not os.path.exists(file_path):
|
|
raise ValueError(f"EPUB file does not exist: {file_path}")
|
|
if not os.path.isfile(file_path):
|
|
raise ValueError(f"Path is not a file: {file_path}")
|
|
|
|
elif source_info.type == "video":
|
|
if source_info.parsed.get("source_kind") == "file":
|
|
file_path = source_info.parsed["file_path"]
|
|
if not os.path.exists(file_path):
|
|
raise ValueError(f"Video file does not exist: {file_path}")
|
|
if not os.path.isfile(file_path):
|
|
raise ValueError(f"Path is not a file: {file_path}")
|
|
# URL-based video sources are validated during processing
|
|
|
|
elif source_info.type == "config":
|
|
config_path = source_info.parsed["config_path"]
|
|
if not os.path.exists(config_path):
|
|
raise ValueError(f"Config file does not exist: {config_path}")
|
|
if not os.path.isfile(config_path):
|
|
raise ValueError(f"Path is not a file: {config_path}")
|
|
|
|
# For web and github, validation happens during scraping
|
|
# (URL accessibility, repo existence)
|