feat: add video tutorial scraping pipeline with per-panel OCR and AI enhancement
Add complete video tutorial extraction system that converts YouTube videos and local video files into AI-consumable skills. The pipeline extracts transcripts, performs visual OCR on code editor panels independently, tracks code evolution across frames, and generates structured SKILL.md output. Key features: - Video metadata extraction (YouTube, local files, playlists) - Multi-source transcript extraction (YouTube API, yt-dlp, Whisper fallback) - Chapter-based and time-window segmentation - Visual extraction: keyframe detection, frame classification, panel detection - Per-panel sub-section OCR (each IDE panel OCR'd independently) - Parallel OCR with ThreadPoolExecutor for multi-panel frames - Narrow panel filtering (300px min width) to skip UI chrome - Text block tracking with spatial panel position matching - Code timeline with edit tracking across frames - Audio-visual alignment (code + narrator pairs) - Video-specific AI enhancement prompt for OCR denoising and code reconstruction - video-tutorial.yaml workflow with 4 stages (OCR cleanup, language detection, tutorial synthesis, skill polish) - CLI integration: skill-seekers video --url/--video-file/--playlist - MCP tool: scrape_video for automation - 161 tests passing Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
270
src/skill_seekers/cli/video_metadata.py
Normal file
270
src/skill_seekers/cli/video_metadata.py
Normal file
@@ -0,0 +1,270 @@
|
||||
"""Video metadata extraction module.
|
||||
|
||||
Uses yt-dlp for metadata extraction without downloading video content.
|
||||
Supports YouTube, Vimeo, and local video files.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
from skill_seekers.cli.video_models import (
|
||||
Chapter,
|
||||
VideoInfo,
|
||||
VideoSourceType,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Optional dependency: yt-dlp
|
||||
try:
|
||||
import yt_dlp
|
||||
|
||||
HAS_YTDLP = True
|
||||
except ImportError:
|
||||
HAS_YTDLP = False
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Video ID Extraction
|
||||
# =============================================================================
|
||||
|
||||
|
||||
# YouTube URL patterns
|
||||
YOUTUBE_PATTERNS = [
|
||||
re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})"),
|
||||
re.compile(r"(?:https?://)?youtu\.be/([a-zA-Z0-9_-]{11})"),
|
||||
re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/embed/([a-zA-Z0-9_-]{11})"),
|
||||
re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/v/([a-zA-Z0-9_-]{11})"),
|
||||
re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/shorts/([a-zA-Z0-9_-]{11})"),
|
||||
]
|
||||
|
||||
YOUTUBE_PLAYLIST_PATTERN = re.compile(
|
||||
r"(?:https?://)?(?:www\.)?youtube\.com/playlist\?list=([a-zA-Z0-9_-]+)"
|
||||
)
|
||||
|
||||
YOUTUBE_CHANNEL_PATTERNS = [
|
||||
re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/@([a-zA-Z0-9_-]+)"),
|
||||
re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/channel/([a-zA-Z0-9_-]+)"),
|
||||
re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/c/([a-zA-Z0-9_-]+)"),
|
||||
]
|
||||
|
||||
VIMEO_PATTERN = re.compile(r"(?:https?://)?(?:www\.)?vimeo\.com/(\d+)")
|
||||
|
||||
|
||||
def extract_video_id(url: str) -> str | None:
|
||||
"""Extract YouTube video ID from various URL formats.
|
||||
|
||||
Args:
|
||||
url: YouTube URL in any supported format.
|
||||
|
||||
Returns:
|
||||
11-character video ID, or None if not a YouTube URL.
|
||||
"""
|
||||
for pattern in YOUTUBE_PATTERNS:
|
||||
match = pattern.search(url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return None
|
||||
|
||||
|
||||
def detect_video_source_type(url_or_path: str) -> VideoSourceType:
|
||||
"""Detect the source type of a video URL or file path.
|
||||
|
||||
Args:
|
||||
url_or_path: URL or local file path.
|
||||
|
||||
Returns:
|
||||
VideoSourceType enum value.
|
||||
"""
|
||||
if os.path.isfile(url_or_path):
|
||||
return VideoSourceType.LOCAL_FILE
|
||||
if os.path.isdir(url_or_path):
|
||||
return VideoSourceType.LOCAL_DIRECTORY
|
||||
|
||||
url_lower = url_or_path.lower()
|
||||
if "youtube.com" in url_lower or "youtu.be" in url_lower:
|
||||
return VideoSourceType.YOUTUBE
|
||||
if "vimeo.com" in url_lower:
|
||||
return VideoSourceType.VIMEO
|
||||
|
||||
return VideoSourceType.LOCAL_FILE
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# YouTube Metadata via yt-dlp
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def _check_ytdlp():
|
||||
"""Raise RuntimeError if yt-dlp is not installed."""
|
||||
if not HAS_YTDLP:
|
||||
raise RuntimeError(
|
||||
"yt-dlp is required for video metadata extraction.\n"
|
||||
'Install with: pip install "skill-seekers[video]"\n'
|
||||
"Or: pip install yt-dlp"
|
||||
)
|
||||
|
||||
|
||||
def extract_youtube_metadata(url: str) -> VideoInfo:
|
||||
"""Extract metadata from a YouTube video URL without downloading.
|
||||
|
||||
Args:
|
||||
url: YouTube video URL.
|
||||
|
||||
Returns:
|
||||
VideoInfo with metadata populated.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If yt-dlp is not installed.
|
||||
"""
|
||||
_check_ytdlp()
|
||||
|
||||
ydl_opts = {
|
||||
"quiet": True,
|
||||
"no_warnings": True,
|
||||
"extract_flat": False,
|
||||
"skip_download": True,
|
||||
}
|
||||
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
info = ydl.extract_info(url, download=False)
|
||||
|
||||
video_id = info.get("id", extract_video_id(url) or "unknown")
|
||||
|
||||
# Parse chapters
|
||||
chapters = []
|
||||
raw_chapters = info.get("chapters") or []
|
||||
for i, ch in enumerate(raw_chapters):
|
||||
end_time = ch.get("end_time", 0)
|
||||
if i + 1 < len(raw_chapters):
|
||||
end_time = raw_chapters[i + 1].get("start_time", end_time)
|
||||
chapters.append(
|
||||
Chapter(
|
||||
title=ch.get("title", f"Chapter {i + 1}"),
|
||||
start_time=ch.get("start_time", 0),
|
||||
end_time=end_time,
|
||||
)
|
||||
)
|
||||
|
||||
return VideoInfo(
|
||||
video_id=video_id,
|
||||
source_type=VideoSourceType.YOUTUBE,
|
||||
source_url=url,
|
||||
title=info.get("title", ""),
|
||||
description=info.get("description", ""),
|
||||
duration=float(info.get("duration", 0)),
|
||||
upload_date=info.get("upload_date"),
|
||||
language=info.get("language") or "en",
|
||||
channel_name=info.get("channel") or info.get("uploader"),
|
||||
channel_url=info.get("channel_url") or info.get("uploader_url"),
|
||||
view_count=info.get("view_count"),
|
||||
like_count=info.get("like_count"),
|
||||
comment_count=info.get("comment_count"),
|
||||
tags=info.get("tags") or [],
|
||||
categories=info.get("categories") or [],
|
||||
thumbnail_url=info.get("thumbnail"),
|
||||
chapters=chapters,
|
||||
)
|
||||
|
||||
|
||||
def extract_local_metadata(file_path: str) -> VideoInfo:
|
||||
"""Extract basic metadata from a local video file.
|
||||
|
||||
Args:
|
||||
file_path: Path to video file.
|
||||
|
||||
Returns:
|
||||
VideoInfo with basic metadata from filename/file properties.
|
||||
"""
|
||||
path = os.path.abspath(file_path)
|
||||
name = os.path.splitext(os.path.basename(path))[0]
|
||||
video_id = hashlib.sha256(path.encode()).hexdigest()[:16]
|
||||
|
||||
return VideoInfo(
|
||||
video_id=video_id,
|
||||
source_type=VideoSourceType.LOCAL_FILE,
|
||||
file_path=path,
|
||||
title=name.replace("-", " ").replace("_", " ").title(),
|
||||
duration=0.0, # Would need ffprobe for accurate duration
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Playlist / Channel Resolution
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def resolve_playlist(url: str) -> list[str]:
|
||||
"""Resolve a YouTube playlist URL to a list of video URLs.
|
||||
|
||||
Args:
|
||||
url: YouTube playlist URL.
|
||||
|
||||
Returns:
|
||||
List of video URLs in playlist order.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If yt-dlp is not installed.
|
||||
"""
|
||||
_check_ytdlp()
|
||||
|
||||
ydl_opts = {
|
||||
"quiet": True,
|
||||
"no_warnings": True,
|
||||
"extract_flat": True,
|
||||
"skip_download": True,
|
||||
}
|
||||
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
info = ydl.extract_info(url, download=False)
|
||||
|
||||
entries = info.get("entries") or []
|
||||
video_urls = []
|
||||
for entry in entries:
|
||||
vid_url = entry.get("url") or entry.get("webpage_url")
|
||||
if vid_url:
|
||||
video_urls.append(vid_url)
|
||||
elif entry.get("id"):
|
||||
video_urls.append(f"https://www.youtube.com/watch?v={entry['id']}")
|
||||
|
||||
return video_urls
|
||||
|
||||
|
||||
def resolve_channel(url: str, max_videos: int = 50) -> list[str]:
|
||||
"""Resolve a YouTube channel URL to a list of recent video URLs.
|
||||
|
||||
Args:
|
||||
url: YouTube channel URL.
|
||||
max_videos: Maximum number of videos to resolve.
|
||||
|
||||
Returns:
|
||||
List of video URLs (most recent first).
|
||||
|
||||
Raises:
|
||||
RuntimeError: If yt-dlp is not installed.
|
||||
"""
|
||||
_check_ytdlp()
|
||||
|
||||
ydl_opts = {
|
||||
"quiet": True,
|
||||
"no_warnings": True,
|
||||
"extract_flat": True,
|
||||
"skip_download": True,
|
||||
"playlistend": max_videos,
|
||||
}
|
||||
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
info = ydl.extract_info(url, download=False)
|
||||
|
||||
entries = info.get("entries") or []
|
||||
video_urls = []
|
||||
for entry in entries:
|
||||
vid_url = entry.get("url") or entry.get("webpage_url")
|
||||
if vid_url:
|
||||
video_urls.append(vid_url)
|
||||
elif entry.get("id"):
|
||||
video_urls.append(f"https://www.youtube.com/watch?v={entry['id']}")
|
||||
|
||||
return video_urls[:max_videos]
|
||||
Reference in New Issue
Block a user