feat: add video tutorial scraping pipeline with per-panel OCR and AI enhancement

Add complete video tutorial extraction system that converts YouTube videos and local video files into AI-consumable skills. The pipeline extracts transcripts, performs visual OCR on code editor panels independently, tracks code evolution across frames, and generates structured SKILL.md output. Key features: - Video metadata extraction (YouTube, local files, playlists) - Multi-source transcript extraction (YouTube API, yt-dlp, Whisper fallback) - Chapter-based and time-window segmentation - Visual extraction: keyframe detection, frame classification, panel detection - Per-panel sub-section OCR (each IDE panel OCR'd independently) - Parallel OCR with ThreadPoolExecutor for multi-panel frames - Narrow panel filtering (300px min width) to skip UI chrome - Text block tracking with spatial panel position matching - Code timeline with edit tracking across frames - Audio-visual alignment (code + narrator pairs) - Video-specific AI enhancement prompt for OCR denoising and code reconstruction - video-tutorial.yaml workflow with 4 stages (OCR cleanup, language detection, tutorial synthesis, skill polish) - CLI integration: skill-seekers video --url/--video-file/--playlist - MCP tool: scrape_video for automation - 161 tests passing Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-27 23:10:19 +03:00
parent 3bad7cf365
commit 62071c4aa9
32 changed files with 15090 additions and 9 deletions
--- a/src/skill_seekers/cli/video_metadata.py
+++ b/src/skill_seekers/cli/video_metadata.py
@@ -0,0 +1,270 @@
+"""Video metadata extraction module.
+
+Uses yt-dlp for metadata extraction without downloading video content.
+Supports YouTube, Vimeo, and local video files.
+"""
+
+import hashlib
+import logging
+import os
+import re
+
+from skill_seekers.cli.video_models import (
+    Chapter,
+    VideoInfo,
+    VideoSourceType,
+)
+
+logger = logging.getLogger(__name__)
+
+# Optional dependency: yt-dlp
+try:
+    import yt_dlp
+
+    HAS_YTDLP = True
+except ImportError:
+    HAS_YTDLP = False
+
+
+# =============================================================================
+# Video ID Extraction
+# =============================================================================
+
+
+# YouTube URL patterns
+YOUTUBE_PATTERNS = [
+    re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})"),
+    re.compile(r"(?:https?://)?youtu\.be/([a-zA-Z0-9_-]{11})"),
+    re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/embed/([a-zA-Z0-9_-]{11})"),
+    re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/v/([a-zA-Z0-9_-]{11})"),
+    re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/shorts/([a-zA-Z0-9_-]{11})"),
+]
+
+YOUTUBE_PLAYLIST_PATTERN = re.compile(
+    r"(?:https?://)?(?:www\.)?youtube\.com/playlist\?list=([a-zA-Z0-9_-]+)"
+)
+
+YOUTUBE_CHANNEL_PATTERNS = [
+    re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/@([a-zA-Z0-9_-]+)"),
+    re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/channel/([a-zA-Z0-9_-]+)"),
+    re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/c/([a-zA-Z0-9_-]+)"),
+]
+
+VIMEO_PATTERN = re.compile(r"(?:https?://)?(?:www\.)?vimeo\.com/(\d+)")
+
+
+def extract_video_id(url: str) -> str | None:
+    """Extract YouTube video ID from various URL formats.
+
+    Args:
+        url: YouTube URL in any supported format.
+
+    Returns:
+        11-character video ID, or None if not a YouTube URL.
+    """
+    for pattern in YOUTUBE_PATTERNS:
+        match = pattern.search(url)
+        if match:
+            return match.group(1)
+    return None
+
+
+def detect_video_source_type(url_or_path: str) -> VideoSourceType:
+    """Detect the source type of a video URL or file path.
+
+    Args:
+        url_or_path: URL or local file path.
+
+    Returns:
+        VideoSourceType enum value.
+    """
+    if os.path.isfile(url_or_path):
+        return VideoSourceType.LOCAL_FILE
+    if os.path.isdir(url_or_path):
+        return VideoSourceType.LOCAL_DIRECTORY
+
+    url_lower = url_or_path.lower()
+    if "youtube.com" in url_lower or "youtu.be" in url_lower:
+        return VideoSourceType.YOUTUBE
+    if "vimeo.com" in url_lower:
+        return VideoSourceType.VIMEO
+
+    return VideoSourceType.LOCAL_FILE
+
+
+# =============================================================================
+# YouTube Metadata via yt-dlp
+# =============================================================================
+
+
+def _check_ytdlp():
+    """Raise RuntimeError if yt-dlp is not installed."""
+    if not HAS_YTDLP:
+        raise RuntimeError(
+            "yt-dlp is required for video metadata extraction.\n"
+            'Install with: pip install "skill-seekers[video]"\n'
+            "Or: pip install yt-dlp"
+        )
+
+
+def extract_youtube_metadata(url: str) -> VideoInfo:
+    """Extract metadata from a YouTube video URL without downloading.
+
+    Args:
+        url: YouTube video URL.
+
+    Returns:
+        VideoInfo with metadata populated.
+
+    Raises:
+        RuntimeError: If yt-dlp is not installed.
+    """
+    _check_ytdlp()
+
+    ydl_opts = {
+        "quiet": True,
+        "no_warnings": True,
+        "extract_flat": False,
+        "skip_download": True,
+    }
+
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(url, download=False)
+
+    video_id = info.get("id", extract_video_id(url) or "unknown")
+
+    # Parse chapters
+    chapters = []
+    raw_chapters = info.get("chapters") or []
+    for i, ch in enumerate(raw_chapters):
+        end_time = ch.get("end_time", 0)
+        if i + 1 < len(raw_chapters):
+            end_time = raw_chapters[i + 1].get("start_time", end_time)
+        chapters.append(
+            Chapter(
+                title=ch.get("title", f"Chapter {i + 1}"),
+                start_time=ch.get("start_time", 0),
+                end_time=end_time,
+            )
+        )
+
+    return VideoInfo(
+        video_id=video_id,
+        source_type=VideoSourceType.YOUTUBE,
+        source_url=url,
+        title=info.get("title", ""),
+        description=info.get("description", ""),
+        duration=float(info.get("duration", 0)),
+        upload_date=info.get("upload_date"),
+        language=info.get("language") or "en",
+        channel_name=info.get("channel") or info.get("uploader"),
+        channel_url=info.get("channel_url") or info.get("uploader_url"),
+        view_count=info.get("view_count"),
+        like_count=info.get("like_count"),
+        comment_count=info.get("comment_count"),
+        tags=info.get("tags") or [],
+        categories=info.get("categories") or [],
+        thumbnail_url=info.get("thumbnail"),
+        chapters=chapters,
+    )
+
+
+def extract_local_metadata(file_path: str) -> VideoInfo:
+    """Extract basic metadata from a local video file.
+
+    Args:
+        file_path: Path to video file.
+
+    Returns:
+        VideoInfo with basic metadata from filename/file properties.
+    """
+    path = os.path.abspath(file_path)
+    name = os.path.splitext(os.path.basename(path))[0]
+    video_id = hashlib.sha256(path.encode()).hexdigest()[:16]
+
+    return VideoInfo(
+        video_id=video_id,
+        source_type=VideoSourceType.LOCAL_FILE,
+        file_path=path,
+        title=name.replace("-", " ").replace("_", " ").title(),
+        duration=0.0,  # Would need ffprobe for accurate duration
+    )
+
+
+# =============================================================================
+# Playlist / Channel Resolution
+# =============================================================================
+
+
+def resolve_playlist(url: str) -> list[str]:
+    """Resolve a YouTube playlist URL to a list of video URLs.
+
+    Args:
+        url: YouTube playlist URL.
+
+    Returns:
+        List of video URLs in playlist order.
+
+    Raises:
+        RuntimeError: If yt-dlp is not installed.
+    """
+    _check_ytdlp()
+
+    ydl_opts = {
+        "quiet": True,
+        "no_warnings": True,
+        "extract_flat": True,
+        "skip_download": True,
+    }
+
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(url, download=False)
+
+    entries = info.get("entries") or []
+    video_urls = []
+    for entry in entries:
+        vid_url = entry.get("url") or entry.get("webpage_url")
+        if vid_url:
+            video_urls.append(vid_url)
+        elif entry.get("id"):
+            video_urls.append(f"https://www.youtube.com/watch?v={entry['id']}")
+
+    return video_urls
+
+
+def resolve_channel(url: str, max_videos: int = 50) -> list[str]:
+    """Resolve a YouTube channel URL to a list of recent video URLs.
+
+    Args:
+        url: YouTube channel URL.
+        max_videos: Maximum number of videos to resolve.
+
+    Returns:
+        List of video URLs (most recent first).
+
+    Raises:
+        RuntimeError: If yt-dlp is not installed.
+    """
+    _check_ytdlp()
+
+    ydl_opts = {
+        "quiet": True,
+        "no_warnings": True,
+        "extract_flat": True,
+        "skip_download": True,
+        "playlistend": max_videos,
+    }
+
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(url, download=False)
+
+    entries = info.get("entries") or []
+    video_urls = []
+    for entry in entries:
+        vid_url = entry.get("url") or entry.get("webpage_url")
+        if vid_url:
+            video_urls.append(vid_url)
+        elif entry.get("id"):
+            video_urls.append(f"https://www.youtube.com/watch?v={entry['id']}")
+
+    return video_urls[:max_videos]