Add complete video tutorial extraction system that converts YouTube videos and local video files into AI-consumable skills. The pipeline extracts transcripts, performs visual OCR on code editor panels independently, tracks code evolution across frames, and generates structured SKILL.md output. Key features: - Video metadata extraction (YouTube, local files, playlists) - Multi-source transcript extraction (YouTube API, yt-dlp, Whisper fallback) - Chapter-based and time-window segmentation - Visual extraction: keyframe detection, frame classification, panel detection - Per-panel sub-section OCR (each IDE panel OCR'd independently) - Parallel OCR with ThreadPoolExecutor for multi-panel frames - Narrow panel filtering (300px min width) to skip UI chrome - Text block tracking with spatial panel position matching - Code timeline with edit tracking across frames - Audio-visual alignment (code + narrator pairs) - Video-specific AI enhancement prompt for OCR denoising and code reconstruction - video-tutorial.yaml workflow with 4 stages (OCR cleanup, language detection, tutorial synthesis, skill polish) - CLI integration: skill-seekers video --url/--video-file/--playlist - MCP tool: scrape_video for automation - 161 tests passing Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
814 lines
27 KiB
Python
814 lines
27 KiB
Python
"""Video source data models and type definitions.
|
|
|
|
Defines all enumerations and dataclasses for the video extraction pipeline:
|
|
- Enums: VideoSourceType, TranscriptSource, FrameType, CodeContext, SegmentContentType
|
|
- Core: VideoInfo, VideoSegment, VideoScraperResult
|
|
- Supporting: Chapter, TranscriptSegment, WordTimestamp, KeyFrame, OCRRegion,
|
|
FrameSubSection, CodeBlock
|
|
- Config: VideoSourceConfig
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
from enum import Enum
|
|
from typing import Any
|
|
|
|
|
|
# =============================================================================
|
|
# Enumerations
|
|
# =============================================================================
|
|
|
|
|
|
class VideoSourceType(Enum):
|
|
"""Where a video came from."""
|
|
|
|
YOUTUBE = "youtube"
|
|
VIMEO = "vimeo"
|
|
LOCAL_FILE = "local_file"
|
|
LOCAL_DIRECTORY = "local_directory"
|
|
|
|
|
|
class TranscriptSource(Enum):
|
|
"""How the transcript was obtained."""
|
|
|
|
YOUTUBE_MANUAL = "youtube_manual"
|
|
YOUTUBE_AUTO = "youtube_auto_generated"
|
|
WHISPER = "whisper"
|
|
SUBTITLE_FILE = "subtitle_file"
|
|
NONE = "none"
|
|
|
|
|
|
class FrameType(Enum):
|
|
"""Classification of a keyframe's visual content."""
|
|
|
|
CODE_EDITOR = "code_editor"
|
|
TERMINAL = "terminal"
|
|
SLIDE = "slide"
|
|
DIAGRAM = "diagram"
|
|
BROWSER = "browser"
|
|
WEBCAM = "webcam"
|
|
SCREENCAST = "screencast"
|
|
OTHER = "other"
|
|
|
|
|
|
class CodeContext(Enum):
|
|
"""Where code was displayed in the video."""
|
|
|
|
EDITOR = "editor"
|
|
TERMINAL = "terminal"
|
|
SLIDE = "slide"
|
|
BROWSER = "browser"
|
|
UNKNOWN = "unknown"
|
|
|
|
|
|
class SegmentContentType(Enum):
|
|
"""Primary content type of a video segment."""
|
|
|
|
EXPLANATION = "explanation"
|
|
LIVE_CODING = "live_coding"
|
|
DEMO = "demo"
|
|
SLIDES = "slides"
|
|
Q_AND_A = "q_and_a"
|
|
INTRO = "intro"
|
|
OUTRO = "outro"
|
|
MIXED = "mixed"
|
|
|
|
|
|
class SegmentationStrategy(Enum):
|
|
"""How segments are determined."""
|
|
|
|
CHAPTERS = "chapters"
|
|
TIME_WINDOW = "time_window"
|
|
SCENE_CHANGE = "scene_change"
|
|
HYBRID = "hybrid"
|
|
|
|
|
|
# =============================================================================
|
|
# Supporting Data Classes
|
|
# =============================================================================
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Chapter:
|
|
"""A chapter marker from a video (typically YouTube)."""
|
|
|
|
title: str
|
|
start_time: float
|
|
end_time: float
|
|
|
|
@property
|
|
def duration(self) -> float:
|
|
return self.end_time - self.start_time
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"title": self.title,
|
|
"start_time": self.start_time,
|
|
"end_time": self.end_time,
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: dict) -> Chapter:
|
|
return cls(
|
|
title=data["title"],
|
|
start_time=data["start_time"],
|
|
end_time=data["end_time"],
|
|
)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class WordTimestamp:
|
|
"""A single word with precise timing information."""
|
|
|
|
word: str
|
|
start: float
|
|
end: float
|
|
probability: float = 1.0
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"word": self.word,
|
|
"start": self.start,
|
|
"end": self.end,
|
|
"probability": self.probability,
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: dict) -> WordTimestamp:
|
|
return cls(
|
|
word=data["word"],
|
|
start=data["start"],
|
|
end=data["end"],
|
|
probability=data.get("probability", 1.0),
|
|
)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class TranscriptSegment:
|
|
"""A raw transcript segment from YouTube API or Whisper."""
|
|
|
|
text: str
|
|
start: float
|
|
end: float
|
|
confidence: float = 1.0
|
|
words: list[WordTimestamp] | None = None
|
|
source: TranscriptSource = TranscriptSource.NONE
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"text": self.text,
|
|
"start": self.start,
|
|
"end": self.end,
|
|
"confidence": self.confidence,
|
|
"words": [w.to_dict() for w in self.words] if self.words else None,
|
|
"source": self.source.value,
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: dict) -> TranscriptSegment:
|
|
words = None
|
|
if data.get("words"):
|
|
words = [WordTimestamp.from_dict(w) for w in data["words"]]
|
|
return cls(
|
|
text=data["text"],
|
|
start=data["start"],
|
|
end=data["end"],
|
|
confidence=data.get("confidence", 1.0),
|
|
words=words,
|
|
source=TranscriptSource(data.get("source", "none")),
|
|
)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class OCRRegion:
|
|
"""A detected text region in a video frame."""
|
|
|
|
text: str
|
|
confidence: float
|
|
bbox: tuple[int, int, int, int]
|
|
is_monospace: bool = False
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"text": self.text,
|
|
"confidence": self.confidence,
|
|
"bbox": list(self.bbox),
|
|
"is_monospace": self.is_monospace,
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: dict) -> OCRRegion:
|
|
return cls(
|
|
text=data["text"],
|
|
confidence=data["confidence"],
|
|
bbox=tuple(data["bbox"]),
|
|
is_monospace=data.get("is_monospace", False),
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class FrameSubSection:
|
|
"""A single panel/region within a video frame, OCR'd independently.
|
|
|
|
Each IDE panel (e.g. code editor, terminal, file tree) is detected
|
|
as a separate sub-section so that side-by-side editors produce
|
|
independent OCR results instead of being merged into one blob.
|
|
"""
|
|
|
|
bbox: tuple[int, int, int, int] # (x1, y1, x2, y2)
|
|
frame_type: FrameType = FrameType.OTHER
|
|
ocr_text: str = ""
|
|
ocr_regions: list[OCRRegion] = field(default_factory=list)
|
|
ocr_confidence: float = 0.0
|
|
panel_id: str = "" # e.g. "panel_0_0" (row_col)
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"bbox": list(self.bbox),
|
|
"frame_type": self.frame_type.value,
|
|
"ocr_text": self.ocr_text,
|
|
"ocr_regions": [r.to_dict() for r in self.ocr_regions],
|
|
"ocr_confidence": self.ocr_confidence,
|
|
"panel_id": self.panel_id,
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: dict) -> FrameSubSection:
|
|
return cls(
|
|
bbox=tuple(data["bbox"]),
|
|
frame_type=FrameType(data.get("frame_type", "other")),
|
|
ocr_text=data.get("ocr_text", ""),
|
|
ocr_regions=[OCRRegion.from_dict(r) for r in data.get("ocr_regions", [])],
|
|
ocr_confidence=data.get("ocr_confidence", 0.0),
|
|
panel_id=data.get("panel_id", ""),
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class KeyFrame:
|
|
"""An extracted video frame with visual analysis results."""
|
|
|
|
timestamp: float
|
|
image_path: str
|
|
frame_type: FrameType = FrameType.OTHER
|
|
scene_change_score: float = 0.0
|
|
ocr_regions: list[OCRRegion] = field(default_factory=list)
|
|
ocr_text: str = ""
|
|
ocr_confidence: float = 0.0
|
|
width: int = 0
|
|
height: int = 0
|
|
sub_sections: list[FrameSubSection] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"timestamp": self.timestamp,
|
|
"image_path": self.image_path,
|
|
"frame_type": self.frame_type.value,
|
|
"scene_change_score": self.scene_change_score,
|
|
"ocr_regions": [r.to_dict() for r in self.ocr_regions],
|
|
"ocr_text": self.ocr_text,
|
|
"ocr_confidence": self.ocr_confidence,
|
|
"width": self.width,
|
|
"height": self.height,
|
|
"sub_sections": [ss.to_dict() for ss in self.sub_sections],
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: dict) -> KeyFrame:
|
|
return cls(
|
|
timestamp=data["timestamp"],
|
|
image_path=data["image_path"],
|
|
frame_type=FrameType(data.get("frame_type", "other")),
|
|
scene_change_score=data.get("scene_change_score", 0.0),
|
|
ocr_regions=[OCRRegion.from_dict(r) for r in data.get("ocr_regions", [])],
|
|
ocr_text=data.get("ocr_text", ""),
|
|
ocr_confidence=data.get("ocr_confidence", 0.0),
|
|
width=data.get("width", 0),
|
|
height=data.get("height", 0),
|
|
sub_sections=[FrameSubSection.from_dict(ss) for ss in data.get("sub_sections", [])],
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class CodeBlock:
|
|
"""A code block detected via OCR from video frames."""
|
|
|
|
code: str
|
|
language: str | None = None
|
|
source_frame: float = 0.0
|
|
context: CodeContext = CodeContext.UNKNOWN
|
|
confidence: float = 0.0
|
|
text_group_id: str = ""
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"code": self.code,
|
|
"language": self.language,
|
|
"source_frame": self.source_frame,
|
|
"context": self.context.value,
|
|
"confidence": self.confidence,
|
|
"text_group_id": self.text_group_id,
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: dict) -> CodeBlock:
|
|
return cls(
|
|
code=data["code"],
|
|
language=data.get("language"),
|
|
source_frame=data.get("source_frame", 0.0),
|
|
context=CodeContext(data.get("context", "unknown")),
|
|
confidence=data.get("confidence", 0.0),
|
|
text_group_id=data.get("text_group_id", ""),
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class TextGroupEdit:
|
|
"""Represents an edit detected between appearances of a text group."""
|
|
|
|
timestamp: float
|
|
added_lines: list[str] = field(default_factory=list)
|
|
removed_lines: list[str] = field(default_factory=list)
|
|
modified_lines: list[dict] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"timestamp": self.timestamp,
|
|
"added_lines": self.added_lines,
|
|
"removed_lines": self.removed_lines,
|
|
"modified_lines": self.modified_lines,
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: dict) -> TextGroupEdit:
|
|
return cls(
|
|
timestamp=data["timestamp"],
|
|
added_lines=data.get("added_lines", []),
|
|
removed_lines=data.get("removed_lines", []),
|
|
modified_lines=data.get("modified_lines", []),
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class TextGroup:
|
|
"""A group of related text blocks tracked across the video.
|
|
|
|
Represents a single code file/snippet as it appears and evolves
|
|
across multiple video frames.
|
|
"""
|
|
|
|
group_id: str
|
|
appearances: list[tuple[float, float]] = field(default_factory=list)
|
|
consensus_lines: list[dict] = field(default_factory=list)
|
|
edits: list[TextGroupEdit] = field(default_factory=list)
|
|
detected_language: str | None = None
|
|
frame_type: FrameType = FrameType.CODE_EDITOR
|
|
panel_id: str = "" # Tracks which panel this group originated from
|
|
|
|
@property
|
|
def full_text(self) -> str:
|
|
return "\n".join(line["text"] for line in self.consensus_lines if line.get("text"))
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"group_id": self.group_id,
|
|
"appearances": [[s, e] for s, e in self.appearances],
|
|
"consensus_lines": self.consensus_lines,
|
|
"edits": [e.to_dict() for e in self.edits],
|
|
"detected_language": self.detected_language,
|
|
"frame_type": self.frame_type.value,
|
|
"panel_id": self.panel_id,
|
|
"full_text": self.full_text,
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: dict) -> TextGroup:
|
|
return cls(
|
|
group_id=data["group_id"],
|
|
appearances=[tuple(a) for a in data.get("appearances", [])],
|
|
consensus_lines=data.get("consensus_lines", []),
|
|
edits=[TextGroupEdit.from_dict(e) for e in data.get("edits", [])],
|
|
detected_language=data.get("detected_language"),
|
|
frame_type=FrameType(data.get("frame_type", "code_editor")),
|
|
panel_id=data.get("panel_id", ""),
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class TextGroupTimeline:
|
|
"""Timeline of all text groups and their lifecycle in the video."""
|
|
|
|
text_groups: list[TextGroup] = field(default_factory=list)
|
|
total_code_time: float = 0.0
|
|
total_groups: int = 0
|
|
total_edits: int = 0
|
|
|
|
def get_groups_at_time(self, timestamp: float) -> list[TextGroup]:
|
|
"""Return all text groups visible at a given timestamp."""
|
|
return [
|
|
tg
|
|
for tg in self.text_groups
|
|
if any(start <= timestamp <= end for start, end in tg.appearances)
|
|
]
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"text_groups": [tg.to_dict() for tg in self.text_groups],
|
|
"total_code_time": self.total_code_time,
|
|
"total_groups": self.total_groups,
|
|
"total_edits": self.total_edits,
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: dict) -> TextGroupTimeline:
|
|
return cls(
|
|
text_groups=[TextGroup.from_dict(tg) for tg in data.get("text_groups", [])],
|
|
total_code_time=data.get("total_code_time", 0.0),
|
|
total_groups=data.get("total_groups", 0),
|
|
total_edits=data.get("total_edits", 0),
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class AudioVisualAlignment:
|
|
"""Links on-screen code with concurrent transcript narration."""
|
|
|
|
text_group_id: str
|
|
start_time: float
|
|
end_time: float
|
|
on_screen_code: str
|
|
transcript_during: str
|
|
language: str | None = None
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"text_group_id": self.text_group_id,
|
|
"start_time": self.start_time,
|
|
"end_time": self.end_time,
|
|
"on_screen_code": self.on_screen_code,
|
|
"transcript_during": self.transcript_during,
|
|
"language": self.language,
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: dict) -> AudioVisualAlignment:
|
|
return cls(
|
|
text_group_id=data["text_group_id"],
|
|
start_time=data["start_time"],
|
|
end_time=data["end_time"],
|
|
on_screen_code=data["on_screen_code"],
|
|
transcript_during=data.get("transcript_during", ""),
|
|
language=data.get("language"),
|
|
)
|
|
|
|
|
|
# =============================================================================
|
|
# Core Data Classes
|
|
# =============================================================================
|
|
|
|
|
|
@dataclass
|
|
class VideoSegment:
|
|
"""A time-aligned segment combining transcript + visual + metadata."""
|
|
|
|
index: int
|
|
start_time: float
|
|
end_time: float
|
|
duration: float
|
|
|
|
# Stream 1: ASR (Audio)
|
|
transcript: str = ""
|
|
words: list[WordTimestamp] = field(default_factory=list)
|
|
transcript_confidence: float = 0.0
|
|
|
|
# Stream 2: OCR (Visual)
|
|
keyframes: list[KeyFrame] = field(default_factory=list)
|
|
ocr_text: str = ""
|
|
detected_code_blocks: list[CodeBlock] = field(default_factory=list)
|
|
has_code_on_screen: bool = False
|
|
has_slides: bool = False
|
|
has_diagram: bool = False
|
|
|
|
# Stream 3: Metadata
|
|
chapter_title: str | None = None
|
|
topic: str | None = None
|
|
category: str | None = None
|
|
|
|
# Merged content
|
|
content: str = ""
|
|
summary: str | None = None
|
|
|
|
# Quality metadata
|
|
confidence: float = 0.0
|
|
content_type: SegmentContentType = SegmentContentType.MIXED
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"index": self.index,
|
|
"start_time": self.start_time,
|
|
"end_time": self.end_time,
|
|
"duration": self.duration,
|
|
"transcript": self.transcript,
|
|
"words": [w.to_dict() for w in self.words],
|
|
"transcript_confidence": self.transcript_confidence,
|
|
"keyframes": [k.to_dict() for k in self.keyframes],
|
|
"ocr_text": self.ocr_text,
|
|
"detected_code_blocks": [c.to_dict() for c in self.detected_code_blocks],
|
|
"has_code_on_screen": self.has_code_on_screen,
|
|
"has_slides": self.has_slides,
|
|
"has_diagram": self.has_diagram,
|
|
"chapter_title": self.chapter_title,
|
|
"topic": self.topic,
|
|
"category": self.category,
|
|
"content": self.content,
|
|
"summary": self.summary,
|
|
"confidence": self.confidence,
|
|
"content_type": self.content_type.value,
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: dict) -> VideoSegment:
|
|
return cls(
|
|
index=data["index"],
|
|
start_time=data["start_time"],
|
|
end_time=data["end_time"],
|
|
duration=data["duration"],
|
|
transcript=data.get("transcript", ""),
|
|
words=[WordTimestamp.from_dict(w) for w in data.get("words", [])],
|
|
transcript_confidence=data.get("transcript_confidence", 0.0),
|
|
keyframes=[KeyFrame.from_dict(k) for k in data.get("keyframes", [])],
|
|
ocr_text=data.get("ocr_text", ""),
|
|
detected_code_blocks=[
|
|
CodeBlock.from_dict(c) for c in data.get("detected_code_blocks", [])
|
|
],
|
|
has_code_on_screen=data.get("has_code_on_screen", False),
|
|
has_slides=data.get("has_slides", False),
|
|
has_diagram=data.get("has_diagram", False),
|
|
chapter_title=data.get("chapter_title"),
|
|
topic=data.get("topic"),
|
|
category=data.get("category"),
|
|
content=data.get("content", ""),
|
|
summary=data.get("summary"),
|
|
confidence=data.get("confidence", 0.0),
|
|
content_type=SegmentContentType(data.get("content_type", "mixed")),
|
|
)
|
|
|
|
@property
|
|
def timestamp_display(self) -> str:
|
|
"""Human-readable timestamp (e.g., '05:30 - 08:15')."""
|
|
start_min, start_sec = divmod(int(self.start_time), 60)
|
|
end_min, end_sec = divmod(int(self.end_time), 60)
|
|
if self.start_time >= 3600 or self.end_time >= 3600:
|
|
start_hr, start_min = divmod(start_min, 60)
|
|
end_hr, end_min = divmod(end_min, 60)
|
|
return f"{start_hr:d}:{start_min:02d}:{start_sec:02d} - {end_hr:d}:{end_min:02d}:{end_sec:02d}"
|
|
return f"{start_min:02d}:{start_sec:02d} - {end_min:02d}:{end_sec:02d}"
|
|
|
|
|
|
@dataclass
|
|
class VideoInfo:
|
|
"""Complete metadata and extracted content for a single video."""
|
|
|
|
# Identity
|
|
video_id: str
|
|
source_type: VideoSourceType
|
|
source_url: str | None = None
|
|
file_path: str | None = None
|
|
|
|
# Basic metadata
|
|
title: str = ""
|
|
description: str = ""
|
|
duration: float = 0.0
|
|
upload_date: str | None = None
|
|
language: str = "en"
|
|
|
|
# Channel / Author
|
|
channel_name: str | None = None
|
|
channel_url: str | None = None
|
|
|
|
# Engagement metadata
|
|
view_count: int | None = None
|
|
like_count: int | None = None
|
|
comment_count: int | None = None
|
|
|
|
# Discovery metadata
|
|
tags: list[str] = field(default_factory=list)
|
|
categories: list[str] = field(default_factory=list)
|
|
thumbnail_url: str | None = None
|
|
|
|
# Structure
|
|
chapters: list[Chapter] = field(default_factory=list)
|
|
|
|
# Playlist context
|
|
playlist_title: str | None = None
|
|
playlist_index: int | None = None
|
|
playlist_total: int | None = None
|
|
|
|
# Extracted content
|
|
raw_transcript: list[TranscriptSegment] = field(default_factory=list)
|
|
segments: list[VideoSegment] = field(default_factory=list)
|
|
|
|
# Processing metadata
|
|
transcript_source: TranscriptSource = TranscriptSource.NONE
|
|
visual_extraction_enabled: bool = False
|
|
whisper_model: str | None = None
|
|
processing_time_seconds: float = 0.0
|
|
extracted_at: str = ""
|
|
|
|
# Quality scores
|
|
transcript_confidence: float = 0.0
|
|
content_richness_score: float = 0.0
|
|
|
|
# Consensus-based text tracking (Phase A-D)
|
|
text_group_timeline: TextGroupTimeline | None = None
|
|
audio_visual_alignments: list[AudioVisualAlignment] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"video_id": self.video_id,
|
|
"source_type": self.source_type.value,
|
|
"source_url": self.source_url,
|
|
"file_path": self.file_path,
|
|
"title": self.title,
|
|
"description": self.description,
|
|
"duration": self.duration,
|
|
"upload_date": self.upload_date,
|
|
"language": self.language,
|
|
"channel_name": self.channel_name,
|
|
"channel_url": self.channel_url,
|
|
"view_count": self.view_count,
|
|
"like_count": self.like_count,
|
|
"comment_count": self.comment_count,
|
|
"tags": self.tags,
|
|
"categories": self.categories,
|
|
"thumbnail_url": self.thumbnail_url,
|
|
"chapters": [c.to_dict() for c in self.chapters],
|
|
"playlist_title": self.playlist_title,
|
|
"playlist_index": self.playlist_index,
|
|
"playlist_total": self.playlist_total,
|
|
"raw_transcript": [t.to_dict() for t in self.raw_transcript],
|
|
"segments": [s.to_dict() for s in self.segments],
|
|
"transcript_source": self.transcript_source.value,
|
|
"visual_extraction_enabled": self.visual_extraction_enabled,
|
|
"whisper_model": self.whisper_model,
|
|
"processing_time_seconds": self.processing_time_seconds,
|
|
"extracted_at": self.extracted_at,
|
|
"transcript_confidence": self.transcript_confidence,
|
|
"content_richness_score": self.content_richness_score,
|
|
"text_group_timeline": self.text_group_timeline.to_dict()
|
|
if self.text_group_timeline
|
|
else None,
|
|
"audio_visual_alignments": [a.to_dict() for a in self.audio_visual_alignments],
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: dict) -> VideoInfo:
|
|
timeline_data = data.get("text_group_timeline")
|
|
timeline = TextGroupTimeline.from_dict(timeline_data) if timeline_data else None
|
|
return cls(
|
|
video_id=data["video_id"],
|
|
source_type=VideoSourceType(data["source_type"]),
|
|
source_url=data.get("source_url"),
|
|
file_path=data.get("file_path"),
|
|
title=data.get("title", ""),
|
|
description=data.get("description", ""),
|
|
duration=data.get("duration", 0.0),
|
|
upload_date=data.get("upload_date"),
|
|
language=data.get("language", "en"),
|
|
channel_name=data.get("channel_name"),
|
|
channel_url=data.get("channel_url"),
|
|
view_count=data.get("view_count"),
|
|
like_count=data.get("like_count"),
|
|
comment_count=data.get("comment_count"),
|
|
tags=data.get("tags", []),
|
|
categories=data.get("categories", []),
|
|
thumbnail_url=data.get("thumbnail_url"),
|
|
chapters=[Chapter.from_dict(c) for c in data.get("chapters", [])],
|
|
playlist_title=data.get("playlist_title"),
|
|
playlist_index=data.get("playlist_index"),
|
|
playlist_total=data.get("playlist_total"),
|
|
raw_transcript=[TranscriptSegment.from_dict(t) for t in data.get("raw_transcript", [])],
|
|
segments=[VideoSegment.from_dict(s) for s in data.get("segments", [])],
|
|
transcript_source=TranscriptSource(data.get("transcript_source", "none")),
|
|
visual_extraction_enabled=data.get("visual_extraction_enabled", False),
|
|
whisper_model=data.get("whisper_model"),
|
|
processing_time_seconds=data.get("processing_time_seconds", 0.0),
|
|
extracted_at=data.get("extracted_at", ""),
|
|
transcript_confidence=data.get("transcript_confidence", 0.0),
|
|
content_richness_score=data.get("content_richness_score", 0.0),
|
|
text_group_timeline=timeline,
|
|
audio_visual_alignments=[
|
|
AudioVisualAlignment.from_dict(a) for a in data.get("audio_visual_alignments", [])
|
|
],
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class VideoSourceConfig:
|
|
"""Configuration for video source processing."""
|
|
|
|
# Source specification (exactly one should be set)
|
|
url: str | None = None
|
|
playlist: str | None = None
|
|
channel: str | None = None
|
|
path: str | None = None
|
|
directory: str | None = None
|
|
|
|
# Identity
|
|
name: str = "video"
|
|
description: str = ""
|
|
|
|
# Filtering
|
|
max_videos: int = 50
|
|
languages: list[str] | None = None
|
|
|
|
# Extraction
|
|
visual_extraction: bool = False
|
|
whisper_model: str = "base"
|
|
|
|
# Segmentation
|
|
time_window_seconds: float = 120.0
|
|
min_segment_duration: float = 10.0
|
|
max_segment_duration: float = 600.0
|
|
|
|
# Categorization
|
|
categories: dict[str, list[str]] | None = None
|
|
|
|
# Subtitle files
|
|
subtitle_patterns: list[str] | None = None
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: dict) -> VideoSourceConfig:
|
|
return cls(
|
|
url=data.get("url"),
|
|
playlist=data.get("playlist"),
|
|
channel=data.get("channel"),
|
|
path=data.get("path"),
|
|
directory=data.get("directory"),
|
|
name=data.get("name", "video"),
|
|
description=data.get("description", ""),
|
|
max_videos=data.get("max_videos", 50),
|
|
languages=data.get("languages"),
|
|
visual_extraction=data.get("visual_extraction", False),
|
|
whisper_model=data.get("whisper_model", "base"),
|
|
time_window_seconds=data.get("time_window_seconds", 120.0),
|
|
min_segment_duration=data.get("min_segment_duration", 10.0),
|
|
max_segment_duration=data.get("max_segment_duration", 600.0),
|
|
categories=data.get("categories"),
|
|
subtitle_patterns=data.get("subtitle_patterns"),
|
|
)
|
|
|
|
def validate(self) -> list[str]:
|
|
"""Validate configuration. Returns list of errors."""
|
|
errors = []
|
|
sources_set = sum(
|
|
1
|
|
for s in [self.url, self.playlist, self.channel, self.path, self.directory]
|
|
if s is not None
|
|
)
|
|
if sources_set == 0:
|
|
errors.append(
|
|
"Video source must specify one of: url, playlist, channel, path, directory"
|
|
)
|
|
if sources_set > 1:
|
|
errors.append("Video source must specify exactly one source type")
|
|
return errors
|
|
|
|
|
|
@dataclass
|
|
class VideoScraperResult:
|
|
"""Complete result from the video scraper."""
|
|
|
|
videos: list[VideoInfo] = field(default_factory=list)
|
|
total_duration_seconds: float = 0.0
|
|
total_segments: int = 0
|
|
total_code_blocks: int = 0
|
|
config: VideoSourceConfig | None = None
|
|
processing_time_seconds: float = 0.0
|
|
warnings: list[str] = field(default_factory=list)
|
|
errors: list[dict[str, Any]] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"videos": [v.to_dict() for v in self.videos],
|
|
"total_duration_seconds": self.total_duration_seconds,
|
|
"total_segments": self.total_segments,
|
|
"total_code_blocks": self.total_code_blocks,
|
|
"processing_time_seconds": self.processing_time_seconds,
|
|
"warnings": self.warnings,
|
|
"errors": self.errors,
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: dict) -> VideoScraperResult:
|
|
return cls(
|
|
videos=[VideoInfo.from_dict(v) for v in data.get("videos", [])],
|
|
total_duration_seconds=data.get("total_duration_seconds", 0.0),
|
|
total_segments=data.get("total_segments", 0),
|
|
total_code_blocks=data.get("total_code_blocks", 0),
|
|
processing_time_seconds=data.get("processing_time_seconds", 0.0),
|
|
warnings=data.get("warnings", []),
|
|
errors=data.get("errors", []),
|
|
)
|