skill-seekers-reference/src/skill_seekers/cli/video_models.py

"""Video source data models and type definitions.

Defines all enumerations and dataclasses for the video extraction pipeline:
- Enums: VideoSourceType, TranscriptSource, FrameType, CodeContext, SegmentContentType
- Core: VideoInfo, VideoSegment, VideoScraperResult
- Supporting: Chapter, TranscriptSegment, WordTimestamp, KeyFrame, OCRRegion,
  FrameSubSection, CodeBlock
- Config: VideoSourceConfig
"""

from __future__ import annotations

from dataclasses import dataclass, field
from enum import Enum
from typing import Any


# =============================================================================
# Enumerations
# =============================================================================


class VideoSourceType(Enum):
    """Where a video came from."""

    YOUTUBE = "youtube"
    VIMEO = "vimeo"
    LOCAL_FILE = "local_file"
    LOCAL_DIRECTORY = "local_directory"


class TranscriptSource(Enum):
    """How the transcript was obtained."""

    YOUTUBE_MANUAL = "youtube_manual"
    YOUTUBE_AUTO = "youtube_auto_generated"
    WHISPER = "whisper"
    SUBTITLE_FILE = "subtitle_file"
    NONE = "none"


class FrameType(Enum):
    """Classification of a keyframe's visual content."""

    CODE_EDITOR = "code_editor"
    TERMINAL = "terminal"
    SLIDE = "slide"
    DIAGRAM = "diagram"
    BROWSER = "browser"
    WEBCAM = "webcam"
    SCREENCAST = "screencast"
    OTHER = "other"


class CodeContext(Enum):
    """Where code was displayed in the video."""

    EDITOR = "editor"
    TERMINAL = "terminal"
    SLIDE = "slide"
    BROWSER = "browser"
    UNKNOWN = "unknown"


class SegmentContentType(Enum):
    """Primary content type of a video segment."""

    EXPLANATION = "explanation"
    LIVE_CODING = "live_coding"
    DEMO = "demo"
    SLIDES = "slides"
    Q_AND_A = "q_and_a"
    INTRO = "intro"
    OUTRO = "outro"
    MIXED = "mixed"


class SegmentationStrategy(Enum):
    """How segments are determined."""

    CHAPTERS = "chapters"
    TIME_WINDOW = "time_window"
    SCENE_CHANGE = "scene_change"
    HYBRID = "hybrid"


# =============================================================================
# Supporting Data Classes
# =============================================================================


@dataclass(frozen=True)
class Chapter:
    """A chapter marker from a video (typically YouTube)."""

    title: str
    start_time: float
    end_time: float

    @property
    def duration(self) -> float:
        return self.end_time - self.start_time

    def to_dict(self) -> dict:
        return {
            "title": self.title,
            "start_time": self.start_time,
            "end_time": self.end_time,
        }

    @classmethod
    def from_dict(cls, data: dict) -> Chapter:
        return cls(
            title=data["title"],
            start_time=data["start_time"],
            end_time=data["end_time"],
        )


@dataclass(frozen=True)
class WordTimestamp:
    """A single word with precise timing information."""

    word: str
    start: float
    end: float
    probability: float = 1.0

    def to_dict(self) -> dict:
        return {
            "word": self.word,
            "start": self.start,
            "end": self.end,
            "probability": self.probability,
        }

    @classmethod
    def from_dict(cls, data: dict) -> WordTimestamp:
        return cls(
            word=data["word"],
            start=data["start"],
            end=data["end"],
            probability=data.get("probability", 1.0),
        )


@dataclass(frozen=True)
class TranscriptSegment:
    """A raw transcript segment from YouTube API or Whisper."""

    text: str
    start: float
    end: float
    confidence: float = 1.0
    words: list[WordTimestamp] | None = None
    source: TranscriptSource = TranscriptSource.NONE

    def to_dict(self) -> dict:
        return {
            "text": self.text,
            "start": self.start,
            "end": self.end,
            "confidence": self.confidence,
            "words": [w.to_dict() for w in self.words] if self.words else None,
            "source": self.source.value,
        }

    @classmethod
    def from_dict(cls, data: dict) -> TranscriptSegment:
        words = None
        if data.get("words"):
            words = [WordTimestamp.from_dict(w) for w in data["words"]]
        return cls(
            text=data["text"],
            start=data["start"],
            end=data["end"],
            confidence=data.get("confidence", 1.0),
            words=words,
            source=TranscriptSource(data.get("source", "none")),
        )


@dataclass(frozen=True)
class OCRRegion:
    """A detected text region in a video frame."""

    text: str
    confidence: float
    bbox: tuple[int, int, int, int]
    is_monospace: bool = False

    def to_dict(self) -> dict:
        return {
            "text": self.text,
            "confidence": self.confidence,
            "bbox": list(self.bbox),
            "is_monospace": self.is_monospace,
        }

    @classmethod
    def from_dict(cls, data: dict) -> OCRRegion:
        return cls(
            text=data["text"],
            confidence=data["confidence"],
            bbox=tuple(data["bbox"]),
            is_monospace=data.get("is_monospace", False),
        )


@dataclass
class FrameSubSection:
    """A single panel/region within a video frame, OCR'd independently.

    Each IDE panel (e.g. code editor, terminal, file tree) is detected
    as a separate sub-section so that side-by-side editors produce
    independent OCR results instead of being merged into one blob.
    """

    bbox: tuple[int, int, int, int]  # (x1, y1, x2, y2)
    frame_type: FrameType = FrameType.OTHER
    ocr_text: str = ""
    ocr_regions: list[OCRRegion] = field(default_factory=list)
    ocr_confidence: float = 0.0
    panel_id: str = ""  # e.g. "panel_0_0" (row_col)

    def to_dict(self) -> dict:
        return {
            "bbox": list(self.bbox),
            "frame_type": self.frame_type.value,
            "ocr_text": self.ocr_text,
            "ocr_regions": [r.to_dict() for r in self.ocr_regions],
            "ocr_confidence": self.ocr_confidence,
            "panel_id": self.panel_id,
        }

    @classmethod
    def from_dict(cls, data: dict) -> FrameSubSection:
        return cls(
            bbox=tuple(data["bbox"]),
            frame_type=FrameType(data.get("frame_type", "other")),
            ocr_text=data.get("ocr_text", ""),
            ocr_regions=[OCRRegion.from_dict(r) for r in data.get("ocr_regions", [])],
            ocr_confidence=data.get("ocr_confidence", 0.0),
            panel_id=data.get("panel_id", ""),
        )


@dataclass
class KeyFrame:
    """An extracted video frame with visual analysis results."""

    timestamp: float
    image_path: str
    frame_type: FrameType = FrameType.OTHER
    scene_change_score: float = 0.0
    ocr_regions: list[OCRRegion] = field(default_factory=list)
    ocr_text: str = ""
    ocr_confidence: float = 0.0
    width: int = 0
    height: int = 0
    sub_sections: list[FrameSubSection] = field(default_factory=list)

    def to_dict(self) -> dict:
        return {
            "timestamp": self.timestamp,
            "image_path": self.image_path,
            "frame_type": self.frame_type.value,
            "scene_change_score": self.scene_change_score,
            "ocr_regions": [r.to_dict() for r in self.ocr_regions],
            "ocr_text": self.ocr_text,
            "ocr_confidence": self.ocr_confidence,
            "width": self.width,
            "height": self.height,
            "sub_sections": [ss.to_dict() for ss in self.sub_sections],
        }

    @classmethod
    def from_dict(cls, data: dict) -> KeyFrame:
        return cls(
            timestamp=data["timestamp"],
            image_path=data["image_path"],
            frame_type=FrameType(data.get("frame_type", "other")),
            scene_change_score=data.get("scene_change_score", 0.0),
            ocr_regions=[OCRRegion.from_dict(r) for r in data.get("ocr_regions", [])],
            ocr_text=data.get("ocr_text", ""),
            ocr_confidence=data.get("ocr_confidence", 0.0),
            width=data.get("width", 0),
            height=data.get("height", 0),
            sub_sections=[FrameSubSection.from_dict(ss) for ss in data.get("sub_sections", [])],
        )


@dataclass
class CodeBlock:
    """A code block detected via OCR from video frames."""

    code: str
    language: str | None = None
    source_frame: float = 0.0
    context: CodeContext = CodeContext.UNKNOWN
    confidence: float = 0.0
    text_group_id: str = ""

    def to_dict(self) -> dict:
        return {
            "code": self.code,
            "language": self.language,
            "source_frame": self.source_frame,
            "context": self.context.value,
            "confidence": self.confidence,
            "text_group_id": self.text_group_id,
        }

    @classmethod
    def from_dict(cls, data: dict) -> CodeBlock:
        return cls(
            code=data["code"],
            language=data.get("language"),
            source_frame=data.get("source_frame", 0.0),
            context=CodeContext(data.get("context", "unknown")),
            confidence=data.get("confidence", 0.0),
            text_group_id=data.get("text_group_id", ""),
        )


@dataclass
class TextGroupEdit:
    """Represents an edit detected between appearances of a text group."""

    timestamp: float
    added_lines: list[str] = field(default_factory=list)
    removed_lines: list[str] = field(default_factory=list)
    modified_lines: list[dict] = field(default_factory=list)

    def to_dict(self) -> dict:
        return {
            "timestamp": self.timestamp,
            "added_lines": self.added_lines,
            "removed_lines": self.removed_lines,
            "modified_lines": self.modified_lines,
        }

    @classmethod
    def from_dict(cls, data: dict) -> TextGroupEdit:
        return cls(
            timestamp=data["timestamp"],
            added_lines=data.get("added_lines", []),
            removed_lines=data.get("removed_lines", []),
            modified_lines=data.get("modified_lines", []),
        )


@dataclass
class TextGroup:
    """A group of related text blocks tracked across the video.

    Represents a single code file/snippet as it appears and evolves
    across multiple video frames.
    """

    group_id: str
    appearances: list[tuple[float, float]] = field(default_factory=list)
    consensus_lines: list[dict] = field(default_factory=list)
    edits: list[TextGroupEdit] = field(default_factory=list)
    detected_language: str | None = None
    frame_type: FrameType = FrameType.CODE_EDITOR
    panel_id: str = ""  # Tracks which panel this group originated from

    @property
    def full_text(self) -> str:
        return "\n".join(line["text"] for line in self.consensus_lines if line.get("text"))

    def to_dict(self) -> dict:
        return {
            "group_id": self.group_id,
            "appearances": [[s, e] for s, e in self.appearances],
            "consensus_lines": self.consensus_lines,
            "edits": [e.to_dict() for e in self.edits],
            "detected_language": self.detected_language,
            "frame_type": self.frame_type.value,
            "panel_id": self.panel_id,
            "full_text": self.full_text,
        }

    @classmethod
    def from_dict(cls, data: dict) -> TextGroup:
        return cls(
            group_id=data["group_id"],
            appearances=[tuple(a) for a in data.get("appearances", [])],
            consensus_lines=data.get("consensus_lines", []),
            edits=[TextGroupEdit.from_dict(e) for e in data.get("edits", [])],
            detected_language=data.get("detected_language"),
            frame_type=FrameType(data.get("frame_type", "code_editor")),
            panel_id=data.get("panel_id", ""),
        )


@dataclass
class TextGroupTimeline:
    """Timeline of all text groups and their lifecycle in the video."""

    text_groups: list[TextGroup] = field(default_factory=list)
    total_code_time: float = 0.0
    total_groups: int = 0
    total_edits: int = 0

    def get_groups_at_time(self, timestamp: float) -> list[TextGroup]:
        """Return all text groups visible at a given timestamp."""
        return [
            tg
            for tg in self.text_groups
            if any(start <= timestamp <= end for start, end in tg.appearances)
        ]

    def to_dict(self) -> dict:
        return {
            "text_groups": [tg.to_dict() for tg in self.text_groups],
            "total_code_time": self.total_code_time,
            "total_groups": self.total_groups,
            "total_edits": self.total_edits,
        }

    @classmethod
    def from_dict(cls, data: dict) -> TextGroupTimeline:
        return cls(
            text_groups=[TextGroup.from_dict(tg) for tg in data.get("text_groups", [])],
            total_code_time=data.get("total_code_time", 0.0),
            total_groups=data.get("total_groups", 0),
            total_edits=data.get("total_edits", 0),
        )


@dataclass
class AudioVisualAlignment:
    """Links on-screen code with concurrent transcript narration."""

    text_group_id: str
    start_time: float
    end_time: float
    on_screen_code: str
    transcript_during: str
    language: str | None = None

    def to_dict(self) -> dict:
        return {
            "text_group_id": self.text_group_id,
            "start_time": self.start_time,
            "end_time": self.end_time,
            "on_screen_code": self.on_screen_code,
            "transcript_during": self.transcript_during,
            "language": self.language,
        }

    @classmethod
    def from_dict(cls, data: dict) -> AudioVisualAlignment:
        return cls(
            text_group_id=data["text_group_id"],
            start_time=data["start_time"],
            end_time=data["end_time"],
            on_screen_code=data["on_screen_code"],
            transcript_during=data.get("transcript_during", ""),
            language=data.get("language"),
        )


# =============================================================================
# Core Data Classes
# =============================================================================


@dataclass
class VideoSegment:
    """A time-aligned segment combining transcript + visual + metadata."""

    index: int
    start_time: float
    end_time: float
    duration: float

    # Stream 1: ASR (Audio)
    transcript: str = ""
    words: list[WordTimestamp] = field(default_factory=list)
    transcript_confidence: float = 0.0

    # Stream 2: OCR (Visual)
    keyframes: list[KeyFrame] = field(default_factory=list)
    ocr_text: str = ""
    detected_code_blocks: list[CodeBlock] = field(default_factory=list)
    has_code_on_screen: bool = False
    has_slides: bool = False
    has_diagram: bool = False

    # Stream 3: Metadata
    chapter_title: str | None = None
    topic: str | None = None
    category: str | None = None

    # Merged content
    content: str = ""
    summary: str | None = None

    # Quality metadata
    confidence: float = 0.0
    content_type: SegmentContentType = SegmentContentType.MIXED

    def to_dict(self) -> dict:
        return {
            "index": self.index,
            "start_time": self.start_time,
            "end_time": self.end_time,
            "duration": self.duration,
            "transcript": self.transcript,
            "words": [w.to_dict() for w in self.words],
            "transcript_confidence": self.transcript_confidence,
            "keyframes": [k.to_dict() for k in self.keyframes],
            "ocr_text": self.ocr_text,
            "detected_code_blocks": [c.to_dict() for c in self.detected_code_blocks],
            "has_code_on_screen": self.has_code_on_screen,
            "has_slides": self.has_slides,
            "has_diagram": self.has_diagram,
            "chapter_title": self.chapter_title,
            "topic": self.topic,
            "category": self.category,
            "content": self.content,
            "summary": self.summary,
            "confidence": self.confidence,
            "content_type": self.content_type.value,
        }

    @classmethod
    def from_dict(cls, data: dict) -> VideoSegment:
        return cls(
            index=data["index"],
            start_time=data["start_time"],
            end_time=data["end_time"],
            duration=data["duration"],
            transcript=data.get("transcript", ""),
            words=[WordTimestamp.from_dict(w) for w in data.get("words", [])],
            transcript_confidence=data.get("transcript_confidence", 0.0),
            keyframes=[KeyFrame.from_dict(k) for k in data.get("keyframes", [])],
            ocr_text=data.get("ocr_text", ""),
            detected_code_blocks=[
                CodeBlock.from_dict(c) for c in data.get("detected_code_blocks", [])
            ],
            has_code_on_screen=data.get("has_code_on_screen", False),
            has_slides=data.get("has_slides", False),
            has_diagram=data.get("has_diagram", False),
            chapter_title=data.get("chapter_title"),
            topic=data.get("topic"),
            category=data.get("category"),
            content=data.get("content", ""),
            summary=data.get("summary"),
            confidence=data.get("confidence", 0.0),
            content_type=SegmentContentType(data.get("content_type", "mixed")),
        )

    @property
    def timestamp_display(self) -> str:
        """Human-readable timestamp (e.g., '05:30 - 08:15')."""
        start_min, start_sec = divmod(int(self.start_time), 60)
        end_min, end_sec = divmod(int(self.end_time), 60)
        if self.start_time >= 3600 or self.end_time >= 3600:
            start_hr, start_min = divmod(start_min, 60)
            end_hr, end_min = divmod(end_min, 60)
            return f"{start_hr:d}:{start_min:02d}:{start_sec:02d} - {end_hr:d}:{end_min:02d}:{end_sec:02d}"
        return f"{start_min:02d}:{start_sec:02d} - {end_min:02d}:{end_sec:02d}"


@dataclass
class VideoInfo:
    """Complete metadata and extracted content for a single video."""

    # Identity
    video_id: str
    source_type: VideoSourceType
    source_url: str | None = None
    file_path: str | None = None

    # Basic metadata
    title: str = ""
    description: str = ""
    duration: float = 0.0
    upload_date: str | None = None
    language: str = "en"

    # Channel / Author
    channel_name: str | None = None
    channel_url: str | None = None

    # Engagement metadata
    view_count: int | None = None
    like_count: int | None = None
    comment_count: int | None = None

    # Discovery metadata
    tags: list[str] = field(default_factory=list)
    categories: list[str] = field(default_factory=list)
    thumbnail_url: str | None = None

    # Structure
    chapters: list[Chapter] = field(default_factory=list)

    # Playlist context
    playlist_title: str | None = None
    playlist_index: int | None = None
    playlist_total: int | None = None

    # Extracted content
    raw_transcript: list[TranscriptSegment] = field(default_factory=list)
    segments: list[VideoSegment] = field(default_factory=list)

    # Processing metadata
    transcript_source: TranscriptSource = TranscriptSource.NONE
    visual_extraction_enabled: bool = False
    whisper_model: str | None = None
    processing_time_seconds: float = 0.0
    extracted_at: str = ""

    # Quality scores
    transcript_confidence: float = 0.0
    content_richness_score: float = 0.0

    # Consensus-based text tracking (Phase A-D)
    text_group_timeline: TextGroupTimeline | None = None
    audio_visual_alignments: list[AudioVisualAlignment] = field(default_factory=list)

    def to_dict(self) -> dict:
        return {
            "video_id": self.video_id,
            "source_type": self.source_type.value,
            "source_url": self.source_url,
            "file_path": self.file_path,
            "title": self.title,
            "description": self.description,
            "duration": self.duration,
            "upload_date": self.upload_date,
            "language": self.language,
            "channel_name": self.channel_name,
            "channel_url": self.channel_url,
            "view_count": self.view_count,
            "like_count": self.like_count,
            "comment_count": self.comment_count,
            "tags": self.tags,
            "categories": self.categories,
            "thumbnail_url": self.thumbnail_url,
            "chapters": [c.to_dict() for c in self.chapters],
            "playlist_title": self.playlist_title,
            "playlist_index": self.playlist_index,
            "playlist_total": self.playlist_total,
            "raw_transcript": [t.to_dict() for t in self.raw_transcript],
            "segments": [s.to_dict() for s in self.segments],
            "transcript_source": self.transcript_source.value,
            "visual_extraction_enabled": self.visual_extraction_enabled,
            "whisper_model": self.whisper_model,
            "processing_time_seconds": self.processing_time_seconds,
            "extracted_at": self.extracted_at,
            "transcript_confidence": self.transcript_confidence,
            "content_richness_score": self.content_richness_score,
            "text_group_timeline": self.text_group_timeline.to_dict()
            if self.text_group_timeline
            else None,
            "audio_visual_alignments": [a.to_dict() for a in self.audio_visual_alignments],
        }

    @classmethod
    def from_dict(cls, data: dict) -> VideoInfo:
        timeline_data = data.get("text_group_timeline")
        timeline = TextGroupTimeline.from_dict(timeline_data) if timeline_data else None
        return cls(
            video_id=data["video_id"],
            source_type=VideoSourceType(data["source_type"]),
            source_url=data.get("source_url"),
            file_path=data.get("file_path"),
            title=data.get("title", ""),
            description=data.get("description", ""),
            duration=data.get("duration", 0.0),
            upload_date=data.get("upload_date"),
            language=data.get("language", "en"),
            channel_name=data.get("channel_name"),
            channel_url=data.get("channel_url"),
            view_count=data.get("view_count"),
            like_count=data.get("like_count"),
            comment_count=data.get("comment_count"),
            tags=data.get("tags", []),
            categories=data.get("categories", []),
            thumbnail_url=data.get("thumbnail_url"),
            chapters=[Chapter.from_dict(c) for c in data.get("chapters", [])],
            playlist_title=data.get("playlist_title"),
            playlist_index=data.get("playlist_index"),
            playlist_total=data.get("playlist_total"),
            raw_transcript=[TranscriptSegment.from_dict(t) for t in data.get("raw_transcript", [])],
            segments=[VideoSegment.from_dict(s) for s in data.get("segments", [])],
            transcript_source=TranscriptSource(data.get("transcript_source", "none")),
            visual_extraction_enabled=data.get("visual_extraction_enabled", False),
            whisper_model=data.get("whisper_model"),
            processing_time_seconds=data.get("processing_time_seconds", 0.0),
            extracted_at=data.get("extracted_at", ""),
            transcript_confidence=data.get("transcript_confidence", 0.0),
            content_richness_score=data.get("content_richness_score", 0.0),
            text_group_timeline=timeline,
            audio_visual_alignments=[
                AudioVisualAlignment.from_dict(a) for a in data.get("audio_visual_alignments", [])
            ],
        )


@dataclass
class VideoSourceConfig:
    """Configuration for video source processing."""

    # Source specification (exactly one should be set)
    url: str | None = None
    playlist: str | None = None
    channel: str | None = None
    path: str | None = None
    directory: str | None = None

    # Identity
    name: str = "video"
    description: str = ""

    # Filtering
    max_videos: int = 50
    languages: list[str] | None = None

    # Extraction
    visual_extraction: bool = False
    whisper_model: str = "base"

    # Segmentation
    time_window_seconds: float = 120.0
    min_segment_duration: float = 10.0
    max_segment_duration: float = 600.0

    # Categorization
    categories: dict[str, list[str]] | None = None

    # Subtitle files
    subtitle_patterns: list[str] | None = None

    @classmethod
    def from_dict(cls, data: dict) -> VideoSourceConfig:
        return cls(
            url=data.get("url"),
            playlist=data.get("playlist"),
            channel=data.get("channel"),
            path=data.get("path"),
            directory=data.get("directory"),
            name=data.get("name", "video"),
            description=data.get("description", ""),
            max_videos=data.get("max_videos", 50),
            languages=data.get("languages"),
            visual_extraction=data.get("visual_extraction", False),
            whisper_model=data.get("whisper_model", "base"),
            time_window_seconds=data.get("time_window_seconds", 120.0),
            min_segment_duration=data.get("min_segment_duration", 10.0),
            max_segment_duration=data.get("max_segment_duration", 600.0),
            categories=data.get("categories"),
            subtitle_patterns=data.get("subtitle_patterns"),
        )

    def validate(self) -> list[str]:
        """Validate configuration. Returns list of errors."""
        errors = []
        sources_set = sum(
            1
            for s in [self.url, self.playlist, self.channel, self.path, self.directory]
            if s is not None
        )
        if sources_set == 0:
            errors.append(
                "Video source must specify one of: url, playlist, channel, path, directory"
            )
        if sources_set > 1:
            errors.append("Video source must specify exactly one source type")
        return errors


@dataclass
class VideoScraperResult:
    """Complete result from the video scraper."""

    videos: list[VideoInfo] = field(default_factory=list)
    total_duration_seconds: float = 0.0
    total_segments: int = 0
    total_code_blocks: int = 0
    config: VideoSourceConfig | None = None
    processing_time_seconds: float = 0.0
    warnings: list[str] = field(default_factory=list)
    errors: list[dict[str, Any]] = field(default_factory=list)

    def to_dict(self) -> dict:
        return {
            "videos": [v.to_dict() for v in self.videos],
            "total_duration_seconds": self.total_duration_seconds,
            "total_segments": self.total_segments,
            "total_code_blocks": self.total_code_blocks,
            "processing_time_seconds": self.processing_time_seconds,
            "warnings": self.warnings,
            "errors": self.errors,
        }

    @classmethod
    def from_dict(cls, data: dict) -> VideoScraperResult:
        return cls(
            videos=[VideoInfo.from_dict(v) for v in data.get("videos", [])],
            total_duration_seconds=data.get("total_duration_seconds", 0.0),
            total_segments=data.get("total_segments", 0),
            total_code_blocks=data.get("total_code_blocks", 0),
            processing_time_seconds=data.get("processing_time_seconds", 0.0),
            warnings=data.get("warnings", []),
            errors=data.get("errors", []),
        )