skill-seekers-reference/src/skill_seekers/cli/video_visual.py

"""Video visual extraction module (Tier 2).

Extracts keyframes from videos, classifies them, and performs OCR
to extract text content from slides, code, and terminal screens.

Dependencies (Tier 2):
- opencv-python-headless: Frame extraction and image analysis
- scenedetect: Scene boundary detection
- easyocr: Text recognition in frames
"""

from __future__ import annotations

import concurrent.futures
import difflib
import gc
import logging
import os
import re
import tempfile
from dataclasses import dataclass, field

from skill_seekers.cli.video_models import (
    CodeBlock,
    CodeContext,
    FrameSubSection,
    FrameType,
    KeyFrame,
    OCRRegion,
    TextGroup,
    TextGroupEdit,
    TextGroupTimeline,
)

logger = logging.getLogger(__name__)

# Set ROCm/MIOpen env vars BEFORE importing torch (via easyocr).
# Without MIOPEN_FIND_MODE=FAST, MIOpen tries to allocate huge workspace
# buffers (300MB+), gets 0 bytes, and silently falls back to CPU kernels.
if "MIOPEN_FIND_MODE" not in os.environ:
    os.environ["MIOPEN_FIND_MODE"] = "FAST"
if "MIOPEN_USER_DB_PATH" not in os.environ:
    _miopen_db = os.path.expanduser("~/.config/miopen")
    os.makedirs(_miopen_db, exist_ok=True)
    os.environ["MIOPEN_USER_DB_PATH"] = _miopen_db

# Tier 2 dependency flags
try:
    import cv2

    HAS_OPENCV = True
except ImportError:
    cv2 = None  # type: ignore[assignment]
    HAS_OPENCV = False

try:
    import scenedetect as sd

    HAS_SCENEDETECT = True
except ImportError:
    sd = None  # type: ignore[assignment]
    HAS_SCENEDETECT = False

try:
    import easyocr

    HAS_EASYOCR = True
except ImportError:
    easyocr = None  # type: ignore[assignment]
    HAS_EASYOCR = False

try:
    import pytesseract

    HAS_PYTESSERACT = True
except ImportError:
    pytesseract = None  # type: ignore[assignment]
    HAS_PYTESSERACT = False

# Circuit breaker: after first tesseract failure, disable it for the session.
# Prevents wasting time spawning subprocesses that always fail.
_tesseract_broken = False


_INSTALL_MSG = (
    "Visual extraction requires additional dependencies.\n"
    "Recommended: skill-seekers video --setup  (auto-detects GPU, installs correct PyTorch)\n"
    'Alternative:  pip install "skill-seekers[video-full]"  (may install wrong PyTorch variant)'
)

# Lazy-initialized EasyOCR reader (heavy, only load once)
_ocr_reader = None


def _detect_gpu() -> bool:
    """Check if a CUDA or ROCm GPU is available for EasyOCR/PyTorch."""
    try:
        import torch

        return torch.cuda.is_available() or (
            hasattr(torch.version, "hip") and torch.version.hip is not None
        )
    except ImportError:
        return False


def _get_ocr_reader():
    """Get or create the EasyOCR reader (lazy singleton)."""
    global _ocr_reader
    if _ocr_reader is None:
        use_gpu = _detect_gpu()
        logger.info(
            f"Initializing OCR engine ({'GPU' if use_gpu else 'CPU'} mode, "
            "first run may download models)..."
        )
        _ocr_reader = easyocr.Reader(["en"], gpu=use_gpu)
    return _ocr_reader


def _detect_theme(gray_img) -> str:
    """Detect 'dark' or 'light' theme from grayscale image.

    Uses median brightness: < 128 = dark theme, >= 128 = light theme.
    """
    import numpy as np

    median = float(np.median(gray_img))
    return "dark" if median < 128 else "light"


def _preprocess_frame_for_ocr(frame_path: str, frame_type: FrameType) -> str:
    """Apply frame-type-aware preprocessing before OCR.

    CODE_EDITOR/TERMINAL: COLOR inversion (preserves syntax highlighting) →
    grayscale → aggressive upscale → CLAHE contrast enhancement.  Produces
    a high-res, high-contrast grayscale suitable for EasyOCR.

    SLIDE: mild sharpening.
    Others: no preprocessing.

    Args:
        frame_path: Path to the original frame image.
        frame_type: Classification of the frame.

    Returns:
        Path to the preprocessed image (may be a temp file or the original).
    """
    if not HAS_OPENCV:
        return frame_path

    import numpy as np

    if frame_type in (FrameType.CODE_EDITOR, FrameType.TERMINAL):
        img = cv2.imread(frame_path)
        if img is None:
            return frame_path

        # 1. Theme detection on original grayscale
        gray_check = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        theme = _detect_theme(gray_check)

        # 2. COLOR inversion on BGR — preserves syntax highlighting distinctions.
        #    Grayscale-then-invert loses the difference between blue/green/red text.
        if theme == "dark":
            img = cv2.bitwise_not(img)

        # 3. Convert inverted color to grayscale
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # 4. Aggressive upscale BEFORE any processing — OCR needs ~12px+ char height.
        #    Must be done on grayscale (not binary) for clean INTER_CUBIC interpolation.
        h, w = gray.shape
        if w < 1920:
            scale = max(2, (1920 // w) + 1)
            gray = cv2.resize(gray, (w * scale, h * scale), interpolation=cv2.INTER_CUBIC)

        # 5. CLAHE contrast enhancement — brings out faint text
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        gray = clahe.apply(gray)

        with tempfile.NamedTemporaryFile(suffix=".png", prefix="ocr_pre_", delete=False) as tmp:
            tmp_path = tmp.name
        cv2.imwrite(tmp_path, gray)
        return tmp_path

    if frame_type == FrameType.SLIDE:
        img = cv2.imread(frame_path)
        if img is None:
            return frame_path
        kernel = np.array([[0, -0.5, 0], [-0.5, 3, -0.5], [0, -0.5, 0]])
        sharpened = cv2.filter2D(img, -1, kernel)
        with tempfile.NamedTemporaryFile(suffix=".png", prefix="ocr_pre_", delete=False) as tmp:
            tmp_path = tmp.name
        cv2.imwrite(tmp_path, sharpened)
        return tmp_path

    return frame_path


def _binarize_for_tesseract(grayscale_path: str) -> str:
    """Produce a clean binary image from a preprocessed grayscale, for Tesseract.

    Pipeline: Gaussian blur → Otsu's threshold → morphological close.
    Tesseract performs best on clean black-text-on-white binary images.

    Args:
        grayscale_path: Path to a preprocessed grayscale image.

    Returns:
        Path to the binary image (temp file).
    """
    import numpy as np

    gray = cv2.imread(grayscale_path, cv2.IMREAD_GRAYSCALE)
    if gray is None:
        return grayscale_path

    # Gaussian blur to smooth noise before thresholding
    blurred = cv2.GaussianBlur(gray, (3, 3), 0)

    # Otsu's binarization — globally optimal for bimodal (text vs background)
    _, binary = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Morphological close to fill small gaps in character strokes
    kernel = np.ones((2, 2), np.uint8)
    binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=1)

    with tempfile.NamedTemporaryFile(suffix=".png", prefix="ocr_bin_", delete=False) as tmp:
        tmp_path = tmp.name
    cv2.imwrite(tmp_path, binary)
    return tmp_path


def _get_ocr_params(frame_type: FrameType) -> dict:
    """Return EasyOCR readtext kwargs tuned per frame type.

    CODE_EDITOR/TERMINAL: lower thresholds, beam search, higher mag.
    SLIDE/OTHER: defaults with greedy decoder.
    """
    if frame_type in (FrameType.CODE_EDITOR, FrameType.TERMINAL):
        return {
            "text_threshold": 0.4,
            "low_text": 0.3,
            "contrast_ths": 0.3,
            "mag_ratio": 1.0,  # Frame already upscaled in preprocessing
            "decoder": "beamsearch",
            "beamWidth": 10,
        }
    if frame_type == FrameType.SLIDE:
        return {
            "text_threshold": 0.6,
            "low_text": 0.4,
            "mag_ratio": 1.0,
            "decoder": "greedy",
            "beamWidth": 5,
        }
    return {
        "text_threshold": 0.6,
        "low_text": 0.4,
        "mag_ratio": 1.0,
        "decoder": "greedy",
        "beamWidth": 5,
    }


_CODE_TOKENS = frozenset(
    {
        "func",
        "var",
        "def",
        "class",
        "return",
        "if",
        "for",
        "while",
        "import",
        "from",
        "const",
        "let",
        "function",
        "extends",
        "self",
        "true",
        "false",
        "null",
        "none",
        "elif",
        "else",
        "try",
        "except",
        "async",
        "await",
        "yield",
        "print",
        "int",
        "str",
        "float",
        "bool",
        "=",
        "(",
        ")",
        "{",
        "}",
        "[",
        "]",
        ":",
        "->",
        "=>",
        "==",
        "!=",
    }
)


def _has_code_tokens(text: str) -> bool:
    """Check if text contains recognizable code tokens."""
    lower = text.lower()
    return any(token in lower for token in _CODE_TOKENS)


def _run_tesseract_ocr(preprocessed_path: str, frame_type: FrameType) -> list[tuple]:  # noqa: ARG001
    """Run pytesseract on a preprocessed frame.

    Creates a binarized version of the preprocessed grayscale (Tesseract
    performs best on clean binary images), then runs Tesseract with
    ``--psm 4`` (single column of variable-size text) and LSTM engine.

    Returns results in the same format as EasyOCR: list of (bbox, text, confidence).
    Groups words into lines by y-coordinate.

    Uses a circuit breaker: if tesseract fails once, it's disabled for the
    rest of the session to avoid wasting time on repeated subprocess failures.

    Args:
        preprocessed_path: Path to the preprocessed grayscale image.
        frame_type: Frame classification (reserved for future per-type tuning).
    """
    global _tesseract_broken
    if not HAS_PYTESSERACT or _tesseract_broken:
        return []

    # Produce clean binary for Tesseract
    binary_path = _binarize_for_tesseract(preprocessed_path)
    try:
        data = pytesseract.image_to_data(
            binary_path,
            config="--psm 4 --oem 1",
            output_type=pytesseract.Output.DICT,
        )
    except Exception:  # noqa: BLE001
        _tesseract_broken = True
        logger.warning(
            "pytesseract failed — disabling for this session. "
            "Install tesseract binary: skill-seekers video --setup"
        )
        return []
    finally:
        if binary_path != preprocessed_path and os.path.exists(binary_path):
            os.unlink(binary_path)

    # Collect words with valid confidence
    words = []
    for i in range(len(data["text"])):
        text = data["text"][i].strip()
        conf = float(data["conf"][i])
        if not text or conf < 30:
            continue
        x = data["left"][i]
        y = data["top"][i]
        w = data["width"][i]
        h = data["height"][i]
        bbox = [[x, y], [x + w, y], [x + w, y + h], [x, y + h]]
        words.append(
            {
                "bbox": bbox,
                "text": text,
                "conf": conf / 100.0,
                "y_center": y + h / 2,
                "line_num": data["line_num"][i],
                "block_num": data["block_num"][i],
            }
        )

    if not words:
        return []

    # Group by (block_num, line_num) to form lines
    line_groups: dict[tuple[int, int], list[dict]] = {}
    for w in words:
        key = (w["block_num"], w["line_num"])
        line_groups.setdefault(key, []).append(w)

    results = []
    for _key, line_words in sorted(line_groups.items()):
        line_words.sort(key=lambda w: w["bbox"][0][0])
        line_text = " ".join(w["text"] for w in line_words)
        avg_conf = sum(w["conf"] for w in line_words) / len(line_words)

        # Build bounding box for the whole line
        x_min = min(w["bbox"][0][0] for w in line_words)
        y_min = min(w["bbox"][0][1] for w in line_words)
        x_max = max(w["bbox"][1][0] for w in line_words)
        y_max = max(w["bbox"][2][1] for w in line_words)
        bbox = [[x_min, y_min], [x_max, y_min], [x_max, y_max], [x_min, y_max]]

        results.append((bbox, line_text, avg_conf))

    return results


def _run_multi_engine_ocr(
    frame_path: str,
    frame_type: FrameType,
) -> tuple[list[tuple], str]:
    """Run multiple OCR engines and ensemble the results.

    Strategy:
    1. Preprocess the frame (inversion + binarization for code frames).
    2. Run EasyOCR on the preprocessed image.
    3. Run pytesseract on the preprocessed image.
    4. For each y-bucket line, pick the engine result with higher confidence.
    5. Prefer results that contain recognizable code tokens.

    Returns:
        Tuple of (raw_results, flat_text).
    """
    preprocessed_path = _preprocess_frame_for_ocr(frame_path, frame_type)
    try:
        return _ensemble_ocr_results(preprocessed_path, frame_type)
    finally:
        if preprocessed_path != frame_path and os.path.exists(preprocessed_path):
            os.unlink(preprocessed_path)


def _ensemble_ocr_results(
    preprocessed_path: str,
    frame_type: FrameType,
) -> tuple[list[tuple], str]:
    """Run EasyOCR + pytesseract and merge results by y-bucket."""
    # Run EasyOCR
    easy_results: list[tuple] = []
    if HAS_EASYOCR:
        try:
            reader = _get_ocr_reader()
            ocr_params = _get_ocr_params(frame_type)
            raw = reader.readtext(preprocessed_path, detail=1, paragraph=False, **ocr_params)
            easy_results = [
                (bbox, text.strip(), conf)
                for bbox, text, conf in raw
                if conf >= 0.3 and text.strip()
            ]
        except Exception:  # noqa: BLE001
            logger.debug("EasyOCR failed in multi-engine pipeline")

    # Run pytesseract
    tess_results = _run_tesseract_ocr(preprocessed_path, frame_type)

    if not easy_results and not tess_results:
        return [], ""
    if not easy_results:
        flat = " ".join(text for _, text, _ in tess_results)
        return tess_results, flat
    if not tess_results:
        flat = " ".join(text for _, text, _ in easy_results)
        return easy_results, flat

    # Merge by y-bucket: for each line, pick the better engine result
    merged = _merge_by_y_bucket(easy_results, tess_results)
    flat = " ".join(text for _, text, _ in merged)
    return merged, flat


def _merge_by_y_bucket(
    easy_results: list[tuple],
    tess_results: list[tuple],
    y_tolerance: float = 20.0,
) -> list[tuple]:
    """Merge two sets of OCR results by matching y-coordinate lines.

    For each y-bucket, picks the result with higher confidence,
    with a preference for results containing code tokens.
    """

    def _y_center(bbox) -> float:
        return (min(pt[1] for pt in bbox) + max(pt[1] for pt in bbox)) / 2

    # Build y-indexed lines for each engine
    easy_lines = [(r, _y_center(r[0])) for r in easy_results]
    tess_lines = [(r, _y_center(r[0])) for r in tess_results]

    # Sort by y
    easy_lines.sort(key=lambda x: x[1])
    tess_lines.sort(key=lambda x: x[1])

    merged: list[tuple] = []
    used_tess = set()

    for easy_r, easy_y in easy_lines:
        # Find matching tess line
        best_tess_idx = None
        best_dist = float("inf")
        for i, (tess_r, tess_y) in enumerate(tess_lines):
            if i in used_tess:
                continue
            dist = abs(easy_y - tess_y)
            if dist <= y_tolerance and dist < best_dist:
                best_dist = dist
                best_tess_idx = i

        if best_tess_idx is not None:
            used_tess.add(best_tess_idx)
            tess_r = tess_lines[best_tess_idx][0]
            # Pick better result
            winner = _pick_better_ocr_result(easy_r, tess_r)
            merged.append(winner)
        else:
            merged.append(easy_r)

    # Add unmatched tess lines
    for i, (tess_r, _) in enumerate(tess_lines):
        if i not in used_tess:
            merged.append(tess_r)

    # Sort final results by y position
    merged.sort(key=lambda r: _y_center(r[0]))
    return merged


def _pick_better_ocr_result(result_a: tuple, result_b: tuple) -> tuple:
    """Pick the better of two OCR results for the same line.

    Prefers code-token-containing results; ties broken by confidence.
    """
    _, text_a, conf_a = result_a
    _, text_b, conf_b = result_b

    has_code_a = _has_code_tokens(text_a)
    has_code_b = _has_code_tokens(text_b)

    # If one has code tokens and the other doesn't, prefer code tokens
    if has_code_a and not has_code_b:
        return result_a
    if has_code_b and not has_code_a:
        return result_b

    # Both have or both lack code tokens — pick higher confidence
    return result_a if conf_a >= conf_b else result_b


def _ocr_with_claude_vision(frame_path: str, frame_type: FrameType) -> tuple[str, float]:
    """Use Claude Vision API to extract code from a frame.

    Sends the frame image to Claude Haiku and asks it to extract all
    visible code/text exactly as shown.

    Returns:
        (extracted_text, confidence).  Confidence is 0.95 when successful.
        Returns ("", 0.0) if API key is not set or the call fails.
    """
    import base64

    api_key = os.environ.get("ANTHROPIC_API_KEY", "")
    if not api_key:
        return "", 0.0

    try:
        import anthropic

        # Read image as base64
        with open(frame_path, "rb") as f:
            image_data = base64.standard_b64encode(f.read()).decode("utf-8")

        # Determine media type
        ext = os.path.splitext(frame_path)[1].lower()
        media_type_map = {
            ".png": "image/png",
            ".jpg": "image/jpeg",
            ".jpeg": "image/jpeg",
            ".gif": "image/gif",
            ".webp": "image/webp",
        }
        media_type = media_type_map.get(ext, "image/png")

        context = "IDE screenshot" if frame_type == FrameType.CODE_EDITOR else "terminal screenshot"
        prompt = (
            f"Extract all visible code/text from this {context} exactly as shown. "
            "Preserve indentation, line breaks, and all characters. "
            "Return only the raw code text, no explanations."
        )

        client = anthropic.Anthropic(api_key=api_key)
        response = client.messages.create(
            model="claude-haiku-4-5-20251001",
            max_tokens=4096,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": media_type,
                                "data": image_data,
                            },
                        },
                        {
                            "type": "text",
                            "text": prompt,
                        },
                    ],
                }
            ],
        )

        text = response.content[0].text.strip() if response.content else ""
        if text:
            return text, 0.95
        return "", 0.0

    except Exception:  # noqa: BLE001
        logger.debug("Claude Vision API call failed, falling back to OCR results")
        return "", 0.0


def check_visual_dependencies() -> dict[str, bool]:
    """Check which visual extraction dependencies are available.

    Returns:
        Dict mapping dependency name to availability.
    """
    return {
        "opencv": HAS_OPENCV,
        "scenedetect": HAS_SCENEDETECT,
        "easyocr": HAS_EASYOCR,
    }


def detect_scenes(video_path: str) -> list[tuple[float, float]]:
    """Detect scene boundaries in a video using scenedetect.

    Args:
        video_path: Path to video file.

    Returns:
        List of (start_time, end_time) tuples for each scene in seconds.

    Raises:
        RuntimeError: If required dependencies are not installed.
    """
    if not HAS_OPENCV or not HAS_SCENEDETECT:
        raise RuntimeError(_INSTALL_MSG)

    logger.info(f"Detecting scenes in {video_path}...")

    video = sd.open_video(video_path)
    scene_manager = sd.SceneManager()
    scene_manager.add_detector(sd.ContentDetector(threshold=27.0))
    scene_manager.detect_scenes(video)
    scene_list = scene_manager.get_scene_list()

    scenes = []
    for scene_start, scene_end in scene_list:
        scenes.append((scene_start.get_seconds(), scene_end.get_seconds()))

    logger.info(f"Detected {len(scenes)} scenes")
    return scenes


def extract_keyframes(video_path: str, timestamps: list[float]) -> list[KeyFrame]:
    """Extract keyframes at specified timestamps using OpenCV.

    Args:
        video_path: Path to video file.
        timestamps: List of timestamps (in seconds) to extract frames at.

    Returns:
        List of KeyFrame objects with saved frame paths.

    Raises:
        RuntimeError: If required dependencies are not installed.
    """
    if not HAS_OPENCV:
        raise RuntimeError(_INSTALL_MSG)

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        logger.error(f"Cannot open video: {video_path}")
        return []

    fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
    keyframes = []

    for ts in sorted(timestamps):
        frame_num = int(ts * fps)
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
        ret, frame = cap.read()
        if not ret:
            logger.warning(f"Could not read frame at {ts:.1f}s")
            continue

        # Save frame to temp file
        with tempfile.NamedTemporaryFile(
            suffix=".jpg", prefix=f"frame_{ts:.0f}s_", delete=False
        ) as tmp:
            tmp_path = tmp.name
        cv2.imwrite(tmp_path, frame)

        frame_type = classify_frame(tmp_path)
        kf = KeyFrame(
            timestamp=ts,
            image_path=tmp_path,
            frame_type=frame_type,
        )
        keyframes.append(kf)

    cap.release()
    logger.info(f"Extracted {len(keyframes)} keyframes")
    return keyframes


# Minimum panel dimensions for region-based classification.
# IDE panels smaller than these are toolbar/tab/scrollbar noise.
_MIN_PANEL_WIDTH = 200
_MIN_PANEL_HEIGHT = 150
_MIN_PANEL_AREA_PCT = 5.0  # percent of total frame area


def _classify_region(gray, edges, hsv) -> FrameType:
    """Classify a single rectangular region from pre-computed arrays."""
    import numpy as np

    h, w = gray.shape
    mean_brightness = float(gray.mean())
    edge_density = float(edges.mean()) / 255.0
    saturation_mean = float(hsv[:, :, 1].mean())

    # Horizontal line detection for code editors
    horizontal_lines = 0
    if mean_brightness < 80 and edge_density > 0.008:
        lines = cv2.HoughLinesP(
            edges, 1, np.pi / 180, threshold=80, minLineLength=w // 8, maxLineGap=10
        )
        if lines is not None:
            for line in lines:
                x1, y1, x2, y2 = line[0]
                angle = abs(np.degrees(np.arctan2(y2 - y1, x2 - x1)))
                if angle < 5 or angle > 175:
                    horizontal_lines += 1

    if mean_brightness < 80 and (
        edge_density > 0.05 or (edge_density > 0.01 and horizontal_lines >= 3)
    ):
        if saturation_mean < 30:
            return FrameType.TERMINAL
        return FrameType.CODE_EDITOR
    elif mean_brightness > 180 and edge_density > 0.03:
        return FrameType.SLIDE
    elif mean_brightness > 160 and edge_density < 0.02:
        return FrameType.DIAGRAM
    elif saturation_mean > 60 and mean_brightness > 80:
        return FrameType.WEBCAM

    return FrameType.OTHER


def _detect_panel_dividers(gray) -> tuple[list[int], list[int]]:
    """Detect IDE panel divider positions using brightness gradients.

    Panel dividers are thin lines where many rows (or columns) have a
    sharp brightness change.  Returns lists of x and y positions.
    """
    import numpy as np

    h, w = gray.shape

    # Vertical dividers: column-wise horizontal gradient
    dx = np.abs(np.diff(gray.astype(np.float32), axis=1))
    v_sig = (dx > 25).sum(axis=0)
    v_cols = np.where(v_sig > h * 0.3)[0]

    v_dividers: list[int] = []
    if len(v_cols) > 0:
        group = [v_cols[0]]
        for x in v_cols[1:]:
            if x - group[-1] <= 15:
                group.append(x)
            else:
                v_dividers.append(int(np.mean(group)))
                group = [x]
        v_dividers.append(int(np.mean(group)))
    v_dividers = [d for d in v_dividers if w * 0.03 < d < w * 0.97]

    # Horizontal dividers: row-wise vertical gradient
    dy = np.abs(np.diff(gray.astype(np.float32), axis=0))
    h_sig = (dy > 25).sum(axis=1)
    h_rows = np.where(h_sig > w * 0.3)[0]

    h_dividers: list[int] = []
    if len(h_rows) > 0:
        group = [h_rows[0]]
        for y in h_rows[1:]:
            if y - group[-1] <= 15:
                group.append(y)
            else:
                h_dividers.append(int(np.mean(group)))
                group = [y]
        h_dividers.append(int(np.mean(group)))
    h_dividers = [d for d in h_dividers if h * 0.03 < d < h * 0.97]

    return v_dividers, h_dividers


def classify_frame_regions(
    frame_path: str,
) -> list[tuple[int, int, int, int, FrameType]]:
    """Classify a frame by detecting IDE panels as rectangles.

    Finds panel divider lines (vertical and horizontal brightness edges),
    builds a grid of rectangular panels, filters by minimum size, and
    classifies each panel independently.

    This handles split-screen IDE layouts where half the screen shows code
    and the other half shows a game viewport or inspector.

    Args:
        frame_path: Path to frame image file.

    Returns:
        List of ``(x1, y1, x2, y2, FrameType)`` for each detected panel
        that meets the minimum size threshold.
    """
    if not HAS_OPENCV:
        raise RuntimeError(_INSTALL_MSG)

    img = cv2.imread(frame_path)
    if img is None:
        return [(0, 0, 0, 0, FrameType.OTHER)]

    h, w = img.shape[:2]
    gray_full = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    edges_full = cv2.Canny(gray_full, 50, 150)
    hsv_full = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)

    v_dividers, h_dividers = _detect_panel_dividers(gray_full)

    xs = [0] + v_dividers + [w]
    ys = [0] + h_dividers + [h]
    total_area = w * h

    panels: list[tuple[int, int, int, int, FrameType]] = []
    for i in range(len(ys) - 1):
        for j in range(len(xs) - 1):
            x1, x2 = xs[j], xs[j + 1]
            y1, y2 = ys[i], ys[i + 1]
            pw, ph = x2 - x1, y2 - y1
            area_pct = (pw * ph) / total_area * 100

            if pw < _MIN_PANEL_WIDTH or ph < _MIN_PANEL_HEIGHT:
                continue
            if area_pct < _MIN_PANEL_AREA_PCT:
                continue

            ft = _classify_region(
                gray_full[y1:y2, x1:x2],
                edges_full[y1:y2, x1:x2],
                hsv_full[y1:y2, x1:x2],
            )
            panels.append((x1, y1, x2, y2, ft))

    # Fallback: if no panels survived the size filter, classify whole frame
    if not panels:
        ft = _classify_region(gray_full, edges_full, hsv_full)
        panels.append((0, 0, w, h, ft))

    return panels


def _find_code_bbox(
    regions: list[tuple[int, int, int, int, FrameType]],
) -> tuple[int, int, int, int] | None:
    """Merge all code/terminal panels into one bounding box.

    Returns ``(x1, y1, x2, y2)`` covering all code regions, or None.
    """
    code = [r for r in regions if r[4] in (FrameType.CODE_EDITOR, FrameType.TERMINAL)]
    if not code:
        return None
    return (
        min(r[0] for r in code),
        min(r[1] for r in code),
        max(r[2] for r in code),
        max(r[3] for r in code),
    )


# Panels narrower than this produce mostly OCR noise (inspector sidebars,
# narrow file-tree strips, thin toolbars).  300 px is roughly the width
# needed for a single readable code line at typical IDE font sizes.
_MIN_PANEL_OCR_WIDTH = 300


def _get_code_panels(
    regions: list[tuple[int, int, int, int, FrameType]],
    min_width: int = _MIN_PANEL_OCR_WIDTH,
) -> list[tuple[int, int, int, int]]:
    """Return bounding boxes for individual code/terminal panels.

    Unlike ``_find_code_bbox`` which merges all code regions into one,
    this returns each code panel separately so they can be OCR'd
    independently.  Panels narrower than *min_width* pixels are
    discarded — they typically contain inspector sidebars or toolbars
    that produce garbage OCR.
    """
    return [
        (r[0], r[1], r[2], r[3])
        for r in regions
        if r[4] in (FrameType.CODE_EDITOR, FrameType.TERMINAL) and (r[2] - r[0]) >= min_width
    ]


def _crop_code_region(frame_path: str, bbox: tuple[int, int, int, int], suffix: str = "") -> str:
    """Crop the code region from a frame and save as a temp file.

    Args:
        frame_path: Path to the source frame image.
        bbox: ``(x1, y1, x2, y2)`` crop rectangle.
        suffix: Optional suffix to disambiguate when cropping multiple
            panels from the same frame (e.g. ``"_p0"``, ``"_p1"``).
    """
    img = cv2.imread(frame_path)
    x1, y1, x2, y2 = bbox
    cropped = img[y1:y2, x1:x2]
    base, ext = os.path.splitext(frame_path)
    cropped_path = f"{base}_code_crop{suffix}{ext}"
    cv2.imwrite(cropped_path, cropped)
    return cropped_path


def _frame_type_from_regions(
    regions: list[tuple[int, int, int, int, FrameType]],
) -> FrameType:
    """Derive the dominant frame type from pre-computed regions.

    Same logic as ``classify_frame`` but avoids re-loading the image.
    """
    for _x1, _y1, _x2, _y2, ft in regions:
        if ft == FrameType.TERMINAL:
            return FrameType.TERMINAL
        if ft == FrameType.CODE_EDITOR:
            return FrameType.CODE_EDITOR

    from collections import Counter

    type_counts = Counter(ft for _, _, _, _, ft in regions)
    return type_counts.most_common(1)[0][0] if type_counts else FrameType.OTHER


def classify_frame(frame_path: str) -> FrameType:
    """Classify a video frame by its visual content.

    Uses region-based panel detection: finds IDE panel boundaries,
    classifies each rectangular panel, returns CODE_EDITOR/TERMINAL
    if *any* panel contains code.  This handles split-screen layouts.

    Args:
        frame_path: Path to frame image file.

    Returns:
        FrameType classification (CODE_EDITOR if any panel has code).
    """
    regions = classify_frame_regions(frame_path)

    # If any panel is code, the frame "has code"
    for _x1, _y1, _x2, _y2, ft in regions:
        if ft == FrameType.TERMINAL:
            return FrameType.TERMINAL
        if ft == FrameType.CODE_EDITOR:
            return FrameType.CODE_EDITOR

    # No code — return the most common type
    from collections import Counter

    type_counts = Counter(ft for _, _, _, _, ft in regions)
    return type_counts.most_common(1)[0][0]


def extract_text_from_frame(
    frame_path: str,
    frame_type: FrameType = FrameType.OTHER,
) -> tuple[list[tuple], str]:
    """Extract text from a video frame using EasyOCR.

    Applies frame-type-aware preprocessing and OCR parameters for
    better accuracy on code, terminal, and slide frames.

    Args:
        frame_path: Path to frame image file.
        frame_type: Classification of the frame content.

    Returns:
        Tuple of (raw_easyocr_results, flat_text_string).
        Each raw result is (bbox, text, confidence).

    Raises:
        RuntimeError: If required dependencies are not installed.
    """
    if not HAS_EASYOCR:
        raise RuntimeError(_INSTALL_MSG)

    preprocessed_path = _preprocess_frame_for_ocr(frame_path, frame_type)
    try:
        reader = _get_ocr_reader()
        ocr_params = _get_ocr_params(frame_type)
        results = reader.readtext(preprocessed_path, detail=1, paragraph=False, **ocr_params)
    finally:
        if preprocessed_path != frame_path and os.path.exists(preprocessed_path):
            os.unlink(preprocessed_path)

    # Filter by confidence
    filtered = []
    texts = []
    for bbox, text, conf in results:
        if conf >= 0.3 and text.strip():
            filtered.append((bbox, text.strip(), conf))
            texts.append(text.strip())

    return filtered, " ".join(texts)


def _cluster_ocr_into_lines(
    raw_results: list[tuple],
    frame_type: FrameType = FrameType.OTHER,
) -> list[OCRRegion]:
    """Cluster EasyOCR results into line-based OCRRegions.

    Groups text fragments that share similar y-coordinates into
    lines, sorts within each line by x-coordinate, and builds
    one OCRRegion per line.

    Args:
        raw_results: List of (bbox, text, confidence) from EasyOCR.
        frame_type: Frame classification for monospace detection.

    Returns:
        List of OCRRegion objects, one per detected text line.
    """
    if not raw_results:
        return []

    # Compute y_center for each result and estimate line height
    items = []
    for bbox, text, conf in raw_results:
        y_top = min(pt[1] for pt in bbox)
        y_bottom = max(pt[1] for pt in bbox)
        x_left = min(pt[0] for pt in bbox)
        x_right = max(pt[0] for pt in bbox)
        y_center = (y_top + y_bottom) / 2
        line_height = y_bottom - y_top
        items.append(
            {
                "text": text,
                "conf": conf,
                "y_center": y_center,
                "y_top": y_top,
                "y_bottom": y_bottom,
                "x_left": x_left,
                "x_right": x_right,
                "line_height": max(line_height, 1),
            }
        )

    # Sort by y_center
    items.sort(key=lambda it: it["y_center"])

    # Cluster into lines
    lines: list[list[dict]] = [[items[0]]]
    for item in items[1:]:
        current_line = lines[-1]
        avg_height = sum(it["line_height"] for it in current_line) / len(current_line)
        if abs(item["y_center"] - current_line[-1]["y_center"]) <= avg_height * 0.5:
            current_line.append(item)
        else:
            lines.append([item])

    # Estimate average character width for tab detection
    total_chars = sum(len(it["text"]) for it in items)
    total_width = sum(it["x_right"] - it["x_left"] for it in items)
    avg_char_width = total_width / max(total_chars, 1)

    is_mono = frame_type in (FrameType.CODE_EDITOR, FrameType.TERMINAL)

    regions = []
    for line in lines:
        # Sort fragments within line by x-coordinate
        line.sort(key=lambda it: it["x_left"])

        # Join fragments with appropriate spacing
        parts = []
        for i, frag in enumerate(line):
            if i > 0:
                gap = frag["x_left"] - line[i - 1]["x_right"]
                if gap > avg_char_width * 2:
                    parts.append("\t")
                else:
                    parts.append(" ")
            parts.append(frag["text"])

        text = "".join(parts)
        avg_conf = sum(f["conf"] for f in line) / len(line)
        bbox = (
            int(min(f["x_left"] for f in line)),
            int(min(f["y_top"] for f in line)),
            int(max(f["x_right"] for f in line)),
            int(max(f["y_bottom"] for f in line)),
        )

        regions.append(
            OCRRegion(
                text=text,
                confidence=avg_conf,
                bbox=bbox,
                is_monospace=is_mono,
            )
        )

    return regions


# ── OCR line cleaning ────────────────────────────────────────────────


def _fuzzy_word_match(a: str, b: str) -> bool:
    """Check if two words are likely the same despite OCR noise.

    Allows single-char prefix/suffix noise (e.g. 'gpublic' vs 'public')
    and common OCR confusions (l/1, O/0, rn/m).
    """
    if a == b:
        return True
    # Strip single-char OCR prefix noise (e.g. 'Jpublic' → 'public')
    a_stripped = a.lstrip("gGjJlLiI|") if len(a) > 2 else a
    b_stripped = b.lstrip("gGjJlLiI|") if len(b) > 2 else b
    if a_stripped == b_stripped:
        return True
    # Allow edit distance ≤ 1 for short words
    if abs(len(a) - len(b)) <= 1 and len(a) >= 3:
        diffs = sum(1 for x, y in zip(a, b, strict=False) if x != y)
        diffs += abs(len(a) - len(b))
        return diffs <= 1
    return False


def _fix_intra_line_duplication(line: str) -> str:
    """Fix lines where OCR duplicated content.

    Detects when the same token sequence appears twice adjacent,
    e.g. 'public class Card public class Card : MonoBehaviour'
    → 'public class Card : MonoBehaviour'.
    """
    words = line.split()
    if len(words) < 4:
        return line
    half = len(words) // 2
    for split_point in range(max(2, half - 2), min(len(words) - 1, half + 3)):
        prefix = words[:split_point]
        suffix = words[split_point:]
        # Check if suffix starts with same sequence as prefix
        match_len = 0
        for i, w in enumerate(prefix):
            if i < len(suffix) and _fuzzy_word_match(w, suffix[i]):
                match_len += 1
            else:
                break
        if match_len >= len(prefix) * 0.7 and match_len >= 2:
            # Keep the longer/cleaner half (suffix usually has trailing content)
            return (
                " ".join(suffix)
                if len(" ".join(suffix)) >= len(" ".join(prefix))
                else " ".join(prefix)
            )
    return line


# Compiled patterns for _clean_ocr_line
_RE_LEADING_LINE_NUMBER = re.compile(r"^\s*\d{1,4}(?:\s+|\t)")
_RE_COLLAPSE_MARKERS = re.compile(r"[▶▼►◄…⋯⋮]")
_RE_IDE_TAB_BAR = re.compile(
    r"^\s*(?:File|Edit|Assets|Window|Help|View|Tools|Debug|Run|Terminal)\s+",
    re.IGNORECASE,
)
_RE_UNITY_INSPECTOR = re.compile(
    r"^\s*(?:Inspector|Hierarchy|Project|Console|Scene|Game)\b.*$",
    re.IGNORECASE,
)


def _clean_ocr_line(line: str) -> str:
    """Remove IDE decorations and OCR artifacts from a single line."""
    if not line:
        return line
    # Remove full-line UI chrome
    if _RE_UNITY_INSPECTOR.match(line):
        return ""
    if _RE_IDE_TAB_BAR.match(line):
        return ""
    # Strip leading line numbers (e.g. '23  public class Card')
    line = _RE_LEADING_LINE_NUMBER.sub("", line)
    # Remove collapse markers / VS Code decorations
    line = _RE_COLLAPSE_MARKERS.sub("", line)
    # Fix intra-line duplication from multi-engine overlap
    line = _fix_intra_line_duplication(line)
    return line.strip()


def _assemble_structured_text(regions: list[OCRRegion], frame_type: FrameType) -> str:
    """Join OCR line regions into structured text.

    CODE_EDITOR/TERMINAL: newline-separated with indentation from x-offset.
    SLIDE: double-newline paragraph spacing.
    Others: space-separated flat text.

    Args:
        regions: List of OCRRegion objects (one per line).
        frame_type: Frame classification.

    Returns:
        Formatted text string.
    """
    if not regions:
        return ""

    if frame_type in (FrameType.CODE_EDITOR, FrameType.TERMINAL):
        if not regions:
            return ""
        # Estimate indentation from x-offset relative to leftmost region
        min_x = min(r.bbox[0] for r in regions)
        raw_lines = []
        for r in regions:
            indent_px = r.bbox[0] - min_x
            # Estimate character width from the region
            region_width = r.bbox[2] - r.bbox[0]
            char_count = len(r.text.replace("\t", "    "))
            char_width = region_width / max(char_count, 1)
            indent_chars = int(indent_px / max(char_width, 1))
            # Round to nearest 4-space indent
            indent_level = round(indent_chars / 4)
            raw_lines.append("    " * indent_level + r.text)
        # Clean IDE decorations and OCR artifacts from each line
        cleaned = []
        for line in raw_lines:
            c = _clean_ocr_line(line)
            if c:
                cleaned.append(c)
        return "\n".join(cleaned)

    if frame_type == FrameType.SLIDE:
        cleaned = [_clean_ocr_line(r.text) for r in regions]
        return "\n\n".join(c for c in cleaned if c)

    cleaned = [_clean_ocr_line(r.text) for r in regions]
    return " ".join(c for c in cleaned if c)


def _compute_frame_timestamps(
    video_path: str,
    duration: float,
    sample_interval: float = 0.7,
    min_gap: float = 0.5,
    start_offset: float = 0.0,
    end_limit: float | None = None,
) -> list[float]:
    """Build a deduplicated list of timestamps to extract frames at.

    Combines scene-change detection (catches visual transitions) with
    regular interval sampling (catches gradual changes).  Nearby
    timestamps closer than *min_gap* seconds are merged.

    Args:
        video_path: Path to the video file.
        duration: Total video duration in seconds.
        sample_interval: Seconds between interval samples.
        min_gap: Minimum gap between kept timestamps.
        start_offset: Start sampling at this time (seconds).
        end_limit: Stop sampling at this time (seconds). None = full duration.

    Returns:
        Sorted, deduplicated list of timestamps (seconds).
    """
    effective_end = end_limit if end_limit is not None else duration
    timestamps: set[float] = set()

    # 1. Scene detection — catches cuts, slide transitions, editor switches
    if HAS_SCENEDETECT:
        try:
            scenes = detect_scenes(video_path)
            for start, _end in scenes:
                # Take frame 0.5s after the scene starts (avoids transition blur)
                ts = round(start + 0.5, 1)
                if ts >= start_offset and ts < effective_end:
                    timestamps.add(ts)
        except Exception as exc:  # noqa: BLE001
            logger.warning(f"Scene detection failed, falling back to interval: {exc}")

    # 2. Regular interval sampling — fills gaps between scene cuts
    t = max(0.5, start_offset)
    while t < effective_end:
        timestamps.add(round(t, 1))
        t += sample_interval

    # Always include near the end
    if effective_end > 2.0:
        timestamps.add(round(effective_end - 1.0, 1))

    # 3. Sort and deduplicate (merge timestamps closer than min_gap)
    sorted_ts = sorted(timestamps)
    if not sorted_ts:
        return []

    deduped = [sorted_ts[0]]
    for ts in sorted_ts[1:]:
        if ts - deduped[-1] >= min_gap:
            deduped.append(ts)
    return deduped


def _frames_are_similar(frame_a, frame_b, threshold: float = 3.0) -> bool:
    """Check if two OpenCV frames are visually similar.

    Uses mean absolute pixel difference on downscaled grayscale.
    This catches text changes on dark backgrounds that histogram
    correlation would miss.

    Args:
        frame_a: First BGR frame (numpy array).
        frame_b: Second BGR frame (numpy array).
        threshold: Mean pixel difference below this = "duplicate".
            Typical values: 1-2 for identical, 3-5 for minor text
            changes, 10+ for scene changes.

    Returns:
        True if the frames are similar enough to skip one.
    """
    import numpy as np

    gray_a = cv2.cvtColor(frame_a, cv2.COLOR_BGR2GRAY)
    gray_b = cv2.cvtColor(frame_b, cv2.COLOR_BGR2GRAY)

    # Resize to same small size for speed
    small = (320, 180)
    gray_a = cv2.resize(gray_a, small)
    gray_b = cv2.resize(gray_b, small)

    # Mean absolute pixel difference (0-255 scale)
    diff = np.abs(gray_a.astype(np.float32) - gray_b.astype(np.float32))
    mean_diff = diff.mean()

    return mean_diff < threshold


def _text_similarity(text_a: str, text_b: str) -> float:
    """Compute text similarity ratio using SequenceMatcher.

    Args:
        text_a: First text string.
        text_b: Second text string.

    Returns:
        Similarity ratio between 0.0 and 1.0.
    """
    if not text_a or not text_b:
        return 0.0
    return difflib.SequenceMatcher(None, text_a, text_b).ratio()


@dataclass
class YBucketLine:
    """A line tracked by y-coordinate across multiple frames."""

    y_center: float
    y_tolerance: float = 15.0
    observations: list[dict] = field(default_factory=list)
    consensus_text: str = ""
    consensus_confidence: float = 0.0


class YBucketConsensusEngine:
    """Build consensus text from OCR observations across multiple frames.

    Groups OCR regions by y-coordinate into buckets, then for each bucket
    selects the best text by clustering similar observations and picking
    the highest-confidence cluster winner.
    """

    def __init__(self, y_tolerance: float = 15.0):
        self._y_tolerance = y_tolerance
        self._buckets: list[YBucketLine] = []
        self._frame_count = 0

    def add_frame(
        self,
        frame_index: int,
        timestamp: float,
        ocr_regions: list[OCRRegion],
    ) -> None:
        """Feed one frame's OCR regions into the engine."""
        self._frame_count += 1
        for region in ocr_regions:
            y_center = (region.bbox[1] + region.bbox[3]) / 2.0
            obs = {
                "text": region.text,
                "confidence": region.confidence,
                "frame_index": frame_index,
                "timestamp": timestamp,
                "x_left": region.bbox[0],
                "x_right": region.bbox[2],
            }

            # Find matching bucket
            matched = False
            for bucket in self._buckets:
                if abs(bucket.y_center - y_center) <= bucket.y_tolerance:
                    bucket.observations.append(obs)
                    matched = True
                    break

            if not matched:
                self._buckets.append(
                    YBucketLine(
                        y_center=y_center,
                        y_tolerance=self._y_tolerance,
                        observations=[obs],
                    )
                )

    def build_consensus(self) -> list[YBucketLine]:
        """Build consensus text for each y-bucket.

        Algorithm:
        1. Sort observations by confidence (descending).
        2. Cluster observations by text similarity (ratio >= 0.6).
        3. Score clusters by sum of confidence weights.
        4. Winning cluster's highest-confidence observation = consensus_text.
        5. Single observations with confidence < 0.4 → empty (unreliable).
        """
        for bucket in self._buckets:
            if not bucket.observations:
                continue

            # Sort by confidence descending
            sorted_obs = sorted(bucket.observations, key=lambda o: o["confidence"], reverse=True)

            # Single observation with low confidence → skip
            if len(sorted_obs) == 1 and sorted_obs[0]["confidence"] < 0.4:
                bucket.consensus_text = ""
                bucket.consensus_confidence = 0.0
                continue

            # Cluster by text similarity
            clusters: list[list[dict]] = []
            for obs in sorted_obs:
                placed = False
                for cluster in clusters:
                    rep_text = cluster[0]["text"]
                    sim = _text_similarity(rep_text, obs["text"])
                    if sim >= 0.6:
                        cluster.append(obs)
                        placed = True
                        break
                if not placed:
                    clusters.append([obs])

            # Score clusters by sum of confidence
            best_cluster = max(clusters, key=lambda c: sum(o["confidence"] for o in c))

            # Winner = highest confidence in best cluster
            winner = best_cluster[0]  # already sorted by confidence
            bucket.consensus_text = winner["text"]
            bucket.consensus_confidence = sum(o["confidence"] for o in best_cluster) / len(
                best_cluster
            )

        # Sort buckets by y_center (top to bottom)
        self._buckets.sort(key=lambda b: b.y_center)
        return self._buckets

    def get_consensus_text(self) -> str:
        """Return assembled consensus text (newline-joined lines)."""
        return "\n".join(b.consensus_text for b in self._buckets if b.consensus_text)

    def get_consensus_confidence(self) -> float:
        """Return mean consensus confidence across non-empty buckets."""
        non_empty = [b for b in self._buckets if b.consensus_text]
        if not non_empty:
            return 0.0
        return sum(b.consensus_confidence for b in non_empty) / len(non_empty)

    def get_bucket_y_centers(self) -> set[float]:
        """Return the set of y-center values for all buckets."""
        return {b.y_center for b in self._buckets}

    def reset(self) -> None:
        """Clear all state."""
        self._buckets.clear()
        self._frame_count = 0


@dataclass
class TrackedTextBlock:
    """A text block tracked across multiple video frames."""

    first_seen: float
    last_seen: float
    frame_indices: list[int] = field(default_factory=list)
    text_snapshots: list[str] = field(default_factory=list)
    frame_type: FrameType = FrameType.OTHER
    best_text: str = ""
    best_confidence: float = 0.0
    # Consensus fields (Phase A)
    consensus_lines: list[dict] = field(default_factory=list)
    text_group_id: str = ""
    ocr_regions_per_frame: list[list[OCRRegion]] = field(default_factory=list)
    panel_bbox: tuple[int, int, int, int] | None = None
    panel_id: str = ""


class TextBlockTracker:
    """Track text blocks across video frames for continuity detection.

    Uses y-bucket overlap matching when OCR regions are available,
    falling back to text similarity matching otherwise.
    """

    def __init__(self, similarity_threshold: float = 0.6, y_tolerance: float = 15.0):
        self._active_blocks: list[TrackedTextBlock] = []
        self._completed_blocks: list[TrackedTextBlock] = []
        self._similarity_threshold = similarity_threshold
        self._y_tolerance = y_tolerance
        # Y-bucket consensus engines keyed by active block index
        self._engines: dict[int, YBucketConsensusEngine] = {}
        # Text group tracking
        self._text_groups: list[TextGroup] = []
        self._next_group_id = 1

    def update(
        self,
        frame_index: int,
        timestamp: float,
        ocr_text: str,
        confidence: float,
        frame_type: FrameType,
        ocr_regions: list[OCRRegion] | None = None,
        panel_bbox: tuple[int, int, int, int] | None = None,
    ) -> None:
        """Process a new frame's OCR results.

        For code/terminal frames: match against active blocks using panel
        position (when ``panel_bbox`` is provided), y-bucket overlap (when
        ``ocr_regions`` are provided), or text similarity as final fallback.
        For other frames: complete all active blocks.
        """
        is_code_frame = frame_type in (FrameType.CODE_EDITOR, FrameType.TERMINAL)

        if not is_code_frame:
            self._complete_all_active()
            return

        if not ocr_text or len(ocr_text.strip()) < 10:
            return

        best_match: TrackedTextBlock | None = None
        best_match_idx = -1

        # 1. Try panel position matching first (for per-panel OCR)
        if panel_bbox is not None:
            best_match, best_match_idx = self._match_by_panel_position(panel_bbox, ocr_text)

        # 2. Try y-bucket matching when regions are available
        if best_match is None and ocr_regions:
            best_match, best_match_idx = self._match_by_y_buckets(ocr_regions)

        # 3. Fallback to text similarity (skip when panel_bbox is provided —
        #    spatial position is the authoritative signal for panel identity)
        if best_match is None and panel_bbox is None:
            best_sim = 0.0
            for i, block in enumerate(self._active_blocks):
                sim = _text_similarity(block.best_text, ocr_text)
                if sim >= self._similarity_threshold and sim > best_sim:
                    best_match = block
                    best_match_idx = i
                    best_sim = sim

        if best_match is not None:
            best_match.last_seen = timestamp
            best_match.frame_indices.append(frame_index)
            best_match.text_snapshots.append(ocr_text)
            if ocr_regions:
                best_match.ocr_regions_per_frame.append(list(ocr_regions))
            if confidence > best_match.best_confidence:
                best_match.best_text = ocr_text
                best_match.best_confidence = confidence
            # Update panel_bbox if not set yet
            if panel_bbox is not None and best_match.panel_bbox is None:
                best_match.panel_bbox = panel_bbox
            # Feed into consensus engine
            if ocr_regions and best_match_idx in self._engines:
                self._engines[best_match_idx].add_frame(frame_index, timestamp, ocr_regions)
        else:
            new_idx = len(self._active_blocks)
            new_block = TrackedTextBlock(
                first_seen=timestamp,
                last_seen=timestamp,
                frame_indices=[frame_index],
                text_snapshots=[ocr_text],
                frame_type=frame_type,
                best_text=ocr_text,
                best_confidence=confidence,
                ocr_regions_per_frame=[list(ocr_regions)] if ocr_regions else [],
                panel_bbox=panel_bbox,
            )
            self._active_blocks.append(new_block)
            # Create consensus engine for new block
            engine = YBucketConsensusEngine(y_tolerance=self._y_tolerance)
            if ocr_regions:
                engine.add_frame(frame_index, timestamp, ocr_regions)
            self._engines[new_idx] = engine

    def _match_by_y_buckets(
        self, new_regions: list[OCRRegion]
    ) -> tuple[TrackedTextBlock | None, int]:
        """Match new frame regions against active blocks by y-bucket overlap.

        Returns (matched_block, block_index) or (None, -1) if no match.
        A match requires >= 40% of the new frame's region y-centers to
        fall within existing bucket y-centers (within tolerance).
        """
        if not self._active_blocks:
            return None, -1

        new_y_centers = []
        for r in new_regions:
            y_center = (r.bbox[1] + r.bbox[3]) / 2.0
            new_y_centers.append(y_center)

        if not new_y_centers:
            return None, -1

        best_block = None
        best_idx = -1
        best_overlap = 0.0

        for i, _block in enumerate(self._active_blocks):
            engine = self._engines.get(i)
            if engine is None:
                continue

            existing_y_centers = engine.get_bucket_y_centers()
            if not existing_y_centers:
                continue

            # Count how many new y-centers match existing buckets
            matched = 0
            for ny in new_y_centers:
                for ey in existing_y_centers:
                    if abs(ny - ey) <= self._y_tolerance:
                        matched += 1
                        break

            overlap = matched / len(new_y_centers)
            if overlap >= 0.4 and overlap > best_overlap:
                best_overlap = overlap
                best_block = self._active_blocks[i]
                best_idx = i

        return best_block, best_idx

    def _match_by_panel_position(
        self,
        panel_bbox: tuple[int, int, int, int],
        ocr_text: str,
    ) -> tuple[TrackedTextBlock | None, int]:
        """Match by panel x-range overlap (horizontal position).

        Two panels match if their x-ranges overlap by >= 50%.
        Also requires text similarity >= 0.3 to avoid matching
        completely different content that happens to be in the same position.
        """
        if not self._active_blocks:
            return None, -1

        px1, _py1, px2, _py2 = panel_bbox
        p_width = px2 - px1
        if p_width <= 0:
            return None, -1

        best_block: TrackedTextBlock | None = None
        best_idx = -1
        best_overlap = 0.0

        for i, block in enumerate(self._active_blocks):
            if block.panel_bbox is None:
                continue

            bx1, _by1, bx2, _by2 = block.panel_bbox
            b_width = bx2 - bx1
            if b_width <= 0:
                continue

            # Compute x-range overlap
            overlap_start = max(px1, bx1)
            overlap_end = min(px2, bx2)
            overlap_width = max(0, overlap_end - overlap_start)

            # Overlap as fraction of the smaller panel width
            min_width = min(p_width, b_width)
            x_overlap = overlap_width / min_width

            if x_overlap >= 0.5 and x_overlap > best_overlap:
                # Require minimal text similarity to avoid cross-matching
                sim = _text_similarity(block.best_text, ocr_text)
                if sim >= 0.3:
                    best_overlap = x_overlap
                    best_block = block
                    best_idx = i

        return best_block, best_idx

    def _complete_all_active(self) -> None:
        """Move all active blocks to completed, building consensus first."""
        for i, block in enumerate(self._active_blocks):
            engine = self._engines.get(i)
            if engine is not None:
                buckets = engine.build_consensus()
                block.consensus_lines = [
                    {
                        "y_center": b.y_center,
                        "text": b.consensus_text,
                        "confidence": b.consensus_confidence,
                    }
                    for b in buckets
                    if b.consensus_text
                ]
                consensus_text = engine.get_consensus_text()
                consensus_conf = engine.get_consensus_confidence()
                if consensus_text and consensus_conf > block.best_confidence:
                    block.best_text = consensus_text
                    block.best_confidence = consensus_conf

            self._completed_blocks.append(block)

        self._active_blocks.clear()
        self._engines.clear()

    def _assign_text_group(self, block: TrackedTextBlock) -> None:
        """Assign a text group ID to a completed block.

        Compares consensus_lines against existing TextGroups:
        - Overlap >= 60% → same group (possibly edited)
        - Overlap < 60% → new group
        """
        block_lines = [cl["text"] for cl in block.consensus_lines if cl.get("text")]
        if not block_lines:
            # Fallback: use best_text lines
            block_lines = [line for line in block.best_text.split("\n") if line.strip()]
        if not block_lines:
            return

        best_group = None
        best_overlap = 0.0

        for group in self._text_groups:
            group_lines = [cl["text"] for cl in group.consensus_lines if cl.get("text")]
            if not group_lines:
                continue

            # Compute overlap
            shorter_len = min(len(block_lines), len(group_lines))
            if shorter_len == 0:
                continue

            matched = 0
            for bl in block_lines:
                for gl in group_lines:
                    if _text_similarity(bl, gl) >= 0.6:
                        matched += 1
                        break

            overlap = matched / shorter_len
            if overlap >= 0.6 and overlap > best_overlap:
                best_overlap = overlap
                best_group = group

        if best_group is not None:
            # Same group — compute edit
            old_lines = [cl["text"] for cl in best_group.consensus_lines if cl.get("text")]
            edit = self._compute_edit(old_lines, block_lines, block.first_seen)
            if edit is not None:
                best_group.edits.append(edit)

            # Update group's consensus lines to new version
            best_group.consensus_lines = (
                list(block.consensus_lines)
                if block.consensus_lines
                else [
                    {"y_center": 0.0, "text": line, "confidence": block.best_confidence}
                    for line in block_lines
                ]
            )
            best_group.appearances.append((block.first_seen, block.last_seen))
            block.text_group_id = best_group.group_id
            # Propagate panel_id if not already set
            if block.panel_id and not best_group.panel_id:
                best_group.panel_id = block.panel_id
        else:
            # New group
            group_id = f"TG-{self._next_group_id:03d}"
            self._next_group_id += 1
            new_group = TextGroup(
                group_id=group_id,
                appearances=[(block.first_seen, block.last_seen)],
                consensus_lines=list(block.consensus_lines)
                if block.consensus_lines
                else [
                    {"y_center": 0.0, "text": line, "confidence": block.best_confidence}
                    for line in block_lines
                ],
                edits=[],
                frame_type=block.frame_type,
                panel_id=block.panel_id,
            )
            self._text_groups.append(new_group)
            block.text_group_id = group_id

    def _compute_edit(
        self, old_lines: list[str], new_lines: list[str], timestamp: float
    ) -> TextGroupEdit | None:
        """Compute a TextGroupEdit between old and new line lists."""
        if old_lines == new_lines:
            return None

        matcher = difflib.SequenceMatcher(None, old_lines, new_lines)
        added: list[str] = []
        removed: list[str] = []
        modified: list[dict] = []

        for tag, i1, i2, j1, j2 in matcher.get_opcodes():
            if tag == "equal":
                continue
            elif tag == "insert":
                added.extend(new_lines[j1:j2])
            elif tag == "delete":
                removed.extend(old_lines[i1:i2])
            elif tag == "replace":
                for k, old_line in enumerate(old_lines[i1:i2]):
                    if k < (j2 - j1):
                        modified.append(
                            {
                                "line_num": i1 + k,
                                "old": old_line,
                                "new": new_lines[j1 + k],
                            }
                        )
                    else:
                        removed.append(old_line)
                if (j2 - j1) > (i2 - i1):
                    added.extend(new_lines[j1 + (i2 - i1) : j2])

        if not added and not removed and not modified:
            return None

        return TextGroupEdit(
            timestamp=timestamp,
            added_lines=added,
            removed_lines=removed,
            modified_lines=modified,
        )

    def finalize(self) -> list[TrackedTextBlock]:
        """Complete tracking, assign text groups, and return all blocks."""
        self._complete_all_active()
        for block in self._completed_blocks:
            self._assign_text_group(block)
        return list(self._completed_blocks)

    def get_text_groups(self) -> list[TextGroup]:
        """Return all text groups after finalize().

        Also runs language detection on groups that don't already have
        a detected_language set.
        """
        # Run language detection on each group
        try:
            from skill_seekers.cli.language_detector import LanguageDetector

            detector = LanguageDetector()
        except ImportError:
            detector = None

        if detector is not None:
            for group in self._text_groups:
                if group.detected_language:
                    continue  # Already detected
                text = group.full_text
                if text and len(text) >= 20:
                    try:
                        lang, _conf = detector.detect_from_code(text)
                        if lang:
                            group.detected_language = lang
                    except Exception:
                        pass

        return list(self._text_groups)


def _extract_code_blocks(
    tracked_blocks: list[TrackedTextBlock],
    text_groups: list[TextGroup] | None = None,
) -> list[CodeBlock]:
    """Convert tracked text blocks into CodeBlock objects.

    Filters for code/terminal frames with sufficient text length
    and attempts language detection. When text_groups are provided
    and a block has a text_group_id, uses the group's consensus text
    for better quality.

    Args:
        tracked_blocks: Tracked text blocks from TextBlockTracker.
        text_groups: Optional list of TextGroup objects for consensus text.

    Returns:
        List of CodeBlock objects with detected language.
    """
    code_blocks = []

    # Build lookup for text groups
    group_map: dict[str, TextGroup] = {}
    if text_groups:
        for tg in text_groups:
            group_map[tg.group_id] = tg

    # Lazy import language detector
    try:
        from skill_seekers.cli.language_detector import LanguageDetector

        detector = LanguageDetector()
    except ImportError:
        detector = None

    for block in tracked_blocks:
        if block.frame_type not in (FrameType.CODE_EDITOR, FrameType.TERMINAL):
            continue
        if len(block.best_text) < 20:
            continue

        # Use consensus text from text group when available
        code_text = block.best_text
        if block.text_group_id and block.text_group_id in group_map:
            group = group_map[block.text_group_id]
            group_text = group.full_text
            if group_text and len(group_text) >= 20:
                code_text = group_text

        # Detect language
        language = None
        if detector is not None:
            try:
                lang, _conf = detector.detect_from_code(code_text)
                if lang:
                    language = lang
            except Exception:  # noqa: BLE001
                pass

        # Map FrameType to CodeContext
        if block.frame_type == FrameType.CODE_EDITOR:
            context = CodeContext.EDITOR
        elif block.frame_type == FrameType.TERMINAL:
            context = CodeContext.TERMINAL
        else:
            context = CodeContext.UNKNOWN

        code_blocks.append(
            CodeBlock(
                code=code_text,
                language=language,
                source_frame=block.first_seen,
                context=context,
                confidence=block.best_confidence,
                text_group_id=block.text_group_id,
            )
        )

    return code_blocks


def _ocr_single_panel(
    frame_path: str,
    panel_bbox: tuple[int, int, int, int],
    panel_idx: int,
    frame_type: FrameType,
    full_area: int,
    regions: list[tuple[int, int, int, int, FrameType]],
    use_vision_api: bool,
) -> FrameSubSection | None:
    """OCR a single panel and return a FrameSubSection (or None).

    Designed to be called in parallel via ThreadPoolExecutor — each
    invocation is independent (unique crop path, no shared mutable state).
    """
    x1, y1, x2, y2 = panel_bbox
    panel_area = (x2 - x1) * (y2 - y1)

    # Crop panel if it's a subset of the frame
    cropped_path: str | None = None
    if panel_area < full_area * 0.9:
        cropped_path = _crop_code_region(frame_path, panel_bbox, suffix=f"_p{panel_idx}")
        ocr_target = cropped_path
    else:
        ocr_target = frame_path

    try:
        raw_results, _ = _run_multi_engine_ocr(ocr_target, frame_type)
        p_regions = _cluster_ocr_into_lines(raw_results, frame_type) if raw_results else []
        p_text = _assemble_structured_text(p_regions, frame_type) if p_regions else ""
        p_conf = sum(r.confidence for r in p_regions) / len(p_regions) if p_regions else 0.0

        # Vision API fallback for low-confidence panels
        vision_used = False
        if use_vision_api and p_conf < 0.5:
            v_text, v_conf = _ocr_with_claude_vision(ocr_target, frame_type)
            if v_text and v_conf > p_conf:
                p_text, p_conf, p_regions = v_text, v_conf, []
                vision_used = True
    finally:
        if cropped_path and os.path.exists(cropped_path):
            os.unlink(cropped_path)

    if not p_text.strip():
        return None

    row = sum(1 for r in regions if r[1] < y1)
    col = sum(1 for r in regions if r[0] < x1 and abs(r[1] - y1) < 50)

    ss = FrameSubSection(
        bbox=panel_bbox,
        frame_type=frame_type,
        ocr_text=p_text,
        ocr_regions=p_regions,
        ocr_confidence=p_conf,
        panel_id=f"panel_{row}_{col}",
    )
    # Stash vision_used flag for the caller to count
    ss._vision_used = vision_used
    return ss


def extract_visual_data(
    video_path: str,
    segments: list,
    output_dir: str,
    sample_interval: float = 0.7,
    min_gap: float = 0.5,
    similarity_threshold: float = 3.0,
    use_vision_api: bool = False,
    clip_start: float | None = None,
    clip_end: float | None = None,
) -> tuple[list[KeyFrame], list[CodeBlock], TextGroupTimeline | None]:
    """Run continuous visual extraction on a video.

    Instead of extracting one frame per segment, this scans the entire
    video using scene-change detection + interval sampling, deduplicates
    near-identical frames, classifies each frame, runs OCR with
    frame-type-aware preprocessing, preserves spatial layout, tracks
    text across frames with y-bucket consensus, and builds a text group
    timeline for code lifecycle tracking.

    For code/terminal frames, uses multi-engine OCR (EasyOCR + pytesseract)
    with ensemble voting.  When ``use_vision_api`` is True and multi-engine
    confidence is below 0.5, falls back to Claude Vision API.

    Args:
        video_path: Path to downloaded video file.
        segments: List of VideoSegment objects (used for duration hint).
        output_dir: Directory to save extracted frames.
        sample_interval: Seconds between interval samples (default 0.7s).
        min_gap: Minimum gap between kept timestamps (default 0.5s).
        similarity_threshold: Pixel-diff threshold for duplicate detection (default 3.0).
        use_vision_api: If True, use Claude Vision API as fallback for low-confidence
            code frames (requires ANTHROPIC_API_KEY).
        clip_start: Start of clip range in seconds (None = beginning).
        clip_end: End of clip range in seconds (None = full duration).

    Returns:
        Tuple of (keyframes, code_blocks, text_group_timeline).
        text_group_timeline is None when no code frames are found.
    """
    if not HAS_OPENCV:
        raise RuntimeError(_INSTALL_MSG)

    frames_dir = os.path.join(output_dir, "frames")
    # Clean stale frames from previous runs
    if os.path.exists(frames_dir):
        for old in os.listdir(frames_dir):
            if old.endswith(".jpg"):
                os.remove(os.path.join(frames_dir, old))
    os.makedirs(frames_dir, exist_ok=True)

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        logger.error(f"Cannot open video: {video_path}")
        return [], [], None

    fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
    total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    duration = total_frames / fps if fps > 0 else 0.0

    # If segments give a better duration hint, use it
    if segments:
        seg_end = max(s.end_time for s in segments)
        if seg_end > duration:
            duration = seg_end

    logger.info(
        f"Continuous visual scan: {duration:.0f}s video, "
        f"interval={sample_interval}s, scene detection={'ON' if HAS_SCENEDETECT else 'OFF'}"
    )

    # Build candidate timestamps
    timestamps = _compute_frame_timestamps(
        video_path,
        duration,
        sample_interval=sample_interval,
        min_gap=min_gap,
        start_offset=clip_start or 0.0,
        end_limit=clip_end,
    )
    logger.info(f"  {len(timestamps)} candidate timestamps after dedup")

    keyframes = []
    prev_frame = None
    skipped_similar = 0
    vision_api_frames = 0
    tracker = TextBlockTracker()

    for ts in timestamps:
        frame_num = int(ts * fps)
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
        ret, frame = cap.read()
        if not ret:
            continue

        # Skip near-duplicate frames
        if prev_frame is not None and _frames_are_similar(
            prev_frame, frame, threshold=similarity_threshold
        ):
            skipped_similar += 1
            continue
        prev_frame = frame.copy()
        frame_h, frame_w = frame.shape[:2]

        # Save frame
        idx = len(keyframes)
        frame_filename = f"frame_{idx:03d}_{ts:.0f}s.jpg"
        frame_path = os.path.join(frames_dir, frame_filename)
        cv2.imwrite(frame_path, frame)
        del frame  # Free the numpy array early — saved to disk

        # Classify using region-based panel detection
        regions = classify_frame_regions(frame_path)
        code_panels = _get_code_panels(regions)
        # Derive frame_type from already-computed regions (avoids loading
        # the image a second time — classify_frame() would repeat the work).
        frame_type = _frame_type_from_regions(regions)
        is_code_frame = frame_type in (FrameType.CODE_EDITOR, FrameType.TERMINAL)

        # Per-panel OCR: each code/terminal panel is OCR'd independently
        # so side-by-side editors produce separate code blocks.
        sub_sections: list[FrameSubSection] = []
        ocr_text = ""
        ocr_regions: list[OCRRegion] = []
        ocr_confidence = 0.0

        if is_code_frame and code_panels and (HAS_EASYOCR or HAS_PYTESSERACT):
            full_area = frame_h * frame_w

            if len(code_panels) > 1:
                # Parallel OCR — each panel is independent
                with concurrent.futures.ThreadPoolExecutor(
                    max_workers=min(2, len(code_panels))
                ) as pool:
                    futures = {
                        pool.submit(
                            _ocr_single_panel,
                            frame_path,
                            pb,
                            pi,
                            frame_type,
                            full_area,
                            regions,
                            use_vision_api,
                        ): pi
                        for pi, pb in enumerate(code_panels)
                    }
                    for fut in concurrent.futures.as_completed(futures):
                        ss = fut.result()
                        if ss is not None:
                            if ss._vision_used:
                                vision_api_frames += 1
                            sub_sections.append(ss)
            else:
                # Single panel — avoid thread overhead
                ss = _ocr_single_panel(
                    frame_path,
                    code_panels[0],
                    0,
                    frame_type,
                    full_area,
                    regions,
                    use_vision_api,
                )
                if ss is not None:
                    if ss._vision_used:
                        vision_api_frames += 1
                    sub_sections.append(ss)

            # Track each sub-section independently
            for ss in sub_sections:
                tracker.update(
                    idx,
                    ts,
                    ss.ocr_text,
                    ss.ocr_confidence,
                    ss.frame_type,
                    ocr_regions=ss.ocr_regions,
                    panel_bbox=ss.bbox,
                )

            # Set frame-level OCR to best sub-section for backward compat
            if sub_sections:
                best_ss = max(sub_sections, key=lambda s: s.ocr_confidence)
                ocr_text = best_ss.ocr_text
                ocr_regions = best_ss.ocr_regions
                ocr_confidence = best_ss.ocr_confidence

        elif is_code_frame and (HAS_EASYOCR or HAS_PYTESSERACT):
            # No code panels detected but frame is code — OCR whole frame
            raw_ocr_results, _flat_text = _run_multi_engine_ocr(frame_path, frame_type)
            if raw_ocr_results:
                ocr_regions = _cluster_ocr_into_lines(raw_ocr_results, frame_type)
                ocr_text = _assemble_structured_text(ocr_regions, frame_type)
                ocr_confidence = (
                    sum(r.confidence for r in ocr_regions) / len(ocr_regions)
                    if ocr_regions
                    else 0.0
                )

            if use_vision_api and ocr_confidence < 0.5:
                vision_text, vision_conf = _ocr_with_claude_vision(frame_path, frame_type)
                if vision_text and vision_conf > ocr_confidence:
                    ocr_text = vision_text
                    ocr_confidence = vision_conf
                    ocr_regions = []
                    vision_api_frames += 1

            tracker.update(idx, ts, ocr_text, ocr_confidence, frame_type, ocr_regions=ocr_regions)

        elif HAS_EASYOCR and frame_type not in (FrameType.WEBCAM, FrameType.OTHER):
            # Standard EasyOCR for slide/diagram frames (skip webcam/other)
            raw_ocr_results, _flat_text = extract_text_from_frame(frame_path, frame_type)
            if raw_ocr_results:
                ocr_regions = _cluster_ocr_into_lines(raw_ocr_results, frame_type)
                ocr_text = _assemble_structured_text(ocr_regions, frame_type)
                ocr_confidence = (
                    sum(r.confidence for r in ocr_regions) / len(ocr_regions)
                    if ocr_regions
                    else 0.0
                )

            tracker.update(idx, ts, ocr_text, ocr_confidence, frame_type, ocr_regions=ocr_regions)

        kf = KeyFrame(
            timestamp=ts,
            image_path=frame_path,
            frame_type=frame_type,
            ocr_text=ocr_text,
            ocr_regions=ocr_regions,
            ocr_confidence=ocr_confidence,
            width=frame_w,
            height=frame_h,
            sub_sections=sub_sections,
        )
        keyframes.append(kf)

        logger.debug(
            f"  Frame {idx}: {frame_type.value} at {ts:.1f}s"
            + (
                f" | OCR: {ocr_text[:60]}..."
                if len(ocr_text) > 60
                else f" | OCR: {ocr_text}"
                if ocr_text
                else ""
            )
        )

        # Periodically collect to free PyTorch/numpy memory
        if idx % 10 == 9:
            gc.collect()

    cap.release()

    # Finalize text tracking and extract code blocks
    tracked_blocks = tracker.finalize()
    text_groups = tracker.get_text_groups()
    code_blocks = _extract_code_blocks(tracked_blocks, text_groups=text_groups)

    # Build timeline
    timeline: TextGroupTimeline | None = None
    if text_groups:
        total_code_time = sum(end - start for tg in text_groups for start, end in tg.appearances)
        total_edits = sum(len(tg.edits) for tg in text_groups)
        timeline = TextGroupTimeline(
            text_groups=text_groups,
            total_code_time=total_code_time,
            total_groups=len(text_groups),
            total_edits=total_edits,
        )

    vision_msg = f", {vision_api_frames} via Vision API" if vision_api_frames > 0 else ""
    logger.info(
        f"Extracted {len(keyframes)} unique keyframes "
        f"({skipped_similar} duplicates skipped), "
        f"{sum(1 for kf in keyframes if kf.ocr_text)} with OCR text, "
        f"{len(code_blocks)} code blocks detected, "
        f"{len(text_groups)} text groups{vision_msg}"
    )
    return keyframes, code_blocks, timeline


def download_video(
    url: str,
    output_dir: str,
    clip_start: float | None = None,
    clip_end: float | None = None,
) -> str | None:
    """Download a video using yt-dlp for visual processing.

    Downloads the best quality up to 1080p. Uses separate video+audio streams
    and merges them (via ffmpeg) since YouTube only offers combined streams at
    360p/720p — higher resolutions require downloading video-only + audio-only
    and muxing.

    Args:
        url: Video URL.
        output_dir: Directory to save the downloaded file.
        clip_start: Download from this time (seconds). None = beginning.
        clip_end: Download until this time (seconds). None = full video.

    Returns:
        Path to downloaded video file, or None on failure.
    """
    try:
        import yt_dlp
    except ImportError:
        logger.error("yt-dlp is required for video download")
        return None

    os.makedirs(output_dir, exist_ok=True)
    output_template = os.path.join(output_dir, "video.%(ext)s")

    opts = {
        "format": (
            "bestvideo[height<=1080][vcodec^=avc1]+bestaudio/best[height<=1080][vcodec^=avc1]/"
            "bestvideo[height<=1080][vcodec^=h264]+bestaudio/best[height<=1080][vcodec^=h264]/"
            "bestvideo[height<=1080]+bestaudio/best[height<=1080]"
        ),
        "merge_output_format": "mp4",
        "outtmpl": output_template,
        "quiet": True,
        "no_warnings": True,
    }

    # Apply download_ranges for clip support (yt-dlp 2023.01.02+)
    if clip_start is not None or clip_end is not None:
        try:
            from yt_dlp.utils import download_range_func

            ranges = [(clip_start or 0, clip_end or float("inf"))]
            opts["download_ranges"] = download_range_func(None, ranges)
        except (ImportError, TypeError):
            logger.warning(
                "yt-dlp version does not support download_ranges; "
                "downloading full video and relying on frame timestamp filtering"
            )

    logger.info(f"Downloading video for visual extraction...")
    try:
        with yt_dlp.YoutubeDL(opts) as ydl:
            info = ydl.extract_info(url, download=True)
            filename = ydl.prepare_filename(info)
            if os.path.exists(filename):
                logger.info(f"Downloaded: {filename}")
                return filename
            # Try common extensions
            for ext in ["mp4", "webm", "mkv"]:
                candidate = os.path.join(output_dir, f"video.{ext}")
                if os.path.exists(candidate):
                    return candidate
    except Exception as e:
        logger.error(f"Failed to download video: {e}")

    return None