feat: add skill-seekers video --setup for GPU auto-detection and dependency installation

Auto-detects NVIDIA (CUDA), AMD (ROCm), or CPU-only GPU and installs the
correct PyTorch variant + easyocr + all visual extraction dependencies.
Removes easyocr from video-full pip extras to avoid pulling ~2GB of wrong
CUDA packages on non-NVIDIA systems.

New files:
- video_setup.py (835 lines): GPU detection, PyTorch install, ROCm config,
  venv checks, system dep validation, module selection, verification
- test_video_setup.py (60 tests): Full coverage of detection, install, verify

Updated docs: CHANGELOG, AGENTS.md, CLAUDE.md, README.md, CLI_REFERENCE,
FAQ, TROUBLESHOOTING, installation guide, video dependency plan

All 2523 tests passing (15 skipped).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-03-01 18:39:16 +03:00
parent 12bc29ab36
commit cc9cc32417
24 changed files with 2439 additions and 508 deletions

View File

@@ -13,6 +13,7 @@ from __future__ import annotations
import concurrent.futures
import difflib
import gc
import logging
import os
import tempfile
@@ -32,6 +33,18 @@ from skill_seekers.cli.video_models import (
logger = logging.getLogger(__name__)
# Set ROCm/MIOpen env vars BEFORE importing torch (via easyocr).
# Without MIOPEN_FIND_MODE=FAST, MIOpen tries to allocate huge workspace
# buffers (300MB+), gets 0 bytes, and silently falls back to CPU kernels.
import os as _os
if "MIOPEN_FIND_MODE" not in _os.environ:
_os.environ["MIOPEN_FIND_MODE"] = "FAST"
if "MIOPEN_USER_DB_PATH" not in _os.environ:
_miopen_db = _os.path.expanduser("~/.config/miopen")
_os.makedirs(_miopen_db, exist_ok=True)
_os.environ["MIOPEN_USER_DB_PATH"] = _miopen_db
# Tier 2 dependency flags
try:
import cv2
@@ -65,23 +78,46 @@ except ImportError:
pytesseract = None # type: ignore[assignment]
HAS_PYTESSERACT = False
# Circuit breaker: after first tesseract failure, disable it for the session.
# Prevents wasting time spawning subprocesses that always fail.
_tesseract_broken = False
_INSTALL_MSG = (
"Visual extraction requires additional dependencies.\n"
'Install with: pip install "skill-seekers[video-full]"\n'
"Or: pip install opencv-python-headless scenedetect easyocr"
"Recommended: skill-seekers video --setup (auto-detects GPU, installs correct PyTorch)\n"
'Alternative: pip install "skill-seekers[video-full]" (may install wrong PyTorch variant)'
)
# Lazy-initialized EasyOCR reader (heavy, only load once)
_ocr_reader = None
def _detect_gpu() -> bool:
"""Check if a CUDA or ROCm GPU is available for EasyOCR/PyTorch."""
try:
import torch
if torch.cuda.is_available():
return True
# ROCm exposes GPU via torch.version.hip
if hasattr(torch.version, "hip") and torch.version.hip is not None:
return True
return False
except ImportError:
return False
def _get_ocr_reader():
"""Get or create the EasyOCR reader (lazy singleton)."""
global _ocr_reader
if _ocr_reader is None:
logger.info("Initializing OCR engine (first run may download models)...")
_ocr_reader = easyocr.Reader(["en"], gpu=False)
use_gpu = _detect_gpu()
logger.info(
f"Initializing OCR engine ({'GPU' if use_gpu else 'CPU'} mode, "
"first run may download models)..."
)
_ocr_reader = easyocr.Reader(["en"], gpu=use_gpu)
return _ocr_reader
@@ -296,11 +332,15 @@ def _run_tesseract_ocr(preprocessed_path: str, frame_type: FrameType) -> list[tu
Returns results in the same format as EasyOCR: list of (bbox, text, confidence).
Groups words into lines by y-coordinate.
Uses a circuit breaker: if tesseract fails once, it's disabled for the
rest of the session to avoid wasting time on repeated subprocess failures.
Args:
preprocessed_path: Path to the preprocessed grayscale image.
frame_type: Frame classification (reserved for future per-type tuning).
"""
if not HAS_PYTESSERACT:
global _tesseract_broken
if not HAS_PYTESSERACT or _tesseract_broken:
return []
# Produce clean binary for Tesseract
@@ -312,7 +352,11 @@ def _run_tesseract_ocr(preprocessed_path: str, frame_type: FrameType) -> list[tu
output_type=pytesseract.Output.DICT,
)
except Exception: # noqa: BLE001
logger.debug("pytesseract failed, returning empty results")
_tesseract_broken = True
logger.warning(
"pytesseract failed — disabling for this session. "
"Install tesseract binary: skill-seekers video --setup"
)
return []
finally:
if binary_path != preprocessed_path and os.path.exists(binary_path):
@@ -897,6 +941,25 @@ def _crop_code_region(frame_path: str, bbox: tuple[int, int, int, int], suffix:
return cropped_path
def _frame_type_from_regions(
regions: list[tuple[int, int, int, int, FrameType]],
) -> FrameType:
"""Derive the dominant frame type from pre-computed regions.
Same logic as ``classify_frame`` but avoids re-loading the image.
"""
for _x1, _y1, _x2, _y2, ft in regions:
if ft == FrameType.TERMINAL:
return FrameType.TERMINAL
if ft == FrameType.CODE_EDITOR:
return FrameType.CODE_EDITOR
from collections import Counter
type_counts = Counter(ft for _, _, _, _, ft in regions)
return type_counts.most_common(1)[0][0] if type_counts else FrameType.OTHER
def classify_frame(frame_path: str) -> FrameType:
"""Classify a video frame by its visual content.
@@ -1114,6 +1177,8 @@ def _compute_frame_timestamps(
duration: float,
sample_interval: float = 0.7,
min_gap: float = 0.5,
start_offset: float = 0.0,
end_limit: float | None = None,
) -> list[float]:
"""Build a deduplicated list of timestamps to extract frames at.
@@ -1126,10 +1191,13 @@ def _compute_frame_timestamps(
duration: Total video duration in seconds.
sample_interval: Seconds between interval samples.
min_gap: Minimum gap between kept timestamps.
start_offset: Start sampling at this time (seconds).
end_limit: Stop sampling at this time (seconds). None = full duration.
Returns:
Sorted, deduplicated list of timestamps (seconds).
"""
effective_end = end_limit if end_limit is not None else duration
timestamps: set[float] = set()
# 1. Scene detection — catches cuts, slide transitions, editor switches
@@ -1138,19 +1206,21 @@ def _compute_frame_timestamps(
scenes = detect_scenes(video_path)
for start, _end in scenes:
# Take frame 0.5s after the scene starts (avoids transition blur)
timestamps.add(round(start + 0.5, 1))
ts = round(start + 0.5, 1)
if ts >= start_offset and ts < effective_end:
timestamps.add(ts)
except Exception as exc: # noqa: BLE001
logger.warning(f"Scene detection failed, falling back to interval: {exc}")
# 2. Regular interval sampling — fills gaps between scene cuts
t = 0.5 # start slightly after 0 to avoid black intro frames
while t < duration:
t = max(0.5, start_offset)
while t < effective_end:
timestamps.add(round(t, 1))
t += sample_interval
# Always include near the end
if duration > 2.0:
timestamps.add(round(duration - 1.0, 1))
if effective_end > 2.0:
timestamps.add(round(effective_end - 1.0, 1))
# 3. Sort and deduplicate (merge timestamps closer than min_gap)
sorted_ts = sorted(timestamps)
@@ -1876,6 +1946,8 @@ def extract_visual_data(
min_gap: float = 0.5,
similarity_threshold: float = 3.0,
use_vision_api: bool = False,
clip_start: float | None = None,
clip_end: float | None = None,
) -> tuple[list[KeyFrame], list[CodeBlock], TextGroupTimeline | None]:
"""Run continuous visual extraction on a video.
@@ -1899,6 +1971,8 @@ def extract_visual_data(
similarity_threshold: Pixel-diff threshold for duplicate detection (default 3.0).
use_vision_api: If True, use Claude Vision API as fallback for low-confidence
code frames (requires ANTHROPIC_API_KEY).
clip_start: Start of clip range in seconds (None = beginning).
clip_end: End of clip range in seconds (None = full duration).
Returns:
Tuple of (keyframes, code_blocks, text_group_timeline).
@@ -1937,7 +2011,12 @@ def extract_visual_data(
# Build candidate timestamps
timestamps = _compute_frame_timestamps(
video_path, duration, sample_interval=sample_interval, min_gap=min_gap
video_path,
duration,
sample_interval=sample_interval,
min_gap=min_gap,
start_offset=clip_start or 0.0,
end_limit=clip_end,
)
logger.info(f" {len(timestamps)} candidate timestamps after dedup")
@@ -1961,17 +2040,21 @@ def extract_visual_data(
skipped_similar += 1
continue
prev_frame = frame.copy()
frame_h, frame_w = frame.shape[:2]
# Save frame
idx = len(keyframes)
frame_filename = f"frame_{idx:03d}_{ts:.0f}s.jpg"
frame_path = os.path.join(frames_dir, frame_filename)
cv2.imwrite(frame_path, frame)
del frame # Free the numpy array early — saved to disk
# Classify using region-based panel detection
regions = classify_frame_regions(frame_path)
code_panels = _get_code_panels(regions)
frame_type = classify_frame(frame_path) # dominant type for metadata
# Derive frame_type from already-computed regions (avoids loading
# the image a second time — classify_frame() would repeat the work).
frame_type = _frame_type_from_regions(regions)
is_code_frame = frame_type in (FrameType.CODE_EDITOR, FrameType.TERMINAL)
# Per-panel OCR: each code/terminal panel is OCR'd independently
@@ -1982,11 +2065,13 @@ def extract_visual_data(
ocr_confidence = 0.0
if is_code_frame and code_panels and (HAS_EASYOCR or HAS_PYTESSERACT):
full_area = frame.shape[0] * frame.shape[1]
full_area = frame_h * frame_w
if len(code_panels) > 1:
# Parallel OCR — each panel is independent
with concurrent.futures.ThreadPoolExecutor(max_workers=len(code_panels)) as pool:
with concurrent.futures.ThreadPoolExecutor(
max_workers=min(2, len(code_panels))
) as pool:
futures = {
pool.submit(
_ocr_single_panel,
@@ -2084,8 +2169,8 @@ def extract_visual_data(
ocr_text=ocr_text,
ocr_regions=ocr_regions,
ocr_confidence=ocr_confidence,
width=frame.shape[1],
height=frame.shape[0],
width=frame_w,
height=frame_h,
sub_sections=sub_sections,
)
keyframes.append(kf)
@@ -2101,6 +2186,10 @@ def extract_visual_data(
)
)
# Periodically collect to free PyTorch/numpy memory
if idx % 10 == 9:
gc.collect()
cap.release()
# Finalize text tracking and extract code blocks
@@ -2131,7 +2220,12 @@ def extract_visual_data(
return keyframes, code_blocks, timeline
def download_video(url: str, output_dir: str) -> str | None:
def download_video(
url: str,
output_dir: str,
clip_start: float | None = None,
clip_end: float | None = None,
) -> str | None:
"""Download a video using yt-dlp for visual processing.
Downloads the best quality up to 1080p. Uses separate video+audio streams
@@ -2142,6 +2236,8 @@ def download_video(url: str, output_dir: str) -> str | None:
Args:
url: Video URL.
output_dir: Directory to save the downloaded file.
clip_start: Download from this time (seconds). None = beginning.
clip_end: Download until this time (seconds). None = full video.
Returns:
Path to downloaded video file, or None on failure.
@@ -2156,13 +2252,30 @@ def download_video(url: str, output_dir: str) -> str | None:
output_template = os.path.join(output_dir, "video.%(ext)s")
opts = {
"format": "bestvideo[height<=1080]+bestaudio/best[height<=1080]",
"format": (
"bestvideo[height<=1080][vcodec^=avc1]+bestaudio/best[height<=1080][vcodec^=avc1]/"
"bestvideo[height<=1080][vcodec^=h264]+bestaudio/best[height<=1080][vcodec^=h264]/"
"bestvideo[height<=1080]+bestaudio/best[height<=1080]"
),
"merge_output_format": "mp4",
"outtmpl": output_template,
"quiet": True,
"no_warnings": True,
}
# Apply download_ranges for clip support (yt-dlp 2023.01.02+)
if clip_start is not None or clip_end is not None:
try:
from yt_dlp.utils import download_range_func
ranges = [(clip_start or 0, clip_end or float("inf"))]
opts["download_ranges"] = download_range_func(None, ranges)
except (ImportError, TypeError):
logger.warning(
"yt-dlp version does not support download_ranges; "
"downloading full video and relying on frame timestamp filtering"
)
logger.info(f"Downloading video for visual extraction...")
try:
with yt_dlp.YoutubeDL(opts) as ydl: