feat: add video tutorial scraping pipeline with per-panel OCR and AI enhancement

Add complete video tutorial extraction system that converts YouTube videos
and local video files into AI-consumable skills. The pipeline extracts
transcripts, performs visual OCR on code editor panels independently,
tracks code evolution across frames, and generates structured SKILL.md output.

Key features:
- Video metadata extraction (YouTube, local files, playlists)
- Multi-source transcript extraction (YouTube API, yt-dlp, Whisper fallback)
- Chapter-based and time-window segmentation
- Visual extraction: keyframe detection, frame classification, panel detection
- Per-panel sub-section OCR (each IDE panel OCR'd independently)
- Parallel OCR with ThreadPoolExecutor for multi-panel frames
- Narrow panel filtering (300px min width) to skip UI chrome
- Text block tracking with spatial panel position matching
- Code timeline with edit tracking across frames
- Audio-visual alignment (code + narrator pairs)
- Video-specific AI enhancement prompt for OCR denoising and code reconstruction
- video-tutorial.yaml workflow with 4 stages (OCR cleanup, language detection,
  tutorial synthesis, skill polish)
- CLI integration: skill-seekers video --url/--video-file/--playlist
- MCP tool: scrape_video for automation
- 161 tests passing

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
YusufKaraaslanSpyke
2026-02-27 23:10:19 +03:00
parent 3bad7cf365
commit 62071c4aa9
32 changed files with 15090 additions and 9 deletions

View File

@@ -74,11 +74,12 @@ class UnifiedScraper:
"github": [], # List of github sources
"pdf": [], # List of pdf sources
"word": [], # List of word sources
"video": [], # List of video sources
"local": [], # List of local sources (docs or code)
}
# Track source index for unique naming (multi-source support)
self._source_counters = {"documentation": 0, "github": 0, "pdf": 0, "word": 0, "local": 0}
self._source_counters = {"documentation": 0, "github": 0, "pdf": 0, "word": 0, "video": 0, "local": 0}
# Output paths - cleaner organization
self.name = self.config["name"]
@@ -154,6 +155,8 @@ class UnifiedScraper:
self._scrape_pdf(source)
elif source_type == "word":
self._scrape_word(source)
elif source_type == "video":
self._scrape_video(source)
elif source_type == "local":
self._scrape_local(source)
else:
@@ -576,6 +579,63 @@ class UnifiedScraper:
logger.info(f"✅ Word: {len(word_data.get('pages', []))} sections extracted")
def _scrape_video(self, source: dict[str, Any]):
"""Scrape video source (YouTube, local file, etc.)."""
try:
from skill_seekers.cli.video_scraper import VideoToSkillConverter
except ImportError:
logger.error("video_scraper.py not found")
return
# Multi-source support: Get unique index for this video source
idx = self._source_counters["video"]
self._source_counters["video"] += 1
# Determine video identifier
video_url = source.get("url", "")
video_id = video_url or source.get("path", f"video_{idx}")
# Create config for video scraper
video_config = {
"name": f"{self.name}_video_{idx}",
"url": source.get("url"),
"video_file": source.get("path"),
"playlist": source.get("playlist"),
"description": source.get("description", ""),
"languages": ",".join(source.get("languages", ["en"])),
"visual": source.get("visual_extraction", False),
"whisper_model": source.get("whisper_model", "base"),
}
# Process video
logger.info(f"Scraping video: {video_id}")
converter = VideoToSkillConverter(video_config)
try:
result = converter.process()
converter.save_extracted_data()
# Append to list
self.scraped_data["video"].append(
{
"video_id": video_id,
"idx": idx,
"data": result.to_dict(),
"data_file": converter.data_file,
}
)
# Build standalone SKILL.md for synthesis
converter.build_skill()
logger.info("✅ Video: Standalone SKILL.md created")
logger.info(
f"✅ Video: {len(result.videos)} videos, "
f"{result.total_segments} segments extracted"
)
except Exception as e:
logger.error(f"Failed to process video source: {e}")
def _scrape_local(self, source: dict[str, Any]):
"""
Scrape local directory (documentation files or source code).