feat: add video tutorial scraping pipeline with per-panel OCR and AI enhancement

Add complete video tutorial extraction system that converts YouTube videos
and local video files into AI-consumable skills. The pipeline extracts
transcripts, performs visual OCR on code editor panels independently,
tracks code evolution across frames, and generates structured SKILL.md output.

Key features:
- Video metadata extraction (YouTube, local files, playlists)
- Multi-source transcript extraction (YouTube API, yt-dlp, Whisper fallback)
- Chapter-based and time-window segmentation
- Visual extraction: keyframe detection, frame classification, panel detection
- Per-panel sub-section OCR (each IDE panel OCR'd independently)
- Parallel OCR with ThreadPoolExecutor for multi-panel frames
- Narrow panel filtering (300px min width) to skip UI chrome
- Text block tracking with spatial panel position matching
- Code timeline with edit tracking across frames
- Audio-visual alignment (code + narrator pairs)
- Video-specific AI enhancement prompt for OCR denoising and code reconstruction
- video-tutorial.yaml workflow with 4 stages (OCR cleanup, language detection,
  tutorial synthesis, skill polish)
- CLI integration: skill-seekers video --url/--video-file/--playlist
- MCP tool: scrape_video for automation
- 161 tests passing

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
YusufKaraaslanSpyke
2026-02-27 23:10:19 +03:00
parent 3bad7cf365
commit 62071c4aa9
32 changed files with 15090 additions and 9 deletions

View File

@@ -133,6 +133,8 @@ class CreateCommand:
return self._route_pdf()
elif self.source_info.type == "word":
return self._route_word()
elif self.source_info.type == "video":
return self._route_video()
elif self.source_info.type == "config":
return self._route_config()
else:
@@ -345,6 +347,55 @@ class CreateCommand:
finally:
sys.argv = original_argv
def _route_video(self) -> int:
"""Route to video scraper (video_scraper.py)."""
from skill_seekers.cli import video_scraper
# Reconstruct argv for video_scraper
argv = ["video_scraper"]
# Add video source (URL or file)
parsed = self.source_info.parsed
if parsed.get("source_kind") == "file":
argv.extend(["--video-file", parsed["file_path"]])
elif parsed.get("url"):
url = parsed["url"]
# Detect playlist vs single video
if "playlist" in url.lower():
argv.extend(["--playlist", url])
else:
argv.extend(["--url", url])
# Add universal arguments
self._add_common_args(argv)
# Add video-specific arguments
video_langs = getattr(self.args, "video_languages", None) or getattr(self.args, "languages", None)
if video_langs:
argv.extend(["--languages", video_langs])
if getattr(self.args, "visual", False):
argv.append("--visual")
if getattr(self.args, "whisper_model", None) and self.args.whisper_model != "base":
argv.extend(["--whisper-model", self.args.whisper_model])
vi = getattr(self.args, "visual_interval", None)
if vi is not None and vi != 0.7:
argv.extend(["--visual-interval", str(vi)])
vmg = getattr(self.args, "visual_min_gap", None)
if vmg is not None and vmg != 0.5:
argv.extend(["--visual-min-gap", str(vmg)])
vs = getattr(self.args, "visual_similarity", None)
if vs is not None and vs != 3.0:
argv.extend(["--visual-similarity", str(vs)])
# Call video_scraper with modified argv
logger.debug(f"Calling video_scraper with argv: {argv}")
original_argv = sys.argv
try:
sys.argv = argv
return video_scraper.main()
finally:
sys.argv = original_argv
def _route_config(self) -> int:
"""Route to unified scraper for config files (unified_scraper.py)."""
from skill_seekers.cli import unified_scraper
@@ -468,6 +519,8 @@ Examples:
Local: skill-seekers create ./my-project -p comprehensive
PDF: skill-seekers create tutorial.pdf --ocr
DOCX: skill-seekers create document.docx
Video: skill-seekers create https://youtube.com/watch?v=...
Video: skill-seekers create recording.mp4
Config: skill-seekers create configs/react.json
Source Auto-Detection:
@@ -476,6 +529,8 @@ Source Auto-Detection:
• ./path → local codebase
• file.pdf → PDF extraction
• file.docx → Word document extraction
• youtube.com/... → Video transcript extraction
• file.mp4 → Video file extraction
• file.json → multi-source config
Progressive Help (13 → 120+ flags):
@@ -483,6 +538,7 @@ Progressive Help (13 → 120+ flags):
--help-github GitHub repository options
--help-local Local codebase analysis
--help-pdf PDF extraction options
--help-video Video extraction options
--help-advanced Rare/advanced options
--help-all All options + compatibility
@@ -513,6 +569,9 @@ Common Workflows:
parser.add_argument(
"--help-word", action="store_true", help=argparse.SUPPRESS, dest="_help_word"
)
parser.add_argument(
"--help-video", action="store_true", help=argparse.SUPPRESS, dest="_help_video"
)
parser.add_argument(
"--help-config", action="store_true", help=argparse.SUPPRESS, dest="_help_config"
)
@@ -571,6 +630,15 @@ Common Workflows:
add_create_arguments(parser_word, mode="word")
parser_word.print_help()
return 0
elif args._help_video:
parser_video = argparse.ArgumentParser(
prog="skill-seekers create",
description="Create skill from video (YouTube, Vimeo, local files)",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
add_create_arguments(parser_video, mode="video")
parser_video.print_help()
return 0
elif args._help_config:
parser_config = argparse.ArgumentParser(
prog="skill-seekers create",