feat: add video tutorial scraping pipeline with per-panel OCR and AI enhancement

Add complete video tutorial extraction system that converts YouTube videos and local video files into AI-consumable skills. The pipeline extracts transcripts, performs visual OCR on code editor panels independently, tracks code evolution across frames, and generates structured SKILL.md output. Key features: - Video metadata extraction (YouTube, local files, playlists) - Multi-source transcript extraction (YouTube API, yt-dlp, Whisper fallback) - Chapter-based and time-window segmentation - Visual extraction: keyframe detection, frame classification, panel detection - Per-panel sub-section OCR (each IDE panel OCR'd independently) - Parallel OCR with ThreadPoolExecutor for multi-panel frames - Narrow panel filtering (300px min width) to skip UI chrome - Text block tracking with spatial panel position matching - Code timeline with edit tracking across frames - Audio-visual alignment (code + narrator pairs) - Video-specific AI enhancement prompt for OCR denoising and code reconstruction - video-tutorial.yaml workflow with 4 stages (OCR cleanup, language detection, tutorial synthesis, skill polish) - CLI integration: skill-seekers video --url/--video-file/--playlist - MCP tool: scrape_video for automation - 161 tests passing Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-27 23:10:19 +03:00
parent 3bad7cf365
commit 62071c4aa9
32 changed files with 15090 additions and 9 deletions
--- a/src/skill_seekers/cli/unified_scraper.py
+++ b/src/skill_seekers/cli/unified_scraper.py
@@ -74,11 +74,12 @@ class UnifiedScraper:
            "github": [],  # List of github sources
            "pdf": [],  # List of pdf sources
            "word": [],  # List of word sources
+            "video": [],  # List of video sources
            "local": [],  # List of local sources (docs or code)
        }

        # Track source index for unique naming (multi-source support)
-        self._source_counters = {"documentation": 0, "github": 0, "pdf": 0, "word": 0, "local": 0}
+        self._source_counters = {"documentation": 0, "github": 0, "pdf": 0, "word": 0, "video": 0, "local": 0}

        # Output paths - cleaner organization
        self.name = self.config["name"]
@@ -154,6 +155,8 @@ class UnifiedScraper:
                    self._scrape_pdf(source)
                elif source_type == "word":
                    self._scrape_word(source)
+                elif source_type == "video":
+                    self._scrape_video(source)
                elif source_type == "local":
                    self._scrape_local(source)
                else:
@@ -576,6 +579,63 @@ class UnifiedScraper:

        logger.info(f"✅ Word: {len(word_data.get('pages', []))} sections extracted")

+    def _scrape_video(self, source: dict[str, Any]):
+        """Scrape video source (YouTube, local file, etc.)."""
+        try:
+            from skill_seekers.cli.video_scraper import VideoToSkillConverter
+        except ImportError:
+            logger.error("video_scraper.py not found")
+            return
+
+        # Multi-source support: Get unique index for this video source
+        idx = self._source_counters["video"]
+        self._source_counters["video"] += 1
+
+        # Determine video identifier
+        video_url = source.get("url", "")
+        video_id = video_url or source.get("path", f"video_{idx}")
+
+        # Create config for video scraper
+        video_config = {
+            "name": f"{self.name}_video_{idx}",
+            "url": source.get("url"),
+            "video_file": source.get("path"),
+            "playlist": source.get("playlist"),
+            "description": source.get("description", ""),
+            "languages": ",".join(source.get("languages", ["en"])),
+            "visual": source.get("visual_extraction", False),
+            "whisper_model": source.get("whisper_model", "base"),
+        }
+
+        # Process video
+        logger.info(f"Scraping video: {video_id}")
+        converter = VideoToSkillConverter(video_config)
+
+        try:
+            result = converter.process()
+            converter.save_extracted_data()
+
+            # Append to list
+            self.scraped_data["video"].append(
+                {
+                    "video_id": video_id,
+                    "idx": idx,
+                    "data": result.to_dict(),
+                    "data_file": converter.data_file,
+                }
+            )
+
+            # Build standalone SKILL.md for synthesis
+            converter.build_skill()
+            logger.info("✅ Video: Standalone SKILL.md created")
+
+            logger.info(
+                f"✅ Video: {len(result.videos)} videos, "
+                f"{result.total_segments} segments extracted"
+            )
+        except Exception as e:
+            logger.error(f"Failed to process video source: {e}")
+
    def _scrape_local(self, source: dict[str, Any]):
        """
        Scrape local directory (documentation files or source code).