diff --git a/pyproject.toml b/pyproject.toml
index 0729726..30555cf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,7 +52,6 @@ dependencies = [
     "anthropic>=0.76.0", # Required for AI enhancement (core feature)
     "PyMuPDF>=1.24.14",
     "Pillow>=11.0.0",
-    "pytesseract>=0.3.13",
     "pydantic>=2.12.3",
     "pydantic-settings>=2.11.0",
     "python-dotenv>=1.1.1",
@@ -129,6 +128,7 @@ video-full = [
     "scenedetect[opencv]>=0.6.4",
     "easyocr>=1.7.0",
     "opencv-python-headless>=4.9.0",
+    "pytesseract>=0.3.13",
 ]
 
 # RAG vector database upload support
@@ -172,6 +172,8 @@ embedding = [
 ]
 
 # All optional dependencies combined (dev dependencies now in [dependency-groups])
+# Note: video-full deps (opencv, easyocr, faster-whisper) excluded due to heavy
+# native dependencies. Install separately: pip install skill-seekers[video-full]
 all = [
     "mammoth>=1.6.0",
     "python-docx>=1.1.0",
diff --git a/src/skill_seekers/cli/arguments/create.py b/src/skill_seekers/cli/arguments/create.py
index 99737ff..ef1d6e0 100644
--- a/src/skill_seekers/cli/arguments/create.py
+++ b/src/skill_seekers/cli/arguments/create.py
@@ -488,6 +488,13 @@ VIDEO_ARGUMENTS: dict[str, dict[str, Any]] = {
             "metavar": "THRESH",
         },
     },
+    "vision_ocr": {
+        "flags": ("--vision-ocr",),
+        "kwargs": {
+            "action": "store_true",
+            "help": "Use Claude Vision API as fallback for low-confidence code frames (requires ANTHROPIC_API_KEY, ~$0.004/frame)",
+        },
+    },
 }
 
 # Multi-source config specific (from unified_scraper.py)
diff --git a/src/skill_seekers/cli/create_command.py b/src/skill_seekers/cli/create_command.py
index 22c8b21..cff35d2 100644
--- a/src/skill_seekers/cli/create_command.py
+++ b/src/skill_seekers/cli/create_command.py
@@ -360,8 +360,12 @@ class CreateCommand:
 
         # Add video source (URL or file)
         parsed = self.source_info.parsed
+        video_playlist = getattr(self.args, "video_playlist", None)
         if parsed.get("source_kind") == "file":
             argv.extend(["--video-file", parsed["file_path"]])
+        elif video_playlist:
+            # Explicit --video-playlist flag takes precedence
+            argv.extend(["--playlist", video_playlist])
         elif parsed.get("url"):
             url = parsed["url"]
             # Detect playlist vs single video
@@ -374,11 +378,15 @@ class CreateCommand:
         self._add_common_args(argv)
 
         # Add video-specific arguments
-        video_langs = getattr(self.args, "video_languages", None) or getattr(self.args, "languages", None)
+        video_langs = getattr(self.args, "video_languages", None) or getattr(
+            self.args, "languages", None
+        )
         if video_langs:
             argv.extend(["--languages", video_langs])
         if getattr(self.args, "visual", False):
             argv.append("--visual")
+        if getattr(self.args, "vision_ocr", False):
+            argv.append("--vision-ocr")
         if getattr(self.args, "whisper_model", None) and self.args.whisper_model != "base":
             argv.extend(["--whisper-model", self.args.whisper_model])
         vi = getattr(self.args, "visual_interval", None)
diff --git a/src/skill_seekers/cli/main.py b/src/skill_seekers/cli/main.py
index c7980cf..83c1fd7 100644
--- a/src/skill_seekers/cli/main.py
+++ b/src/skill_seekers/cli/main.py
@@ -12,6 +12,8 @@ Commands:
     scrape               Scrape documentation website
     github               Scrape GitHub repository
     pdf                  Extract from PDF file
+    word                 Extract from Word (.docx) file
+    video                Extract from video (YouTube or local)
     unified              Multi-source scraping (docs + GitHub + PDF)
     analyze              Analyze local codebase and extract code knowledge
     enhance              AI-powered enhancement (auto: API or LOCAL mode)
diff --git a/src/skill_seekers/cli/unified_scraper.py b/src/skill_seekers/cli/unified_scraper.py
index 6cfc9cf..81f7ed3 100644
--- a/src/skill_seekers/cli/unified_scraper.py
+++ b/src/skill_seekers/cli/unified_scraper.py
@@ -79,7 +79,14 @@ class UnifiedScraper:
         }
 
         # Track source index for unique naming (multi-source support)
-        self._source_counters = {"documentation": 0, "github": 0, "pdf": 0, "word": 0, "video": 0, "local": 0}
+        self._source_counters = {
+            "documentation": 0,
+            "github": 0,
+            "pdf": 0,
+            "word": 0,
+            "video": 0,
+            "local": 0,
+        }
 
         # Output paths - cleaner organization
         self.name = self.config["name"]
@@ -583,8 +590,12 @@ class UnifiedScraper:
         """Scrape video source (YouTube, local file, etc.)."""
         try:
             from skill_seekers.cli.video_scraper import VideoToSkillConverter
-        except ImportError:
-            logger.error("video_scraper.py not found")
+        except ImportError as e:
+            logger.error(
+                f"Video scraper dependencies not installed: {e}\n"
+                "  Install with: pip install skill-seekers[video]\n"
+                "  For visual extraction (frame analysis, OCR): pip install skill-seekers[video-full]"
+            )
             return
 
         # Multi-source support: Get unique index for this video source
@@ -630,8 +641,7 @@ class UnifiedScraper:
             logger.info("✅ Video: Standalone SKILL.md created")
 
             logger.info(
-                f"✅ Video: {len(result.videos)} videos, "
-                f"{result.total_segments} segments extracted"
+                f"✅ Video: {len(result.videos)} videos, {result.total_segments} segments extracted"
             )
         except Exception as e:
             logger.error(f"Failed to process video source: {e}")
diff --git a/src/skill_seekers/cli/video_models.py b/src/skill_seekers/cli/video_models.py
index 3da19c7..de15a04 100644
--- a/src/skill_seekers/cli/video_models.py
+++ b/src/skill_seekers/cli/video_models.py
@@ -222,6 +222,7 @@ class FrameSubSection:
     ocr_regions: list[OCRRegion] = field(default_factory=list)
     ocr_confidence: float = 0.0
     panel_id: str = ""  # e.g. "panel_0_0" (row_col)
+    _vision_used: bool = False  # Whether Vision API was used for OCR
 
     def to_dict(self) -> dict:
         return {
diff --git a/src/skill_seekers/cli/video_scraper.py b/src/skill_seekers/cli/video_scraper.py
index eca06a2..d0aff1b 100644
--- a/src/skill_seekers/cli/video_scraper.py
+++ b/src/skill_seekers/cli/video_scraper.py
@@ -469,7 +469,12 @@ class VideoToSkillConverter:
 
         # Generate reference files for each video
         for video in self.result.videos:
-            ref_filename = f"video_{_sanitize_filename(video.title)}.md"
+            sanitized = (
+                _sanitize_filename(video.title)
+                or video.video_id
+                or f"video_{hash(video.title) % 10000:04d}"
+            )
+            ref_filename = f"video_{sanitized}.md"
             ref_path = os.path.join(refs_dir, ref_filename)
             ref_content = self._generate_reference_md(video)
             with open(ref_path, "w", encoding="utf-8") as f:
@@ -750,7 +755,12 @@ class VideoToSkillConverter:
                     preview += "..."
                 lines.append(f"{preview}\n")
 
-            ref_filename = f"video_{_sanitize_filename(video.title)}.md"
+            sanitized = (
+                _sanitize_filename(video.title)
+                or video.video_id
+                or f"video_{hash(video.title) % 10000:04d}"
+            )
+            ref_filename = f"video_{sanitized}.md"
             lines.append(
                 f"> Full transcript: [references/{ref_filename}](references/{ref_filename})\n"
             )
@@ -766,7 +776,12 @@ class VideoToSkillConverter:
         # References
         lines.append("## References\n")
         for video in self.result.videos:
-            ref_filename = f"video_{_sanitize_filename(video.title)}.md"
+            sanitized = (
+                _sanitize_filename(video.title)
+                or video.video_id
+                or f"video_{hash(video.title) % 10000:04d}"
+            )
+            ref_filename = f"video_{sanitized}.md"
             lines.append(f"- [{video.title}](references/{ref_filename})")
 
         return "\n".join(lines)
@@ -940,11 +955,25 @@ def _run_video_enhancement(skill_dir: str, enhance_level: int, args) -> None:
         if api_key:
             enhance_cmd.extend(["--api-key", api_key])
 
-        result = subprocess.run(enhance_cmd, check=True)
-        if result.returncode == 0:
-            logger.info("✅ Video skill enhancement complete!")
-    except subprocess.CalledProcessError:
-        logger.warning("⚠ Enhancement failed, but skill was still built")
+        logger.info(
+            "Starting video skill enhancement (this may take 10+ minutes "
+            "for large videos with AI enhancement)..."
+        )
+        subprocess.run(enhance_cmd, check=True, timeout=1800)
+        logger.info("Video skill enhancement complete!")
+    except subprocess.TimeoutExpired:
+        logger.warning(
+            "⚠ Enhancement timed out after 30 minutes. "
+            "The skill was still built without enhancement. "
+            "You can retry manually with:\n"
+            f"  skill-seekers enhance {skill_dir} --enhance-level {enhance_level}"
+        )
+    except subprocess.CalledProcessError as exc:
+        logger.warning(
+            f"⚠ Enhancement failed (exit code {exc.returncode}), "
+            "but skill was still built. You can retry manually with:\n"
+            f"  skill-seekers enhance {skill_dir} --enhance-level {enhance_level}"
+        )
     except FileNotFoundError:
         logger.warning("⚠ skill-seekers-enhance not found. Run manually:")
         logger.info(f"  skill-seekers enhance {skill_dir} --enhance-level {enhance_level}")
diff --git a/src/skill_seekers/cli/video_transcript.py b/src/skill_seekers/cli/video_transcript.py
index c527fac..04a9319 100644
--- a/src/skill_seekers/cli/video_transcript.py
+++ b/src/skill_seekers/cli/video_transcript.py
@@ -70,10 +70,36 @@ def extract_youtube_transcript(
 
     try:
         ytt_api = YouTubeTranscriptApi()
-        transcript = ytt_api.fetch(video_id, languages=languages)
+
+        # Use list_transcripts to detect whether the transcript is auto-generated
+        source = TranscriptSource.YOUTUBE_MANUAL
+        try:
+            transcript_list = ytt_api.list(video_id)
+            # Prefer manually created transcripts; fall back to auto-generated
+            try:
+                transcript_entry = transcript_list.find_manually_created_transcript(languages)
+                source = TranscriptSource.YOUTUBE_MANUAL
+            except Exception:
+                try:
+                    transcript_entry = transcript_list.find_generated_transcript(languages)
+                    source = TranscriptSource.YOUTUBE_AUTO
+                except Exception:
+                    # Fall back to any available transcript
+                    transcript_entry = transcript_list.find_transcript(languages)
+                    source = (
+                        TranscriptSource.YOUTUBE_AUTO
+                        if transcript_entry.is_generated
+                        else TranscriptSource.YOUTUBE_MANUAL
+                    )
+            transcript = transcript_entry.fetch()
+        except Exception:
+            # Fall back to direct fetch if list fails (older API versions)
+            transcript = ytt_api.fetch(video_id, languages=languages)
+            # Check is_generated on the FetchedTranscript if available
+            if getattr(transcript, "is_generated", False):
+                source = TranscriptSource.YOUTUBE_AUTO
 
         segments = []
-        source = TranscriptSource.YOUTUBE_MANUAL
         for snippet in transcript.snippets:
             text = snippet.text.strip()
             if not text:
diff --git a/src/skill_seekers/cli/video_visual.py b/src/skill_seekers/cli/video_visual.py
index be441ce..f8edd98 100644
--- a/src/skill_seekers/cli/video_visual.py
+++ b/src/skill_seekers/cli/video_visual.py
@@ -1864,7 +1864,7 @@ def _ocr_single_panel(
         panel_id=f"panel_{row}_{col}",
     )
     # Stash vision_used flag for the caller to count
-    ss._vision_used = vision_used  # type: ignore[attr-defined]
+    ss._vision_used = vision_used
     return ss
 
 
@@ -1918,7 +1918,7 @@ def extract_visual_data(
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened():
         logger.error(f"Cannot open video: {video_path}")
-        return [], []
+        return [], [], None
 
     fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
     total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
@@ -2003,7 +2003,7 @@ def extract_visual_data(
                     for fut in concurrent.futures.as_completed(futures):
                         ss = fut.result()
                         if ss is not None:
-                            if getattr(ss, "_vision_used", False):
+                            if ss._vision_used:
                                 vision_api_frames += 1
                             sub_sections.append(ss)
             else:
@@ -2018,7 +2018,7 @@ def extract_visual_data(
                     use_vision_api,
                 )
                 if ss is not None:
-                    if getattr(ss, "_vision_used", False):
+                    if ss._vision_used:
                         vision_api_frames += 1
                     sub_sections.append(ss)
 
diff --git a/src/skill_seekers/mcp/server_fastmcp.py b/src/skill_seekers/mcp/server_fastmcp.py
index 5e8a581..4f6308e 100644
--- a/src/skill_seekers/mcp/server_fastmcp.py
+++ b/src/skill_seekers/mcp/server_fastmcp.py
@@ -3,20 +3,21 @@
 Skill Seeker MCP Server (FastMCP Implementation)
 
 Modern, decorator-based MCP server using FastMCP for simplified tool registration.
-Provides 25 tools for generating Claude AI skills from documentation.
+Provides 33 tools for generating Claude AI skills from documentation.
 
 This is a streamlined alternative to server.py (2200 lines → 708 lines, 68% reduction).
 All tool implementations are delegated to modular tool files in tools/ directory.
 
 **Architecture:**
 - FastMCP server with decorator-based tool registration
-- 25 tools organized into 6 categories:
+- 33 tools organized into 7 categories:
   * Config tools (3): generate_config, list_configs, validate_config
-  * Scraping tools (8): estimate_pages, scrape_docs, scrape_github, scrape_pdf, scrape_codebase, detect_patterns, extract_test_examples, build_how_to_guides, extract_config_patterns
+  * Scraping tools (10): estimate_pages, scrape_docs, scrape_github, scrape_pdf, scrape_video, scrape_codebase, detect_patterns, extract_test_examples, build_how_to_guides, extract_config_patterns
   * Packaging tools (4): package_skill, upload_skill, enhance_skill, install_skill
   * Splitting tools (2): split_config, generate_router
-  * Source tools (4): fetch_config, submit_config, add_config_source, list_config_sources, remove_config_source
+  * Source tools (5): fetch_config, submit_config, add_config_source, list_config_sources, remove_config_source
   * Vector Database tools (4): export_to_weaviate, export_to_chroma, export_to_faiss, export_to_qdrant
+  * Workflow tools (5): list_workflows, get_workflow, create_workflow, update_workflow, delete_workflow
 
 **Usage:**
   # Stdio transport (default, backward compatible)
@@ -140,6 +141,7 @@ except ImportError:
         scrape_docs_impl,
         scrape_github_impl,
         scrape_pdf_impl,
+        scrape_video_impl,
         split_config_impl,
         submit_config_impl,
         upload_skill_impl,
@@ -250,7 +252,7 @@ async def validate_config(config_path: str) -> str:
 
 
 # ============================================================================
-# SCRAPING TOOLS (4 tools)
+# SCRAPING TOOLS (10 tools)
 # ============================================================================
 
 
@@ -432,6 +434,12 @@ async def scrape_video(
     description: str | None = None,
     languages: str | None = None,
     from_json: str | None = None,
+    visual: bool = False,
+    whisper_model: str | None = None,
+    visual_interval: float | None = None,
+    visual_min_gap: float | None = None,
+    visual_similarity: float | None = None,
+    vision_ocr: bool = False,
 ) -> str:
     """
     Scrape video content and build Claude skill.
@@ -444,6 +452,12 @@ async def scrape_video(
         description: Skill description
         languages: Transcript language preferences (comma-separated)
         from_json: Build from extracted JSON file
+        visual: Enable visual frame extraction (requires video-full extras)
+        whisper_model: Whisper model size for local transcription (e.g., base, small, medium, large)
+        visual_interval: Seconds between frame captures (default: 5.0)
+        visual_min_gap: Minimum seconds between kept frames (default: 2.0)
+        visual_similarity: Similarity threshold to skip duplicate frames 0.0-1.0 (default: 0.95)
+        vision_ocr: Use vision model for OCR on extracted frames
 
     Returns:
         Video scraping results with file paths.
@@ -463,6 +477,18 @@ async def scrape_video(
         args["languages"] = languages
     if from_json:
         args["from_json"] = from_json
+    if visual:
+        args["visual"] = visual
+    if whisper_model:
+        args["whisper_model"] = whisper_model
+    if visual_interval is not None:
+        args["visual_interval"] = visual_interval
+    if visual_min_gap is not None:
+        args["visual_min_gap"] = visual_min_gap
+    if visual_similarity is not None:
+        args["visual_similarity"] = visual_similarity
+    if vision_ocr:
+        args["vision_ocr"] = vision_ocr
 
     result = await scrape_video_impl(args)
     if isinstance(result, list) and result:
diff --git a/src/skill_seekers/mcp/tools/scraping_tools.py b/src/skill_seekers/mcp/tools/scraping_tools.py
index a9c083c..5cea4e7 100644
--- a/src/skill_seekers/mcp/tools/scraping_tools.py
+++ b/src/skill_seekers/mcp/tools/scraping_tools.py
@@ -372,6 +372,12 @@ async def scrape_video_tool(args: dict) -> list[TextContent]:
             - description (str, optional): Skill description
             - languages (str, optional): Language preferences (comma-separated)
             - from_json (str, optional): Build from extracted JSON file
+            - visual (bool, optional): Enable visual frame extraction (default: False)
+            - whisper_model (str, optional): Whisper model size (default: base)
+            - visual_interval (float, optional): Seconds between frame captures (default: 5.0)
+            - visual_min_gap (float, optional): Minimum seconds between kept frames (default: 2.0)
+            - visual_similarity (float, optional): Similarity threshold to skip duplicate frames (default: 0.95)
+            - vision_ocr (bool, optional): Use vision model for OCR on frames (default: False)
 
     Returns:
         List[TextContent]: Tool execution results
@@ -383,6 +389,12 @@ async def scrape_video_tool(args: dict) -> list[TextContent]:
     description = args.get("description")
     languages = args.get("languages")
     from_json = args.get("from_json")
+    visual = args.get("visual", False)
+    whisper_model = args.get("whisper_model")
+    visual_interval = args.get("visual_interval")
+    visual_min_gap = args.get("visual_min_gap")
+    visual_similarity = args.get("visual_similarity")
+    vision_ocr = args.get("vision_ocr", False)
 
     # Build command
     cmd = [sys.executable, str(CLI_DIR / "video_scraper.py")]
@@ -415,6 +427,20 @@ async def scrape_video_tool(args: dict) -> list[TextContent]:
             )
         ]
 
+    # Visual extraction parameters
+    if visual:
+        cmd.append("--visual")
+    if whisper_model:
+        cmd.extend(["--whisper-model", whisper_model])
+    if visual_interval is not None:
+        cmd.extend(["--visual-interval", str(visual_interval)])
+    if visual_min_gap is not None:
+        cmd.extend(["--visual-min-gap", str(visual_min_gap)])
+    if visual_similarity is not None:
+        cmd.extend(["--visual-similarity", str(visual_similarity)])
+    if vision_ocr:
+        cmd.append("--vision-ocr")
+
     # Run video_scraper.py with streaming
     timeout = 600  # 10 minutes for video extraction
 
diff --git a/tests/test_cli_parsers.py b/tests/test_cli_parsers.py
index 57fafef..8d240d1 100644
--- a/tests/test_cli_parsers.py
+++ b/tests/test_cli_parsers.py
@@ -24,12 +24,12 @@ class TestParserRegistry:
 
     def test_all_parsers_registered(self):
         """Test that all parsers are registered."""
-        assert len(PARSERS) == 22, f"Expected 22 parsers, got {len(PARSERS)}"
+        assert len(PARSERS) == 23, f"Expected 23 parsers, got {len(PARSERS)}"
 
     def test_get_parser_names(self):
         """Test getting list of parser names."""
         names = get_parser_names()
-        assert len(names) == 22
+        assert len(names) == 23
         assert "scrape" in names
         assert "github" in names
         assert "package" in names
@@ -37,6 +37,7 @@ class TestParserRegistry:
         assert "analyze" in names
         assert "config" in names
         assert "workflows" in names
+        assert "video" in names
 
     def test_all_parsers_are_subcommand_parsers(self):
         """Test that all parsers inherit from SubcommandParser."""
@@ -242,9 +243,9 @@ class TestBackwardCompatibility:
             assert cmd in names, f"Command '{cmd}' not found in parser registry!"
 
     def test_command_count_matches(self):
-        """Test that we have exactly 22 commands (includes new create, workflows, and word commands)."""
-        assert len(PARSERS) == 22
-        assert len(get_parser_names()) == 22
+        """Test that we have exactly 23 commands (includes create, workflows, word, and video commands)."""
+        assert len(PARSERS) == 23
+        assert len(get_parser_names()) == 23
 
 
 if __name__ == "__main__":
diff --git a/uv.lock b/uv.lock
index 967c3d3..55d4cab 100644
--- a/uv.lock
+++ b/uv.lock
@@ -5983,7 +5983,6 @@ dependencies = [
     { name = "pygithub" },
     { name = "pygments" },
     { name = "pymupdf" },
-    { name = "pytesseract" },
     { name = "python-dotenv" },
     { name = "pyyaml" },
     { name = "requests" },
@@ -6084,6 +6083,7 @@ video-full = [
     { name = "easyocr" },
     { name = "faster-whisper" },
     { name = "opencv-python-headless" },
+    { name = "pytesseract" },
     { name = "scenedetect", extra = ["opencv"] },
     { name = "youtube-transcript-api" },
     { name = "yt-dlp" },
@@ -6164,7 +6164,7 @@ requires-dist = [
     { name = "pygithub", specifier = ">=2.5.0" },
     { name = "pygments", specifier = ">=2.19.2" },
     { name = "pymupdf", specifier = ">=1.24.14" },
-    { name = "pytesseract", specifier = ">=0.3.13" },
+    { name = "pytesseract", marker = "extra == 'video-full'", specifier = ">=0.3.13" },
     { name = "python-docx", marker = "extra == 'all'", specifier = ">=1.1.0" },
     { name = "python-docx", marker = "extra == 'docx'", specifier = ">=1.1.0" },
     { name = "python-dotenv", specifier = ">=1.1.1" },