fix: resolve 15 bugs and gaps in video scraper pipeline

- Fix extract_visual_data returning 2-tuple instead of 3 (ValueError crash) - Move pytesseract from core deps to [video-full] optional group - Add 30-min timeout + user feedback to video enhancement subprocess - Add scrape_video_impl to MCP server fallback import block - Detect auto-generated YouTube captions via is_generated property - Forward --vision-ocr and --video-playlist through create command - Fix filename collision for non-ASCII video titles (fallback to video_id) - Make _vision_used a proper dataclass field on FrameSubSection - Expose 6 visual params in MCP scrape_video tool - Add install instructions on missing video deps in unified scraper - Update MCP docstring tool counts (25→33, 7 categories) - Add video and word commands to main.py docstring - Document video-full exclusion from [all] deps in pyproject.toml - Update parser registry test count (22→23 for video parser) All 2437 tests passing, 0 failures. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-01 12:39:21 +03:00
parent 066e19674a
commit 12bc29ab36
13 changed files with 171 additions and 33 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,7 +52,6 @@ dependencies = [
    "anthropic>=0.76.0", # Required for AI enhancement (core feature)
    "PyMuPDF>=1.24.14",
    "Pillow>=11.0.0",
-    "pytesseract>=0.3.13",
    "pydantic>=2.12.3",
    "pydantic-settings>=2.11.0",
    "python-dotenv>=1.1.1",
@@ -129,6 +128,7 @@ video-full = [
    "scenedetect[opencv]>=0.6.4",
    "easyocr>=1.7.0",
    "opencv-python-headless>=4.9.0",
+    "pytesseract>=0.3.13",
 ]

 # RAG vector database upload support
@@ -172,6 +172,8 @@ embedding = [
 ]

 # All optional dependencies combined (dev dependencies now in [dependency-groups])
+# Note: video-full deps (opencv, easyocr, faster-whisper) excluded due to heavy
+# native dependencies. Install separately: pip install skill-seekers[video-full]
 all = [
    "mammoth>=1.6.0",
    "python-docx>=1.1.0",