fix: resolve 15 bugs and gaps in video scraper pipeline
- Fix extract_visual_data returning 2-tuple instead of 3 (ValueError crash) - Move pytesseract from core deps to [video-full] optional group - Add 30-min timeout + user feedback to video enhancement subprocess - Add scrape_video_impl to MCP server fallback import block - Detect auto-generated YouTube captions via is_generated property - Forward --vision-ocr and --video-playlist through create command - Fix filename collision for non-ASCII video titles (fallback to video_id) - Make _vision_used a proper dataclass field on FrameSubSection - Expose 6 visual params in MCP scrape_video tool - Add install instructions on missing video deps in unified scraper - Update MCP docstring tool counts (25→33, 7 categories) - Add video and word commands to main.py docstring - Document video-full exclusion from [all] deps in pyproject.toml - Update parser registry test count (22→23 for video parser) All 2437 tests passing, 0 failures. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -52,7 +52,6 @@ dependencies = [
|
||||
"anthropic>=0.76.0", # Required for AI enhancement (core feature)
|
||||
"PyMuPDF>=1.24.14",
|
||||
"Pillow>=11.0.0",
|
||||
"pytesseract>=0.3.13",
|
||||
"pydantic>=2.12.3",
|
||||
"pydantic-settings>=2.11.0",
|
||||
"python-dotenv>=1.1.1",
|
||||
@@ -129,6 +128,7 @@ video-full = [
|
||||
"scenedetect[opencv]>=0.6.4",
|
||||
"easyocr>=1.7.0",
|
||||
"opencv-python-headless>=4.9.0",
|
||||
"pytesseract>=0.3.13",
|
||||
]
|
||||
|
||||
# RAG vector database upload support
|
||||
@@ -172,6 +172,8 @@ embedding = [
|
||||
]
|
||||
|
||||
# All optional dependencies combined (dev dependencies now in [dependency-groups])
|
||||
# Note: video-full deps (opencv, easyocr, faster-whisper) excluded due to heavy
|
||||
# native dependencies. Install separately: pip install skill-seekers[video-full]
|
||||
all = [
|
||||
"mammoth>=1.6.0",
|
||||
"python-docx>=1.1.0",
|
||||
|
||||
Reference in New Issue
Block a user