fix: resolve 15 bugs and gaps in video scraper pipeline

- Fix extract_visual_data returning 2-tuple instead of 3 (ValueError crash)
- Move pytesseract from core deps to [video-full] optional group
- Add 30-min timeout + user feedback to video enhancement subprocess
- Add scrape_video_impl to MCP server fallback import block
- Detect auto-generated YouTube captions via is_generated property
- Forward --vision-ocr and --video-playlist through create command
- Fix filename collision for non-ASCII video titles (fallback to video_id)
- Make _vision_used a proper dataclass field on FrameSubSection
- Expose 6 visual params in MCP scrape_video tool
- Add install instructions on missing video deps in unified scraper
- Update MCP docstring tool counts (25→33, 7 categories)
- Add video and word commands to main.py docstring
- Document video-full exclusion from [all] deps in pyproject.toml
- Update parser registry test count (22→23 for video parser)

All 2437 tests passing, 0 failures.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-03-01 12:39:21 +03:00
parent 066e19674a
commit 12bc29ab36
13 changed files with 171 additions and 33 deletions

View File

@@ -52,7 +52,6 @@ dependencies = [
"anthropic>=0.76.0", # Required for AI enhancement (core feature)
"PyMuPDF>=1.24.14",
"Pillow>=11.0.0",
"pytesseract>=0.3.13",
"pydantic>=2.12.3",
"pydantic-settings>=2.11.0",
"python-dotenv>=1.1.1",
@@ -129,6 +128,7 @@ video-full = [
"scenedetect[opencv]>=0.6.4",
"easyocr>=1.7.0",
"opencv-python-headless>=4.9.0",
"pytesseract>=0.3.13",
]
# RAG vector database upload support
@@ -172,6 +172,8 @@ embedding = [
]
# All optional dependencies combined (dev dependencies now in [dependency-groups])
# Note: video-full deps (opencv, easyocr, faster-whisper) excluded due to heavy
# native dependencies. Install separately: pip install skill-seekers[video-full]
all = [
"mammoth>=1.6.0",
"python-docx>=1.1.0",

View File

@@ -488,6 +488,13 @@ VIDEO_ARGUMENTS: dict[str, dict[str, Any]] = {
"metavar": "THRESH",
},
},
"vision_ocr": {
"flags": ("--vision-ocr",),
"kwargs": {
"action": "store_true",
"help": "Use Claude Vision API as fallback for low-confidence code frames (requires ANTHROPIC_API_KEY, ~$0.004/frame)",
},
},
}
# Multi-source config specific (from unified_scraper.py)

View File

@@ -360,8 +360,12 @@ class CreateCommand:
# Add video source (URL or file)
parsed = self.source_info.parsed
video_playlist = getattr(self.args, "video_playlist", None)
if parsed.get("source_kind") == "file":
argv.extend(["--video-file", parsed["file_path"]])
elif video_playlist:
# Explicit --video-playlist flag takes precedence
argv.extend(["--playlist", video_playlist])
elif parsed.get("url"):
url = parsed["url"]
# Detect playlist vs single video
@@ -374,11 +378,15 @@ class CreateCommand:
self._add_common_args(argv)
# Add video-specific arguments
video_langs = getattr(self.args, "video_languages", None) or getattr(self.args, "languages", None)
video_langs = getattr(self.args, "video_languages", None) or getattr(
self.args, "languages", None
)
if video_langs:
argv.extend(["--languages", video_langs])
if getattr(self.args, "visual", False):
argv.append("--visual")
if getattr(self.args, "vision_ocr", False):
argv.append("--vision-ocr")
if getattr(self.args, "whisper_model", None) and self.args.whisper_model != "base":
argv.extend(["--whisper-model", self.args.whisper_model])
vi = getattr(self.args, "visual_interval", None)

View File

@@ -12,6 +12,8 @@ Commands:
scrape Scrape documentation website
github Scrape GitHub repository
pdf Extract from PDF file
word Extract from Word (.docx) file
video Extract from video (YouTube or local)
unified Multi-source scraping (docs + GitHub + PDF)
analyze Analyze local codebase and extract code knowledge
enhance AI-powered enhancement (auto: API or LOCAL mode)

View File

@@ -79,7 +79,14 @@ class UnifiedScraper:
}
# Track source index for unique naming (multi-source support)
self._source_counters = {"documentation": 0, "github": 0, "pdf": 0, "word": 0, "video": 0, "local": 0}
self._source_counters = {
"documentation": 0,
"github": 0,
"pdf": 0,
"word": 0,
"video": 0,
"local": 0,
}
# Output paths - cleaner organization
self.name = self.config["name"]
@@ -583,8 +590,12 @@ class UnifiedScraper:
"""Scrape video source (YouTube, local file, etc.)."""
try:
from skill_seekers.cli.video_scraper import VideoToSkillConverter
except ImportError:
logger.error("video_scraper.py not found")
except ImportError as e:
logger.error(
f"Video scraper dependencies not installed: {e}\n"
" Install with: pip install skill-seekers[video]\n"
" For visual extraction (frame analysis, OCR): pip install skill-seekers[video-full]"
)
return
# Multi-source support: Get unique index for this video source
@@ -630,8 +641,7 @@ class UnifiedScraper:
logger.info("✅ Video: Standalone SKILL.md created")
logger.info(
f"✅ Video: {len(result.videos)} videos, "
f"{result.total_segments} segments extracted"
f"✅ Video: {len(result.videos)} videos, {result.total_segments} segments extracted"
)
except Exception as e:
logger.error(f"Failed to process video source: {e}")

View File

@@ -222,6 +222,7 @@ class FrameSubSection:
ocr_regions: list[OCRRegion] = field(default_factory=list)
ocr_confidence: float = 0.0
panel_id: str = "" # e.g. "panel_0_0" (row_col)
_vision_used: bool = False # Whether Vision API was used for OCR
def to_dict(self) -> dict:
return {

View File

@@ -469,7 +469,12 @@ class VideoToSkillConverter:
# Generate reference files for each video
for video in self.result.videos:
ref_filename = f"video_{_sanitize_filename(video.title)}.md"
sanitized = (
_sanitize_filename(video.title)
or video.video_id
or f"video_{hash(video.title) % 10000:04d}"
)
ref_filename = f"video_{sanitized}.md"
ref_path = os.path.join(refs_dir, ref_filename)
ref_content = self._generate_reference_md(video)
with open(ref_path, "w", encoding="utf-8") as f:
@@ -750,7 +755,12 @@ class VideoToSkillConverter:
preview += "..."
lines.append(f"{preview}\n")
ref_filename = f"video_{_sanitize_filename(video.title)}.md"
sanitized = (
_sanitize_filename(video.title)
or video.video_id
or f"video_{hash(video.title) % 10000:04d}"
)
ref_filename = f"video_{sanitized}.md"
lines.append(
f"> Full transcript: [references/{ref_filename}](references/{ref_filename})\n"
)
@@ -766,7 +776,12 @@ class VideoToSkillConverter:
# References
lines.append("## References\n")
for video in self.result.videos:
ref_filename = f"video_{_sanitize_filename(video.title)}.md"
sanitized = (
_sanitize_filename(video.title)
or video.video_id
or f"video_{hash(video.title) % 10000:04d}"
)
ref_filename = f"video_{sanitized}.md"
lines.append(f"- [{video.title}](references/{ref_filename})")
return "\n".join(lines)
@@ -940,11 +955,25 @@ def _run_video_enhancement(skill_dir: str, enhance_level: int, args) -> None:
if api_key:
enhance_cmd.extend(["--api-key", api_key])
result = subprocess.run(enhance_cmd, check=True)
if result.returncode == 0:
logger.info("✅ Video skill enhancement complete!")
except subprocess.CalledProcessError:
logger.warning("⚠ Enhancement failed, but skill was still built")
logger.info(
"Starting video skill enhancement (this may take 10+ minutes "
"for large videos with AI enhancement)..."
)
subprocess.run(enhance_cmd, check=True, timeout=1800)
logger.info("Video skill enhancement complete!")
except subprocess.TimeoutExpired:
logger.warning(
"⚠ Enhancement timed out after 30 minutes. "
"The skill was still built without enhancement. "
"You can retry manually with:\n"
f" skill-seekers enhance {skill_dir} --enhance-level {enhance_level}"
)
except subprocess.CalledProcessError as exc:
logger.warning(
f"⚠ Enhancement failed (exit code {exc.returncode}), "
"but skill was still built. You can retry manually with:\n"
f" skill-seekers enhance {skill_dir} --enhance-level {enhance_level}"
)
except FileNotFoundError:
logger.warning("⚠ skill-seekers-enhance not found. Run manually:")
logger.info(f" skill-seekers enhance {skill_dir} --enhance-level {enhance_level}")

View File

@@ -70,10 +70,36 @@ def extract_youtube_transcript(
try:
ytt_api = YouTubeTranscriptApi()
transcript = ytt_api.fetch(video_id, languages=languages)
# Use list_transcripts to detect whether the transcript is auto-generated
source = TranscriptSource.YOUTUBE_MANUAL
try:
transcript_list = ytt_api.list(video_id)
# Prefer manually created transcripts; fall back to auto-generated
try:
transcript_entry = transcript_list.find_manually_created_transcript(languages)
source = TranscriptSource.YOUTUBE_MANUAL
except Exception:
try:
transcript_entry = transcript_list.find_generated_transcript(languages)
source = TranscriptSource.YOUTUBE_AUTO
except Exception:
# Fall back to any available transcript
transcript_entry = transcript_list.find_transcript(languages)
source = (
TranscriptSource.YOUTUBE_AUTO
if transcript_entry.is_generated
else TranscriptSource.YOUTUBE_MANUAL
)
transcript = transcript_entry.fetch()
except Exception:
# Fall back to direct fetch if list fails (older API versions)
transcript = ytt_api.fetch(video_id, languages=languages)
# Check is_generated on the FetchedTranscript if available
if getattr(transcript, "is_generated", False):
source = TranscriptSource.YOUTUBE_AUTO
segments = []
source = TranscriptSource.YOUTUBE_MANUAL
for snippet in transcript.snippets:
text = snippet.text.strip()
if not text:

View File

@@ -1864,7 +1864,7 @@ def _ocr_single_panel(
panel_id=f"panel_{row}_{col}",
)
# Stash vision_used flag for the caller to count
ss._vision_used = vision_used # type: ignore[attr-defined]
ss._vision_used = vision_used
return ss
@@ -1918,7 +1918,7 @@ def extract_visual_data(
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
logger.error(f"Cannot open video: {video_path}")
return [], []
return [], [], None
fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
@@ -2003,7 +2003,7 @@ def extract_visual_data(
for fut in concurrent.futures.as_completed(futures):
ss = fut.result()
if ss is not None:
if getattr(ss, "_vision_used", False):
if ss._vision_used:
vision_api_frames += 1
sub_sections.append(ss)
else:
@@ -2018,7 +2018,7 @@ def extract_visual_data(
use_vision_api,
)
if ss is not None:
if getattr(ss, "_vision_used", False):
if ss._vision_used:
vision_api_frames += 1
sub_sections.append(ss)

View File

@@ -3,20 +3,21 @@
Skill Seeker MCP Server (FastMCP Implementation)
Modern, decorator-based MCP server using FastMCP for simplified tool registration.
Provides 25 tools for generating Claude AI skills from documentation.
Provides 33 tools for generating Claude AI skills from documentation.
This is a streamlined alternative to server.py (2200 lines → 708 lines, 68% reduction).
All tool implementations are delegated to modular tool files in tools/ directory.
**Architecture:**
- FastMCP server with decorator-based tool registration
- 25 tools organized into 6 categories:
- 33 tools organized into 7 categories:
* Config tools (3): generate_config, list_configs, validate_config
* Scraping tools (8): estimate_pages, scrape_docs, scrape_github, scrape_pdf, scrape_codebase, detect_patterns, extract_test_examples, build_how_to_guides, extract_config_patterns
* Scraping tools (10): estimate_pages, scrape_docs, scrape_github, scrape_pdf, scrape_video, scrape_codebase, detect_patterns, extract_test_examples, build_how_to_guides, extract_config_patterns
* Packaging tools (4): package_skill, upload_skill, enhance_skill, install_skill
* Splitting tools (2): split_config, generate_router
* Source tools (4): fetch_config, submit_config, add_config_source, list_config_sources, remove_config_source
* Source tools (5): fetch_config, submit_config, add_config_source, list_config_sources, remove_config_source
* Vector Database tools (4): export_to_weaviate, export_to_chroma, export_to_faiss, export_to_qdrant
* Workflow tools (5): list_workflows, get_workflow, create_workflow, update_workflow, delete_workflow
**Usage:**
# Stdio transport (default, backward compatible)
@@ -140,6 +141,7 @@ except ImportError:
scrape_docs_impl,
scrape_github_impl,
scrape_pdf_impl,
scrape_video_impl,
split_config_impl,
submit_config_impl,
upload_skill_impl,
@@ -250,7 +252,7 @@ async def validate_config(config_path: str) -> str:
# ============================================================================
# SCRAPING TOOLS (4 tools)
# SCRAPING TOOLS (10 tools)
# ============================================================================
@@ -432,6 +434,12 @@ async def scrape_video(
description: str | None = None,
languages: str | None = None,
from_json: str | None = None,
visual: bool = False,
whisper_model: str | None = None,
visual_interval: float | None = None,
visual_min_gap: float | None = None,
visual_similarity: float | None = None,
vision_ocr: bool = False,
) -> str:
"""
Scrape video content and build Claude skill.
@@ -444,6 +452,12 @@ async def scrape_video(
description: Skill description
languages: Transcript language preferences (comma-separated)
from_json: Build from extracted JSON file
visual: Enable visual frame extraction (requires video-full extras)
whisper_model: Whisper model size for local transcription (e.g., base, small, medium, large)
visual_interval: Seconds between frame captures (default: 5.0)
visual_min_gap: Minimum seconds between kept frames (default: 2.0)
visual_similarity: Similarity threshold to skip duplicate frames 0.0-1.0 (default: 0.95)
vision_ocr: Use vision model for OCR on extracted frames
Returns:
Video scraping results with file paths.
@@ -463,6 +477,18 @@ async def scrape_video(
args["languages"] = languages
if from_json:
args["from_json"] = from_json
if visual:
args["visual"] = visual
if whisper_model:
args["whisper_model"] = whisper_model
if visual_interval is not None:
args["visual_interval"] = visual_interval
if visual_min_gap is not None:
args["visual_min_gap"] = visual_min_gap
if visual_similarity is not None:
args["visual_similarity"] = visual_similarity
if vision_ocr:
args["vision_ocr"] = vision_ocr
result = await scrape_video_impl(args)
if isinstance(result, list) and result:

View File

@@ -372,6 +372,12 @@ async def scrape_video_tool(args: dict) -> list[TextContent]:
- description (str, optional): Skill description
- languages (str, optional): Language preferences (comma-separated)
- from_json (str, optional): Build from extracted JSON file
- visual (bool, optional): Enable visual frame extraction (default: False)
- whisper_model (str, optional): Whisper model size (default: base)
- visual_interval (float, optional): Seconds between frame captures (default: 5.0)
- visual_min_gap (float, optional): Minimum seconds between kept frames (default: 2.0)
- visual_similarity (float, optional): Similarity threshold to skip duplicate frames (default: 0.95)
- vision_ocr (bool, optional): Use vision model for OCR on frames (default: False)
Returns:
List[TextContent]: Tool execution results
@@ -383,6 +389,12 @@ async def scrape_video_tool(args: dict) -> list[TextContent]:
description = args.get("description")
languages = args.get("languages")
from_json = args.get("from_json")
visual = args.get("visual", False)
whisper_model = args.get("whisper_model")
visual_interval = args.get("visual_interval")
visual_min_gap = args.get("visual_min_gap")
visual_similarity = args.get("visual_similarity")
vision_ocr = args.get("vision_ocr", False)
# Build command
cmd = [sys.executable, str(CLI_DIR / "video_scraper.py")]
@@ -415,6 +427,20 @@ async def scrape_video_tool(args: dict) -> list[TextContent]:
)
]
# Visual extraction parameters
if visual:
cmd.append("--visual")
if whisper_model:
cmd.extend(["--whisper-model", whisper_model])
if visual_interval is not None:
cmd.extend(["--visual-interval", str(visual_interval)])
if visual_min_gap is not None:
cmd.extend(["--visual-min-gap", str(visual_min_gap)])
if visual_similarity is not None:
cmd.extend(["--visual-similarity", str(visual_similarity)])
if vision_ocr:
cmd.append("--vision-ocr")
# Run video_scraper.py with streaming
timeout = 600 # 10 minutes for video extraction

View File

@@ -24,12 +24,12 @@ class TestParserRegistry:
def test_all_parsers_registered(self):
"""Test that all parsers are registered."""
assert len(PARSERS) == 22, f"Expected 22 parsers, got {len(PARSERS)}"
assert len(PARSERS) == 23, f"Expected 23 parsers, got {len(PARSERS)}"
def test_get_parser_names(self):
"""Test getting list of parser names."""
names = get_parser_names()
assert len(names) == 22
assert len(names) == 23
assert "scrape" in names
assert "github" in names
assert "package" in names
@@ -37,6 +37,7 @@ class TestParserRegistry:
assert "analyze" in names
assert "config" in names
assert "workflows" in names
assert "video" in names
def test_all_parsers_are_subcommand_parsers(self):
"""Test that all parsers inherit from SubcommandParser."""
@@ -242,9 +243,9 @@ class TestBackwardCompatibility:
assert cmd in names, f"Command '{cmd}' not found in parser registry!"
def test_command_count_matches(self):
"""Test that we have exactly 22 commands (includes new create, workflows, and word commands)."""
assert len(PARSERS) == 22
assert len(get_parser_names()) == 22
"""Test that we have exactly 23 commands (includes create, workflows, word, and video commands)."""
assert len(PARSERS) == 23
assert len(get_parser_names()) == 23
if __name__ == "__main__":

4
uv.lock generated
View File

@@ -5983,7 +5983,6 @@ dependencies = [
{ name = "pygithub" },
{ name = "pygments" },
{ name = "pymupdf" },
{ name = "pytesseract" },
{ name = "python-dotenv" },
{ name = "pyyaml" },
{ name = "requests" },
@@ -6084,6 +6083,7 @@ video-full = [
{ name = "easyocr" },
{ name = "faster-whisper" },
{ name = "opencv-python-headless" },
{ name = "pytesseract" },
{ name = "scenedetect", extra = ["opencv"] },
{ name = "youtube-transcript-api" },
{ name = "yt-dlp" },
@@ -6164,7 +6164,7 @@ requires-dist = [
{ name = "pygithub", specifier = ">=2.5.0" },
{ name = "pygments", specifier = ">=2.19.2" },
{ name = "pymupdf", specifier = ">=1.24.14" },
{ name = "pytesseract", specifier = ">=0.3.13" },
{ name = "pytesseract", marker = "extra == 'video-full'", specifier = ">=0.3.13" },
{ name = "python-docx", marker = "extra == 'all'", specifier = ">=1.1.0" },
{ name = "python-docx", marker = "extra == 'docx'", specifier = ">=1.1.0" },
{ name = "python-dotenv", specifier = ">=1.1.1" },