fix: resolve 15 bugs and gaps in video scraper pipeline

- Fix extract_visual_data returning 2-tuple instead of 3 (ValueError crash)
- Move pytesseract from core deps to [video-full] optional group
- Add 30-min timeout + user feedback to video enhancement subprocess
- Add scrape_video_impl to MCP server fallback import block
- Detect auto-generated YouTube captions via is_generated property
- Forward --vision-ocr and --video-playlist through create command
- Fix filename collision for non-ASCII video titles (fallback to video_id)
- Make _vision_used a proper dataclass field on FrameSubSection
- Expose 6 visual params in MCP scrape_video tool
- Add install instructions on missing video deps in unified scraper
- Update MCP docstring tool counts (25→33, 7 categories)
- Add video and word commands to main.py docstring
- Document video-full exclusion from [all] deps in pyproject.toml
- Update parser registry test count (22→23 for video parser)

All 2437 tests passing, 0 failures.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-03-01 12:39:21 +03:00
parent 066e19674a
commit 12bc29ab36
13 changed files with 171 additions and 33 deletions

View File

@@ -488,6 +488,13 @@ VIDEO_ARGUMENTS: dict[str, dict[str, Any]] = {
"metavar": "THRESH",
},
},
"vision_ocr": {
"flags": ("--vision-ocr",),
"kwargs": {
"action": "store_true",
"help": "Use Claude Vision API as fallback for low-confidence code frames (requires ANTHROPIC_API_KEY, ~$0.004/frame)",
},
},
}
# Multi-source config specific (from unified_scraper.py)

View File

@@ -360,8 +360,12 @@ class CreateCommand:
# Add video source (URL or file)
parsed = self.source_info.parsed
video_playlist = getattr(self.args, "video_playlist", None)
if parsed.get("source_kind") == "file":
argv.extend(["--video-file", parsed["file_path"]])
elif video_playlist:
# Explicit --video-playlist flag takes precedence
argv.extend(["--playlist", video_playlist])
elif parsed.get("url"):
url = parsed["url"]
# Detect playlist vs single video
@@ -374,11 +378,15 @@ class CreateCommand:
self._add_common_args(argv)
# Add video-specific arguments
video_langs = getattr(self.args, "video_languages", None) or getattr(self.args, "languages", None)
video_langs = getattr(self.args, "video_languages", None) or getattr(
self.args, "languages", None
)
if video_langs:
argv.extend(["--languages", video_langs])
if getattr(self.args, "visual", False):
argv.append("--visual")
if getattr(self.args, "vision_ocr", False):
argv.append("--vision-ocr")
if getattr(self.args, "whisper_model", None) and self.args.whisper_model != "base":
argv.extend(["--whisper-model", self.args.whisper_model])
vi = getattr(self.args, "visual_interval", None)

View File

@@ -12,6 +12,8 @@ Commands:
scrape Scrape documentation website
github Scrape GitHub repository
pdf Extract from PDF file
word Extract from Word (.docx) file
video Extract from video (YouTube or local)
unified Multi-source scraping (docs + GitHub + PDF)
analyze Analyze local codebase and extract code knowledge
enhance AI-powered enhancement (auto: API or LOCAL mode)

View File

@@ -79,7 +79,14 @@ class UnifiedScraper:
}
# Track source index for unique naming (multi-source support)
self._source_counters = {"documentation": 0, "github": 0, "pdf": 0, "word": 0, "video": 0, "local": 0}
self._source_counters = {
"documentation": 0,
"github": 0,
"pdf": 0,
"word": 0,
"video": 0,
"local": 0,
}
# Output paths - cleaner organization
self.name = self.config["name"]
@@ -583,8 +590,12 @@ class UnifiedScraper:
"""Scrape video source (YouTube, local file, etc.)."""
try:
from skill_seekers.cli.video_scraper import VideoToSkillConverter
except ImportError:
logger.error("video_scraper.py not found")
except ImportError as e:
logger.error(
f"Video scraper dependencies not installed: {e}\n"
" Install with: pip install skill-seekers[video]\n"
" For visual extraction (frame analysis, OCR): pip install skill-seekers[video-full]"
)
return
# Multi-source support: Get unique index for this video source
@@ -630,8 +641,7 @@ class UnifiedScraper:
logger.info("✅ Video: Standalone SKILL.md created")
logger.info(
f"✅ Video: {len(result.videos)} videos, "
f"{result.total_segments} segments extracted"
f"✅ Video: {len(result.videos)} videos, {result.total_segments} segments extracted"
)
except Exception as e:
logger.error(f"Failed to process video source: {e}")

View File

@@ -222,6 +222,7 @@ class FrameSubSection:
ocr_regions: list[OCRRegion] = field(default_factory=list)
ocr_confidence: float = 0.0
panel_id: str = "" # e.g. "panel_0_0" (row_col)
_vision_used: bool = False # Whether Vision API was used for OCR
def to_dict(self) -> dict:
return {

View File

@@ -469,7 +469,12 @@ class VideoToSkillConverter:
# Generate reference files for each video
for video in self.result.videos:
ref_filename = f"video_{_sanitize_filename(video.title)}.md"
sanitized = (
_sanitize_filename(video.title)
or video.video_id
or f"video_{hash(video.title) % 10000:04d}"
)
ref_filename = f"video_{sanitized}.md"
ref_path = os.path.join(refs_dir, ref_filename)
ref_content = self._generate_reference_md(video)
with open(ref_path, "w", encoding="utf-8") as f:
@@ -750,7 +755,12 @@ class VideoToSkillConverter:
preview += "..."
lines.append(f"{preview}\n")
ref_filename = f"video_{_sanitize_filename(video.title)}.md"
sanitized = (
_sanitize_filename(video.title)
or video.video_id
or f"video_{hash(video.title) % 10000:04d}"
)
ref_filename = f"video_{sanitized}.md"
lines.append(
f"> Full transcript: [references/{ref_filename}](references/{ref_filename})\n"
)
@@ -766,7 +776,12 @@ class VideoToSkillConverter:
# References
lines.append("## References\n")
for video in self.result.videos:
ref_filename = f"video_{_sanitize_filename(video.title)}.md"
sanitized = (
_sanitize_filename(video.title)
or video.video_id
or f"video_{hash(video.title) % 10000:04d}"
)
ref_filename = f"video_{sanitized}.md"
lines.append(f"- [{video.title}](references/{ref_filename})")
return "\n".join(lines)
@@ -940,11 +955,25 @@ def _run_video_enhancement(skill_dir: str, enhance_level: int, args) -> None:
if api_key:
enhance_cmd.extend(["--api-key", api_key])
result = subprocess.run(enhance_cmd, check=True)
if result.returncode == 0:
logger.info("✅ Video skill enhancement complete!")
except subprocess.CalledProcessError:
logger.warning("⚠ Enhancement failed, but skill was still built")
logger.info(
"Starting video skill enhancement (this may take 10+ minutes "
"for large videos with AI enhancement)..."
)
subprocess.run(enhance_cmd, check=True, timeout=1800)
logger.info("Video skill enhancement complete!")
except subprocess.TimeoutExpired:
logger.warning(
"⚠ Enhancement timed out after 30 minutes. "
"The skill was still built without enhancement. "
"You can retry manually with:\n"
f" skill-seekers enhance {skill_dir} --enhance-level {enhance_level}"
)
except subprocess.CalledProcessError as exc:
logger.warning(
f"⚠ Enhancement failed (exit code {exc.returncode}), "
"but skill was still built. You can retry manually with:\n"
f" skill-seekers enhance {skill_dir} --enhance-level {enhance_level}"
)
except FileNotFoundError:
logger.warning("⚠ skill-seekers-enhance not found. Run manually:")
logger.info(f" skill-seekers enhance {skill_dir} --enhance-level {enhance_level}")

View File

@@ -70,10 +70,36 @@ def extract_youtube_transcript(
try:
ytt_api = YouTubeTranscriptApi()
transcript = ytt_api.fetch(video_id, languages=languages)
# Use list_transcripts to detect whether the transcript is auto-generated
source = TranscriptSource.YOUTUBE_MANUAL
try:
transcript_list = ytt_api.list(video_id)
# Prefer manually created transcripts; fall back to auto-generated
try:
transcript_entry = transcript_list.find_manually_created_transcript(languages)
source = TranscriptSource.YOUTUBE_MANUAL
except Exception:
try:
transcript_entry = transcript_list.find_generated_transcript(languages)
source = TranscriptSource.YOUTUBE_AUTO
except Exception:
# Fall back to any available transcript
transcript_entry = transcript_list.find_transcript(languages)
source = (
TranscriptSource.YOUTUBE_AUTO
if transcript_entry.is_generated
else TranscriptSource.YOUTUBE_MANUAL
)
transcript = transcript_entry.fetch()
except Exception:
# Fall back to direct fetch if list fails (older API versions)
transcript = ytt_api.fetch(video_id, languages=languages)
# Check is_generated on the FetchedTranscript if available
if getattr(transcript, "is_generated", False):
source = TranscriptSource.YOUTUBE_AUTO
segments = []
source = TranscriptSource.YOUTUBE_MANUAL
for snippet in transcript.snippets:
text = snippet.text.strip()
if not text:

View File

@@ -1864,7 +1864,7 @@ def _ocr_single_panel(
panel_id=f"panel_{row}_{col}",
)
# Stash vision_used flag for the caller to count
ss._vision_used = vision_used # type: ignore[attr-defined]
ss._vision_used = vision_used
return ss
@@ -1918,7 +1918,7 @@ def extract_visual_data(
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
logger.error(f"Cannot open video: {video_path}")
return [], []
return [], [], None
fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
@@ -2003,7 +2003,7 @@ def extract_visual_data(
for fut in concurrent.futures.as_completed(futures):
ss = fut.result()
if ss is not None:
if getattr(ss, "_vision_used", False):
if ss._vision_used:
vision_api_frames += 1
sub_sections.append(ss)
else:
@@ -2018,7 +2018,7 @@ def extract_visual_data(
use_vision_api,
)
if ss is not None:
if getattr(ss, "_vision_used", False):
if ss._vision_used:
vision_api_frames += 1
sub_sections.append(ss)