feat: add video tutorial scraping pipeline with per-panel OCR and AI enhancement
Add complete video tutorial extraction system that converts YouTube videos and local video files into AI-consumable skills. The pipeline extracts transcripts, performs visual OCR on code editor panels independently, tracks code evolution across frames, and generates structured SKILL.md output. Key features: - Video metadata extraction (YouTube, local files, playlists) - Multi-source transcript extraction (YouTube API, yt-dlp, Whisper fallback) - Chapter-based and time-window segmentation - Visual extraction: keyframe detection, frame classification, panel detection - Per-panel sub-section OCR (each IDE panel OCR'd independently) - Parallel OCR with ThreadPoolExecutor for multi-panel frames - Narrow panel filtering (300px min width) to skip UI chrome - Text block tracking with spatial panel position matching - Code timeline with edit tracking across frames - Audio-visual alignment (code + narrator pairs) - Video-specific AI enhancement prompt for OCR denoising and code reconstruction - video-tutorial.yaml workflow with 4 stages (OCR cleanup, language detection, tutorial synthesis, skill polish) - CLI integration: skill-seekers video --url/--video-file/--playlist - MCP tool: scrape_video for automation - 161 tests passing Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -401,6 +401,86 @@ WORD_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
},
|
||||
}
|
||||
|
||||
# Video specific (from video.py)
|
||||
VIDEO_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"video_url": {
|
||||
"flags": ("--video-url",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Video URL (YouTube, Vimeo)",
|
||||
"metavar": "URL",
|
||||
},
|
||||
},
|
||||
"video_file": {
|
||||
"flags": ("--video-file",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Local video file path",
|
||||
"metavar": "PATH",
|
||||
},
|
||||
},
|
||||
"video_playlist": {
|
||||
"flags": ("--video-playlist",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Playlist URL",
|
||||
"metavar": "URL",
|
||||
},
|
||||
},
|
||||
"video_languages": {
|
||||
"flags": ("--video-languages",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"default": "en",
|
||||
"help": "Transcript language preference (comma-separated)",
|
||||
"metavar": "LANGS",
|
||||
},
|
||||
},
|
||||
"visual": {
|
||||
"flags": ("--visual",),
|
||||
"kwargs": {
|
||||
"action": "store_true",
|
||||
"help": "Enable visual extraction (requires video-full deps)",
|
||||
},
|
||||
},
|
||||
"whisper_model": {
|
||||
"flags": ("--whisper-model",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"default": "base",
|
||||
"help": "Whisper model size (default: base)",
|
||||
"metavar": "MODEL",
|
||||
},
|
||||
},
|
||||
"visual_interval": {
|
||||
"flags": ("--visual-interval",),
|
||||
"kwargs": {
|
||||
"type": float,
|
||||
"default": 0.7,
|
||||
"help": "Visual scan interval in seconds (default: 0.7)",
|
||||
"metavar": "SECS",
|
||||
},
|
||||
},
|
||||
"visual_min_gap": {
|
||||
"flags": ("--visual-min-gap",),
|
||||
"kwargs": {
|
||||
"type": float,
|
||||
"default": 0.5,
|
||||
"help": "Min gap between extracted frames in seconds (default: 0.5)",
|
||||
"metavar": "SECS",
|
||||
},
|
||||
},
|
||||
"visual_similarity": {
|
||||
"flags": ("--visual-similarity",),
|
||||
"kwargs": {
|
||||
"type": float,
|
||||
"default": 3.0,
|
||||
"help": "Pixel-diff threshold for duplicate detection; lower = more frames (default: 3.0)",
|
||||
"metavar": "THRESH",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
# Multi-source config specific (from unified_scraper.py)
|
||||
CONFIG_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"merge_mode": {
|
||||
@@ -484,6 +564,7 @@ def get_source_specific_arguments(source_type: str) -> dict[str, dict[str, Any]]
|
||||
"local": LOCAL_ARGUMENTS,
|
||||
"pdf": PDF_ARGUMENTS,
|
||||
"word": WORD_ARGUMENTS,
|
||||
"video": VIDEO_ARGUMENTS,
|
||||
"config": CONFIG_ARGUMENTS,
|
||||
}
|
||||
return source_args.get(source_type, {})
|
||||
@@ -521,6 +602,7 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default")
|
||||
- 'local': Universal + local-specific
|
||||
- 'pdf': Universal + pdf-specific
|
||||
- 'word': Universal + word-specific
|
||||
- 'video': Universal + video-specific
|
||||
- 'advanced': Advanced/rare arguments
|
||||
- 'all': All 120+ arguments
|
||||
|
||||
@@ -561,6 +643,10 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default")
|
||||
for arg_name, arg_def in WORD_ARGUMENTS.items():
|
||||
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
|
||||
|
||||
if mode in ["video", "all"]:
|
||||
for arg_name, arg_def in VIDEO_ARGUMENTS.items():
|
||||
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
|
||||
|
||||
if mode in ["config", "all"]:
|
||||
for arg_name, arg_def in CONFIG_ARGUMENTS.items():
|
||||
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
|
||||
|
||||
141
src/skill_seekers/cli/arguments/video.py
Normal file
141
src/skill_seekers/cli/arguments/video.py
Normal file
@@ -0,0 +1,141 @@
|
||||
"""Video command argument definitions.
|
||||
|
||||
This module defines ALL arguments for the video command in ONE place.
|
||||
Both video_scraper.py (standalone) and parsers/video_parser.py (unified CLI)
|
||||
import and use these definitions.
|
||||
|
||||
Shared arguments (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
|
||||
via ``add_all_standard_arguments()``.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from typing import Any
|
||||
|
||||
from .common import add_all_standard_arguments
|
||||
|
||||
# Video-specific argument definitions as data structure
|
||||
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
|
||||
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
|
||||
VIDEO_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"url": {
|
||||
"flags": ("--url",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Video URL (YouTube, Vimeo)",
|
||||
"metavar": "URL",
|
||||
},
|
||||
},
|
||||
"video_file": {
|
||||
"flags": ("--video-file",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Local video file path",
|
||||
"metavar": "PATH",
|
||||
},
|
||||
},
|
||||
"playlist": {
|
||||
"flags": ("--playlist",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Playlist URL",
|
||||
"metavar": "URL",
|
||||
},
|
||||
},
|
||||
"languages": {
|
||||
"flags": ("--languages",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"default": "en",
|
||||
"help": "Transcript language preference (comma-separated, default: en)",
|
||||
"metavar": "LANGS",
|
||||
},
|
||||
},
|
||||
"visual": {
|
||||
"flags": ("--visual",),
|
||||
"kwargs": {
|
||||
"action": "store_true",
|
||||
"help": "Enable visual extraction (requires video-full deps)",
|
||||
},
|
||||
},
|
||||
"whisper_model": {
|
||||
"flags": ("--whisper-model",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"default": "base",
|
||||
"help": "Whisper model size (default: base)",
|
||||
"metavar": "MODEL",
|
||||
},
|
||||
},
|
||||
"from_json": {
|
||||
"flags": ("--from-json",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Build skill from extracted JSON",
|
||||
"metavar": "FILE",
|
||||
},
|
||||
},
|
||||
"visual_interval": {
|
||||
"flags": ("--visual-interval",),
|
||||
"kwargs": {
|
||||
"type": float,
|
||||
"default": 0.7,
|
||||
"help": "Visual scan interval in seconds (default: 0.7)",
|
||||
"metavar": "SECS",
|
||||
},
|
||||
},
|
||||
"visual_min_gap": {
|
||||
"flags": ("--visual-min-gap",),
|
||||
"kwargs": {
|
||||
"type": float,
|
||||
"default": 0.5,
|
||||
"help": "Minimum gap between extracted frames in seconds (default: 0.5)",
|
||||
"metavar": "SECS",
|
||||
},
|
||||
},
|
||||
"visual_similarity": {
|
||||
"flags": ("--visual-similarity",),
|
||||
"kwargs": {
|
||||
"type": float,
|
||||
"default": 3.0,
|
||||
"help": "Pixel-diff threshold for duplicate frame detection; lower = more frames kept (default: 3.0)",
|
||||
"metavar": "THRESH",
|
||||
},
|
||||
},
|
||||
"vision_ocr": {
|
||||
"flags": ("--vision-ocr",),
|
||||
"kwargs": {
|
||||
"action": "store_true",
|
||||
"help": "Use Claude Vision API as fallback for low-confidence code frames (requires ANTHROPIC_API_KEY, ~$0.004/frame)",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def add_video_arguments(parser: argparse.ArgumentParser) -> None:
|
||||
"""Add all video command arguments to a parser.
|
||||
|
||||
Registers shared args (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
|
||||
then adds video-specific args on top.
|
||||
|
||||
The default for --enhance-level is overridden to 0 (disabled) for video.
|
||||
"""
|
||||
# Shared universal args first
|
||||
add_all_standard_arguments(parser)
|
||||
|
||||
# Override enhance-level default to 0 for video
|
||||
for action in parser._actions:
|
||||
if hasattr(action, "dest") and action.dest == "enhance_level":
|
||||
action.default = 0
|
||||
action.help = (
|
||||
"AI enhancement level (auto-detects API vs LOCAL mode): "
|
||||
"0=disabled (default for video), 1=SKILL.md only, 2=+architecture/config, 3=full enhancement. "
|
||||
"Mode selection: uses API if ANTHROPIC_API_KEY is set, otherwise LOCAL (Claude Code)"
|
||||
)
|
||||
|
||||
# Video-specific args
|
||||
for arg_name, arg_def in VIDEO_ARGUMENTS.items():
|
||||
flags = arg_def["flags"]
|
||||
kwargs = arg_def["kwargs"]
|
||||
parser.add_argument(*flags, **kwargs)
|
||||
@@ -27,7 +27,7 @@ class ConfigValidator:
|
||||
"""
|
||||
|
||||
# Valid source types
|
||||
VALID_SOURCE_TYPES = {"documentation", "github", "pdf", "local"}
|
||||
VALID_SOURCE_TYPES = {"documentation", "github", "pdf", "local", "word", "video"}
|
||||
|
||||
# Valid merge modes
|
||||
VALID_MERGE_MODES = {"rule-based", "claude-enhanced"}
|
||||
|
||||
@@ -133,6 +133,8 @@ class CreateCommand:
|
||||
return self._route_pdf()
|
||||
elif self.source_info.type == "word":
|
||||
return self._route_word()
|
||||
elif self.source_info.type == "video":
|
||||
return self._route_video()
|
||||
elif self.source_info.type == "config":
|
||||
return self._route_config()
|
||||
else:
|
||||
@@ -345,6 +347,55 @@ class CreateCommand:
|
||||
finally:
|
||||
sys.argv = original_argv
|
||||
|
||||
def _route_video(self) -> int:
|
||||
"""Route to video scraper (video_scraper.py)."""
|
||||
from skill_seekers.cli import video_scraper
|
||||
|
||||
# Reconstruct argv for video_scraper
|
||||
argv = ["video_scraper"]
|
||||
|
||||
# Add video source (URL or file)
|
||||
parsed = self.source_info.parsed
|
||||
if parsed.get("source_kind") == "file":
|
||||
argv.extend(["--video-file", parsed["file_path"]])
|
||||
elif parsed.get("url"):
|
||||
url = parsed["url"]
|
||||
# Detect playlist vs single video
|
||||
if "playlist" in url.lower():
|
||||
argv.extend(["--playlist", url])
|
||||
else:
|
||||
argv.extend(["--url", url])
|
||||
|
||||
# Add universal arguments
|
||||
self._add_common_args(argv)
|
||||
|
||||
# Add video-specific arguments
|
||||
video_langs = getattr(self.args, "video_languages", None) or getattr(self.args, "languages", None)
|
||||
if video_langs:
|
||||
argv.extend(["--languages", video_langs])
|
||||
if getattr(self.args, "visual", False):
|
||||
argv.append("--visual")
|
||||
if getattr(self.args, "whisper_model", None) and self.args.whisper_model != "base":
|
||||
argv.extend(["--whisper-model", self.args.whisper_model])
|
||||
vi = getattr(self.args, "visual_interval", None)
|
||||
if vi is not None and vi != 0.7:
|
||||
argv.extend(["--visual-interval", str(vi)])
|
||||
vmg = getattr(self.args, "visual_min_gap", None)
|
||||
if vmg is not None and vmg != 0.5:
|
||||
argv.extend(["--visual-min-gap", str(vmg)])
|
||||
vs = getattr(self.args, "visual_similarity", None)
|
||||
if vs is not None and vs != 3.0:
|
||||
argv.extend(["--visual-similarity", str(vs)])
|
||||
|
||||
# Call video_scraper with modified argv
|
||||
logger.debug(f"Calling video_scraper with argv: {argv}")
|
||||
original_argv = sys.argv
|
||||
try:
|
||||
sys.argv = argv
|
||||
return video_scraper.main()
|
||||
finally:
|
||||
sys.argv = original_argv
|
||||
|
||||
def _route_config(self) -> int:
|
||||
"""Route to unified scraper for config files (unified_scraper.py)."""
|
||||
from skill_seekers.cli import unified_scraper
|
||||
@@ -468,6 +519,8 @@ Examples:
|
||||
Local: skill-seekers create ./my-project -p comprehensive
|
||||
PDF: skill-seekers create tutorial.pdf --ocr
|
||||
DOCX: skill-seekers create document.docx
|
||||
Video: skill-seekers create https://youtube.com/watch?v=...
|
||||
Video: skill-seekers create recording.mp4
|
||||
Config: skill-seekers create configs/react.json
|
||||
|
||||
Source Auto-Detection:
|
||||
@@ -476,6 +529,8 @@ Source Auto-Detection:
|
||||
• ./path → local codebase
|
||||
• file.pdf → PDF extraction
|
||||
• file.docx → Word document extraction
|
||||
• youtube.com/... → Video transcript extraction
|
||||
• file.mp4 → Video file extraction
|
||||
• file.json → multi-source config
|
||||
|
||||
Progressive Help (13 → 120+ flags):
|
||||
@@ -483,6 +538,7 @@ Progressive Help (13 → 120+ flags):
|
||||
--help-github GitHub repository options
|
||||
--help-local Local codebase analysis
|
||||
--help-pdf PDF extraction options
|
||||
--help-video Video extraction options
|
||||
--help-advanced Rare/advanced options
|
||||
--help-all All options + compatibility
|
||||
|
||||
@@ -513,6 +569,9 @@ Common Workflows:
|
||||
parser.add_argument(
|
||||
"--help-word", action="store_true", help=argparse.SUPPRESS, dest="_help_word"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--help-video", action="store_true", help=argparse.SUPPRESS, dest="_help_video"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--help-config", action="store_true", help=argparse.SUPPRESS, dest="_help_config"
|
||||
)
|
||||
@@ -571,6 +630,15 @@ Common Workflows:
|
||||
add_create_arguments(parser_word, mode="word")
|
||||
parser_word.print_help()
|
||||
return 0
|
||||
elif args._help_video:
|
||||
parser_video = argparse.ArgumentParser(
|
||||
prog="skill-seekers create",
|
||||
description="Create skill from video (YouTube, Vimeo, local files)",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
add_create_arguments(parser_video, mode="video")
|
||||
parser_video.print_help()
|
||||
return 0
|
||||
elif args._help_config:
|
||||
parser_config = argparse.ArgumentParser(
|
||||
prog="skill-seekers create",
|
||||
|
||||
@@ -97,9 +97,17 @@ class SkillEnhancer:
|
||||
print(f"❌ Error calling Claude API: {e}")
|
||||
return None
|
||||
|
||||
def _is_video_source(self, references):
|
||||
"""Check if the references come from video tutorial extraction."""
|
||||
return any(meta["source"] == "video_tutorial" for meta in references.values())
|
||||
|
||||
def _build_enhancement_prompt(self, references, current_skill_md):
|
||||
"""Build the prompt for Claude with multi-source awareness"""
|
||||
|
||||
# Dispatch to video-specific prompt if video source detected
|
||||
if self._is_video_source(references):
|
||||
return self._build_video_enhancement_prompt(references, current_skill_md)
|
||||
|
||||
# Extract skill name and description
|
||||
skill_name = self.skill_dir.name
|
||||
|
||||
@@ -276,6 +284,148 @@ Return ONLY the complete SKILL.md content, starting with the frontmatter (---).
|
||||
|
||||
return prompt
|
||||
|
||||
def _build_video_enhancement_prompt(self, references, current_skill_md):
|
||||
"""Build a video-specific enhancement prompt.
|
||||
|
||||
Video tutorial references contain transcript text, OCR'd code panels,
|
||||
code timelines with edits, and audio-visual alignment pairs. This prompt
|
||||
is tailored to reconstruct clean code from noisy OCR, detect programming
|
||||
languages from context, and synthesize a coherent tutorial skill.
|
||||
"""
|
||||
skill_name = self.skill_dir.name
|
||||
|
||||
prompt = f"""You are enhancing a Claude skill built from VIDEO TUTORIAL extraction. This skill is about: {skill_name}
|
||||
|
||||
The raw data was extracted from video tutorials using:
|
||||
1. **Transcript** (speech-to-text) — HIGH quality, this is the primary signal
|
||||
2. **OCR on code panels** — NOISY, may contain line numbers, UI chrome, garbled text
|
||||
3. **Code Timeline** — Tracks code evolution across frames with diffs
|
||||
4. **Audio-Visual Alignment** — Pairs of on-screen code + narrator explanation
|
||||
|
||||
CURRENT SKILL.MD:
|
||||
{"```markdown" if current_skill_md else "(none - create from scratch)"}
|
||||
{current_skill_md or "No existing SKILL.md"}
|
||||
{"```" if current_skill_md else ""}
|
||||
|
||||
REFERENCE FILES:
|
||||
"""
|
||||
|
||||
# Add all reference content
|
||||
for filename, metadata in references.items():
|
||||
content = metadata["content"]
|
||||
if len(content) > 30000:
|
||||
content = content[:30000] + "\n\n[Content truncated for size...]"
|
||||
prompt += f"\n#### {filename}\n"
|
||||
prompt += f"*Source: {metadata['source']}, Confidence: {metadata['confidence']}*\n\n"
|
||||
prompt += f"```markdown\n{content}\n```\n"
|
||||
|
||||
prompt += """
|
||||
|
||||
VIDEO-SPECIFIC ENHANCEMENT INSTRUCTIONS:
|
||||
|
||||
You are working with data extracted from programming tutorial videos. The data has
|
||||
specific characteristics you MUST handle:
|
||||
|
||||
## 1. OCR Code Reconstruction (CRITICAL)
|
||||
|
||||
The OCR'd code blocks are NOISY. Common issues you MUST fix:
|
||||
- **Line numbers in code**: OCR captures line numbers (1, 2, 3...) as part of the code — STRIP THEM
|
||||
- **UI chrome contamination**: Tab bars, file names, button text appear in code blocks — REMOVE
|
||||
- **Garbled characters**: OCR errors like `l` → `1`, `O` → `0`, `rn` → `m` — FIX using context
|
||||
- **Duplicate fragments**: Same code appears across multiple frames with minor OCR variations — DEDUPLICATE
|
||||
- **Incomplete lines**: Lines cut off at panel edges — RECONSTRUCT from transcript context
|
||||
- **Animation/timeline numbers**: Frame counters or timeline numbers in code — REMOVE
|
||||
|
||||
When reconstructing code:
|
||||
- The TRANSCRIPT is the ground truth for WHAT the code does
|
||||
- The OCR is the ground truth for HOW the code looks (syntax, structure)
|
||||
- Combine both: use transcript to understand intent, OCR for actual code structure
|
||||
- If OCR is too garbled, reconstruct the code based on what the narrator describes
|
||||
|
||||
## 2. Language Detection
|
||||
|
||||
The OCR-based language detection is often WRONG. Fix it by:
|
||||
- Reading the transcript for language mentions ("in GDScript", "this Python function", "our C# class")
|
||||
- Using code patterns: `extends`, `func`, `var`, `signal` = GDScript; `def`, `class`, `import` = Python;
|
||||
`function`, `const`, `let` = JavaScript/TypeScript; `using`, `namespace` = C#
|
||||
- Looking at file extensions mentioned in the transcript or visible in tab bars
|
||||
- Using proper language tags in all code fences (```gdscript, ```python, etc.)
|
||||
|
||||
## 3. Code Timeline Processing
|
||||
|
||||
The "Code Timeline" section shows how code EVOLVES during the tutorial. Use it to:
|
||||
- Show the FINAL version of each code block (not intermediate states)
|
||||
- Optionally show key intermediate steps if the tutorial is about building up code progressively
|
||||
- The edit diffs show exactly what changed between frames — use these to understand the tutorial flow
|
||||
|
||||
## 4. Audio-Visual Alignment
|
||||
|
||||
These are the MOST VALUABLE pairs: each links on-screen code with the narrator's explanation.
|
||||
- Use these to create annotated code examples with inline comments
|
||||
- The narrator text explains WHY each piece of code exists
|
||||
- Cross-reference these pairs to build the "how-to" sections
|
||||
|
||||
## 5. Tutorial Structure
|
||||
|
||||
Transform the raw chronological data into a LOGICAL tutorial structure:
|
||||
- Group by TOPIC, not by timestamp (e.g., "Setting Up the State Machine" not "Segment 3")
|
||||
- Create clear section headers that describe what is being TAUGHT
|
||||
- Build a progressive learning path: concepts build on each other
|
||||
- Include prerequisite knowledge mentioned by the narrator
|
||||
|
||||
YOUR TASK — Create an enhanced SKILL.md:
|
||||
|
||||
1. **Clean Overview Section**
|
||||
- What does this tutorial teach? (from transcript, NOT generic)
|
||||
- Prerequisites mentioned by the narrator
|
||||
- Key technologies/frameworks used (from actual code, not guesses)
|
||||
|
||||
2. **"When to Use This Skill" Section**
|
||||
- Specific trigger conditions based on what the tutorial covers
|
||||
- Use cases directly from the tutorial content
|
||||
- Reference the framework/library/tool being taught
|
||||
|
||||
3. **Quick Reference Section** (MOST IMPORTANT)
|
||||
- Extract 5-10 CLEAN, reconstructed code examples
|
||||
- Each example must be:
|
||||
a. Denoised (no line numbers, no UI chrome, no garbled text)
|
||||
b. Complete (not cut off mid-line)
|
||||
c. Properly language-tagged
|
||||
d. Annotated with a description from the transcript
|
||||
- Prefer code from Audio-Visual Alignment pairs (they have narrator context)
|
||||
- Show the FINAL working version of each code block
|
||||
|
||||
4. **Step-by-Step Tutorial Section**
|
||||
- Follow the tutorial's teaching flow
|
||||
- Each step includes: clean code + explanation from transcript
|
||||
- Use narrator's explanations as the descriptions (paraphrase, don't copy verbatim)
|
||||
- Show code evolution where the tutorial builds up code incrementally
|
||||
|
||||
5. **Key Concepts Section**
|
||||
- Extract terminology and concepts the narrator explains
|
||||
- Define them using the narrator's own explanations
|
||||
- Link concepts to specific code examples
|
||||
|
||||
6. **Reference Files Description**
|
||||
- Explain what each reference file contains
|
||||
- Note that OCR data is raw and may contain errors
|
||||
- Point to the most useful sections (Audio-Visual Alignment, Code Timeline)
|
||||
|
||||
7. **Keep the frontmatter** (---\\nname: ...\\n---) intact if present
|
||||
|
||||
CRITICAL RULES:
|
||||
- NEVER include raw OCR text with line numbers or UI chrome — always clean it first
|
||||
- ALWAYS use correct language tags (detect from context, not from OCR metadata)
|
||||
- The transcript is your BEST source for understanding content — trust it over garbled OCR
|
||||
- Extract REAL code from the references, reconstruct where needed, but never invent code
|
||||
- Keep code examples SHORT and focused (5-30 lines max per example)
|
||||
- Make the skill actionable: someone reading it should be able to implement what the tutorial teaches
|
||||
|
||||
OUTPUT:
|
||||
Return ONLY the complete SKILL.md content, starting with the frontmatter (---).
|
||||
"""
|
||||
return prompt
|
||||
|
||||
def save_enhanced_skill_md(self, content):
|
||||
"""Save the enhanced SKILL.md"""
|
||||
# Backup original
|
||||
|
||||
@@ -48,6 +48,7 @@ COMMAND_MODULES = {
|
||||
"github": "skill_seekers.cli.github_scraper",
|
||||
"pdf": "skill_seekers.cli.pdf_scraper",
|
||||
"word": "skill_seekers.cli.word_scraper",
|
||||
"video": "skill_seekers.cli.video_scraper",
|
||||
"unified": "skill_seekers.cli.unified_scraper",
|
||||
"enhance": "skill_seekers.cli.enhance_command",
|
||||
"enhance-status": "skill_seekers.cli.enhance_status",
|
||||
@@ -142,7 +143,6 @@ def _reconstruct_argv(command: str, args: argparse.Namespace) -> list[str]:
|
||||
# Handle positional arguments (no -- prefix)
|
||||
if key in [
|
||||
"source", # create command
|
||||
"url",
|
||||
"directory",
|
||||
"file",
|
||||
"job_id",
|
||||
|
||||
@@ -13,6 +13,7 @@ from .scrape_parser import ScrapeParser
|
||||
from .github_parser import GitHubParser
|
||||
from .pdf_parser import PDFParser
|
||||
from .word_parser import WordParser
|
||||
from .video_parser import VideoParser
|
||||
from .unified_parser import UnifiedParser
|
||||
from .enhance_parser import EnhanceParser
|
||||
from .enhance_status_parser import EnhanceStatusParser
|
||||
@@ -43,6 +44,7 @@ PARSERS = [
|
||||
EnhanceStatusParser(),
|
||||
PDFParser(),
|
||||
WordParser(),
|
||||
VideoParser(),
|
||||
UnifiedParser(),
|
||||
EstimateParser(),
|
||||
InstallParser(),
|
||||
|
||||
32
src/skill_seekers/cli/parsers/video_parser.py
Normal file
32
src/skill_seekers/cli/parsers/video_parser.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""Video subcommand parser.
|
||||
|
||||
Uses shared argument definitions from arguments.video to ensure
|
||||
consistency with the standalone video_scraper module.
|
||||
"""
|
||||
|
||||
from .base import SubcommandParser
|
||||
from skill_seekers.cli.arguments.video import add_video_arguments
|
||||
|
||||
|
||||
class VideoParser(SubcommandParser):
|
||||
"""Parser for video subcommand."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "video"
|
||||
|
||||
@property
|
||||
def help(self) -> str:
|
||||
return "Extract from video (YouTube, local files)"
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return "Extract transcripts and metadata from videos and generate skill"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
"""Add video-specific arguments.
|
||||
|
||||
Uses shared argument definitions to ensure consistency
|
||||
with video_scraper.py (standalone scraper).
|
||||
"""
|
||||
add_video_arguments(parser)
|
||||
@@ -63,24 +63,34 @@ class SourceDetector:
|
||||
if source.endswith(".docx"):
|
||||
return cls._detect_word(source)
|
||||
|
||||
# 2. Directory detection
|
||||
# Video file extensions
|
||||
VIDEO_EXTENSIONS = (".mp4", ".mkv", ".avi", ".mov", ".webm", ".flv", ".wmv")
|
||||
if source.lower().endswith(VIDEO_EXTENSIONS):
|
||||
return cls._detect_video_file(source)
|
||||
|
||||
# 2. Video URL detection (before directory check)
|
||||
video_url_info = cls._detect_video_url(source)
|
||||
if video_url_info:
|
||||
return video_url_info
|
||||
|
||||
# 3. Directory detection
|
||||
if os.path.isdir(source):
|
||||
return cls._detect_local(source)
|
||||
|
||||
# 3. GitHub patterns
|
||||
# 4. GitHub patterns
|
||||
github_info = cls._detect_github(source)
|
||||
if github_info:
|
||||
return github_info
|
||||
|
||||
# 4. URL detection
|
||||
# 5. URL detection
|
||||
if source.startswith("http://") or source.startswith("https://"):
|
||||
return cls._detect_web(source)
|
||||
|
||||
# 5. Domain inference (add https://)
|
||||
# 6. Domain inference (add https://)
|
||||
if "." in source and not source.startswith("/"):
|
||||
return cls._detect_web(f"https://{source}")
|
||||
|
||||
# 6. Error - cannot determine
|
||||
# 7. Error - cannot determine
|
||||
raise ValueError(
|
||||
f"Cannot determine source type for: {source}\n\n"
|
||||
"Examples:\n"
|
||||
@@ -89,6 +99,8 @@ class SourceDetector:
|
||||
" Local: skill-seekers create ./my-project\n"
|
||||
" PDF: skill-seekers create tutorial.pdf\n"
|
||||
" DOCX: skill-seekers create document.docx\n"
|
||||
" Video: skill-seekers create https://youtube.com/watch?v=...\n"
|
||||
" Video: skill-seekers create recording.mp4\n"
|
||||
" Config: skill-seekers create configs/react.json"
|
||||
)
|
||||
|
||||
@@ -116,6 +128,55 @@ class SourceDetector:
|
||||
type="word", parsed={"file_path": source}, suggested_name=name, raw_input=source
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _detect_video_file(cls, source: str) -> SourceInfo:
|
||||
"""Detect local video file source."""
|
||||
name = os.path.splitext(os.path.basename(source))[0]
|
||||
return SourceInfo(
|
||||
type="video",
|
||||
parsed={"file_path": source, "source_kind": "file"},
|
||||
suggested_name=name,
|
||||
raw_input=source,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _detect_video_url(cls, source: str) -> SourceInfo | None:
|
||||
"""Detect video platform URL (YouTube, Vimeo).
|
||||
|
||||
Returns SourceInfo if the source is a video URL, None otherwise.
|
||||
"""
|
||||
lower = source.lower()
|
||||
|
||||
# YouTube patterns
|
||||
youtube_keywords = ["youtube.com/watch", "youtu.be/", "youtube.com/playlist",
|
||||
"youtube.com/@", "youtube.com/channel/", "youtube.com/c/",
|
||||
"youtube.com/shorts/", "youtube.com/embed/"]
|
||||
if any(kw in lower for kw in youtube_keywords):
|
||||
# Determine suggested name
|
||||
if "playlist" in lower:
|
||||
name = "youtube_playlist"
|
||||
elif "/@" in lower or "/channel/" in lower or "/c/" in lower:
|
||||
name = "youtube_channel"
|
||||
else:
|
||||
name = "youtube_video"
|
||||
return SourceInfo(
|
||||
type="video",
|
||||
parsed={"url": source, "source_kind": "url"},
|
||||
suggested_name=name,
|
||||
raw_input=source,
|
||||
)
|
||||
|
||||
# Vimeo patterns
|
||||
if "vimeo.com/" in lower:
|
||||
return SourceInfo(
|
||||
type="video",
|
||||
parsed={"url": source, "source_kind": "url"},
|
||||
suggested_name="vimeo_video",
|
||||
raw_input=source,
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def _detect_local(cls, source: str) -> SourceInfo:
|
||||
"""Detect local directory source."""
|
||||
@@ -209,6 +270,15 @@ class SourceDetector:
|
||||
if not os.path.isfile(file_path):
|
||||
raise ValueError(f"Path is not a file: {file_path}")
|
||||
|
||||
elif source_info.type == "video":
|
||||
if source_info.parsed.get("source_kind") == "file":
|
||||
file_path = source_info.parsed["file_path"]
|
||||
if not os.path.exists(file_path):
|
||||
raise ValueError(f"Video file does not exist: {file_path}")
|
||||
if not os.path.isfile(file_path):
|
||||
raise ValueError(f"Path is not a file: {file_path}")
|
||||
# URL-based video sources are validated during processing
|
||||
|
||||
elif source_info.type == "config":
|
||||
config_path = source_info.parsed["config_path"]
|
||||
if not os.path.exists(config_path):
|
||||
|
||||
@@ -74,11 +74,12 @@ class UnifiedScraper:
|
||||
"github": [], # List of github sources
|
||||
"pdf": [], # List of pdf sources
|
||||
"word": [], # List of word sources
|
||||
"video": [], # List of video sources
|
||||
"local": [], # List of local sources (docs or code)
|
||||
}
|
||||
|
||||
# Track source index for unique naming (multi-source support)
|
||||
self._source_counters = {"documentation": 0, "github": 0, "pdf": 0, "word": 0, "local": 0}
|
||||
self._source_counters = {"documentation": 0, "github": 0, "pdf": 0, "word": 0, "video": 0, "local": 0}
|
||||
|
||||
# Output paths - cleaner organization
|
||||
self.name = self.config["name"]
|
||||
@@ -154,6 +155,8 @@ class UnifiedScraper:
|
||||
self._scrape_pdf(source)
|
||||
elif source_type == "word":
|
||||
self._scrape_word(source)
|
||||
elif source_type == "video":
|
||||
self._scrape_video(source)
|
||||
elif source_type == "local":
|
||||
self._scrape_local(source)
|
||||
else:
|
||||
@@ -576,6 +579,63 @@ class UnifiedScraper:
|
||||
|
||||
logger.info(f"✅ Word: {len(word_data.get('pages', []))} sections extracted")
|
||||
|
||||
def _scrape_video(self, source: dict[str, Any]):
|
||||
"""Scrape video source (YouTube, local file, etc.)."""
|
||||
try:
|
||||
from skill_seekers.cli.video_scraper import VideoToSkillConverter
|
||||
except ImportError:
|
||||
logger.error("video_scraper.py not found")
|
||||
return
|
||||
|
||||
# Multi-source support: Get unique index for this video source
|
||||
idx = self._source_counters["video"]
|
||||
self._source_counters["video"] += 1
|
||||
|
||||
# Determine video identifier
|
||||
video_url = source.get("url", "")
|
||||
video_id = video_url or source.get("path", f"video_{idx}")
|
||||
|
||||
# Create config for video scraper
|
||||
video_config = {
|
||||
"name": f"{self.name}_video_{idx}",
|
||||
"url": source.get("url"),
|
||||
"video_file": source.get("path"),
|
||||
"playlist": source.get("playlist"),
|
||||
"description": source.get("description", ""),
|
||||
"languages": ",".join(source.get("languages", ["en"])),
|
||||
"visual": source.get("visual_extraction", False),
|
||||
"whisper_model": source.get("whisper_model", "base"),
|
||||
}
|
||||
|
||||
# Process video
|
||||
logger.info(f"Scraping video: {video_id}")
|
||||
converter = VideoToSkillConverter(video_config)
|
||||
|
||||
try:
|
||||
result = converter.process()
|
||||
converter.save_extracted_data()
|
||||
|
||||
# Append to list
|
||||
self.scraped_data["video"].append(
|
||||
{
|
||||
"video_id": video_id,
|
||||
"idx": idx,
|
||||
"data": result.to_dict(),
|
||||
"data_file": converter.data_file,
|
||||
}
|
||||
)
|
||||
|
||||
# Build standalone SKILL.md for synthesis
|
||||
converter.build_skill()
|
||||
logger.info("✅ Video: Standalone SKILL.md created")
|
||||
|
||||
logger.info(
|
||||
f"✅ Video: {len(result.videos)} videos, "
|
||||
f"{result.total_segments} segments extracted"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process video source: {e}")
|
||||
|
||||
def _scrape_local(self, source: dict[str, Any]):
|
||||
"""
|
||||
Scrape local directory (documentation files or source code).
|
||||
|
||||
@@ -289,6 +289,10 @@ def read_reference_files(
|
||||
else:
|
||||
return "codebase_analysis", "medium", repo_id
|
||||
|
||||
# Video tutorial sources (video_*.md from video scraper)
|
||||
elif relative_path.name.startswith("video_"):
|
||||
return "video_tutorial", "high", None
|
||||
|
||||
# Conflicts report (discrepancy detection)
|
||||
elif "conflicts" in path_str:
|
||||
return "conflicts", "medium", None
|
||||
|
||||
270
src/skill_seekers/cli/video_metadata.py
Normal file
270
src/skill_seekers/cli/video_metadata.py
Normal file
@@ -0,0 +1,270 @@
|
||||
"""Video metadata extraction module.
|
||||
|
||||
Uses yt-dlp for metadata extraction without downloading video content.
|
||||
Supports YouTube, Vimeo, and local video files.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
from skill_seekers.cli.video_models import (
|
||||
Chapter,
|
||||
VideoInfo,
|
||||
VideoSourceType,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Optional dependency: yt-dlp
|
||||
try:
|
||||
import yt_dlp
|
||||
|
||||
HAS_YTDLP = True
|
||||
except ImportError:
|
||||
HAS_YTDLP = False
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Video ID Extraction
|
||||
# =============================================================================
|
||||
|
||||
|
||||
# YouTube URL patterns
|
||||
YOUTUBE_PATTERNS = [
|
||||
re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})"),
|
||||
re.compile(r"(?:https?://)?youtu\.be/([a-zA-Z0-9_-]{11})"),
|
||||
re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/embed/([a-zA-Z0-9_-]{11})"),
|
||||
re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/v/([a-zA-Z0-9_-]{11})"),
|
||||
re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/shorts/([a-zA-Z0-9_-]{11})"),
|
||||
]
|
||||
|
||||
YOUTUBE_PLAYLIST_PATTERN = re.compile(
|
||||
r"(?:https?://)?(?:www\.)?youtube\.com/playlist\?list=([a-zA-Z0-9_-]+)"
|
||||
)
|
||||
|
||||
YOUTUBE_CHANNEL_PATTERNS = [
|
||||
re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/@([a-zA-Z0-9_-]+)"),
|
||||
re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/channel/([a-zA-Z0-9_-]+)"),
|
||||
re.compile(r"(?:https?://)?(?:www\.)?youtube\.com/c/([a-zA-Z0-9_-]+)"),
|
||||
]
|
||||
|
||||
VIMEO_PATTERN = re.compile(r"(?:https?://)?(?:www\.)?vimeo\.com/(\d+)")
|
||||
|
||||
|
||||
def extract_video_id(url: str) -> str | None:
|
||||
"""Extract YouTube video ID from various URL formats.
|
||||
|
||||
Args:
|
||||
url: YouTube URL in any supported format.
|
||||
|
||||
Returns:
|
||||
11-character video ID, or None if not a YouTube URL.
|
||||
"""
|
||||
for pattern in YOUTUBE_PATTERNS:
|
||||
match = pattern.search(url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return None
|
||||
|
||||
|
||||
def detect_video_source_type(url_or_path: str) -> VideoSourceType:
|
||||
"""Detect the source type of a video URL or file path.
|
||||
|
||||
Args:
|
||||
url_or_path: URL or local file path.
|
||||
|
||||
Returns:
|
||||
VideoSourceType enum value.
|
||||
"""
|
||||
if os.path.isfile(url_or_path):
|
||||
return VideoSourceType.LOCAL_FILE
|
||||
if os.path.isdir(url_or_path):
|
||||
return VideoSourceType.LOCAL_DIRECTORY
|
||||
|
||||
url_lower = url_or_path.lower()
|
||||
if "youtube.com" in url_lower or "youtu.be" in url_lower:
|
||||
return VideoSourceType.YOUTUBE
|
||||
if "vimeo.com" in url_lower:
|
||||
return VideoSourceType.VIMEO
|
||||
|
||||
return VideoSourceType.LOCAL_FILE
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# YouTube Metadata via yt-dlp
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def _check_ytdlp():
|
||||
"""Raise RuntimeError if yt-dlp is not installed."""
|
||||
if not HAS_YTDLP:
|
||||
raise RuntimeError(
|
||||
"yt-dlp is required for video metadata extraction.\n"
|
||||
'Install with: pip install "skill-seekers[video]"\n'
|
||||
"Or: pip install yt-dlp"
|
||||
)
|
||||
|
||||
|
||||
def extract_youtube_metadata(url: str) -> VideoInfo:
|
||||
"""Extract metadata from a YouTube video URL without downloading.
|
||||
|
||||
Args:
|
||||
url: YouTube video URL.
|
||||
|
||||
Returns:
|
||||
VideoInfo with metadata populated.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If yt-dlp is not installed.
|
||||
"""
|
||||
_check_ytdlp()
|
||||
|
||||
ydl_opts = {
|
||||
"quiet": True,
|
||||
"no_warnings": True,
|
||||
"extract_flat": False,
|
||||
"skip_download": True,
|
||||
}
|
||||
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
info = ydl.extract_info(url, download=False)
|
||||
|
||||
video_id = info.get("id", extract_video_id(url) or "unknown")
|
||||
|
||||
# Parse chapters
|
||||
chapters = []
|
||||
raw_chapters = info.get("chapters") or []
|
||||
for i, ch in enumerate(raw_chapters):
|
||||
end_time = ch.get("end_time", 0)
|
||||
if i + 1 < len(raw_chapters):
|
||||
end_time = raw_chapters[i + 1].get("start_time", end_time)
|
||||
chapters.append(
|
||||
Chapter(
|
||||
title=ch.get("title", f"Chapter {i + 1}"),
|
||||
start_time=ch.get("start_time", 0),
|
||||
end_time=end_time,
|
||||
)
|
||||
)
|
||||
|
||||
return VideoInfo(
|
||||
video_id=video_id,
|
||||
source_type=VideoSourceType.YOUTUBE,
|
||||
source_url=url,
|
||||
title=info.get("title", ""),
|
||||
description=info.get("description", ""),
|
||||
duration=float(info.get("duration", 0)),
|
||||
upload_date=info.get("upload_date"),
|
||||
language=info.get("language") or "en",
|
||||
channel_name=info.get("channel") or info.get("uploader"),
|
||||
channel_url=info.get("channel_url") or info.get("uploader_url"),
|
||||
view_count=info.get("view_count"),
|
||||
like_count=info.get("like_count"),
|
||||
comment_count=info.get("comment_count"),
|
||||
tags=info.get("tags") or [],
|
||||
categories=info.get("categories") or [],
|
||||
thumbnail_url=info.get("thumbnail"),
|
||||
chapters=chapters,
|
||||
)
|
||||
|
||||
|
||||
def extract_local_metadata(file_path: str) -> VideoInfo:
|
||||
"""Extract basic metadata from a local video file.
|
||||
|
||||
Args:
|
||||
file_path: Path to video file.
|
||||
|
||||
Returns:
|
||||
VideoInfo with basic metadata from filename/file properties.
|
||||
"""
|
||||
path = os.path.abspath(file_path)
|
||||
name = os.path.splitext(os.path.basename(path))[0]
|
||||
video_id = hashlib.sha256(path.encode()).hexdigest()[:16]
|
||||
|
||||
return VideoInfo(
|
||||
video_id=video_id,
|
||||
source_type=VideoSourceType.LOCAL_FILE,
|
||||
file_path=path,
|
||||
title=name.replace("-", " ").replace("_", " ").title(),
|
||||
duration=0.0, # Would need ffprobe for accurate duration
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Playlist / Channel Resolution
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def resolve_playlist(url: str) -> list[str]:
|
||||
"""Resolve a YouTube playlist URL to a list of video URLs.
|
||||
|
||||
Args:
|
||||
url: YouTube playlist URL.
|
||||
|
||||
Returns:
|
||||
List of video URLs in playlist order.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If yt-dlp is not installed.
|
||||
"""
|
||||
_check_ytdlp()
|
||||
|
||||
ydl_opts = {
|
||||
"quiet": True,
|
||||
"no_warnings": True,
|
||||
"extract_flat": True,
|
||||
"skip_download": True,
|
||||
}
|
||||
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
info = ydl.extract_info(url, download=False)
|
||||
|
||||
entries = info.get("entries") or []
|
||||
video_urls = []
|
||||
for entry in entries:
|
||||
vid_url = entry.get("url") or entry.get("webpage_url")
|
||||
if vid_url:
|
||||
video_urls.append(vid_url)
|
||||
elif entry.get("id"):
|
||||
video_urls.append(f"https://www.youtube.com/watch?v={entry['id']}")
|
||||
|
||||
return video_urls
|
||||
|
||||
|
||||
def resolve_channel(url: str, max_videos: int = 50) -> list[str]:
|
||||
"""Resolve a YouTube channel URL to a list of recent video URLs.
|
||||
|
||||
Args:
|
||||
url: YouTube channel URL.
|
||||
max_videos: Maximum number of videos to resolve.
|
||||
|
||||
Returns:
|
||||
List of video URLs (most recent first).
|
||||
|
||||
Raises:
|
||||
RuntimeError: If yt-dlp is not installed.
|
||||
"""
|
||||
_check_ytdlp()
|
||||
|
||||
ydl_opts = {
|
||||
"quiet": True,
|
||||
"no_warnings": True,
|
||||
"extract_flat": True,
|
||||
"skip_download": True,
|
||||
"playlistend": max_videos,
|
||||
}
|
||||
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
info = ydl.extract_info(url, download=False)
|
||||
|
||||
entries = info.get("entries") or []
|
||||
video_urls = []
|
||||
for entry in entries:
|
||||
vid_url = entry.get("url") or entry.get("webpage_url")
|
||||
if vid_url:
|
||||
video_urls.append(vid_url)
|
||||
elif entry.get("id"):
|
||||
video_urls.append(f"https://www.youtube.com/watch?v={entry['id']}")
|
||||
|
||||
return video_urls[:max_videos]
|
||||
813
src/skill_seekers/cli/video_models.py
Normal file
813
src/skill_seekers/cli/video_models.py
Normal file
@@ -0,0 +1,813 @@
|
||||
"""Video source data models and type definitions.
|
||||
|
||||
Defines all enumerations and dataclasses for the video extraction pipeline:
|
||||
- Enums: VideoSourceType, TranscriptSource, FrameType, CodeContext, SegmentContentType
|
||||
- Core: VideoInfo, VideoSegment, VideoScraperResult
|
||||
- Supporting: Chapter, TranscriptSegment, WordTimestamp, KeyFrame, OCRRegion,
|
||||
FrameSubSection, CodeBlock
|
||||
- Config: VideoSourceConfig
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Enumerations
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class VideoSourceType(Enum):
|
||||
"""Where a video came from."""
|
||||
|
||||
YOUTUBE = "youtube"
|
||||
VIMEO = "vimeo"
|
||||
LOCAL_FILE = "local_file"
|
||||
LOCAL_DIRECTORY = "local_directory"
|
||||
|
||||
|
||||
class TranscriptSource(Enum):
|
||||
"""How the transcript was obtained."""
|
||||
|
||||
YOUTUBE_MANUAL = "youtube_manual"
|
||||
YOUTUBE_AUTO = "youtube_auto_generated"
|
||||
WHISPER = "whisper"
|
||||
SUBTITLE_FILE = "subtitle_file"
|
||||
NONE = "none"
|
||||
|
||||
|
||||
class FrameType(Enum):
|
||||
"""Classification of a keyframe's visual content."""
|
||||
|
||||
CODE_EDITOR = "code_editor"
|
||||
TERMINAL = "terminal"
|
||||
SLIDE = "slide"
|
||||
DIAGRAM = "diagram"
|
||||
BROWSER = "browser"
|
||||
WEBCAM = "webcam"
|
||||
SCREENCAST = "screencast"
|
||||
OTHER = "other"
|
||||
|
||||
|
||||
class CodeContext(Enum):
|
||||
"""Where code was displayed in the video."""
|
||||
|
||||
EDITOR = "editor"
|
||||
TERMINAL = "terminal"
|
||||
SLIDE = "slide"
|
||||
BROWSER = "browser"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
|
||||
class SegmentContentType(Enum):
|
||||
"""Primary content type of a video segment."""
|
||||
|
||||
EXPLANATION = "explanation"
|
||||
LIVE_CODING = "live_coding"
|
||||
DEMO = "demo"
|
||||
SLIDES = "slides"
|
||||
Q_AND_A = "q_and_a"
|
||||
INTRO = "intro"
|
||||
OUTRO = "outro"
|
||||
MIXED = "mixed"
|
||||
|
||||
|
||||
class SegmentationStrategy(Enum):
|
||||
"""How segments are determined."""
|
||||
|
||||
CHAPTERS = "chapters"
|
||||
TIME_WINDOW = "time_window"
|
||||
SCENE_CHANGE = "scene_change"
|
||||
HYBRID = "hybrid"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Supporting Data Classes
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Chapter:
|
||||
"""A chapter marker from a video (typically YouTube)."""
|
||||
|
||||
title: str
|
||||
start_time: float
|
||||
end_time: float
|
||||
|
||||
@property
|
||||
def duration(self) -> float:
|
||||
return self.end_time - self.start_time
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"title": self.title,
|
||||
"start_time": self.start_time,
|
||||
"end_time": self.end_time,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> Chapter:
|
||||
return cls(
|
||||
title=data["title"],
|
||||
start_time=data["start_time"],
|
||||
end_time=data["end_time"],
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class WordTimestamp:
|
||||
"""A single word with precise timing information."""
|
||||
|
||||
word: str
|
||||
start: float
|
||||
end: float
|
||||
probability: float = 1.0
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"word": self.word,
|
||||
"start": self.start,
|
||||
"end": self.end,
|
||||
"probability": self.probability,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> WordTimestamp:
|
||||
return cls(
|
||||
word=data["word"],
|
||||
start=data["start"],
|
||||
end=data["end"],
|
||||
probability=data.get("probability", 1.0),
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TranscriptSegment:
|
||||
"""A raw transcript segment from YouTube API or Whisper."""
|
||||
|
||||
text: str
|
||||
start: float
|
||||
end: float
|
||||
confidence: float = 1.0
|
||||
words: list[WordTimestamp] | None = None
|
||||
source: TranscriptSource = TranscriptSource.NONE
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"text": self.text,
|
||||
"start": self.start,
|
||||
"end": self.end,
|
||||
"confidence": self.confidence,
|
||||
"words": [w.to_dict() for w in self.words] if self.words else None,
|
||||
"source": self.source.value,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> TranscriptSegment:
|
||||
words = None
|
||||
if data.get("words"):
|
||||
words = [WordTimestamp.from_dict(w) for w in data["words"]]
|
||||
return cls(
|
||||
text=data["text"],
|
||||
start=data["start"],
|
||||
end=data["end"],
|
||||
confidence=data.get("confidence", 1.0),
|
||||
words=words,
|
||||
source=TranscriptSource(data.get("source", "none")),
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class OCRRegion:
|
||||
"""A detected text region in a video frame."""
|
||||
|
||||
text: str
|
||||
confidence: float
|
||||
bbox: tuple[int, int, int, int]
|
||||
is_monospace: bool = False
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"text": self.text,
|
||||
"confidence": self.confidence,
|
||||
"bbox": list(self.bbox),
|
||||
"is_monospace": self.is_monospace,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> OCRRegion:
|
||||
return cls(
|
||||
text=data["text"],
|
||||
confidence=data["confidence"],
|
||||
bbox=tuple(data["bbox"]),
|
||||
is_monospace=data.get("is_monospace", False),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FrameSubSection:
|
||||
"""A single panel/region within a video frame, OCR'd independently.
|
||||
|
||||
Each IDE panel (e.g. code editor, terminal, file tree) is detected
|
||||
as a separate sub-section so that side-by-side editors produce
|
||||
independent OCR results instead of being merged into one blob.
|
||||
"""
|
||||
|
||||
bbox: tuple[int, int, int, int] # (x1, y1, x2, y2)
|
||||
frame_type: FrameType = FrameType.OTHER
|
||||
ocr_text: str = ""
|
||||
ocr_regions: list[OCRRegion] = field(default_factory=list)
|
||||
ocr_confidence: float = 0.0
|
||||
panel_id: str = "" # e.g. "panel_0_0" (row_col)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"bbox": list(self.bbox),
|
||||
"frame_type": self.frame_type.value,
|
||||
"ocr_text": self.ocr_text,
|
||||
"ocr_regions": [r.to_dict() for r in self.ocr_regions],
|
||||
"ocr_confidence": self.ocr_confidence,
|
||||
"panel_id": self.panel_id,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> FrameSubSection:
|
||||
return cls(
|
||||
bbox=tuple(data["bbox"]),
|
||||
frame_type=FrameType(data.get("frame_type", "other")),
|
||||
ocr_text=data.get("ocr_text", ""),
|
||||
ocr_regions=[OCRRegion.from_dict(r) for r in data.get("ocr_regions", [])],
|
||||
ocr_confidence=data.get("ocr_confidence", 0.0),
|
||||
panel_id=data.get("panel_id", ""),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class KeyFrame:
|
||||
"""An extracted video frame with visual analysis results."""
|
||||
|
||||
timestamp: float
|
||||
image_path: str
|
||||
frame_type: FrameType = FrameType.OTHER
|
||||
scene_change_score: float = 0.0
|
||||
ocr_regions: list[OCRRegion] = field(default_factory=list)
|
||||
ocr_text: str = ""
|
||||
ocr_confidence: float = 0.0
|
||||
width: int = 0
|
||||
height: int = 0
|
||||
sub_sections: list[FrameSubSection] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"timestamp": self.timestamp,
|
||||
"image_path": self.image_path,
|
||||
"frame_type": self.frame_type.value,
|
||||
"scene_change_score": self.scene_change_score,
|
||||
"ocr_regions": [r.to_dict() for r in self.ocr_regions],
|
||||
"ocr_text": self.ocr_text,
|
||||
"ocr_confidence": self.ocr_confidence,
|
||||
"width": self.width,
|
||||
"height": self.height,
|
||||
"sub_sections": [ss.to_dict() for ss in self.sub_sections],
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> KeyFrame:
|
||||
return cls(
|
||||
timestamp=data["timestamp"],
|
||||
image_path=data["image_path"],
|
||||
frame_type=FrameType(data.get("frame_type", "other")),
|
||||
scene_change_score=data.get("scene_change_score", 0.0),
|
||||
ocr_regions=[OCRRegion.from_dict(r) for r in data.get("ocr_regions", [])],
|
||||
ocr_text=data.get("ocr_text", ""),
|
||||
ocr_confidence=data.get("ocr_confidence", 0.0),
|
||||
width=data.get("width", 0),
|
||||
height=data.get("height", 0),
|
||||
sub_sections=[FrameSubSection.from_dict(ss) for ss in data.get("sub_sections", [])],
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CodeBlock:
|
||||
"""A code block detected via OCR from video frames."""
|
||||
|
||||
code: str
|
||||
language: str | None = None
|
||||
source_frame: float = 0.0
|
||||
context: CodeContext = CodeContext.UNKNOWN
|
||||
confidence: float = 0.0
|
||||
text_group_id: str = ""
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"code": self.code,
|
||||
"language": self.language,
|
||||
"source_frame": self.source_frame,
|
||||
"context": self.context.value,
|
||||
"confidence": self.confidence,
|
||||
"text_group_id": self.text_group_id,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> CodeBlock:
|
||||
return cls(
|
||||
code=data["code"],
|
||||
language=data.get("language"),
|
||||
source_frame=data.get("source_frame", 0.0),
|
||||
context=CodeContext(data.get("context", "unknown")),
|
||||
confidence=data.get("confidence", 0.0),
|
||||
text_group_id=data.get("text_group_id", ""),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TextGroupEdit:
|
||||
"""Represents an edit detected between appearances of a text group."""
|
||||
|
||||
timestamp: float
|
||||
added_lines: list[str] = field(default_factory=list)
|
||||
removed_lines: list[str] = field(default_factory=list)
|
||||
modified_lines: list[dict] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"timestamp": self.timestamp,
|
||||
"added_lines": self.added_lines,
|
||||
"removed_lines": self.removed_lines,
|
||||
"modified_lines": self.modified_lines,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> TextGroupEdit:
|
||||
return cls(
|
||||
timestamp=data["timestamp"],
|
||||
added_lines=data.get("added_lines", []),
|
||||
removed_lines=data.get("removed_lines", []),
|
||||
modified_lines=data.get("modified_lines", []),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TextGroup:
|
||||
"""A group of related text blocks tracked across the video.
|
||||
|
||||
Represents a single code file/snippet as it appears and evolves
|
||||
across multiple video frames.
|
||||
"""
|
||||
|
||||
group_id: str
|
||||
appearances: list[tuple[float, float]] = field(default_factory=list)
|
||||
consensus_lines: list[dict] = field(default_factory=list)
|
||||
edits: list[TextGroupEdit] = field(default_factory=list)
|
||||
detected_language: str | None = None
|
||||
frame_type: FrameType = FrameType.CODE_EDITOR
|
||||
panel_id: str = "" # Tracks which panel this group originated from
|
||||
|
||||
@property
|
||||
def full_text(self) -> str:
|
||||
return "\n".join(line["text"] for line in self.consensus_lines if line.get("text"))
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"group_id": self.group_id,
|
||||
"appearances": [[s, e] for s, e in self.appearances],
|
||||
"consensus_lines": self.consensus_lines,
|
||||
"edits": [e.to_dict() for e in self.edits],
|
||||
"detected_language": self.detected_language,
|
||||
"frame_type": self.frame_type.value,
|
||||
"panel_id": self.panel_id,
|
||||
"full_text": self.full_text,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> TextGroup:
|
||||
return cls(
|
||||
group_id=data["group_id"],
|
||||
appearances=[tuple(a) for a in data.get("appearances", [])],
|
||||
consensus_lines=data.get("consensus_lines", []),
|
||||
edits=[TextGroupEdit.from_dict(e) for e in data.get("edits", [])],
|
||||
detected_language=data.get("detected_language"),
|
||||
frame_type=FrameType(data.get("frame_type", "code_editor")),
|
||||
panel_id=data.get("panel_id", ""),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TextGroupTimeline:
|
||||
"""Timeline of all text groups and their lifecycle in the video."""
|
||||
|
||||
text_groups: list[TextGroup] = field(default_factory=list)
|
||||
total_code_time: float = 0.0
|
||||
total_groups: int = 0
|
||||
total_edits: int = 0
|
||||
|
||||
def get_groups_at_time(self, timestamp: float) -> list[TextGroup]:
|
||||
"""Return all text groups visible at a given timestamp."""
|
||||
return [
|
||||
tg
|
||||
for tg in self.text_groups
|
||||
if any(start <= timestamp <= end for start, end in tg.appearances)
|
||||
]
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"text_groups": [tg.to_dict() for tg in self.text_groups],
|
||||
"total_code_time": self.total_code_time,
|
||||
"total_groups": self.total_groups,
|
||||
"total_edits": self.total_edits,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> TextGroupTimeline:
|
||||
return cls(
|
||||
text_groups=[TextGroup.from_dict(tg) for tg in data.get("text_groups", [])],
|
||||
total_code_time=data.get("total_code_time", 0.0),
|
||||
total_groups=data.get("total_groups", 0),
|
||||
total_edits=data.get("total_edits", 0),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AudioVisualAlignment:
|
||||
"""Links on-screen code with concurrent transcript narration."""
|
||||
|
||||
text_group_id: str
|
||||
start_time: float
|
||||
end_time: float
|
||||
on_screen_code: str
|
||||
transcript_during: str
|
||||
language: str | None = None
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"text_group_id": self.text_group_id,
|
||||
"start_time": self.start_time,
|
||||
"end_time": self.end_time,
|
||||
"on_screen_code": self.on_screen_code,
|
||||
"transcript_during": self.transcript_during,
|
||||
"language": self.language,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> AudioVisualAlignment:
|
||||
return cls(
|
||||
text_group_id=data["text_group_id"],
|
||||
start_time=data["start_time"],
|
||||
end_time=data["end_time"],
|
||||
on_screen_code=data["on_screen_code"],
|
||||
transcript_during=data.get("transcript_during", ""),
|
||||
language=data.get("language"),
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Core Data Classes
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@dataclass
|
||||
class VideoSegment:
|
||||
"""A time-aligned segment combining transcript + visual + metadata."""
|
||||
|
||||
index: int
|
||||
start_time: float
|
||||
end_time: float
|
||||
duration: float
|
||||
|
||||
# Stream 1: ASR (Audio)
|
||||
transcript: str = ""
|
||||
words: list[WordTimestamp] = field(default_factory=list)
|
||||
transcript_confidence: float = 0.0
|
||||
|
||||
# Stream 2: OCR (Visual)
|
||||
keyframes: list[KeyFrame] = field(default_factory=list)
|
||||
ocr_text: str = ""
|
||||
detected_code_blocks: list[CodeBlock] = field(default_factory=list)
|
||||
has_code_on_screen: bool = False
|
||||
has_slides: bool = False
|
||||
has_diagram: bool = False
|
||||
|
||||
# Stream 3: Metadata
|
||||
chapter_title: str | None = None
|
||||
topic: str | None = None
|
||||
category: str | None = None
|
||||
|
||||
# Merged content
|
||||
content: str = ""
|
||||
summary: str | None = None
|
||||
|
||||
# Quality metadata
|
||||
confidence: float = 0.0
|
||||
content_type: SegmentContentType = SegmentContentType.MIXED
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"index": self.index,
|
||||
"start_time": self.start_time,
|
||||
"end_time": self.end_time,
|
||||
"duration": self.duration,
|
||||
"transcript": self.transcript,
|
||||
"words": [w.to_dict() for w in self.words],
|
||||
"transcript_confidence": self.transcript_confidence,
|
||||
"keyframes": [k.to_dict() for k in self.keyframes],
|
||||
"ocr_text": self.ocr_text,
|
||||
"detected_code_blocks": [c.to_dict() for c in self.detected_code_blocks],
|
||||
"has_code_on_screen": self.has_code_on_screen,
|
||||
"has_slides": self.has_slides,
|
||||
"has_diagram": self.has_diagram,
|
||||
"chapter_title": self.chapter_title,
|
||||
"topic": self.topic,
|
||||
"category": self.category,
|
||||
"content": self.content,
|
||||
"summary": self.summary,
|
||||
"confidence": self.confidence,
|
||||
"content_type": self.content_type.value,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> VideoSegment:
|
||||
return cls(
|
||||
index=data["index"],
|
||||
start_time=data["start_time"],
|
||||
end_time=data["end_time"],
|
||||
duration=data["duration"],
|
||||
transcript=data.get("transcript", ""),
|
||||
words=[WordTimestamp.from_dict(w) for w in data.get("words", [])],
|
||||
transcript_confidence=data.get("transcript_confidence", 0.0),
|
||||
keyframes=[KeyFrame.from_dict(k) for k in data.get("keyframes", [])],
|
||||
ocr_text=data.get("ocr_text", ""),
|
||||
detected_code_blocks=[
|
||||
CodeBlock.from_dict(c) for c in data.get("detected_code_blocks", [])
|
||||
],
|
||||
has_code_on_screen=data.get("has_code_on_screen", False),
|
||||
has_slides=data.get("has_slides", False),
|
||||
has_diagram=data.get("has_diagram", False),
|
||||
chapter_title=data.get("chapter_title"),
|
||||
topic=data.get("topic"),
|
||||
category=data.get("category"),
|
||||
content=data.get("content", ""),
|
||||
summary=data.get("summary"),
|
||||
confidence=data.get("confidence", 0.0),
|
||||
content_type=SegmentContentType(data.get("content_type", "mixed")),
|
||||
)
|
||||
|
||||
@property
|
||||
def timestamp_display(self) -> str:
|
||||
"""Human-readable timestamp (e.g., '05:30 - 08:15')."""
|
||||
start_min, start_sec = divmod(int(self.start_time), 60)
|
||||
end_min, end_sec = divmod(int(self.end_time), 60)
|
||||
if self.start_time >= 3600 or self.end_time >= 3600:
|
||||
start_hr, start_min = divmod(start_min, 60)
|
||||
end_hr, end_min = divmod(end_min, 60)
|
||||
return f"{start_hr:d}:{start_min:02d}:{start_sec:02d} - {end_hr:d}:{end_min:02d}:{end_sec:02d}"
|
||||
return f"{start_min:02d}:{start_sec:02d} - {end_min:02d}:{end_sec:02d}"
|
||||
|
||||
|
||||
@dataclass
|
||||
class VideoInfo:
|
||||
"""Complete metadata and extracted content for a single video."""
|
||||
|
||||
# Identity
|
||||
video_id: str
|
||||
source_type: VideoSourceType
|
||||
source_url: str | None = None
|
||||
file_path: str | None = None
|
||||
|
||||
# Basic metadata
|
||||
title: str = ""
|
||||
description: str = ""
|
||||
duration: float = 0.0
|
||||
upload_date: str | None = None
|
||||
language: str = "en"
|
||||
|
||||
# Channel / Author
|
||||
channel_name: str | None = None
|
||||
channel_url: str | None = None
|
||||
|
||||
# Engagement metadata
|
||||
view_count: int | None = None
|
||||
like_count: int | None = None
|
||||
comment_count: int | None = None
|
||||
|
||||
# Discovery metadata
|
||||
tags: list[str] = field(default_factory=list)
|
||||
categories: list[str] = field(default_factory=list)
|
||||
thumbnail_url: str | None = None
|
||||
|
||||
# Structure
|
||||
chapters: list[Chapter] = field(default_factory=list)
|
||||
|
||||
# Playlist context
|
||||
playlist_title: str | None = None
|
||||
playlist_index: int | None = None
|
||||
playlist_total: int | None = None
|
||||
|
||||
# Extracted content
|
||||
raw_transcript: list[TranscriptSegment] = field(default_factory=list)
|
||||
segments: list[VideoSegment] = field(default_factory=list)
|
||||
|
||||
# Processing metadata
|
||||
transcript_source: TranscriptSource = TranscriptSource.NONE
|
||||
visual_extraction_enabled: bool = False
|
||||
whisper_model: str | None = None
|
||||
processing_time_seconds: float = 0.0
|
||||
extracted_at: str = ""
|
||||
|
||||
# Quality scores
|
||||
transcript_confidence: float = 0.0
|
||||
content_richness_score: float = 0.0
|
||||
|
||||
# Consensus-based text tracking (Phase A-D)
|
||||
text_group_timeline: TextGroupTimeline | None = None
|
||||
audio_visual_alignments: list[AudioVisualAlignment] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"video_id": self.video_id,
|
||||
"source_type": self.source_type.value,
|
||||
"source_url": self.source_url,
|
||||
"file_path": self.file_path,
|
||||
"title": self.title,
|
||||
"description": self.description,
|
||||
"duration": self.duration,
|
||||
"upload_date": self.upload_date,
|
||||
"language": self.language,
|
||||
"channel_name": self.channel_name,
|
||||
"channel_url": self.channel_url,
|
||||
"view_count": self.view_count,
|
||||
"like_count": self.like_count,
|
||||
"comment_count": self.comment_count,
|
||||
"tags": self.tags,
|
||||
"categories": self.categories,
|
||||
"thumbnail_url": self.thumbnail_url,
|
||||
"chapters": [c.to_dict() for c in self.chapters],
|
||||
"playlist_title": self.playlist_title,
|
||||
"playlist_index": self.playlist_index,
|
||||
"playlist_total": self.playlist_total,
|
||||
"raw_transcript": [t.to_dict() for t in self.raw_transcript],
|
||||
"segments": [s.to_dict() for s in self.segments],
|
||||
"transcript_source": self.transcript_source.value,
|
||||
"visual_extraction_enabled": self.visual_extraction_enabled,
|
||||
"whisper_model": self.whisper_model,
|
||||
"processing_time_seconds": self.processing_time_seconds,
|
||||
"extracted_at": self.extracted_at,
|
||||
"transcript_confidence": self.transcript_confidence,
|
||||
"content_richness_score": self.content_richness_score,
|
||||
"text_group_timeline": self.text_group_timeline.to_dict()
|
||||
if self.text_group_timeline
|
||||
else None,
|
||||
"audio_visual_alignments": [a.to_dict() for a in self.audio_visual_alignments],
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> VideoInfo:
|
||||
timeline_data = data.get("text_group_timeline")
|
||||
timeline = TextGroupTimeline.from_dict(timeline_data) if timeline_data else None
|
||||
return cls(
|
||||
video_id=data["video_id"],
|
||||
source_type=VideoSourceType(data["source_type"]),
|
||||
source_url=data.get("source_url"),
|
||||
file_path=data.get("file_path"),
|
||||
title=data.get("title", ""),
|
||||
description=data.get("description", ""),
|
||||
duration=data.get("duration", 0.0),
|
||||
upload_date=data.get("upload_date"),
|
||||
language=data.get("language", "en"),
|
||||
channel_name=data.get("channel_name"),
|
||||
channel_url=data.get("channel_url"),
|
||||
view_count=data.get("view_count"),
|
||||
like_count=data.get("like_count"),
|
||||
comment_count=data.get("comment_count"),
|
||||
tags=data.get("tags", []),
|
||||
categories=data.get("categories", []),
|
||||
thumbnail_url=data.get("thumbnail_url"),
|
||||
chapters=[Chapter.from_dict(c) for c in data.get("chapters", [])],
|
||||
playlist_title=data.get("playlist_title"),
|
||||
playlist_index=data.get("playlist_index"),
|
||||
playlist_total=data.get("playlist_total"),
|
||||
raw_transcript=[TranscriptSegment.from_dict(t) for t in data.get("raw_transcript", [])],
|
||||
segments=[VideoSegment.from_dict(s) for s in data.get("segments", [])],
|
||||
transcript_source=TranscriptSource(data.get("transcript_source", "none")),
|
||||
visual_extraction_enabled=data.get("visual_extraction_enabled", False),
|
||||
whisper_model=data.get("whisper_model"),
|
||||
processing_time_seconds=data.get("processing_time_seconds", 0.0),
|
||||
extracted_at=data.get("extracted_at", ""),
|
||||
transcript_confidence=data.get("transcript_confidence", 0.0),
|
||||
content_richness_score=data.get("content_richness_score", 0.0),
|
||||
text_group_timeline=timeline,
|
||||
audio_visual_alignments=[
|
||||
AudioVisualAlignment.from_dict(a) for a in data.get("audio_visual_alignments", [])
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class VideoSourceConfig:
|
||||
"""Configuration for video source processing."""
|
||||
|
||||
# Source specification (exactly one should be set)
|
||||
url: str | None = None
|
||||
playlist: str | None = None
|
||||
channel: str | None = None
|
||||
path: str | None = None
|
||||
directory: str | None = None
|
||||
|
||||
# Identity
|
||||
name: str = "video"
|
||||
description: str = ""
|
||||
|
||||
# Filtering
|
||||
max_videos: int = 50
|
||||
languages: list[str] | None = None
|
||||
|
||||
# Extraction
|
||||
visual_extraction: bool = False
|
||||
whisper_model: str = "base"
|
||||
|
||||
# Segmentation
|
||||
time_window_seconds: float = 120.0
|
||||
min_segment_duration: float = 10.0
|
||||
max_segment_duration: float = 600.0
|
||||
|
||||
# Categorization
|
||||
categories: dict[str, list[str]] | None = None
|
||||
|
||||
# Subtitle files
|
||||
subtitle_patterns: list[str] | None = None
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> VideoSourceConfig:
|
||||
return cls(
|
||||
url=data.get("url"),
|
||||
playlist=data.get("playlist"),
|
||||
channel=data.get("channel"),
|
||||
path=data.get("path"),
|
||||
directory=data.get("directory"),
|
||||
name=data.get("name", "video"),
|
||||
description=data.get("description", ""),
|
||||
max_videos=data.get("max_videos", 50),
|
||||
languages=data.get("languages"),
|
||||
visual_extraction=data.get("visual_extraction", False),
|
||||
whisper_model=data.get("whisper_model", "base"),
|
||||
time_window_seconds=data.get("time_window_seconds", 120.0),
|
||||
min_segment_duration=data.get("min_segment_duration", 10.0),
|
||||
max_segment_duration=data.get("max_segment_duration", 600.0),
|
||||
categories=data.get("categories"),
|
||||
subtitle_patterns=data.get("subtitle_patterns"),
|
||||
)
|
||||
|
||||
def validate(self) -> list[str]:
|
||||
"""Validate configuration. Returns list of errors."""
|
||||
errors = []
|
||||
sources_set = sum(
|
||||
1
|
||||
for s in [self.url, self.playlist, self.channel, self.path, self.directory]
|
||||
if s is not None
|
||||
)
|
||||
if sources_set == 0:
|
||||
errors.append(
|
||||
"Video source must specify one of: url, playlist, channel, path, directory"
|
||||
)
|
||||
if sources_set > 1:
|
||||
errors.append("Video source must specify exactly one source type")
|
||||
return errors
|
||||
|
||||
|
||||
@dataclass
|
||||
class VideoScraperResult:
|
||||
"""Complete result from the video scraper."""
|
||||
|
||||
videos: list[VideoInfo] = field(default_factory=list)
|
||||
total_duration_seconds: float = 0.0
|
||||
total_segments: int = 0
|
||||
total_code_blocks: int = 0
|
||||
config: VideoSourceConfig | None = None
|
||||
processing_time_seconds: float = 0.0
|
||||
warnings: list[str] = field(default_factory=list)
|
||||
errors: list[dict[str, Any]] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"videos": [v.to_dict() for v in self.videos],
|
||||
"total_duration_seconds": self.total_duration_seconds,
|
||||
"total_segments": self.total_segments,
|
||||
"total_code_blocks": self.total_code_blocks,
|
||||
"processing_time_seconds": self.processing_time_seconds,
|
||||
"warnings": self.warnings,
|
||||
"errors": self.errors,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> VideoScraperResult:
|
||||
return cls(
|
||||
videos=[VideoInfo.from_dict(v) for v in data.get("videos", [])],
|
||||
total_duration_seconds=data.get("total_duration_seconds", 0.0),
|
||||
total_segments=data.get("total_segments", 0),
|
||||
total_code_blocks=data.get("total_code_blocks", 0),
|
||||
processing_time_seconds=data.get("processing_time_seconds", 0.0),
|
||||
warnings=data.get("warnings", []),
|
||||
errors=data.get("errors", []),
|
||||
)
|
||||
954
src/skill_seekers/cli/video_scraper.py
Normal file
954
src/skill_seekers/cli/video_scraper.py
Normal file
@@ -0,0 +1,954 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Video to Claude Skill Converter
|
||||
|
||||
Extracts transcripts, metadata, and visual content from videos
|
||||
and converts them into Claude AI skills.
|
||||
|
||||
Supports YouTube videos/playlists, Vimeo, and local video files.
|
||||
|
||||
Usage:
|
||||
python3 video_scraper.py --url https://www.youtube.com/watch?v=...
|
||||
python3 video_scraper.py --video-file recording.mp4
|
||||
python3 video_scraper.py --playlist https://www.youtube.com/playlist?list=...
|
||||
python3 video_scraper.py --from-json video_extracted.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
|
||||
from skill_seekers.cli.video_models import (
|
||||
AudioVisualAlignment,
|
||||
TextGroupTimeline,
|
||||
TranscriptSource,
|
||||
VideoInfo,
|
||||
VideoScraperResult,
|
||||
VideoSourceConfig,
|
||||
VideoSourceType,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Dependency Guard
|
||||
# =============================================================================
|
||||
|
||||
# Core video deps are optional
|
||||
try:
|
||||
import yt_dlp # noqa: F401
|
||||
|
||||
HAS_YTDLP = True
|
||||
except ImportError:
|
||||
HAS_YTDLP = False
|
||||
|
||||
try:
|
||||
from youtube_transcript_api import YouTubeTranscriptApi # noqa: F401
|
||||
|
||||
HAS_YOUTUBE_TRANSCRIPT = True
|
||||
except ImportError:
|
||||
HAS_YOUTUBE_TRANSCRIPT = False
|
||||
|
||||
|
||||
def check_video_dependencies(require_full: bool = False) -> None:
|
||||
"""Check that required video dependencies are available.
|
||||
|
||||
Args:
|
||||
require_full: If True, also check Tier 2 deps (Whisper, OpenCV, etc.)
|
||||
|
||||
Raises:
|
||||
RuntimeError: If required dependencies are missing.
|
||||
"""
|
||||
missing = []
|
||||
if not HAS_YTDLP:
|
||||
missing.append("yt-dlp")
|
||||
if not HAS_YOUTUBE_TRANSCRIPT:
|
||||
missing.append("youtube-transcript-api")
|
||||
|
||||
if require_full:
|
||||
try:
|
||||
import cv2 # noqa: F401
|
||||
except ImportError:
|
||||
missing.append("opencv-python-headless")
|
||||
try:
|
||||
import faster_whisper # noqa: F401
|
||||
except ImportError:
|
||||
missing.append("faster-whisper")
|
||||
|
||||
if missing:
|
||||
deps = ", ".join(missing)
|
||||
extra = "[video-full]" if require_full else "[video]"
|
||||
raise RuntimeError(
|
||||
f"Missing video dependencies: {deps}\n"
|
||||
f'Install with: pip install "skill-seekers{extra}"\n'
|
||||
f"Or: pip install {' '.join(missing)}"
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Helper Functions
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def _sanitize_filename(title: str, max_length: int = 60) -> str:
|
||||
"""Sanitize a video title for use as a filename."""
|
||||
name = title.lower()
|
||||
name = re.sub(r"[^a-z0-9\s-]", "", name)
|
||||
name = re.sub(r"[\s]+", "-", name)
|
||||
name = re.sub(r"-+", "-", name)
|
||||
name = name.strip("-")
|
||||
return name[:max_length]
|
||||
|
||||
|
||||
def _format_duration(seconds: float) -> str:
|
||||
"""Format seconds as HH:MM:SS or MM:SS."""
|
||||
total = int(seconds)
|
||||
hours, remainder = divmod(total, 3600)
|
||||
minutes, secs = divmod(remainder, 60)
|
||||
if hours > 0:
|
||||
return f"{hours}:{minutes:02d}:{secs:02d}"
|
||||
return f"{minutes:02d}:{secs:02d}"
|
||||
|
||||
|
||||
def _format_count(count: int | None) -> str:
|
||||
"""Format a count with commas."""
|
||||
if count is None:
|
||||
return "N/A"
|
||||
return f"{count:,}"
|
||||
|
||||
|
||||
def infer_description_from_video(video_info: VideoInfo, name: str = "") -> str:
|
||||
"""Infer skill description from video metadata."""
|
||||
if video_info.description:
|
||||
desc = video_info.description[:150].strip()
|
||||
if len(video_info.description) > 150:
|
||||
desc += "..."
|
||||
return f"Use when {desc.lower()}"
|
||||
if video_info.title:
|
||||
return f"Use when working with {video_info.title.lower()}"
|
||||
return (
|
||||
f"Use when referencing {name} video content"
|
||||
if name
|
||||
else "Use when referencing this video content"
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Audio-Visual Alignment
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def _build_audio_visual_alignments(
|
||||
timeline: TextGroupTimeline,
|
||||
transcript_segments: list,
|
||||
) -> list[AudioVisualAlignment]:
|
||||
"""Build audio-visual alignments pairing on-screen code with transcript.
|
||||
|
||||
For each text group appearance, finds overlapping transcript segments
|
||||
and pairs them into AudioVisualAlignment objects.
|
||||
|
||||
Args:
|
||||
timeline: TextGroupTimeline with text groups and appearances.
|
||||
transcript_segments: List of TranscriptSegment objects.
|
||||
|
||||
Returns:
|
||||
List of AudioVisualAlignment objects.
|
||||
"""
|
||||
alignments: list[AudioVisualAlignment] = []
|
||||
|
||||
for group in timeline.text_groups:
|
||||
for start, end in group.appearances:
|
||||
# Find overlapping transcript segments
|
||||
overlapping_text = []
|
||||
for seg in transcript_segments:
|
||||
seg_start = seg.start
|
||||
seg_end = seg.end
|
||||
# Check overlap
|
||||
if seg_end > start and seg_start < end:
|
||||
overlapping_text.append(seg.text)
|
||||
|
||||
transcript_during = " ".join(overlapping_text).strip()
|
||||
if not transcript_during:
|
||||
continue
|
||||
|
||||
alignments.append(
|
||||
AudioVisualAlignment(
|
||||
text_group_id=group.group_id,
|
||||
start_time=start,
|
||||
end_time=end,
|
||||
on_screen_code=group.full_text,
|
||||
transcript_during=transcript_during,
|
||||
language=group.detected_language,
|
||||
)
|
||||
)
|
||||
|
||||
return alignments
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Main Converter Class
|
||||
# =============================================================================
|
||||
|
||||
|
||||
class VideoToSkillConverter:
|
||||
"""Convert video content to Claude skill."""
|
||||
|
||||
def __init__(self, config: dict):
|
||||
"""Initialize converter.
|
||||
|
||||
Args:
|
||||
config: Configuration dict with keys:
|
||||
- name: Skill name
|
||||
- url/video_file/playlist: Video source
|
||||
- description: Optional description
|
||||
- languages: Optional language preferences
|
||||
- visual: Whether to enable visual extraction
|
||||
- whisper_model: Whisper model size
|
||||
"""
|
||||
self.config = config
|
||||
self.name = config["name"]
|
||||
self.description = config.get("description", "")
|
||||
self.languages = (config.get("languages") or "en").split(",")
|
||||
self.visual = config.get("visual", False)
|
||||
self.whisper_model = config.get("whisper_model", "base")
|
||||
self.visual_interval = config.get("visual_interval", 0.7)
|
||||
self.visual_min_gap = config.get("visual_min_gap", 0.5)
|
||||
self.visual_similarity = config.get("visual_similarity", 3.0)
|
||||
self.vision_ocr = config.get("vision_ocr", False)
|
||||
|
||||
# Paths
|
||||
self.skill_dir = config.get("output") or f"output/{self.name}"
|
||||
self.data_file = f"output/{self.name}_video_extracted.json"
|
||||
|
||||
# Results
|
||||
self.result: VideoScraperResult | None = None
|
||||
|
||||
def process(self) -> VideoScraperResult:
|
||||
"""Run the full video processing pipeline.
|
||||
|
||||
Returns:
|
||||
VideoScraperResult with all extracted data.
|
||||
"""
|
||||
from skill_seekers.cli.video_metadata import (
|
||||
detect_video_source_type,
|
||||
extract_local_metadata,
|
||||
extract_youtube_metadata,
|
||||
resolve_playlist,
|
||||
)
|
||||
from skill_seekers.cli.video_segmenter import segment_video
|
||||
from skill_seekers.cli.video_transcript import get_transcript
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# Validate visual deps upfront so we fail fast
|
||||
if self.visual:
|
||||
check_video_dependencies(require_full=True)
|
||||
from skill_seekers.cli.video_visual import check_visual_dependencies
|
||||
|
||||
deps = check_visual_dependencies()
|
||||
missing = [name for name, available in deps.items() if not available]
|
||||
if missing:
|
||||
raise RuntimeError(
|
||||
f"Visual extraction requires: {', '.join(missing)}\n"
|
||||
'Install with: pip install "skill-seekers[video-full]"\n'
|
||||
"Or: pip install opencv-python-headless scenedetect easyocr"
|
||||
)
|
||||
|
||||
source_config = VideoSourceConfig(
|
||||
name=self.name,
|
||||
description=self.description,
|
||||
languages=self.languages,
|
||||
visual_extraction=self.visual,
|
||||
whisper_model=self.whisper_model,
|
||||
)
|
||||
|
||||
videos: list[VideoInfo] = []
|
||||
warnings: list[str] = []
|
||||
errors: list[dict] = []
|
||||
|
||||
# Determine source URLs
|
||||
urls_or_paths = []
|
||||
if self.config.get("playlist"):
|
||||
logger.info("Resolving playlist...")
|
||||
try:
|
||||
check_video_dependencies()
|
||||
urls_or_paths = resolve_playlist(self.config["playlist"])
|
||||
logger.info(f"Found {len(urls_or_paths)} videos in playlist")
|
||||
except Exception as e:
|
||||
errors.append({"source": self.config["playlist"], "error": str(e)})
|
||||
logger.error(f"Failed to resolve playlist: {e}")
|
||||
elif self.config.get("url"):
|
||||
urls_or_paths = [self.config["url"]]
|
||||
elif self.config.get("video_file"):
|
||||
urls_or_paths = [self.config["video_file"]]
|
||||
|
||||
# Process each video
|
||||
for i, source in enumerate(urls_or_paths):
|
||||
logger.info(f"[{i + 1}/{len(urls_or_paths)}] Processing: {source}")
|
||||
try:
|
||||
source_type = detect_video_source_type(source)
|
||||
|
||||
# Extract metadata
|
||||
if source_type == VideoSourceType.YOUTUBE:
|
||||
check_video_dependencies()
|
||||
video_info = extract_youtube_metadata(source)
|
||||
else:
|
||||
video_info = extract_local_metadata(source)
|
||||
|
||||
# Extract transcript
|
||||
transcript_segments, transcript_source = get_transcript(video_info, source_config)
|
||||
video_info.raw_transcript = transcript_segments
|
||||
video_info.transcript_source = transcript_source
|
||||
|
||||
if not transcript_segments:
|
||||
warnings.append(f"No transcript available for '{video_info.title}'")
|
||||
|
||||
# Compute transcript confidence
|
||||
if transcript_segments:
|
||||
video_info.transcript_confidence = sum(
|
||||
s.confidence for s in transcript_segments
|
||||
) / len(transcript_segments)
|
||||
|
||||
if transcript_source == TranscriptSource.YOUTUBE_AUTO:
|
||||
video_info.transcript_confidence *= 0.8
|
||||
|
||||
# Segment video
|
||||
segments = segment_video(video_info, transcript_segments, source_config)
|
||||
video_info.segments = segments
|
||||
|
||||
# Visual extraction (Tier 2)
|
||||
if self.visual:
|
||||
from skill_seekers.cli.video_visual import (
|
||||
download_video,
|
||||
extract_visual_data,
|
||||
)
|
||||
|
||||
video_path = video_info.file_path
|
||||
temp_video_dir = None
|
||||
|
||||
# Download if remote (YouTube/Vimeo)
|
||||
if not video_path or not os.path.exists(video_path):
|
||||
import tempfile as _tmpmod
|
||||
|
||||
temp_video_dir = _tmpmod.mkdtemp(prefix="ss_video_")
|
||||
video_path = download_video(source, temp_video_dir)
|
||||
|
||||
if video_path and os.path.exists(video_path):
|
||||
keyframes, code_blocks, timeline = extract_visual_data(
|
||||
video_path,
|
||||
segments,
|
||||
self.skill_dir,
|
||||
sample_interval=self.visual_interval,
|
||||
min_gap=self.visual_min_gap,
|
||||
similarity_threshold=self.visual_similarity,
|
||||
use_vision_api=self.vision_ocr,
|
||||
)
|
||||
# Attach keyframes to segments
|
||||
for kf in keyframes:
|
||||
for seg in segments:
|
||||
if seg.start_time <= kf.timestamp < seg.end_time:
|
||||
seg.keyframes.append(kf)
|
||||
break
|
||||
# Assign code blocks to segments by timestamp
|
||||
for cb in code_blocks:
|
||||
for seg in segments:
|
||||
if seg.start_time <= cb.source_frame < seg.end_time:
|
||||
seg.detected_code_blocks.append(cb)
|
||||
seg.has_code_on_screen = True
|
||||
break
|
||||
# Set timeline and build audio-visual alignments
|
||||
video_info.text_group_timeline = timeline
|
||||
if timeline:
|
||||
video_info.audio_visual_alignments = _build_audio_visual_alignments(
|
||||
timeline, video_info.raw_transcript
|
||||
)
|
||||
logger.info(
|
||||
f" Visual: {len(keyframes)} keyframes extracted, "
|
||||
f"{sum(1 for kf in keyframes if kf.ocr_text)} with OCR text, "
|
||||
f"{len(code_blocks)} code blocks detected"
|
||||
)
|
||||
else:
|
||||
warnings.append(f"Could not download video for visual extraction: {source}")
|
||||
|
||||
# Clean up temp download
|
||||
if temp_video_dir:
|
||||
import shutil
|
||||
|
||||
shutil.rmtree(temp_video_dir, ignore_errors=True)
|
||||
|
||||
# Set processing metadata
|
||||
video_info.extracted_at = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
||||
video_info.visual_extraction_enabled = self.visual
|
||||
video_info.processing_time_seconds = time.time() - start_time
|
||||
|
||||
videos.append(video_info)
|
||||
visual_msg = ""
|
||||
if self.visual:
|
||||
total_kf = sum(len(s.keyframes) for s in segments)
|
||||
total_ocr = sum(1 for s in segments for kf in s.keyframes if kf.ocr_text)
|
||||
visual_msg = f", {total_kf} keyframes, {total_ocr} with OCR"
|
||||
logger.info(
|
||||
f" => {len(segments)} segments, "
|
||||
f"{len(transcript_segments)} transcript chunks, "
|
||||
f"source: {transcript_source.value}{visual_msg}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
errors.append({"source": source, "error": str(e)})
|
||||
logger.error(f"Failed to process {source}: {e}")
|
||||
logger.debug("Traceback:", exc_info=True)
|
||||
|
||||
# Build result
|
||||
total_duration = sum(v.duration for v in videos)
|
||||
total_segments = sum(len(v.segments) for v in videos)
|
||||
total_code_blocks = sum(
|
||||
sum(len(s.detected_code_blocks) for s in v.segments) for v in videos
|
||||
)
|
||||
|
||||
self.result = VideoScraperResult(
|
||||
videos=videos,
|
||||
total_duration_seconds=total_duration,
|
||||
total_segments=total_segments,
|
||||
total_code_blocks=total_code_blocks,
|
||||
config=source_config,
|
||||
processing_time_seconds=time.time() - start_time,
|
||||
warnings=warnings,
|
||||
errors=errors,
|
||||
)
|
||||
|
||||
return self.result
|
||||
|
||||
def save_extracted_data(self) -> str:
|
||||
"""Save extracted data to JSON file.
|
||||
|
||||
Returns:
|
||||
Path to saved JSON file.
|
||||
"""
|
||||
if self.result is None:
|
||||
raise RuntimeError("No data to save. Run process() first.")
|
||||
|
||||
os.makedirs(os.path.dirname(self.data_file) or ".", exist_ok=True)
|
||||
with open(self.data_file, "w", encoding="utf-8") as f:
|
||||
json.dump(self.result.to_dict(), f, indent=2, ensure_ascii=False)
|
||||
|
||||
logger.info(f"Saved extracted data to {self.data_file}")
|
||||
return self.data_file
|
||||
|
||||
def load_extracted_data(self, json_path: str) -> None:
|
||||
"""Load previously extracted data from JSON.
|
||||
|
||||
Args:
|
||||
json_path: Path to extracted JSON file.
|
||||
"""
|
||||
with open(json_path, encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
self.result = VideoScraperResult.from_dict(data)
|
||||
logger.info(f"Loaded {len(self.result.videos)} videos from {json_path}")
|
||||
|
||||
def build_skill(self) -> str:
|
||||
"""Build skill directory with SKILL.md and reference files.
|
||||
|
||||
Returns:
|
||||
Path to skill directory.
|
||||
"""
|
||||
if self.result is None:
|
||||
raise RuntimeError(
|
||||
"No data to build from. Run process() or load_extracted_data() first."
|
||||
)
|
||||
|
||||
# Create directories
|
||||
refs_dir = os.path.join(self.skill_dir, "references")
|
||||
video_data_dir = os.path.join(self.skill_dir, "video_data")
|
||||
os.makedirs(refs_dir, exist_ok=True)
|
||||
os.makedirs(video_data_dir, exist_ok=True)
|
||||
|
||||
# Generate reference files for each video
|
||||
for video in self.result.videos:
|
||||
ref_filename = f"video_{_sanitize_filename(video.title)}.md"
|
||||
ref_path = os.path.join(refs_dir, ref_filename)
|
||||
ref_content = self._generate_reference_md(video)
|
||||
with open(ref_path, "w", encoding="utf-8") as f:
|
||||
f.write(ref_content)
|
||||
|
||||
# Save metadata JSON
|
||||
metadata_path = os.path.join(video_data_dir, "metadata.json")
|
||||
with open(metadata_path, "w", encoding="utf-8") as f:
|
||||
json.dump(self.result.to_dict(), f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Generate SKILL.md
|
||||
skill_md = self._generate_skill_md()
|
||||
skill_path = os.path.join(self.skill_dir, "SKILL.md")
|
||||
with open(skill_path, "w", encoding="utf-8") as f:
|
||||
f.write(skill_md)
|
||||
|
||||
logger.info(f"Built skill at {self.skill_dir}")
|
||||
logger.info(f" {len(self.result.videos)} videos, {self.result.total_segments} segments")
|
||||
return self.skill_dir
|
||||
|
||||
def _generate_reference_md(self, video: VideoInfo) -> str:
|
||||
"""Generate reference markdown file for a single video."""
|
||||
lines = []
|
||||
|
||||
# Title
|
||||
lines.append(f"# {video.title}\n")
|
||||
|
||||
# Metadata block
|
||||
meta_parts = []
|
||||
if video.channel_name:
|
||||
if video.channel_url:
|
||||
meta_parts.append(f"**Source:** [{video.channel_name}]({video.channel_url})")
|
||||
else:
|
||||
meta_parts.append(f"**Source:** {video.channel_name}")
|
||||
if video.duration > 0:
|
||||
meta_parts.append(f"**Duration:** {_format_duration(video.duration)}")
|
||||
if video.upload_date:
|
||||
meta_parts.append(f"**Published:** {video.upload_date}")
|
||||
|
||||
if meta_parts:
|
||||
lines.append("> " + " | ".join(meta_parts))
|
||||
|
||||
if video.source_url:
|
||||
lines.append(f"> **URL:** [{video.source_url}]({video.source_url})")
|
||||
|
||||
engagement_parts = []
|
||||
if video.view_count is not None:
|
||||
engagement_parts.append(f"**Views:** {_format_count(video.view_count)}")
|
||||
if video.like_count is not None:
|
||||
engagement_parts.append(f"**Likes:** {_format_count(video.like_count)}")
|
||||
if engagement_parts:
|
||||
lines.append("> " + " | ".join(engagement_parts))
|
||||
|
||||
if video.tags:
|
||||
lines.append(f"> **Tags:** {', '.join(video.tags[:10])}")
|
||||
|
||||
lines.append("")
|
||||
|
||||
# Description summary
|
||||
if video.description:
|
||||
desc = video.description[:300]
|
||||
if len(video.description) > 300:
|
||||
desc += "..."
|
||||
lines.append(desc)
|
||||
lines.append("")
|
||||
|
||||
lines.append("---\n")
|
||||
|
||||
# Table of contents (from chapters or segments)
|
||||
if video.segments:
|
||||
lines.append("## Table of Contents\n")
|
||||
for seg in video.segments:
|
||||
label = seg.chapter_title or f"Segment {seg.index + 1}"
|
||||
lines.append(
|
||||
f"- [{label}](#{_sanitize_filename(label)}-{seg.timestamp_display.replace(' ', '')})"
|
||||
)
|
||||
lines.append("\n---\n")
|
||||
|
||||
# Segments as sections
|
||||
for seg in video.segments:
|
||||
lines.append(seg.content)
|
||||
|
||||
# Visual data (keyframes + OCR)
|
||||
if seg.keyframes:
|
||||
for kf in seg.keyframes:
|
||||
if kf.image_path and os.path.exists(kf.image_path):
|
||||
rel_path = os.path.relpath(
|
||||
kf.image_path,
|
||||
os.path.dirname(os.path.join(self.skill_dir, "references", "x.md")),
|
||||
)
|
||||
lines.append(
|
||||
f"\n> **Frame** ({kf.frame_type.value} at {_format_duration(kf.timestamp)}):"
|
||||
)
|
||||
lines.append(f"> ")
|
||||
if kf.sub_sections:
|
||||
from skill_seekers.cli.video_models import FrameType
|
||||
|
||||
lang_hint = ""
|
||||
if seg.detected_code_blocks:
|
||||
for cb in seg.detected_code_blocks:
|
||||
if cb.language:
|
||||
lang_hint = cb.language
|
||||
break
|
||||
for ss in kf.sub_sections:
|
||||
if (
|
||||
ss.frame_type in (FrameType.CODE_EDITOR, FrameType.TERMINAL)
|
||||
and ss.ocr_text
|
||||
):
|
||||
lines.append(f"\n```{lang_hint}")
|
||||
lines.append(ss.ocr_text)
|
||||
lines.append("```")
|
||||
elif kf.ocr_text:
|
||||
from skill_seekers.cli.video_models import FrameType
|
||||
|
||||
if kf.frame_type in (FrameType.CODE_EDITOR, FrameType.TERMINAL):
|
||||
lang_hint = ""
|
||||
if seg.detected_code_blocks:
|
||||
for cb in seg.detected_code_blocks:
|
||||
if cb.language:
|
||||
lang_hint = cb.language
|
||||
break
|
||||
lines.append(f"\n```{lang_hint}")
|
||||
lines.append(kf.ocr_text)
|
||||
lines.append("```")
|
||||
elif kf.frame_type == FrameType.SLIDE:
|
||||
for text_line in kf.ocr_text.split("\n"):
|
||||
if text_line.strip():
|
||||
lines.append(f"> {text_line}")
|
||||
else:
|
||||
lines.append(f"> **On-screen text:** {kf.ocr_text}")
|
||||
|
||||
# Detected code blocks subsection
|
||||
if seg.detected_code_blocks:
|
||||
lines.append("\n#### Detected Code\n")
|
||||
for cb in seg.detected_code_blocks:
|
||||
lang_label = cb.language or "unknown"
|
||||
context_label = cb.context.value if cb.context else "unknown"
|
||||
lines.append(
|
||||
f"**{lang_label}** ({context_label} at "
|
||||
f"{_format_duration(cb.source_frame)}):\n"
|
||||
)
|
||||
lines.append(f"```{cb.language or ''}")
|
||||
lines.append(cb.code)
|
||||
lines.append("```\n")
|
||||
|
||||
lines.append("\n---\n")
|
||||
|
||||
# Code Timeline section (from text groups)
|
||||
if video.text_group_timeline and video.text_group_timeline.text_groups:
|
||||
tl = video.text_group_timeline
|
||||
lines.append("\n## Code Timeline\n")
|
||||
lines.append(
|
||||
f"> {tl.total_groups} code groups tracked, "
|
||||
f"{tl.total_edits} edits detected, "
|
||||
f"{tl.total_code_time:.0f}s of on-screen code\n"
|
||||
)
|
||||
|
||||
for group in tl.text_groups:
|
||||
lang_hint = group.detected_language or ""
|
||||
lines.append(f"### {group.group_id}")
|
||||
appearance_strs = []
|
||||
for start, end in group.appearances:
|
||||
appearance_strs.append(f"{_format_duration(start)} - {_format_duration(end)}")
|
||||
lines.append(f"**Appearances:** {', '.join(appearance_strs)}\n")
|
||||
|
||||
lines.append(f"```{lang_hint}")
|
||||
lines.append(group.full_text)
|
||||
lines.append("```\n")
|
||||
|
||||
if group.edits:
|
||||
lines.append("**Edits:**\n")
|
||||
for edit in group.edits:
|
||||
lines.append(f"- At {_format_duration(edit.timestamp)}:")
|
||||
for line in edit.added_lines:
|
||||
lines.append(f" + `{line}`")
|
||||
for line in edit.removed_lines:
|
||||
lines.append(f" - `{line}`")
|
||||
for mod in edit.modified_lines:
|
||||
lines.append(
|
||||
f" ~ L{mod.get('line_num', '?')}: "
|
||||
f"`{mod.get('old', '')}` → `{mod.get('new', '')}`"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
lines.append("---\n")
|
||||
|
||||
# Audio-Visual Alignment section
|
||||
if video.audio_visual_alignments:
|
||||
lines.append("\n## Audio-Visual Alignment\n")
|
||||
lines.append(f"> {len(video.audio_visual_alignments)} code-narration pairs\n")
|
||||
|
||||
for av in video.audio_visual_alignments:
|
||||
lang_hint = av.language or ""
|
||||
lines.append(
|
||||
f"**{av.text_group_id}** "
|
||||
f"({_format_duration(av.start_time)} - {_format_duration(av.end_time)})\n"
|
||||
)
|
||||
lines.append(f"```{lang_hint}")
|
||||
lines.append(av.on_screen_code)
|
||||
lines.append("```\n")
|
||||
lines.append(f"> **Narrator:** {av.transcript_during}\n")
|
||||
|
||||
lines.append("---\n")
|
||||
|
||||
# Transcript source info
|
||||
lines.append(f"\n*Transcript source: {video.transcript_source.value}*")
|
||||
if video.transcript_confidence > 0:
|
||||
lines.append(f"*Confidence: {video.transcript_confidence:.0%}*")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def _generate_skill_md(self) -> str:
|
||||
"""Generate the main SKILL.md file."""
|
||||
lines = []
|
||||
desc = self.description or infer_description_from_video(
|
||||
self.result.videos[0]
|
||||
if self.result.videos
|
||||
else VideoInfo(video_id="none", source_type=VideoSourceType.YOUTUBE),
|
||||
self.name,
|
||||
)
|
||||
|
||||
lines.append(f"# {self.name}\n")
|
||||
lines.append(f"{desc}\n")
|
||||
|
||||
# Overview
|
||||
total_dur = _format_duration(self.result.total_duration_seconds)
|
||||
lines.append("## Overview\n")
|
||||
overview = (
|
||||
f"This skill includes knowledge extracted from "
|
||||
f"{len(self.result.videos)} video(s) totaling {total_dur} of content."
|
||||
)
|
||||
# Visual extraction summary
|
||||
total_kf = sum(
|
||||
len(kf) for v in self.result.videos for s in v.segments for kf in [s.keyframes]
|
||||
)
|
||||
total_ocr = sum(
|
||||
1 for v in self.result.videos for s in v.segments for kf in s.keyframes if kf.ocr_text
|
||||
)
|
||||
total_code = sum(
|
||||
len(s.detected_code_blocks) for v in self.result.videos for s in v.segments
|
||||
)
|
||||
if total_kf > 0:
|
||||
overview += (
|
||||
f"\nVisual extraction: {total_kf} keyframes, {total_ocr} with on-screen text"
|
||||
)
|
||||
if total_code > 0:
|
||||
overview += f", {total_code} code blocks detected"
|
||||
overview += "."
|
||||
lines.append(f"{overview}\n")
|
||||
|
||||
# Video tutorials section
|
||||
lines.append("## Video Tutorials\n")
|
||||
|
||||
for video in self.result.videos:
|
||||
lines.append(f"### {video.title}")
|
||||
meta = []
|
||||
if video.channel_name:
|
||||
if video.source_url:
|
||||
meta.append(f"[{video.channel_name}]({video.source_url})")
|
||||
else:
|
||||
meta.append(video.channel_name)
|
||||
if video.duration > 0:
|
||||
meta.append(_format_duration(video.duration))
|
||||
if video.view_count is not None:
|
||||
meta.append(f"{_format_count(video.view_count)} views")
|
||||
if meta:
|
||||
lines.append(f"**Source:** {' | '.join(meta)}\n")
|
||||
|
||||
# Topics covered
|
||||
topics = [s.chapter_title for s in video.segments if s.chapter_title]
|
||||
if topics:
|
||||
lines.append(f"**Topics covered:** {', '.join(topics)}\n")
|
||||
|
||||
# First segment preview
|
||||
if video.segments and video.segments[0].transcript:
|
||||
preview = video.segments[0].transcript[:200]
|
||||
if len(video.segments[0].transcript) > 200:
|
||||
preview += "..."
|
||||
lines.append(f"{preview}\n")
|
||||
|
||||
ref_filename = f"video_{_sanitize_filename(video.title)}.md"
|
||||
lines.append(
|
||||
f"> Full transcript: [references/{ref_filename}](references/{ref_filename})\n"
|
||||
)
|
||||
lines.append("---\n")
|
||||
|
||||
# Warnings
|
||||
if self.result.warnings:
|
||||
lines.append("## Notes\n")
|
||||
for warning in self.result.warnings:
|
||||
lines.append(f"- {warning}")
|
||||
lines.append("")
|
||||
|
||||
# References
|
||||
lines.append("## References\n")
|
||||
for video in self.result.videos:
|
||||
ref_filename = f"video_{_sanitize_filename(video.title)}.md"
|
||||
lines.append(f"- [{video.title}](references/{ref_filename})")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CLI Entry Point
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def main() -> int:
|
||||
"""Entry point for video scraper CLI.
|
||||
|
||||
Returns:
|
||||
Exit code (0 for success, non-zero for error).
|
||||
"""
|
||||
from skill_seekers.cli.arguments.video import add_video_arguments
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="skill-seekers-video",
|
||||
description="Extract transcripts and metadata from videos and generate skill",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""\
|
||||
Examples:
|
||||
skill-seekers video --url https://www.youtube.com/watch?v=...
|
||||
skill-seekers video --video-file recording.mp4
|
||||
skill-seekers video --playlist https://www.youtube.com/playlist?list=...
|
||||
skill-seekers video --from-json video_extracted.json
|
||||
skill-seekers video --url https://youtu.be/... --languages en,es
|
||||
""",
|
||||
)
|
||||
|
||||
add_video_arguments(parser)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Setup logging
|
||||
log_level = logging.DEBUG if args.verbose else (logging.WARNING if args.quiet else logging.INFO)
|
||||
logging.basicConfig(level=log_level, format="%(levelname)s: %(message)s")
|
||||
|
||||
# Validate inputs
|
||||
has_source = any(
|
||||
[
|
||||
getattr(args, "url", None),
|
||||
getattr(args, "video_file", None),
|
||||
getattr(args, "playlist", None),
|
||||
]
|
||||
)
|
||||
has_json = getattr(args, "from_json", None)
|
||||
|
||||
if not has_source and not has_json:
|
||||
parser.error("Must specify --url, --video-file, --playlist, or --from-json")
|
||||
|
||||
# Build config
|
||||
config = {
|
||||
"name": args.name or "video_skill",
|
||||
"description": getattr(args, "description", None) or "",
|
||||
"output": getattr(args, "output", None),
|
||||
"url": getattr(args, "url", None),
|
||||
"video_file": getattr(args, "video_file", None),
|
||||
"playlist": getattr(args, "playlist", None),
|
||||
"languages": getattr(args, "languages", "en"),
|
||||
"visual": getattr(args, "visual", False),
|
||||
"whisper_model": getattr(args, "whisper_model", "base"),
|
||||
"visual_interval": getattr(args, "visual_interval", 0.7),
|
||||
"visual_min_gap": getattr(args, "visual_min_gap", 0.5),
|
||||
"visual_similarity": getattr(args, "visual_similarity", 3.0),
|
||||
"vision_ocr": getattr(args, "vision_ocr", False),
|
||||
}
|
||||
|
||||
converter = VideoToSkillConverter(config)
|
||||
|
||||
# Dry run
|
||||
if args.dry_run:
|
||||
logger.info("DRY RUN — would process:")
|
||||
for key in ["url", "video_file", "playlist"]:
|
||||
if config.get(key):
|
||||
logger.info(f" {key}: {config[key]}")
|
||||
logger.info(f" name: {config['name']}")
|
||||
logger.info(f" languages: {config['languages']}")
|
||||
logger.info(f" visual: {config['visual']}")
|
||||
return 0
|
||||
|
||||
# Workflow 1: Build from JSON
|
||||
if has_json:
|
||||
logger.info(f"Loading extracted data from {args.from_json}")
|
||||
converter.load_extracted_data(args.from_json)
|
||||
converter.build_skill()
|
||||
logger.info(f"Skill built at {converter.skill_dir}")
|
||||
return 0
|
||||
|
||||
# Workflow 2: Full extraction + build
|
||||
try:
|
||||
result = converter.process()
|
||||
if not result.videos:
|
||||
logger.error("No videos were successfully processed")
|
||||
if result.errors:
|
||||
for err in result.errors:
|
||||
logger.error(f" {err['source']}: {err['error']}")
|
||||
return 1
|
||||
|
||||
converter.save_extracted_data()
|
||||
converter.build_skill()
|
||||
|
||||
logger.info(f"\nSkill built successfully at {converter.skill_dir}")
|
||||
logger.info(f" Videos: {len(result.videos)}")
|
||||
logger.info(f" Segments: {result.total_segments}")
|
||||
logger.info(f" Duration: {_format_duration(result.total_duration_seconds)}")
|
||||
logger.info(f" Processing time: {result.processing_time_seconds:.1f}s")
|
||||
|
||||
if result.warnings:
|
||||
for w in result.warnings:
|
||||
logger.warning(f" {w}")
|
||||
|
||||
except RuntimeError as e:
|
||||
logger.error(str(e))
|
||||
return 1
|
||||
|
||||
# Enhancement
|
||||
enhance_level = getattr(args, "enhance_level", 0)
|
||||
if enhance_level > 0:
|
||||
# Auto-inject video-tutorial workflow if no workflow specified
|
||||
if not getattr(args, "enhance_workflow", None):
|
||||
args.enhance_workflow = ["video-tutorial"]
|
||||
|
||||
# Run workflow stages (specialized video analysis)
|
||||
try:
|
||||
from skill_seekers.cli.workflow_runner import run_workflows
|
||||
|
||||
video_context = {
|
||||
"skill_name": converter.name,
|
||||
"skill_dir": converter.skill_dir,
|
||||
"source_type": "video_tutorial",
|
||||
}
|
||||
run_workflows(args, context=video_context)
|
||||
except ImportError:
|
||||
logger.debug("Workflow runner not available, skipping workflow stages")
|
||||
|
||||
# Run traditional SKILL.md enhancement (reads references + rewrites)
|
||||
_run_video_enhancement(converter.skill_dir, enhance_level, args)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def _run_video_enhancement(skill_dir: str, enhance_level: int, args) -> None:
|
||||
"""Run traditional SKILL.md enhancement with video-aware prompt.
|
||||
|
||||
This calls the same SkillEnhancer used by other scrapers, but the prompt
|
||||
auto-detects video_tutorial source type and uses a video-specific prompt.
|
||||
"""
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
has_api_key = bool(
|
||||
os.environ.get("ANTHROPIC_API_KEY")
|
||||
or os.environ.get("ANTHROPIC_AUTH_TOKEN")
|
||||
or getattr(args, "api_key", None)
|
||||
)
|
||||
|
||||
if not has_api_key:
|
||||
logger.info("\n💡 Enhance your video skill with AI:")
|
||||
logger.info(f" export ANTHROPIC_API_KEY=sk-ant-...")
|
||||
logger.info(f" skill-seekers enhance {skill_dir} --enhance-level {enhance_level}")
|
||||
return
|
||||
|
||||
logger.info(f"\n🤖 Running video-aware SKILL.md enhancement (level {enhance_level})...")
|
||||
|
||||
try:
|
||||
enhance_cmd = ["skill-seekers-enhance", skill_dir]
|
||||
enhance_cmd.extend(["--enhance-level", str(enhance_level)])
|
||||
api_key = getattr(args, "api_key", None)
|
||||
if api_key:
|
||||
enhance_cmd.extend(["--api-key", api_key])
|
||||
|
||||
result = subprocess.run(enhance_cmd, check=True)
|
||||
if result.returncode == 0:
|
||||
logger.info("✅ Video skill enhancement complete!")
|
||||
except subprocess.CalledProcessError:
|
||||
logger.warning("⚠ Enhancement failed, but skill was still built")
|
||||
except FileNotFoundError:
|
||||
logger.warning("⚠ skill-seekers-enhance not found. Run manually:")
|
||||
logger.info(f" skill-seekers enhance {skill_dir} --enhance-level {enhance_level}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
218
src/skill_seekers/cli/video_segmenter.py
Normal file
218
src/skill_seekers/cli/video_segmenter.py
Normal file
@@ -0,0 +1,218 @@
|
||||
"""Video segmentation module.
|
||||
|
||||
Aligns transcript + metadata into VideoSegment objects using:
|
||||
1. Chapter-based segmentation (primary — uses YouTube chapters)
|
||||
2. Time-window segmentation (fallback — fixed-duration windows)
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from skill_seekers.cli.video_models import (
|
||||
SegmentContentType,
|
||||
TranscriptSegment,
|
||||
VideoInfo,
|
||||
VideoSegment,
|
||||
VideoSourceConfig,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _classify_content_type(transcript: str) -> SegmentContentType:
|
||||
"""Classify segment content type based on transcript text."""
|
||||
lower = transcript.lower()
|
||||
|
||||
code_indicators = ["import ", "def ", "class ", "function ", "const ", "npm ", "pip ", "git "]
|
||||
intro_indicators = ["welcome", "hello", "today we", "in this video", "let's get started"]
|
||||
outro_indicators = ["thanks for watching", "subscribe", "see you next", "that's it for"]
|
||||
|
||||
if any(kw in lower for kw in outro_indicators):
|
||||
return SegmentContentType.OUTRO
|
||||
if any(kw in lower for kw in intro_indicators):
|
||||
return SegmentContentType.INTRO
|
||||
if sum(1 for kw in code_indicators if kw in lower) >= 2:
|
||||
return SegmentContentType.LIVE_CODING
|
||||
|
||||
return SegmentContentType.EXPLANATION
|
||||
|
||||
|
||||
def _build_segment_content(
|
||||
transcript: str,
|
||||
chapter_title: str | None,
|
||||
start_time: float,
|
||||
end_time: float,
|
||||
) -> str:
|
||||
"""Build merged content string for a segment."""
|
||||
parts = []
|
||||
|
||||
# Add chapter heading
|
||||
start_min, start_sec = divmod(int(start_time), 60)
|
||||
end_min, end_sec = divmod(int(end_time), 60)
|
||||
ts = f"{start_min:02d}:{start_sec:02d} - {end_min:02d}:{end_sec:02d}"
|
||||
|
||||
if chapter_title:
|
||||
parts.append(f"### {chapter_title} ({ts})\n")
|
||||
else:
|
||||
parts.append(f"### Segment ({ts})\n")
|
||||
|
||||
if transcript:
|
||||
parts.append(transcript)
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def _get_transcript_in_range(
|
||||
transcript_segments: list[TranscriptSegment],
|
||||
start_time: float,
|
||||
end_time: float,
|
||||
) -> tuple[str, float]:
|
||||
"""Get concatenated transcript text and average confidence for a time range.
|
||||
|
||||
Returns:
|
||||
Tuple of (text, avg_confidence).
|
||||
"""
|
||||
texts = []
|
||||
confidences = []
|
||||
|
||||
for seg in transcript_segments:
|
||||
# Check overlap: segment overlaps with time range
|
||||
if seg.end > start_time and seg.start < end_time:
|
||||
texts.append(seg.text)
|
||||
confidences.append(seg.confidence)
|
||||
|
||||
text = " ".join(texts)
|
||||
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
|
||||
return text, avg_confidence
|
||||
|
||||
|
||||
def segment_by_chapters(
|
||||
video_info: VideoInfo,
|
||||
transcript_segments: list[TranscriptSegment],
|
||||
) -> list[VideoSegment]:
|
||||
"""Segment video using YouTube chapter boundaries.
|
||||
|
||||
Args:
|
||||
video_info: Video metadata with chapters.
|
||||
transcript_segments: Raw transcript segments.
|
||||
|
||||
Returns:
|
||||
List of VideoSegment objects aligned to chapters.
|
||||
"""
|
||||
segments = []
|
||||
|
||||
for i, chapter in enumerate(video_info.chapters):
|
||||
transcript, confidence = _get_transcript_in_range(
|
||||
transcript_segments, chapter.start_time, chapter.end_time
|
||||
)
|
||||
|
||||
content_type = _classify_content_type(transcript)
|
||||
content = _build_segment_content(
|
||||
transcript, chapter.title, chapter.start_time, chapter.end_time
|
||||
)
|
||||
|
||||
segments.append(
|
||||
VideoSegment(
|
||||
index=i,
|
||||
start_time=chapter.start_time,
|
||||
end_time=chapter.end_time,
|
||||
duration=chapter.end_time - chapter.start_time,
|
||||
transcript=transcript,
|
||||
transcript_confidence=confidence,
|
||||
chapter_title=chapter.title,
|
||||
content=content,
|
||||
confidence=confidence,
|
||||
content_type=content_type,
|
||||
)
|
||||
)
|
||||
|
||||
return segments
|
||||
|
||||
|
||||
def segment_by_time_window(
|
||||
video_info: VideoInfo,
|
||||
transcript_segments: list[TranscriptSegment],
|
||||
window_seconds: float = 120.0,
|
||||
) -> list[VideoSegment]:
|
||||
"""Segment video using fixed time windows.
|
||||
|
||||
Args:
|
||||
video_info: Video metadata.
|
||||
transcript_segments: Raw transcript segments.
|
||||
window_seconds: Duration of each window in seconds.
|
||||
|
||||
Returns:
|
||||
List of VideoSegment objects.
|
||||
"""
|
||||
segments = []
|
||||
duration = video_info.duration
|
||||
|
||||
if duration <= 0 and transcript_segments:
|
||||
duration = max(seg.end for seg in transcript_segments)
|
||||
|
||||
if duration <= 0:
|
||||
return segments
|
||||
|
||||
current_time = 0.0
|
||||
index = 0
|
||||
|
||||
while current_time < duration:
|
||||
end_time = min(current_time + window_seconds, duration)
|
||||
|
||||
transcript, confidence = _get_transcript_in_range(
|
||||
transcript_segments, current_time, end_time
|
||||
)
|
||||
|
||||
if transcript.strip():
|
||||
content_type = _classify_content_type(transcript)
|
||||
content = _build_segment_content(transcript, None, current_time, end_time)
|
||||
|
||||
segments.append(
|
||||
VideoSegment(
|
||||
index=index,
|
||||
start_time=current_time,
|
||||
end_time=end_time,
|
||||
duration=end_time - current_time,
|
||||
transcript=transcript,
|
||||
transcript_confidence=confidence,
|
||||
content=content,
|
||||
confidence=confidence,
|
||||
content_type=content_type,
|
||||
)
|
||||
)
|
||||
index += 1
|
||||
|
||||
current_time = end_time
|
||||
|
||||
return segments
|
||||
|
||||
|
||||
def segment_video(
|
||||
video_info: VideoInfo,
|
||||
transcript_segments: list[TranscriptSegment],
|
||||
config: VideoSourceConfig,
|
||||
) -> list[VideoSegment]:
|
||||
"""Segment a video using the best available strategy.
|
||||
|
||||
Priority:
|
||||
1. Chapter-based (if chapters available)
|
||||
2. Time-window fallback
|
||||
|
||||
Args:
|
||||
video_info: Video metadata.
|
||||
transcript_segments: Raw transcript segments.
|
||||
config: Video source configuration.
|
||||
|
||||
Returns:
|
||||
List of VideoSegment objects.
|
||||
"""
|
||||
# Use chapters if available
|
||||
if video_info.chapters:
|
||||
logger.info(f"Using chapter-based segmentation ({len(video_info.chapters)} chapters)")
|
||||
segments = segment_by_chapters(video_info, transcript_segments)
|
||||
if segments:
|
||||
return segments
|
||||
|
||||
# Fallback to time-window
|
||||
window = config.time_window_seconds
|
||||
logger.info(f"Using time-window segmentation ({window}s windows)")
|
||||
return segment_by_time_window(video_info, transcript_segments, window)
|
||||
370
src/skill_seekers/cli/video_transcript.py
Normal file
370
src/skill_seekers/cli/video_transcript.py
Normal file
@@ -0,0 +1,370 @@
|
||||
"""Video transcript extraction module.
|
||||
|
||||
Handles all transcript acquisition:
|
||||
- YouTube captions via youtube-transcript-api (Tier 1)
|
||||
- Subtitle file parsing: SRT and VTT (Tier 1)
|
||||
- Whisper ASR stub (Tier 2 — raises ImportError with install instructions)
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
from skill_seekers.cli.video_models import (
|
||||
TranscriptSegment,
|
||||
TranscriptSource,
|
||||
VideoInfo,
|
||||
VideoSourceConfig,
|
||||
VideoSourceType,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Optional dependency: youtube-transcript-api
|
||||
try:
|
||||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
|
||||
HAS_YOUTUBE_TRANSCRIPT = True
|
||||
except ImportError:
|
||||
HAS_YOUTUBE_TRANSCRIPT = False
|
||||
|
||||
# Optional dependency: faster-whisper (Tier 2)
|
||||
try:
|
||||
from faster_whisper import WhisperModel # noqa: F401
|
||||
|
||||
HAS_WHISPER = True
|
||||
except ImportError:
|
||||
HAS_WHISPER = False
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# YouTube Transcript Extraction (Tier 1)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def extract_youtube_transcript(
|
||||
video_id: str,
|
||||
languages: list[str] | None = None,
|
||||
) -> tuple[list[TranscriptSegment], TranscriptSource]:
|
||||
"""Fetch YouTube captions via youtube-transcript-api.
|
||||
|
||||
Args:
|
||||
video_id: YouTube video ID (11 chars).
|
||||
languages: Language preference list (e.g., ['en', 'tr']).
|
||||
|
||||
Returns:
|
||||
Tuple of (transcript segments, source type).
|
||||
|
||||
Raises:
|
||||
RuntimeError: If youtube-transcript-api is not installed.
|
||||
"""
|
||||
if not HAS_YOUTUBE_TRANSCRIPT:
|
||||
raise RuntimeError(
|
||||
"youtube-transcript-api is required for YouTube transcript extraction.\n"
|
||||
'Install with: pip install "skill-seekers[video]"\n'
|
||||
"Or: pip install youtube-transcript-api"
|
||||
)
|
||||
|
||||
if languages is None:
|
||||
languages = ["en"]
|
||||
|
||||
try:
|
||||
ytt_api = YouTubeTranscriptApi()
|
||||
transcript = ytt_api.fetch(video_id, languages=languages)
|
||||
|
||||
segments = []
|
||||
source = TranscriptSource.YOUTUBE_MANUAL
|
||||
for snippet in transcript.snippets:
|
||||
text = snippet.text.strip()
|
||||
if not text:
|
||||
continue
|
||||
start = snippet.start
|
||||
duration = snippet.duration
|
||||
segments.append(
|
||||
TranscriptSegment(
|
||||
text=text,
|
||||
start=start,
|
||||
end=start + duration,
|
||||
confidence=1.0,
|
||||
source=source,
|
||||
)
|
||||
)
|
||||
|
||||
if not segments:
|
||||
return [], TranscriptSource.NONE
|
||||
|
||||
return segments, source
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to fetch YouTube transcript for {video_id}: {e}")
|
||||
return [], TranscriptSource.NONE
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Subtitle File Parsing (Tier 1)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def _parse_timestamp_srt(ts: str) -> float:
|
||||
"""Parse SRT timestamp (HH:MM:SS,mmm) to seconds."""
|
||||
ts = ts.strip().replace(",", ".")
|
||||
parts = ts.split(":")
|
||||
if len(parts) == 3:
|
||||
h, m, s = parts
|
||||
return int(h) * 3600 + int(m) * 60 + float(s)
|
||||
return 0.0
|
||||
|
||||
|
||||
def _parse_timestamp_vtt(ts: str) -> float:
|
||||
"""Parse VTT timestamp (HH:MM:SS.mmm or MM:SS.mmm) to seconds."""
|
||||
ts = ts.strip()
|
||||
parts = ts.split(":")
|
||||
if len(parts) == 3:
|
||||
h, m, s = parts
|
||||
return int(h) * 3600 + int(m) * 60 + float(s)
|
||||
elif len(parts) == 2:
|
||||
m, s = parts
|
||||
return int(m) * 60 + float(s)
|
||||
return 0.0
|
||||
|
||||
|
||||
def parse_srt(path: str) -> list[TranscriptSegment]:
|
||||
"""Parse an SRT subtitle file into TranscriptSegments.
|
||||
|
||||
Args:
|
||||
path: Path to .srt file.
|
||||
|
||||
Returns:
|
||||
List of TranscriptSegment objects.
|
||||
"""
|
||||
content = Path(path).read_text(encoding="utf-8", errors="replace")
|
||||
segments = []
|
||||
|
||||
# SRT format: index\nstart --> end\ntext\n\n
|
||||
blocks = re.split(r"\n\s*\n", content.strip())
|
||||
for block in blocks:
|
||||
lines = block.strip().split("\n")
|
||||
if len(lines) < 2:
|
||||
continue
|
||||
|
||||
# Find the timestamp line (contains -->)
|
||||
ts_line = None
|
||||
text_lines = []
|
||||
for line in lines:
|
||||
if "-->" in line:
|
||||
ts_line = line
|
||||
elif ts_line is not None:
|
||||
text_lines.append(line)
|
||||
|
||||
if ts_line is None:
|
||||
continue
|
||||
|
||||
parts = ts_line.split("-->")
|
||||
if len(parts) != 2:
|
||||
continue
|
||||
|
||||
start = _parse_timestamp_srt(parts[0])
|
||||
end = _parse_timestamp_srt(parts[1])
|
||||
text = " ".join(text_lines).strip()
|
||||
|
||||
# Remove HTML tags
|
||||
text = re.sub(r"<[^>]+>", "", text)
|
||||
|
||||
if text:
|
||||
segments.append(
|
||||
TranscriptSegment(
|
||||
text=text,
|
||||
start=start,
|
||||
end=end,
|
||||
confidence=1.0,
|
||||
source=TranscriptSource.SUBTITLE_FILE,
|
||||
)
|
||||
)
|
||||
|
||||
return segments
|
||||
|
||||
|
||||
def parse_vtt(path: str) -> list[TranscriptSegment]:
|
||||
"""Parse a WebVTT subtitle file into TranscriptSegments.
|
||||
|
||||
Args:
|
||||
path: Path to .vtt file.
|
||||
|
||||
Returns:
|
||||
List of TranscriptSegment objects.
|
||||
"""
|
||||
content = Path(path).read_text(encoding="utf-8", errors="replace")
|
||||
segments = []
|
||||
|
||||
# Skip VTT header
|
||||
lines = content.strip().split("\n")
|
||||
i = 0
|
||||
# Skip WEBVTT header and any metadata
|
||||
while i < len(lines) and not re.match(r"\d{2}:\d{2}", lines[i]):
|
||||
i += 1
|
||||
|
||||
current_text_lines = []
|
||||
current_start = 0.0
|
||||
current_end = 0.0
|
||||
in_cue = False
|
||||
|
||||
while i < len(lines):
|
||||
line = lines[i].strip()
|
||||
i += 1
|
||||
|
||||
if "-->" in line:
|
||||
# Save previous cue
|
||||
if in_cue and current_text_lines:
|
||||
text = " ".join(current_text_lines).strip()
|
||||
text = re.sub(r"<[^>]+>", "", text)
|
||||
if text:
|
||||
segments.append(
|
||||
TranscriptSegment(
|
||||
text=text,
|
||||
start=current_start,
|
||||
end=current_end,
|
||||
confidence=1.0,
|
||||
source=TranscriptSource.SUBTITLE_FILE,
|
||||
)
|
||||
)
|
||||
|
||||
parts = line.split("-->")
|
||||
current_start = _parse_timestamp_vtt(parts[0])
|
||||
current_end = _parse_timestamp_vtt(parts[1].split()[0])
|
||||
current_text_lines = []
|
||||
in_cue = True
|
||||
|
||||
elif line == "":
|
||||
if in_cue and current_text_lines:
|
||||
text = " ".join(current_text_lines).strip()
|
||||
text = re.sub(r"<[^>]+>", "", text)
|
||||
if text:
|
||||
segments.append(
|
||||
TranscriptSegment(
|
||||
text=text,
|
||||
start=current_start,
|
||||
end=current_end,
|
||||
confidence=1.0,
|
||||
source=TranscriptSource.SUBTITLE_FILE,
|
||||
)
|
||||
)
|
||||
current_text_lines = []
|
||||
in_cue = False
|
||||
|
||||
elif in_cue:
|
||||
# Skip cue identifiers (numeric lines before timestamps)
|
||||
if not line.isdigit():
|
||||
current_text_lines.append(line)
|
||||
|
||||
# Handle last cue
|
||||
if in_cue and current_text_lines:
|
||||
text = " ".join(current_text_lines).strip()
|
||||
text = re.sub(r"<[^>]+>", "", text)
|
||||
if text:
|
||||
segments.append(
|
||||
TranscriptSegment(
|
||||
text=text,
|
||||
start=current_start,
|
||||
end=current_end,
|
||||
confidence=1.0,
|
||||
source=TranscriptSource.SUBTITLE_FILE,
|
||||
)
|
||||
)
|
||||
|
||||
return segments
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Whisper Stub (Tier 2)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def transcribe_with_whisper(
|
||||
audio_path: str, # noqa: ARG001
|
||||
model: str = "base", # noqa: ARG001
|
||||
language: str | None = None, # noqa: ARG001
|
||||
) -> list[TranscriptSegment]:
|
||||
"""Transcribe audio using faster-whisper (Tier 2).
|
||||
|
||||
Raises:
|
||||
RuntimeError: Always, unless faster-whisper is installed.
|
||||
"""
|
||||
if not HAS_WHISPER:
|
||||
raise RuntimeError(
|
||||
"faster-whisper is required for Whisper transcription.\n"
|
||||
'Install with: pip install "skill-seekers[video-full]"\n'
|
||||
"Or: pip install faster-whisper"
|
||||
)
|
||||
|
||||
# Tier 2 implementation placeholder
|
||||
raise NotImplementedError("Whisper transcription will be implemented in Tier 2")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Main Entry Point
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def get_transcript(
|
||||
video_info: VideoInfo,
|
||||
config: VideoSourceConfig,
|
||||
) -> tuple[list[TranscriptSegment], TranscriptSource]:
|
||||
"""Get transcript for a video, trying available methods in priority order.
|
||||
|
||||
Priority:
|
||||
1. YouTube API (for YouTube videos)
|
||||
2. Subtitle files (SRT/VTT alongside local files)
|
||||
3. Whisper fallback (Tier 2)
|
||||
4. NONE (no transcript available)
|
||||
|
||||
Args:
|
||||
video_info: Video metadata.
|
||||
config: Video source configuration.
|
||||
|
||||
Returns:
|
||||
Tuple of (transcript segments, source type).
|
||||
"""
|
||||
languages = config.languages or ["en"]
|
||||
|
||||
# 1. Try YouTube API for YouTube videos
|
||||
if video_info.source_type == VideoSourceType.YOUTUBE and HAS_YOUTUBE_TRANSCRIPT:
|
||||
try:
|
||||
segments, source = extract_youtube_transcript(video_info.video_id, languages)
|
||||
if segments:
|
||||
logger.info(
|
||||
f"Got {len(segments)} transcript segments via YouTube API "
|
||||
f"({source.value}) for '{video_info.title}'"
|
||||
)
|
||||
return segments, source
|
||||
except Exception as e:
|
||||
logger.warning(f"YouTube transcript failed: {e}")
|
||||
|
||||
# 2. Try subtitle files for local videos
|
||||
if video_info.file_path:
|
||||
base = Path(video_info.file_path).stem
|
||||
parent = Path(video_info.file_path).parent
|
||||
|
||||
for ext in [".srt", ".vtt"]:
|
||||
sub_path = parent / f"{base}{ext}"
|
||||
if sub_path.exists():
|
||||
logger.info(f"Found subtitle file: {sub_path}")
|
||||
segments = parse_srt(str(sub_path)) if ext == ".srt" else parse_vtt(str(sub_path))
|
||||
if segments:
|
||||
return segments, TranscriptSource.SUBTITLE_FILE
|
||||
|
||||
# 3. Whisper fallback (Tier 2 — only if installed)
|
||||
if HAS_WHISPER and video_info.file_path:
|
||||
try:
|
||||
segments = transcribe_with_whisper(
|
||||
video_info.file_path,
|
||||
model=config.whisper_model,
|
||||
language=languages[0] if languages else None,
|
||||
)
|
||||
if segments:
|
||||
return segments, TranscriptSource.WHISPER
|
||||
except (RuntimeError, NotImplementedError):
|
||||
pass
|
||||
|
||||
# 4. No transcript available
|
||||
logger.warning(f"No transcript available for '{video_info.title}'")
|
||||
return [], TranscriptSource.NONE
|
||||
2182
src/skill_seekers/cli/video_visual.py
Normal file
2182
src/skill_seekers/cli/video_visual.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -98,6 +98,7 @@ try:
|
||||
scrape_docs_impl,
|
||||
scrape_github_impl,
|
||||
scrape_pdf_impl,
|
||||
scrape_video_impl,
|
||||
# Splitting tools
|
||||
split_config_impl,
|
||||
submit_config_impl,
|
||||
@@ -420,6 +421,55 @@ async def scrape_pdf(
|
||||
return str(result)
|
||||
|
||||
|
||||
@safe_tool_decorator(
|
||||
description="Extract transcripts and metadata from videos (YouTube, Vimeo, local files) and build Claude skill."
|
||||
)
|
||||
async def scrape_video(
|
||||
url: str | None = None,
|
||||
video_file: str | None = None,
|
||||
playlist: str | None = None,
|
||||
name: str | None = None,
|
||||
description: str | None = None,
|
||||
languages: str | None = None,
|
||||
from_json: str | None = None,
|
||||
) -> str:
|
||||
"""
|
||||
Scrape video content and build Claude skill.
|
||||
|
||||
Args:
|
||||
url: Video URL (YouTube, Vimeo)
|
||||
video_file: Local video file path
|
||||
playlist: Playlist URL
|
||||
name: Skill name
|
||||
description: Skill description
|
||||
languages: Transcript language preferences (comma-separated)
|
||||
from_json: Build from extracted JSON file
|
||||
|
||||
Returns:
|
||||
Video scraping results with file paths.
|
||||
"""
|
||||
args = {}
|
||||
if url:
|
||||
args["url"] = url
|
||||
if video_file:
|
||||
args["video_file"] = video_file
|
||||
if playlist:
|
||||
args["playlist"] = playlist
|
||||
if name:
|
||||
args["name"] = name
|
||||
if description:
|
||||
args["description"] = description
|
||||
if languages:
|
||||
args["languages"] = languages
|
||||
if from_json:
|
||||
args["from_json"] = from_json
|
||||
|
||||
result = await scrape_video_impl(args)
|
||||
if isinstance(result, list) and result:
|
||||
return result[0].text if hasattr(result[0], "text") else str(result[0])
|
||||
return str(result)
|
||||
|
||||
|
||||
@safe_tool_decorator(
|
||||
description="Analyze local codebase and extract code knowledge. Walks directory tree, analyzes code files, extracts signatures, docstrings, and optionally generates API reference documentation and dependency graphs."
|
||||
)
|
||||
|
||||
@@ -63,6 +63,9 @@ from .scraping_tools import (
|
||||
from .scraping_tools import (
|
||||
scrape_pdf_tool as scrape_pdf_impl,
|
||||
)
|
||||
from .scraping_tools import (
|
||||
scrape_video_tool as scrape_video_impl,
|
||||
)
|
||||
from .source_tools import (
|
||||
add_config_source_tool as add_config_source_impl,
|
||||
)
|
||||
@@ -123,6 +126,7 @@ __all__ = [
|
||||
"scrape_docs_impl",
|
||||
"scrape_github_impl",
|
||||
"scrape_pdf_impl",
|
||||
"scrape_video_impl",
|
||||
"scrape_codebase_impl",
|
||||
"detect_patterns_impl",
|
||||
"extract_test_examples_impl",
|
||||
|
||||
@@ -356,6 +356,81 @@ async def scrape_pdf_tool(args: dict) -> list[TextContent]:
|
||||
return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")]
|
||||
|
||||
|
||||
async def scrape_video_tool(args: dict) -> list[TextContent]:
|
||||
"""
|
||||
Scrape video content (YouTube, local files) and build Claude skill.
|
||||
|
||||
Extracts transcripts, metadata, and optionally visual content from videos
|
||||
to create skills.
|
||||
|
||||
Args:
|
||||
args: Dictionary containing:
|
||||
- url (str, optional): Video URL (YouTube, Vimeo)
|
||||
- video_file (str, optional): Local video file path
|
||||
- playlist (str, optional): Playlist URL
|
||||
- name (str, optional): Skill name
|
||||
- description (str, optional): Skill description
|
||||
- languages (str, optional): Language preferences (comma-separated)
|
||||
- from_json (str, optional): Build from extracted JSON file
|
||||
|
||||
Returns:
|
||||
List[TextContent]: Tool execution results
|
||||
"""
|
||||
url = args.get("url")
|
||||
video_file = args.get("video_file")
|
||||
playlist = args.get("playlist")
|
||||
name = args.get("name")
|
||||
description = args.get("description")
|
||||
languages = args.get("languages")
|
||||
from_json = args.get("from_json")
|
||||
|
||||
# Build command
|
||||
cmd = [sys.executable, str(CLI_DIR / "video_scraper.py")]
|
||||
|
||||
if from_json:
|
||||
cmd.extend(["--from-json", from_json])
|
||||
elif url:
|
||||
cmd.extend(["--url", url])
|
||||
if name:
|
||||
cmd.extend(["--name", name])
|
||||
if description:
|
||||
cmd.extend(["--description", description])
|
||||
if languages:
|
||||
cmd.extend(["--languages", languages])
|
||||
elif video_file:
|
||||
cmd.extend(["--video-file", video_file])
|
||||
if name:
|
||||
cmd.extend(["--name", name])
|
||||
if description:
|
||||
cmd.extend(["--description", description])
|
||||
elif playlist:
|
||||
cmd.extend(["--playlist", playlist])
|
||||
if name:
|
||||
cmd.extend(["--name", name])
|
||||
else:
|
||||
return [
|
||||
TextContent(
|
||||
type="text",
|
||||
text="❌ Error: Must specify --url, --video-file, --playlist, or --from-json",
|
||||
)
|
||||
]
|
||||
|
||||
# Run video_scraper.py with streaming
|
||||
timeout = 600 # 10 minutes for video extraction
|
||||
|
||||
progress_msg = "🎬 Scraping video content...\n"
|
||||
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
|
||||
|
||||
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
|
||||
|
||||
output = progress_msg + stdout
|
||||
|
||||
if returncode == 0:
|
||||
return [TextContent(type="text", text=output)]
|
||||
else:
|
||||
return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")]
|
||||
|
||||
|
||||
async def scrape_github_tool(args: dict) -> list[TextContent]:
|
||||
"""
|
||||
Scrape GitHub repository and build Claude skill.
|
||||
|
||||
111
src/skill_seekers/workflows/video-tutorial.yaml
Normal file
111
src/skill_seekers/workflows/video-tutorial.yaml
Normal file
@@ -0,0 +1,111 @@
|
||||
name: video-tutorial
|
||||
description: >
|
||||
Video tutorial enhancement workflow. Cleans OCR noise, reconstructs code from
|
||||
transcript + visual data, detects programming languages, and synthesizes a
|
||||
coherent tutorial skill from raw video extraction output.
|
||||
version: "1.0"
|
||||
applies_to:
|
||||
- video_scraping
|
||||
variables: {}
|
||||
stages:
|
||||
- name: ocr_code_cleanup
|
||||
type: custom
|
||||
target: skill_md
|
||||
enabled: true
|
||||
uses_history: false
|
||||
prompt: >
|
||||
You are reviewing code blocks extracted from video tutorial OCR.
|
||||
The OCR output is noisy — it contains line numbers, UI chrome text,
|
||||
garbled characters, and incomplete lines.
|
||||
|
||||
Clean each code block by:
|
||||
1. Remove line numbers that OCR captured (leading digits like "1 ", "2 ", "23 ")
|
||||
2. Remove UI elements (tab bar text, file names, button labels)
|
||||
3. Fix common OCR errors (l/1, O/0, rn/m confusions)
|
||||
4. Remove animation timeline numbers or frame counters
|
||||
5. Strip trailing whitespace and normalize indentation
|
||||
|
||||
Output JSON with:
|
||||
- "cleaned_blocks": array of cleaned code strings
|
||||
- "languages_detected": map of block index to detected language
|
||||
- "confidence": overall confidence in the cleanup (0-1)
|
||||
|
||||
- name: language_detection
|
||||
type: custom
|
||||
target: skill_md
|
||||
enabled: true
|
||||
uses_history: true
|
||||
prompt: >
|
||||
Based on the previous OCR cleanup results and the transcript content,
|
||||
determine the programming language for each code block.
|
||||
|
||||
Detection strategy (in priority order):
|
||||
1. Narrator mentions: "in GDScript", "this Python function", "our C# class"
|
||||
2. Code patterns: extends/func/signal=GDScript, def/import=Python,
|
||||
function/const/let=JavaScript, using/namespace=C#
|
||||
3. File extensions visible in OCR (.gd, .py, .js, .cs)
|
||||
4. Framework context from transcript (Godot=GDScript, Unity=C#, Django=Python)
|
||||
|
||||
Output JSON with:
|
||||
- "language_map": map of block index to language identifier
|
||||
- "primary_language": the main language used in the tutorial
|
||||
- "framework": detected framework/engine if any
|
||||
|
||||
- name: tutorial_synthesis
|
||||
type: custom
|
||||
target: skill_md
|
||||
enabled: true
|
||||
uses_history: true
|
||||
prompt: >
|
||||
Synthesize the cleaned code blocks, detected languages, and transcript
|
||||
into a coherent tutorial structure.
|
||||
|
||||
Group content by TOPIC rather than timestamp:
|
||||
1. Identify the main concepts taught in the tutorial
|
||||
2. Group related code blocks under concept headings
|
||||
3. Use narrator explanations as descriptions for each code block
|
||||
4. Build a progressive learning path where concepts build on each other
|
||||
5. Show final working code for each concept, not intermediate OCR states
|
||||
|
||||
Use the Audio-Visual Alignment pairs (code + narrator text) as the
|
||||
primary source for creating annotated examples.
|
||||
|
||||
Output JSON with:
|
||||
- "sections": array of tutorial sections with title, description, code examples
|
||||
- "prerequisites": what the viewer should know beforehand
|
||||
- "key_concepts": important terms and their definitions from the tutorial
|
||||
- "learning_path": ordered list of concept names
|
||||
|
||||
- name: skill_polish
|
||||
type: custom
|
||||
target: skill_md
|
||||
enabled: true
|
||||
uses_history: true
|
||||
prompt: >
|
||||
Using all previous stage results, polish the SKILL.md for this video tutorial.
|
||||
|
||||
Create:
|
||||
1. Clear "When to Use This Skill" with specific trigger conditions
|
||||
2. Quick Reference with 5-10 clean, annotated code examples
|
||||
3. Step-by-step guide following the tutorial flow
|
||||
4. Key concepts with definitions from the narrator
|
||||
5. Proper language tags on all code fences
|
||||
|
||||
Rules:
|
||||
- Never include raw OCR artifacts (line numbers, UI chrome)
|
||||
- Always use correct language tags
|
||||
- Keep code examples short and focused (5-30 lines)
|
||||
- Make it actionable for someone implementing what the tutorial teaches
|
||||
|
||||
Output JSON with:
|
||||
- "improved_overview": enhanced overview section
|
||||
- "quick_start": concise getting-started snippet
|
||||
- "key_concepts": essential concepts with definitions
|
||||
- "code_examples": array of clean, annotated code examples
|
||||
|
||||
post_process:
|
||||
reorder_sections: []
|
||||
add_metadata:
|
||||
enhanced: true
|
||||
workflow: video-tutorial
|
||||
source_type: video
|
||||
Reference in New Issue
Block a user