Add complete video tutorial extraction system that converts YouTube videos and local video files into AI-consumable skills. The pipeline extracts transcripts, performs visual OCR on code editor panels independently, tracks code evolution across frames, and generates structured SKILL.md output. Key features: - Video metadata extraction (YouTube, local files, playlists) - Multi-source transcript extraction (YouTube API, yt-dlp, Whisper fallback) - Chapter-based and time-window segmentation - Visual extraction: keyframe detection, frame classification, panel detection - Per-panel sub-section OCR (each IDE panel OCR'd independently) - Parallel OCR with ThreadPoolExecutor for multi-panel frames - Narrow panel filtering (300px min width) to skip UI chrome - Text block tracking with spatial panel position matching - Code timeline with edit tracking across frames - Audio-visual alignment (code + narrator pairs) - Video-specific AI enhancement prompt for OCR denoising and code reconstruction - video-tutorial.yaml workflow with 4 stages (OCR cleanup, language detection, tutorial synthesis, skill polish) - CLI integration: skill-seekers video --url/--video-file/--playlist - MCP tool: scrape_video for automation - 161 tests passing Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
341 lines
12 KiB
Python
341 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Unified Config Validator
|
|
|
|
Validates unified config format that supports multiple sources:
|
|
- documentation (website scraping)
|
|
- github (repository scraping)
|
|
- pdf (PDF document scraping)
|
|
- local (local codebase analysis)
|
|
|
|
Legacy config format support removed in v2.11.0.
|
|
All configs must use unified format with 'sources' array.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ConfigValidator:
|
|
"""
|
|
Validates unified config format (legacy support removed in v2.11.0).
|
|
"""
|
|
|
|
# Valid source types
|
|
VALID_SOURCE_TYPES = {"documentation", "github", "pdf", "local", "word", "video"}
|
|
|
|
# Valid merge modes
|
|
VALID_MERGE_MODES = {"rule-based", "claude-enhanced"}
|
|
|
|
# Valid code analysis depth levels
|
|
VALID_DEPTH_LEVELS = {"surface", "deep", "full"}
|
|
|
|
# Valid AI modes for C3.x enhancement
|
|
VALID_AI_MODES = {"auto", "api", "local", "none"}
|
|
|
|
def __init__(self, config_or_path: dict[str, Any] | str):
|
|
"""
|
|
Initialize validator with config dict or file path.
|
|
|
|
Args:
|
|
config_or_path: Either a config dict or path to config JSON file
|
|
"""
|
|
if isinstance(config_or_path, dict):
|
|
self.config_path = None
|
|
self.config = config_or_path
|
|
else:
|
|
self.config_path = config_or_path
|
|
self.config = self._load_config()
|
|
self.is_unified = True # Always unified format now
|
|
|
|
def _load_config(self) -> dict[str, Any]:
|
|
"""Load JSON config file."""
|
|
try:
|
|
with open(self.config_path, encoding="utf-8") as f:
|
|
return json.load(f)
|
|
except FileNotFoundError as e:
|
|
raise ValueError(f"Config file not found: {self.config_path}") from e
|
|
except json.JSONDecodeError as e:
|
|
raise ValueError(f"Invalid JSON in config file: {e}") from e
|
|
|
|
def validate(self) -> bool:
|
|
"""
|
|
Validate unified config format.
|
|
|
|
Returns:
|
|
True if valid
|
|
|
|
Raises:
|
|
ValueError if invalid with detailed error message
|
|
"""
|
|
# Check if legacy format (no sources array)
|
|
if "sources" not in self.config:
|
|
raise ValueError(
|
|
"\n❌ LEGACY CONFIG FORMAT DETECTED\n\n"
|
|
" Legacy config format was removed in v2.11.0.\n"
|
|
" All configs must now use unified format with 'sources' array.\n\n"
|
|
" OLD FORMAT (removed):\n"
|
|
" {\n"
|
|
' "name": "example",\n'
|
|
' "base_url": "https://..."\n'
|
|
" }\n\n"
|
|
" NEW FORMAT (required):\n"
|
|
" {\n"
|
|
' "name": "example",\n'
|
|
' "description": "...",\n'
|
|
' "sources": [\n'
|
|
" {\n"
|
|
' "type": "documentation",\n'
|
|
' "base_url": "https://..."\n'
|
|
" }\n"
|
|
" ]\n"
|
|
" }\n\n"
|
|
" 📖 See: https://skillseekersweb.com/docs/config-format\n"
|
|
)
|
|
|
|
return self._validate_unified()
|
|
|
|
def _validate_unified(self) -> bool:
|
|
"""Validate unified config format."""
|
|
logger.info("Validating unified config format...")
|
|
|
|
# Required top-level fields
|
|
if "name" not in self.config:
|
|
raise ValueError("Missing required field: 'name'")
|
|
|
|
if "description" not in self.config:
|
|
raise ValueError("Missing required field: 'description'")
|
|
|
|
if "sources" not in self.config:
|
|
raise ValueError("Missing required field: 'sources'")
|
|
|
|
# Validate sources array
|
|
sources = self.config["sources"]
|
|
|
|
if not isinstance(sources, list):
|
|
raise ValueError("'sources' must be an array")
|
|
|
|
if len(sources) == 0:
|
|
raise ValueError("'sources' array cannot be empty")
|
|
|
|
# Validate merge_mode (optional)
|
|
merge_mode = self.config.get("merge_mode", "rule-based")
|
|
if merge_mode not in self.VALID_MERGE_MODES:
|
|
raise ValueError(
|
|
f"Invalid merge_mode: '{merge_mode}'. Must be one of {self.VALID_MERGE_MODES}"
|
|
)
|
|
|
|
# Validate each source
|
|
for i, source in enumerate(sources):
|
|
self._validate_source(source, i)
|
|
|
|
logger.info(f"✅ Unified config valid: {len(sources)} sources")
|
|
return True
|
|
|
|
def _validate_source(self, source: dict[str, Any], index: int):
|
|
"""Validate individual source configuration."""
|
|
# Check source has 'type' field
|
|
if "type" not in source:
|
|
raise ValueError(f"Source {index}: Missing required field 'type'")
|
|
|
|
source_type = source["type"]
|
|
|
|
if source_type not in self.VALID_SOURCE_TYPES:
|
|
raise ValueError(
|
|
f"Source {index}: Invalid type '{source_type}'. Must be one of {self.VALID_SOURCE_TYPES}"
|
|
)
|
|
|
|
# Type-specific validation
|
|
if source_type == "documentation":
|
|
self._validate_documentation_source(source, index)
|
|
elif source_type == "github":
|
|
self._validate_github_source(source, index)
|
|
elif source_type == "pdf":
|
|
self._validate_pdf_source(source, index)
|
|
elif source_type == "local":
|
|
self._validate_local_source(source, index)
|
|
|
|
def _validate_documentation_source(self, source: dict[str, Any], index: int):
|
|
"""Validate documentation source configuration."""
|
|
if "base_url" not in source:
|
|
raise ValueError(f"Source {index} (documentation): Missing required field 'base_url'")
|
|
|
|
# Optional but recommended fields
|
|
if "selectors" not in source:
|
|
logger.warning(
|
|
f"Source {index} (documentation): No 'selectors' specified, using defaults"
|
|
)
|
|
|
|
if "max_pages" in source and not isinstance(source["max_pages"], int):
|
|
raise ValueError(f"Source {index} (documentation): 'max_pages' must be an integer")
|
|
|
|
def _validate_github_source(self, source: dict[str, Any], index: int):
|
|
"""Validate GitHub source configuration."""
|
|
if "repo" not in source:
|
|
raise ValueError(f"Source {index} (github): Missing required field 'repo'")
|
|
|
|
# Validate repo format (owner/repo)
|
|
repo = source["repo"]
|
|
if "/" not in repo:
|
|
raise ValueError(
|
|
f"Source {index} (github): Invalid repo format '{repo}'. Must be 'owner/repo' (e.g., 'facebook/react')"
|
|
)
|
|
|
|
# Validate code_analysis_depth if specified
|
|
if "code_analysis_depth" in source:
|
|
depth = source["code_analysis_depth"]
|
|
if depth not in self.VALID_DEPTH_LEVELS:
|
|
raise ValueError(
|
|
f"Source {index} (github): Invalid code_analysis_depth '{depth}'. "
|
|
f"Must be one of {self.VALID_DEPTH_LEVELS}"
|
|
)
|
|
|
|
# Validate max_issues if specified
|
|
if "max_issues" in source and not isinstance(source["max_issues"], int):
|
|
raise ValueError(f"Source {index} (github): 'max_issues' must be an integer")
|
|
|
|
# Validate enable_codebase_analysis if specified (C3.5)
|
|
if "enable_codebase_analysis" in source and not isinstance(
|
|
source["enable_codebase_analysis"], bool
|
|
):
|
|
raise ValueError(
|
|
f"Source {index} (github): 'enable_codebase_analysis' must be a boolean"
|
|
)
|
|
|
|
# Validate ai_mode if specified (C3.5)
|
|
if "ai_mode" in source:
|
|
ai_mode = source["ai_mode"]
|
|
if ai_mode not in self.VALID_AI_MODES:
|
|
raise ValueError(
|
|
f"Source {index} (github): Invalid ai_mode '{ai_mode}'. Must be one of {self.VALID_AI_MODES}"
|
|
)
|
|
|
|
def _validate_pdf_source(self, source: dict[str, Any], index: int):
|
|
"""Validate PDF source configuration."""
|
|
if "path" not in source:
|
|
raise ValueError(f"Source {index} (pdf): Missing required field 'path'")
|
|
|
|
# Check if file exists
|
|
pdf_path = source["path"]
|
|
if not Path(pdf_path).exists():
|
|
logger.warning(f"Source {index} (pdf): File not found: {pdf_path}")
|
|
|
|
def _validate_local_source(self, source: dict[str, Any], index: int):
|
|
"""Validate local codebase source configuration."""
|
|
if "path" not in source:
|
|
raise ValueError(f"Source {index} (local): Missing required field 'path'")
|
|
|
|
# Check if directory exists
|
|
local_path = source["path"]
|
|
if not Path(local_path).exists():
|
|
logger.warning(f"Source {index} (local): Directory not found: {local_path}")
|
|
elif not Path(local_path).is_dir():
|
|
raise ValueError(f"Source {index} (local): Path is not a directory: {local_path}")
|
|
|
|
# Validate analysis_depth if provided
|
|
if "analysis_depth" in source:
|
|
depth = source["analysis_depth"]
|
|
if depth not in self.VALID_DEPTH_LEVELS:
|
|
raise ValueError(
|
|
f"Source {index} (local): Invalid analysis_depth '{depth}'. Must be one of {self.VALID_DEPTH_LEVELS}"
|
|
)
|
|
|
|
# Validate ai_mode if provided
|
|
if "ai_mode" in source:
|
|
ai_mode = source["ai_mode"]
|
|
if ai_mode not in self.VALID_AI_MODES:
|
|
raise ValueError(
|
|
f"Source {index} (local): Invalid ai_mode '{ai_mode}'. Must be one of {self.VALID_AI_MODES}"
|
|
)
|
|
|
|
def get_sources_by_type(self, source_type: str) -> list[dict[str, Any]]:
|
|
"""
|
|
Get all sources of a specific type.
|
|
|
|
Args:
|
|
source_type: 'documentation', 'github', 'pdf', or 'local'
|
|
|
|
Returns:
|
|
List of sources matching the type
|
|
"""
|
|
sources = self.config["sources"]
|
|
return [s for s in sources if s.get("type") == source_type]
|
|
|
|
def has_multiple_sources(self) -> bool:
|
|
"""Check if config has multiple sources (requires merging)."""
|
|
return len(self.config["sources"]) > 1
|
|
|
|
def needs_api_merge(self) -> bool:
|
|
"""
|
|
Check if config needs API merging.
|
|
|
|
Returns True if both documentation and github sources exist
|
|
with API extraction enabled.
|
|
"""
|
|
if not self.has_multiple_sources():
|
|
return False
|
|
|
|
has_docs_api = any(
|
|
s.get("type") == "documentation" and s.get("extract_api", True)
|
|
for s in self.config["sources"]
|
|
)
|
|
|
|
has_github_code = any(
|
|
s.get("type") == "github" and s.get("include_code", False)
|
|
for s in self.config["sources"]
|
|
)
|
|
|
|
return has_docs_api and has_github_code
|
|
|
|
|
|
def validate_config(config_path: str) -> ConfigValidator:
|
|
"""
|
|
Validate config file and return validator instance.
|
|
|
|
Args:
|
|
config_path: Path to config JSON file
|
|
|
|
Returns:
|
|
ConfigValidator instance
|
|
|
|
Raises:
|
|
ValueError if config is invalid
|
|
"""
|
|
validator = ConfigValidator(config_path)
|
|
validator.validate()
|
|
return validator
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
|
|
if len(sys.argv) < 2:
|
|
print("Usage: python config_validator.py <config.json>")
|
|
sys.exit(1)
|
|
|
|
config_file = sys.argv[1]
|
|
|
|
try:
|
|
validator = validate_config(config_file)
|
|
|
|
print("\n✅ Config valid!")
|
|
print(f" Name: {validator.config.get('name')}")
|
|
|
|
sources = validator.config["sources"]
|
|
print(f" Sources: {len(sources)}")
|
|
for i, source in enumerate(sources):
|
|
print(f" {i + 1}. {source['type']}")
|
|
|
|
if validator.needs_api_merge():
|
|
merge_mode = validator.config.get("merge_mode", "rule-based")
|
|
print(f" ⚠️ API merge required (mode: {merge_mode})")
|
|
|
|
except ValueError as e:
|
|
print(f"\n❌ Config invalid: {e}")
|
|
sys.exit(1)
|