Files
skill-seekers-reference/src/skill_seekers/cli/config_validator.py
YusufKaraaslanSpyke 62071c4aa9 feat: add video tutorial scraping pipeline with per-panel OCR and AI enhancement
Add complete video tutorial extraction system that converts YouTube videos
and local video files into AI-consumable skills. The pipeline extracts
transcripts, performs visual OCR on code editor panels independently,
tracks code evolution across frames, and generates structured SKILL.md output.

Key features:
- Video metadata extraction (YouTube, local files, playlists)
- Multi-source transcript extraction (YouTube API, yt-dlp, Whisper fallback)
- Chapter-based and time-window segmentation
- Visual extraction: keyframe detection, frame classification, panel detection
- Per-panel sub-section OCR (each IDE panel OCR'd independently)
- Parallel OCR with ThreadPoolExecutor for multi-panel frames
- Narrow panel filtering (300px min width) to skip UI chrome
- Text block tracking with spatial panel position matching
- Code timeline with edit tracking across frames
- Audio-visual alignment (code + narrator pairs)
- Video-specific AI enhancement prompt for OCR denoising and code reconstruction
- video-tutorial.yaml workflow with 4 stages (OCR cleanup, language detection,
  tutorial synthesis, skill polish)
- CLI integration: skill-seekers video --url/--video-file/--playlist
- MCP tool: scrape_video for automation
- 161 tests passing

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-27 23:10:19 +03:00

341 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Unified Config Validator
Validates unified config format that supports multiple sources:
- documentation (website scraping)
- github (repository scraping)
- pdf (PDF document scraping)
- local (local codebase analysis)
Legacy config format support removed in v2.11.0.
All configs must use unified format with 'sources' array.
"""
import json
import logging
from pathlib import Path
from typing import Any
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class ConfigValidator:
"""
Validates unified config format (legacy support removed in v2.11.0).
"""
# Valid source types
VALID_SOURCE_TYPES = {"documentation", "github", "pdf", "local", "word", "video"}
# Valid merge modes
VALID_MERGE_MODES = {"rule-based", "claude-enhanced"}
# Valid code analysis depth levels
VALID_DEPTH_LEVELS = {"surface", "deep", "full"}
# Valid AI modes for C3.x enhancement
VALID_AI_MODES = {"auto", "api", "local", "none"}
def __init__(self, config_or_path: dict[str, Any] | str):
"""
Initialize validator with config dict or file path.
Args:
config_or_path: Either a config dict or path to config JSON file
"""
if isinstance(config_or_path, dict):
self.config_path = None
self.config = config_or_path
else:
self.config_path = config_or_path
self.config = self._load_config()
self.is_unified = True # Always unified format now
def _load_config(self) -> dict[str, Any]:
"""Load JSON config file."""
try:
with open(self.config_path, encoding="utf-8") as f:
return json.load(f)
except FileNotFoundError as e:
raise ValueError(f"Config file not found: {self.config_path}") from e
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON in config file: {e}") from e
def validate(self) -> bool:
"""
Validate unified config format.
Returns:
True if valid
Raises:
ValueError if invalid with detailed error message
"""
# Check if legacy format (no sources array)
if "sources" not in self.config:
raise ValueError(
"\n❌ LEGACY CONFIG FORMAT DETECTED\n\n"
" Legacy config format was removed in v2.11.0.\n"
" All configs must now use unified format with 'sources' array.\n\n"
" OLD FORMAT (removed):\n"
" {\n"
' "name": "example",\n'
' "base_url": "https://..."\n'
" }\n\n"
" NEW FORMAT (required):\n"
" {\n"
' "name": "example",\n'
' "description": "...",\n'
' "sources": [\n'
" {\n"
' "type": "documentation",\n'
' "base_url": "https://..."\n'
" }\n"
" ]\n"
" }\n\n"
" 📖 See: https://skillseekersweb.com/docs/config-format\n"
)
return self._validate_unified()
def _validate_unified(self) -> bool:
"""Validate unified config format."""
logger.info("Validating unified config format...")
# Required top-level fields
if "name" not in self.config:
raise ValueError("Missing required field: 'name'")
if "description" not in self.config:
raise ValueError("Missing required field: 'description'")
if "sources" not in self.config:
raise ValueError("Missing required field: 'sources'")
# Validate sources array
sources = self.config["sources"]
if not isinstance(sources, list):
raise ValueError("'sources' must be an array")
if len(sources) == 0:
raise ValueError("'sources' array cannot be empty")
# Validate merge_mode (optional)
merge_mode = self.config.get("merge_mode", "rule-based")
if merge_mode not in self.VALID_MERGE_MODES:
raise ValueError(
f"Invalid merge_mode: '{merge_mode}'. Must be one of {self.VALID_MERGE_MODES}"
)
# Validate each source
for i, source in enumerate(sources):
self._validate_source(source, i)
logger.info(f"✅ Unified config valid: {len(sources)} sources")
return True
def _validate_source(self, source: dict[str, Any], index: int):
"""Validate individual source configuration."""
# Check source has 'type' field
if "type" not in source:
raise ValueError(f"Source {index}: Missing required field 'type'")
source_type = source["type"]
if source_type not in self.VALID_SOURCE_TYPES:
raise ValueError(
f"Source {index}: Invalid type '{source_type}'. Must be one of {self.VALID_SOURCE_TYPES}"
)
# Type-specific validation
if source_type == "documentation":
self._validate_documentation_source(source, index)
elif source_type == "github":
self._validate_github_source(source, index)
elif source_type == "pdf":
self._validate_pdf_source(source, index)
elif source_type == "local":
self._validate_local_source(source, index)
def _validate_documentation_source(self, source: dict[str, Any], index: int):
"""Validate documentation source configuration."""
if "base_url" not in source:
raise ValueError(f"Source {index} (documentation): Missing required field 'base_url'")
# Optional but recommended fields
if "selectors" not in source:
logger.warning(
f"Source {index} (documentation): No 'selectors' specified, using defaults"
)
if "max_pages" in source and not isinstance(source["max_pages"], int):
raise ValueError(f"Source {index} (documentation): 'max_pages' must be an integer")
def _validate_github_source(self, source: dict[str, Any], index: int):
"""Validate GitHub source configuration."""
if "repo" not in source:
raise ValueError(f"Source {index} (github): Missing required field 'repo'")
# Validate repo format (owner/repo)
repo = source["repo"]
if "/" not in repo:
raise ValueError(
f"Source {index} (github): Invalid repo format '{repo}'. Must be 'owner/repo' (e.g., 'facebook/react')"
)
# Validate code_analysis_depth if specified
if "code_analysis_depth" in source:
depth = source["code_analysis_depth"]
if depth not in self.VALID_DEPTH_LEVELS:
raise ValueError(
f"Source {index} (github): Invalid code_analysis_depth '{depth}'. "
f"Must be one of {self.VALID_DEPTH_LEVELS}"
)
# Validate max_issues if specified
if "max_issues" in source and not isinstance(source["max_issues"], int):
raise ValueError(f"Source {index} (github): 'max_issues' must be an integer")
# Validate enable_codebase_analysis if specified (C3.5)
if "enable_codebase_analysis" in source and not isinstance(
source["enable_codebase_analysis"], bool
):
raise ValueError(
f"Source {index} (github): 'enable_codebase_analysis' must be a boolean"
)
# Validate ai_mode if specified (C3.5)
if "ai_mode" in source:
ai_mode = source["ai_mode"]
if ai_mode not in self.VALID_AI_MODES:
raise ValueError(
f"Source {index} (github): Invalid ai_mode '{ai_mode}'. Must be one of {self.VALID_AI_MODES}"
)
def _validate_pdf_source(self, source: dict[str, Any], index: int):
"""Validate PDF source configuration."""
if "path" not in source:
raise ValueError(f"Source {index} (pdf): Missing required field 'path'")
# Check if file exists
pdf_path = source["path"]
if not Path(pdf_path).exists():
logger.warning(f"Source {index} (pdf): File not found: {pdf_path}")
def _validate_local_source(self, source: dict[str, Any], index: int):
"""Validate local codebase source configuration."""
if "path" not in source:
raise ValueError(f"Source {index} (local): Missing required field 'path'")
# Check if directory exists
local_path = source["path"]
if not Path(local_path).exists():
logger.warning(f"Source {index} (local): Directory not found: {local_path}")
elif not Path(local_path).is_dir():
raise ValueError(f"Source {index} (local): Path is not a directory: {local_path}")
# Validate analysis_depth if provided
if "analysis_depth" in source:
depth = source["analysis_depth"]
if depth not in self.VALID_DEPTH_LEVELS:
raise ValueError(
f"Source {index} (local): Invalid analysis_depth '{depth}'. Must be one of {self.VALID_DEPTH_LEVELS}"
)
# Validate ai_mode if provided
if "ai_mode" in source:
ai_mode = source["ai_mode"]
if ai_mode not in self.VALID_AI_MODES:
raise ValueError(
f"Source {index} (local): Invalid ai_mode '{ai_mode}'. Must be one of {self.VALID_AI_MODES}"
)
def get_sources_by_type(self, source_type: str) -> list[dict[str, Any]]:
"""
Get all sources of a specific type.
Args:
source_type: 'documentation', 'github', 'pdf', or 'local'
Returns:
List of sources matching the type
"""
sources = self.config["sources"]
return [s for s in sources if s.get("type") == source_type]
def has_multiple_sources(self) -> bool:
"""Check if config has multiple sources (requires merging)."""
return len(self.config["sources"]) > 1
def needs_api_merge(self) -> bool:
"""
Check if config needs API merging.
Returns True if both documentation and github sources exist
with API extraction enabled.
"""
if not self.has_multiple_sources():
return False
has_docs_api = any(
s.get("type") == "documentation" and s.get("extract_api", True)
for s in self.config["sources"]
)
has_github_code = any(
s.get("type") == "github" and s.get("include_code", False)
for s in self.config["sources"]
)
return has_docs_api and has_github_code
def validate_config(config_path: str) -> ConfigValidator:
"""
Validate config file and return validator instance.
Args:
config_path: Path to config JSON file
Returns:
ConfigValidator instance
Raises:
ValueError if config is invalid
"""
validator = ConfigValidator(config_path)
validator.validate()
return validator
if __name__ == "__main__":
import sys
if len(sys.argv) < 2:
print("Usage: python config_validator.py <config.json>")
sys.exit(1)
config_file = sys.argv[1]
try:
validator = validate_config(config_file)
print("\n✅ Config valid!")
print(f" Name: {validator.config.get('name')}")
sources = validator.config["sources"]
print(f" Sources: {len(sources)}")
for i, source in enumerate(sources):
print(f" {i + 1}. {source['type']}")
if validator.needs_api_merge():
merge_mode = validator.config.get("merge_mode", "rule-based")
print(f" ⚠️ API merge required (mode: {merge_mode})")
except ValueError as e:
print(f"\n❌ Config invalid: {e}")
sys.exit(1)