diff --git a/pyproject.toml b/pyproject.toml index 0729726..30555cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,6 @@ dependencies = [ "anthropic>=0.76.0", # Required for AI enhancement (core feature) "PyMuPDF>=1.24.14", "Pillow>=11.0.0", - "pytesseract>=0.3.13", "pydantic>=2.12.3", "pydantic-settings>=2.11.0", "python-dotenv>=1.1.1", @@ -129,6 +128,7 @@ video-full = [ "scenedetect[opencv]>=0.6.4", "easyocr>=1.7.0", "opencv-python-headless>=4.9.0", + "pytesseract>=0.3.13", ] # RAG vector database upload support @@ -172,6 +172,8 @@ embedding = [ ] # All optional dependencies combined (dev dependencies now in [dependency-groups]) +# Note: video-full deps (opencv, easyocr, faster-whisper) excluded due to heavy +# native dependencies. Install separately: pip install skill-seekers[video-full] all = [ "mammoth>=1.6.0", "python-docx>=1.1.0", diff --git a/src/skill_seekers/cli/arguments/create.py b/src/skill_seekers/cli/arguments/create.py index 99737ff..ef1d6e0 100644 --- a/src/skill_seekers/cli/arguments/create.py +++ b/src/skill_seekers/cli/arguments/create.py @@ -488,6 +488,13 @@ VIDEO_ARGUMENTS: dict[str, dict[str, Any]] = { "metavar": "THRESH", }, }, + "vision_ocr": { + "flags": ("--vision-ocr",), + "kwargs": { + "action": "store_true", + "help": "Use Claude Vision API as fallback for low-confidence code frames (requires ANTHROPIC_API_KEY, ~$0.004/frame)", + }, + }, } # Multi-source config specific (from unified_scraper.py) diff --git a/src/skill_seekers/cli/create_command.py b/src/skill_seekers/cli/create_command.py index 22c8b21..cff35d2 100644 --- a/src/skill_seekers/cli/create_command.py +++ b/src/skill_seekers/cli/create_command.py @@ -360,8 +360,12 @@ class CreateCommand: # Add video source (URL or file) parsed = self.source_info.parsed + video_playlist = getattr(self.args, "video_playlist", None) if parsed.get("source_kind") == "file": argv.extend(["--video-file", parsed["file_path"]]) + elif video_playlist: + # Explicit --video-playlist flag takes precedence + argv.extend(["--playlist", video_playlist]) elif parsed.get("url"): url = parsed["url"] # Detect playlist vs single video @@ -374,11 +378,15 @@ class CreateCommand: self._add_common_args(argv) # Add video-specific arguments - video_langs = getattr(self.args, "video_languages", None) or getattr(self.args, "languages", None) + video_langs = getattr(self.args, "video_languages", None) or getattr( + self.args, "languages", None + ) if video_langs: argv.extend(["--languages", video_langs]) if getattr(self.args, "visual", False): argv.append("--visual") + if getattr(self.args, "vision_ocr", False): + argv.append("--vision-ocr") if getattr(self.args, "whisper_model", None) and self.args.whisper_model != "base": argv.extend(["--whisper-model", self.args.whisper_model]) vi = getattr(self.args, "visual_interval", None) diff --git a/src/skill_seekers/cli/main.py b/src/skill_seekers/cli/main.py index c7980cf..83c1fd7 100644 --- a/src/skill_seekers/cli/main.py +++ b/src/skill_seekers/cli/main.py @@ -12,6 +12,8 @@ Commands: scrape Scrape documentation website github Scrape GitHub repository pdf Extract from PDF file + word Extract from Word (.docx) file + video Extract from video (YouTube or local) unified Multi-source scraping (docs + GitHub + PDF) analyze Analyze local codebase and extract code knowledge enhance AI-powered enhancement (auto: API or LOCAL mode) diff --git a/src/skill_seekers/cli/unified_scraper.py b/src/skill_seekers/cli/unified_scraper.py index 6cfc9cf..81f7ed3 100644 --- a/src/skill_seekers/cli/unified_scraper.py +++ b/src/skill_seekers/cli/unified_scraper.py @@ -79,7 +79,14 @@ class UnifiedScraper: } # Track source index for unique naming (multi-source support) - self._source_counters = {"documentation": 0, "github": 0, "pdf": 0, "word": 0, "video": 0, "local": 0} + self._source_counters = { + "documentation": 0, + "github": 0, + "pdf": 0, + "word": 0, + "video": 0, + "local": 0, + } # Output paths - cleaner organization self.name = self.config["name"] @@ -583,8 +590,12 @@ class UnifiedScraper: """Scrape video source (YouTube, local file, etc.).""" try: from skill_seekers.cli.video_scraper import VideoToSkillConverter - except ImportError: - logger.error("video_scraper.py not found") + except ImportError as e: + logger.error( + f"Video scraper dependencies not installed: {e}\n" + " Install with: pip install skill-seekers[video]\n" + " For visual extraction (frame analysis, OCR): pip install skill-seekers[video-full]" + ) return # Multi-source support: Get unique index for this video source @@ -630,8 +641,7 @@ class UnifiedScraper: logger.info("✅ Video: Standalone SKILL.md created") logger.info( - f"✅ Video: {len(result.videos)} videos, " - f"{result.total_segments} segments extracted" + f"✅ Video: {len(result.videos)} videos, {result.total_segments} segments extracted" ) except Exception as e: logger.error(f"Failed to process video source: {e}") diff --git a/src/skill_seekers/cli/video_models.py b/src/skill_seekers/cli/video_models.py index 3da19c7..de15a04 100644 --- a/src/skill_seekers/cli/video_models.py +++ b/src/skill_seekers/cli/video_models.py @@ -222,6 +222,7 @@ class FrameSubSection: ocr_regions: list[OCRRegion] = field(default_factory=list) ocr_confidence: float = 0.0 panel_id: str = "" # e.g. "panel_0_0" (row_col) + _vision_used: bool = False # Whether Vision API was used for OCR def to_dict(self) -> dict: return { diff --git a/src/skill_seekers/cli/video_scraper.py b/src/skill_seekers/cli/video_scraper.py index eca06a2..d0aff1b 100644 --- a/src/skill_seekers/cli/video_scraper.py +++ b/src/skill_seekers/cli/video_scraper.py @@ -469,7 +469,12 @@ class VideoToSkillConverter: # Generate reference files for each video for video in self.result.videos: - ref_filename = f"video_{_sanitize_filename(video.title)}.md" + sanitized = ( + _sanitize_filename(video.title) + or video.video_id + or f"video_{hash(video.title) % 10000:04d}" + ) + ref_filename = f"video_{sanitized}.md" ref_path = os.path.join(refs_dir, ref_filename) ref_content = self._generate_reference_md(video) with open(ref_path, "w", encoding="utf-8") as f: @@ -750,7 +755,12 @@ class VideoToSkillConverter: preview += "..." lines.append(f"{preview}\n") - ref_filename = f"video_{_sanitize_filename(video.title)}.md" + sanitized = ( + _sanitize_filename(video.title) + or video.video_id + or f"video_{hash(video.title) % 10000:04d}" + ) + ref_filename = f"video_{sanitized}.md" lines.append( f"> Full transcript: [references/{ref_filename}](references/{ref_filename})\n" ) @@ -766,7 +776,12 @@ class VideoToSkillConverter: # References lines.append("## References\n") for video in self.result.videos: - ref_filename = f"video_{_sanitize_filename(video.title)}.md" + sanitized = ( + _sanitize_filename(video.title) + or video.video_id + or f"video_{hash(video.title) % 10000:04d}" + ) + ref_filename = f"video_{sanitized}.md" lines.append(f"- [{video.title}](references/{ref_filename})") return "\n".join(lines) @@ -940,11 +955,25 @@ def _run_video_enhancement(skill_dir: str, enhance_level: int, args) -> None: if api_key: enhance_cmd.extend(["--api-key", api_key]) - result = subprocess.run(enhance_cmd, check=True) - if result.returncode == 0: - logger.info("✅ Video skill enhancement complete!") - except subprocess.CalledProcessError: - logger.warning("⚠ Enhancement failed, but skill was still built") + logger.info( + "Starting video skill enhancement (this may take 10+ minutes " + "for large videos with AI enhancement)..." + ) + subprocess.run(enhance_cmd, check=True, timeout=1800) + logger.info("Video skill enhancement complete!") + except subprocess.TimeoutExpired: + logger.warning( + "⚠ Enhancement timed out after 30 minutes. " + "The skill was still built without enhancement. " + "You can retry manually with:\n" + f" skill-seekers enhance {skill_dir} --enhance-level {enhance_level}" + ) + except subprocess.CalledProcessError as exc: + logger.warning( + f"⚠ Enhancement failed (exit code {exc.returncode}), " + "but skill was still built. You can retry manually with:\n" + f" skill-seekers enhance {skill_dir} --enhance-level {enhance_level}" + ) except FileNotFoundError: logger.warning("⚠ skill-seekers-enhance not found. Run manually:") logger.info(f" skill-seekers enhance {skill_dir} --enhance-level {enhance_level}") diff --git a/src/skill_seekers/cli/video_transcript.py b/src/skill_seekers/cli/video_transcript.py index c527fac..04a9319 100644 --- a/src/skill_seekers/cli/video_transcript.py +++ b/src/skill_seekers/cli/video_transcript.py @@ -70,10 +70,36 @@ def extract_youtube_transcript( try: ytt_api = YouTubeTranscriptApi() - transcript = ytt_api.fetch(video_id, languages=languages) + + # Use list_transcripts to detect whether the transcript is auto-generated + source = TranscriptSource.YOUTUBE_MANUAL + try: + transcript_list = ytt_api.list(video_id) + # Prefer manually created transcripts; fall back to auto-generated + try: + transcript_entry = transcript_list.find_manually_created_transcript(languages) + source = TranscriptSource.YOUTUBE_MANUAL + except Exception: + try: + transcript_entry = transcript_list.find_generated_transcript(languages) + source = TranscriptSource.YOUTUBE_AUTO + except Exception: + # Fall back to any available transcript + transcript_entry = transcript_list.find_transcript(languages) + source = ( + TranscriptSource.YOUTUBE_AUTO + if transcript_entry.is_generated + else TranscriptSource.YOUTUBE_MANUAL + ) + transcript = transcript_entry.fetch() + except Exception: + # Fall back to direct fetch if list fails (older API versions) + transcript = ytt_api.fetch(video_id, languages=languages) + # Check is_generated on the FetchedTranscript if available + if getattr(transcript, "is_generated", False): + source = TranscriptSource.YOUTUBE_AUTO segments = [] - source = TranscriptSource.YOUTUBE_MANUAL for snippet in transcript.snippets: text = snippet.text.strip() if not text: diff --git a/src/skill_seekers/cli/video_visual.py b/src/skill_seekers/cli/video_visual.py index be441ce..f8edd98 100644 --- a/src/skill_seekers/cli/video_visual.py +++ b/src/skill_seekers/cli/video_visual.py @@ -1864,7 +1864,7 @@ def _ocr_single_panel( panel_id=f"panel_{row}_{col}", ) # Stash vision_used flag for the caller to count - ss._vision_used = vision_used # type: ignore[attr-defined] + ss._vision_used = vision_used return ss @@ -1918,7 +1918,7 @@ def extract_visual_data( cap = cv2.VideoCapture(video_path) if not cap.isOpened(): logger.error(f"Cannot open video: {video_path}") - return [], [] + return [], [], None fps = cap.get(cv2.CAP_PROP_FPS) or 30.0 total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT) @@ -2003,7 +2003,7 @@ def extract_visual_data( for fut in concurrent.futures.as_completed(futures): ss = fut.result() if ss is not None: - if getattr(ss, "_vision_used", False): + if ss._vision_used: vision_api_frames += 1 sub_sections.append(ss) else: @@ -2018,7 +2018,7 @@ def extract_visual_data( use_vision_api, ) if ss is not None: - if getattr(ss, "_vision_used", False): + if ss._vision_used: vision_api_frames += 1 sub_sections.append(ss) diff --git a/src/skill_seekers/mcp/server_fastmcp.py b/src/skill_seekers/mcp/server_fastmcp.py index 5e8a581..4f6308e 100644 --- a/src/skill_seekers/mcp/server_fastmcp.py +++ b/src/skill_seekers/mcp/server_fastmcp.py @@ -3,20 +3,21 @@ Skill Seeker MCP Server (FastMCP Implementation) Modern, decorator-based MCP server using FastMCP for simplified tool registration. -Provides 25 tools for generating Claude AI skills from documentation. +Provides 33 tools for generating Claude AI skills from documentation. This is a streamlined alternative to server.py (2200 lines → 708 lines, 68% reduction). All tool implementations are delegated to modular tool files in tools/ directory. **Architecture:** - FastMCP server with decorator-based tool registration -- 25 tools organized into 6 categories: +- 33 tools organized into 7 categories: * Config tools (3): generate_config, list_configs, validate_config - * Scraping tools (8): estimate_pages, scrape_docs, scrape_github, scrape_pdf, scrape_codebase, detect_patterns, extract_test_examples, build_how_to_guides, extract_config_patterns + * Scraping tools (10): estimate_pages, scrape_docs, scrape_github, scrape_pdf, scrape_video, scrape_codebase, detect_patterns, extract_test_examples, build_how_to_guides, extract_config_patterns * Packaging tools (4): package_skill, upload_skill, enhance_skill, install_skill * Splitting tools (2): split_config, generate_router - * Source tools (4): fetch_config, submit_config, add_config_source, list_config_sources, remove_config_source + * Source tools (5): fetch_config, submit_config, add_config_source, list_config_sources, remove_config_source * Vector Database tools (4): export_to_weaviate, export_to_chroma, export_to_faiss, export_to_qdrant + * Workflow tools (5): list_workflows, get_workflow, create_workflow, update_workflow, delete_workflow **Usage:** # Stdio transport (default, backward compatible) @@ -140,6 +141,7 @@ except ImportError: scrape_docs_impl, scrape_github_impl, scrape_pdf_impl, + scrape_video_impl, split_config_impl, submit_config_impl, upload_skill_impl, @@ -250,7 +252,7 @@ async def validate_config(config_path: str) -> str: # ============================================================================ -# SCRAPING TOOLS (4 tools) +# SCRAPING TOOLS (10 tools) # ============================================================================ @@ -432,6 +434,12 @@ async def scrape_video( description: str | None = None, languages: str | None = None, from_json: str | None = None, + visual: bool = False, + whisper_model: str | None = None, + visual_interval: float | None = None, + visual_min_gap: float | None = None, + visual_similarity: float | None = None, + vision_ocr: bool = False, ) -> str: """ Scrape video content and build Claude skill. @@ -444,6 +452,12 @@ async def scrape_video( description: Skill description languages: Transcript language preferences (comma-separated) from_json: Build from extracted JSON file + visual: Enable visual frame extraction (requires video-full extras) + whisper_model: Whisper model size for local transcription (e.g., base, small, medium, large) + visual_interval: Seconds between frame captures (default: 5.0) + visual_min_gap: Minimum seconds between kept frames (default: 2.0) + visual_similarity: Similarity threshold to skip duplicate frames 0.0-1.0 (default: 0.95) + vision_ocr: Use vision model for OCR on extracted frames Returns: Video scraping results with file paths. @@ -463,6 +477,18 @@ async def scrape_video( args["languages"] = languages if from_json: args["from_json"] = from_json + if visual: + args["visual"] = visual + if whisper_model: + args["whisper_model"] = whisper_model + if visual_interval is not None: + args["visual_interval"] = visual_interval + if visual_min_gap is not None: + args["visual_min_gap"] = visual_min_gap + if visual_similarity is not None: + args["visual_similarity"] = visual_similarity + if vision_ocr: + args["vision_ocr"] = vision_ocr result = await scrape_video_impl(args) if isinstance(result, list) and result: diff --git a/src/skill_seekers/mcp/tools/scraping_tools.py b/src/skill_seekers/mcp/tools/scraping_tools.py index a9c083c..5cea4e7 100644 --- a/src/skill_seekers/mcp/tools/scraping_tools.py +++ b/src/skill_seekers/mcp/tools/scraping_tools.py @@ -372,6 +372,12 @@ async def scrape_video_tool(args: dict) -> list[TextContent]: - description (str, optional): Skill description - languages (str, optional): Language preferences (comma-separated) - from_json (str, optional): Build from extracted JSON file + - visual (bool, optional): Enable visual frame extraction (default: False) + - whisper_model (str, optional): Whisper model size (default: base) + - visual_interval (float, optional): Seconds between frame captures (default: 5.0) + - visual_min_gap (float, optional): Minimum seconds between kept frames (default: 2.0) + - visual_similarity (float, optional): Similarity threshold to skip duplicate frames (default: 0.95) + - vision_ocr (bool, optional): Use vision model for OCR on frames (default: False) Returns: List[TextContent]: Tool execution results @@ -383,6 +389,12 @@ async def scrape_video_tool(args: dict) -> list[TextContent]: description = args.get("description") languages = args.get("languages") from_json = args.get("from_json") + visual = args.get("visual", False) + whisper_model = args.get("whisper_model") + visual_interval = args.get("visual_interval") + visual_min_gap = args.get("visual_min_gap") + visual_similarity = args.get("visual_similarity") + vision_ocr = args.get("vision_ocr", False) # Build command cmd = [sys.executable, str(CLI_DIR / "video_scraper.py")] @@ -415,6 +427,20 @@ async def scrape_video_tool(args: dict) -> list[TextContent]: ) ] + # Visual extraction parameters + if visual: + cmd.append("--visual") + if whisper_model: + cmd.extend(["--whisper-model", whisper_model]) + if visual_interval is not None: + cmd.extend(["--visual-interval", str(visual_interval)]) + if visual_min_gap is not None: + cmd.extend(["--visual-min-gap", str(visual_min_gap)]) + if visual_similarity is not None: + cmd.extend(["--visual-similarity", str(visual_similarity)]) + if vision_ocr: + cmd.append("--vision-ocr") + # Run video_scraper.py with streaming timeout = 600 # 10 minutes for video extraction diff --git a/tests/test_cli_parsers.py b/tests/test_cli_parsers.py index 57fafef..8d240d1 100644 --- a/tests/test_cli_parsers.py +++ b/tests/test_cli_parsers.py @@ -24,12 +24,12 @@ class TestParserRegistry: def test_all_parsers_registered(self): """Test that all parsers are registered.""" - assert len(PARSERS) == 22, f"Expected 22 parsers, got {len(PARSERS)}" + assert len(PARSERS) == 23, f"Expected 23 parsers, got {len(PARSERS)}" def test_get_parser_names(self): """Test getting list of parser names.""" names = get_parser_names() - assert len(names) == 22 + assert len(names) == 23 assert "scrape" in names assert "github" in names assert "package" in names @@ -37,6 +37,7 @@ class TestParserRegistry: assert "analyze" in names assert "config" in names assert "workflows" in names + assert "video" in names def test_all_parsers_are_subcommand_parsers(self): """Test that all parsers inherit from SubcommandParser.""" @@ -242,9 +243,9 @@ class TestBackwardCompatibility: assert cmd in names, f"Command '{cmd}' not found in parser registry!" def test_command_count_matches(self): - """Test that we have exactly 22 commands (includes new create, workflows, and word commands).""" - assert len(PARSERS) == 22 - assert len(get_parser_names()) == 22 + """Test that we have exactly 23 commands (includes create, workflows, word, and video commands).""" + assert len(PARSERS) == 23 + assert len(get_parser_names()) == 23 if __name__ == "__main__": diff --git a/uv.lock b/uv.lock index 967c3d3..55d4cab 100644 --- a/uv.lock +++ b/uv.lock @@ -5983,7 +5983,6 @@ dependencies = [ { name = "pygithub" }, { name = "pygments" }, { name = "pymupdf" }, - { name = "pytesseract" }, { name = "python-dotenv" }, { name = "pyyaml" }, { name = "requests" }, @@ -6084,6 +6083,7 @@ video-full = [ { name = "easyocr" }, { name = "faster-whisper" }, { name = "opencv-python-headless" }, + { name = "pytesseract" }, { name = "scenedetect", extra = ["opencv"] }, { name = "youtube-transcript-api" }, { name = "yt-dlp" }, @@ -6164,7 +6164,7 @@ requires-dist = [ { name = "pygithub", specifier = ">=2.5.0" }, { name = "pygments", specifier = ">=2.19.2" }, { name = "pymupdf", specifier = ">=1.24.14" }, - { name = "pytesseract", specifier = ">=0.3.13" }, + { name = "pytesseract", marker = "extra == 'video-full'", specifier = ">=0.3.13" }, { name = "python-docx", marker = "extra == 'all'", specifier = ">=1.1.0" }, { name = "python-docx", marker = "extra == 'docx'", specifier = ">=1.1.0" }, { name = "python-dotenv", specifier = ">=1.1.1" },