feat: video pipeline OCR quality fixes + two-pass AI enhancement

- Skip OCR on WEBCAM/OTHER frames (eliminates ~64 junk results per video) - Add _clean_ocr_line() to strip line numbers, IDE decorations, collapse markers - Add _fix_intra_line_duplication() for multi-engine OCR overlap artifacts - Add _is_likely_code() filter to prevent UI junk in reference code fences - Add language detection to get_text_groups() via LanguageDetector - Apply OCR cleaning in _assemble_structured_text() pipeline - Add two-pass AI enhancement: Pass 1 cleans reference Code Timeline using transcript context, Pass 2 generates SKILL.md from cleaned refs - Update video-tutorial.yaml prompts for pre-cleaned references - Add 17 new tests (197 total video tests), 2540 tests passing Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-01 21:48:21 +03:00
parent bb54b3f7b6
commit d19ad7d820
6 changed files with 489 additions and 23 deletions
--- a/src/skill_seekers/cli/video_scraper.py
+++ b/src/skill_seekers/cli/video_scraper.py
@@ -233,6 +233,86 @@ def _build_audio_visual_alignments(
    return alignments


+# =============================================================================
+# OCR Quality Filters
+# =============================================================================
+
+
+_RE_CODE_TOKENS = re.compile(
+    r"[=(){};]|(?:def|class|function|import|return|var|let|const|public|private|void|static|override|virtual|protected)\b"
+)
+_RE_UI_PATTERNS = re.compile(
+    r"\b(?:Inspector|Hierarchy|Project|Console|Image Type|Sorting Layer|Button|Canvas|Scene|Game)\b",
+    re.IGNORECASE,
+)
+
+
+def _is_likely_code(text: str) -> bool:
+    """Return True if text likely contains programming code, not UI junk."""
+    if not text or len(text.strip()) < 10:
+        return False
+    code_tokens = _RE_CODE_TOKENS.findall(text)
+    ui_patterns = _RE_UI_PATTERNS.findall(text)
+    return len(code_tokens) >= 2 and len(code_tokens) > len(ui_patterns)
+
+
+# =============================================================================
+# Two-Pass AI Reference Enhancement
+# =============================================================================
+
+
+def _ai_clean_reference(ref_path: str, content: str, api_key: str | None = None) -> None:
+    """Use AI to clean Code Timeline section in a reference file.
+
+    Sends the reference file content to Claude with a focused prompt
+    to reconstruct the Code Timeline from noisy OCR + transcript context.
+    """
+    try:
+        import anthropic
+    except ImportError:
+        return
+
+    key = api_key or os.environ.get("ANTHROPIC_API_KEY") or os.environ.get("ANTHROPIC_AUTH_TOKEN")
+    if not key:
+        return
+
+    base_url = os.environ.get("ANTHROPIC_BASE_URL")
+    client_kwargs: dict = {"api_key": key}
+    if base_url:
+        client_kwargs["base_url"] = base_url
+
+    prompt = (
+        "You are cleaning a video tutorial reference file. The Code Timeline section "
+        "contains OCR-extracted code that is noisy (duplicated lines, garbled characters, "
+        "UI decorations mixed in). The transcript sections above provide context about "
+        "what the code SHOULD be.\n\n"
+        "Tasks:\n"
+        "1. Reconstruct each code block in the file using transcript context\n"
+        "2. Fix OCR errors (l/1, O/0, rn/m confusions)\n"
+        "3. Remove any UI text (Inspector, Hierarchy, button labels)\n"
+        "4. Set correct language tags on code fences\n"
+        "5. Keep the document structure but clean the code text\n\n"
+        "Return the COMPLETE reference file with cleaned code blocks. "
+        "Do NOT modify the transcript or metadata sections.\n\n"
+        f"Reference file:\n{content}"
+    )
+
+    try:
+        client = anthropic.Anthropic(**client_kwargs)
+        response = client.messages.create(
+            model="claude-sonnet-4-20250514",
+            max_tokens=8000,
+            messages=[{"role": "user", "content": prompt}],
+        )
+        result = response.content[0].text
+        if result and len(result) > len(content) * 0.5:
+            with open(ref_path, "w", encoding="utf-8") as f:
+                f.write(result)
+            logger.info(f"AI-cleaned reference: {os.path.basename(ref_path)}")
+    except Exception as e:
+        logger.debug(f"Reference enhancement failed: {e}")
+
+
 # =============================================================================
 # Main Converter Class
 # =============================================================================
@@ -675,6 +755,7 @@ class VideoToSkillConverter:
                            if (
                                ss.frame_type in (FrameType.CODE_EDITOR, FrameType.TERMINAL)
                                and ss.ocr_text
+                                and _is_likely_code(ss.ocr_text)
                            ):
                                lines.append(f"\n```{lang_hint}")
                                lines.append(ss.ocr_text)
@@ -683,15 +764,16 @@ class VideoToSkillConverter:
                        from skill_seekers.cli.video_models import FrameType

                        if kf.frame_type in (FrameType.CODE_EDITOR, FrameType.TERMINAL):
-                            lang_hint = ""
-                            if seg.detected_code_blocks:
-                                for cb in seg.detected_code_blocks:
-                                    if cb.language:
-                                        lang_hint = cb.language
-                                        break
-                            lines.append(f"\n```{lang_hint}")
-                            lines.append(kf.ocr_text)
-                            lines.append("```")
+                            if _is_likely_code(kf.ocr_text):
+                                lang_hint = ""
+                                if seg.detected_code_blocks:
+                                    for cb in seg.detected_code_blocks:
+                                        if cb.language:
+                                            lang_hint = cb.language
+                                            break
+                                lines.append(f"\n```{lang_hint}")
+                                lines.append(kf.ocr_text)
+                                lines.append("```")
                        elif kf.frame_type == FrameType.SLIDE:
                            for text_line in kf.ocr_text.split("\n"):
                                if text_line.strip():
@@ -779,6 +861,44 @@ class VideoToSkillConverter:

        return "\n".join(lines)

+    def _enhance_reference_files(self, enhance_level: int, args) -> None:
+        """First-pass: AI-clean reference files before SKILL.md enhancement.
+
+        When enhance_level >= 2 and an API key is available, sends each
+        reference file to Claude to reconstruct noisy Code Timeline
+        sections using transcript context.
+        """
+        has_api_key = bool(
+            os.environ.get("ANTHROPIC_API_KEY")
+            or os.environ.get("ANTHROPIC_AUTH_TOKEN")
+            or getattr(args, "api_key", None)
+        )
+        if not has_api_key or enhance_level < 2:
+            return
+
+        refs_dir = os.path.join(self.skill_dir, "references")
+        if not os.path.isdir(refs_dir):
+            return
+
+        logger.info("\n📝 Pass 1: AI-cleaning reference files (Code Timeline reconstruction)...")
+        api_key = getattr(args, "api_key", None)
+
+        for ref_file in sorted(os.listdir(refs_dir)):
+            if not ref_file.endswith(".md"):
+                continue
+            ref_path = os.path.join(refs_dir, ref_file)
+            try:
+                with open(ref_path, encoding="utf-8") as f:
+                    content = f.read()
+            except OSError:
+                continue
+
+            # Only enhance if there are code fences to clean
+            if "```" not in content:
+                continue
+
+            _ai_clean_reference(ref_path, content, api_key)
+
    def _generate_skill_md(self) -> str:
        """Generate the main SKILL.md file."""
        lines = []
@@ -1044,11 +1164,14 @@ Examples:
    # Enhancement
    enhance_level = getattr(args, "enhance_level", 0)
    if enhance_level > 0:
+        # Pass 1: Clean reference files (Code Timeline reconstruction)
+        converter._enhance_reference_files(enhance_level, args)
+
        # Auto-inject video-tutorial workflow if no workflow specified
        if not getattr(args, "enhance_workflow", None):
            args.enhance_workflow = ["video-tutorial"]

-        # Run workflow stages (specialized video analysis)
+        # Pass 2: Run workflow stages (specialized video analysis)
        try:
            from skill_seekers.cli.workflow_runner import run_workflows

--- a/src/skill_seekers/cli/video_visual.py
+++ b/src/skill_seekers/cli/video_visual.py
@@ -16,6 +16,7 @@ import difflib
 import gc
 import logging
 import os
+import re
 import tempfile
 from dataclasses import dataclass, field

@@ -1126,6 +1127,92 @@ def _cluster_ocr_into_lines(
    return regions


+# ── OCR line cleaning ────────────────────────────────────────────────
+
+
+def _fuzzy_word_match(a: str, b: str) -> bool:
+    """Check if two words are likely the same despite OCR noise.
+
+    Allows single-char prefix/suffix noise (e.g. 'gpublic' vs 'public')
+    and common OCR confusions (l/1, O/0, rn/m).
+    """
+    if a == b:
+        return True
+    # Strip single-char OCR prefix noise (e.g. 'Jpublic' → 'public')
+    a_stripped = a.lstrip("gGjJlLiI|") if len(a) > 2 else a
+    b_stripped = b.lstrip("gGjJlLiI|") if len(b) > 2 else b
+    if a_stripped == b_stripped:
+        return True
+    # Allow edit distance ≤ 1 for short words
+    if abs(len(a) - len(b)) <= 1 and len(a) >= 3:
+        diffs = sum(1 for x, y in zip(a, b, strict=False) if x != y)
+        diffs += abs(len(a) - len(b))
+        return diffs <= 1
+    return False
+
+
+def _fix_intra_line_duplication(line: str) -> str:
+    """Fix lines where OCR duplicated content.
+
+    Detects when the same token sequence appears twice adjacent,
+    e.g. 'public class Card public class Card : MonoBehaviour'
+    → 'public class Card : MonoBehaviour'.
+    """
+    words = line.split()
+    if len(words) < 4:
+        return line
+    half = len(words) // 2
+    for split_point in range(max(2, half - 2), min(len(words) - 1, half + 3)):
+        prefix = words[:split_point]
+        suffix = words[split_point:]
+        # Check if suffix starts with same sequence as prefix
+        match_len = 0
+        for i, w in enumerate(prefix):
+            if i < len(suffix) and _fuzzy_word_match(w, suffix[i]):
+                match_len += 1
+            else:
+                break
+        if match_len >= len(prefix) * 0.7 and match_len >= 2:
+            # Keep the longer/cleaner half (suffix usually has trailing content)
+            return (
+                " ".join(suffix)
+                if len(" ".join(suffix)) >= len(" ".join(prefix))
+                else " ".join(prefix)
+            )
+    return line
+
+
+# Compiled patterns for _clean_ocr_line
+_RE_LEADING_LINE_NUMBER = re.compile(r"^\s*\d{1,4}(?:\s+|\t)")
+_RE_COLLAPSE_MARKERS = re.compile(r"[▶▼►◄…⋯⋮]")
+_RE_IDE_TAB_BAR = re.compile(
+    r"^\s*(?:File|Edit|Assets|Window|Help|View|Tools|Debug|Run|Terminal)\s+",
+    re.IGNORECASE,
+)
+_RE_UNITY_INSPECTOR = re.compile(
+    r"^\s*(?:Inspector|Hierarchy|Project|Console|Scene|Game)\b.*$",
+    re.IGNORECASE,
+)
+
+
+def _clean_ocr_line(line: str) -> str:
+    """Remove IDE decorations and OCR artifacts from a single line."""
+    if not line:
+        return line
+    # Remove full-line UI chrome
+    if _RE_UNITY_INSPECTOR.match(line):
+        return ""
+    if _RE_IDE_TAB_BAR.match(line):
+        return ""
+    # Strip leading line numbers (e.g. '23  public class Card')
+    line = _RE_LEADING_LINE_NUMBER.sub("", line)
+    # Remove collapse markers / VS Code decorations
+    line = _RE_COLLAPSE_MARKERS.sub("", line)
+    # Fix intra-line duplication from multi-engine overlap
+    line = _fix_intra_line_duplication(line)
+    return line.strip()
+
+
 def _assemble_structured_text(regions: list[OCRRegion], frame_type: FrameType) -> str:
    """Join OCR line regions into structured text.

@@ -1148,7 +1235,7 @@ def _assemble_structured_text(regions: list[OCRRegion], frame_type: FrameType) -
            return ""
        # Estimate indentation from x-offset relative to leftmost region
        min_x = min(r.bbox[0] for r in regions)
-        lines = []
+        raw_lines = []
        for r in regions:
            indent_px = r.bbox[0] - min_x
            # Estimate character width from the region
@@ -1158,13 +1245,21 @@ def _assemble_structured_text(regions: list[OCRRegion], frame_type: FrameType) -
            indent_chars = int(indent_px / max(char_width, 1))
            # Round to nearest 4-space indent
            indent_level = round(indent_chars / 4)
-            lines.append("    " * indent_level + r.text)
-        return "\n".join(lines)
+            raw_lines.append("    " * indent_level + r.text)
+        # Clean IDE decorations and OCR artifacts from each line
+        cleaned = []
+        for line in raw_lines:
+            c = _clean_ocr_line(line)
+            if c:
+                cleaned.append(c)
+        return "\n".join(cleaned)

    if frame_type == FrameType.SLIDE:
-        return "\n\n".join(r.text for r in regions)
+        cleaned = [_clean_ocr_line(r.text) for r in regions]
+        return "\n\n".join(c for c in cleaned if c)

-    return " ".join(r.text for r in regions)
+    cleaned = [_clean_ocr_line(r.text) for r in regions]
+    return " ".join(c for c in cleaned if c)


 def _compute_frame_timestamps(
@@ -1788,7 +1883,32 @@ class TextBlockTracker:
        return list(self._completed_blocks)

    def get_text_groups(self) -> list[TextGroup]:
-        """Return all text groups after finalize()."""
+        """Return all text groups after finalize().
+
+        Also runs language detection on groups that don't already have
+        a detected_language set.
+        """
+        # Run language detection on each group
+        try:
+            from skill_seekers.cli.language_detector import LanguageDetector
+
+            detector = LanguageDetector()
+        except ImportError:
+            detector = None
+
+        if detector is not None:
+            for group in self._text_groups:
+                if group.detected_language:
+                    continue  # Already detected
+                text = group.full_text
+                if text and len(text) >= 20:
+                    try:
+                        lang, _conf = detector.detect_from_code(text)
+                        if lang:
+                            group.detected_language = lang
+                    except Exception:
+                        pass
+
        return list(self._text_groups)


@@ -2143,8 +2263,8 @@ def extract_visual_data(

            tracker.update(idx, ts, ocr_text, ocr_confidence, frame_type, ocr_regions=ocr_regions)

-        elif HAS_EASYOCR:
-            # Standard EasyOCR for non-code frames
+        elif HAS_EASYOCR and frame_type not in (FrameType.WEBCAM, FrameType.OTHER):
+            # Standard EasyOCR for slide/diagram frames (skip webcam/other)
            raw_ocr_results, _flat_text = extract_text_from_frame(frame_path, frame_type)
            if raw_ocr_results:
                ocr_regions = _cluster_ocr_into_lines(raw_ocr_results, frame_type)