feat: video pipeline OCR quality fixes + two-pass AI enhancement

- Skip OCR on WEBCAM/OTHER frames (eliminates ~64 junk results per video)
- Add _clean_ocr_line() to strip line numbers, IDE decorations, collapse markers
- Add _fix_intra_line_duplication() for multi-engine OCR overlap artifacts
- Add _is_likely_code() filter to prevent UI junk in reference code fences
- Add language detection to get_text_groups() via LanguageDetector
- Apply OCR cleaning in _assemble_structured_text() pipeline
- Add two-pass AI enhancement: Pass 1 cleans reference Code Timeline
  using transcript context, Pass 2 generates SKILL.md from cleaned refs
- Update video-tutorial.yaml prompts for pre-cleaned references
- Add 17 new tests (197 total video tests), 2540 tests passing

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-03-01 21:48:21 +03:00
parent bb54b3f7b6
commit d19ad7d820
6 changed files with 489 additions and 23 deletions

View File

@@ -233,6 +233,86 @@ def _build_audio_visual_alignments(
return alignments
# =============================================================================
# OCR Quality Filters
# =============================================================================
_RE_CODE_TOKENS = re.compile(
r"[=(){};]|(?:def|class|function|import|return|var|let|const|public|private|void|static|override|virtual|protected)\b"
)
_RE_UI_PATTERNS = re.compile(
r"\b(?:Inspector|Hierarchy|Project|Console|Image Type|Sorting Layer|Button|Canvas|Scene|Game)\b",
re.IGNORECASE,
)
def _is_likely_code(text: str) -> bool:
"""Return True if text likely contains programming code, not UI junk."""
if not text or len(text.strip()) < 10:
return False
code_tokens = _RE_CODE_TOKENS.findall(text)
ui_patterns = _RE_UI_PATTERNS.findall(text)
return len(code_tokens) >= 2 and len(code_tokens) > len(ui_patterns)
# =============================================================================
# Two-Pass AI Reference Enhancement
# =============================================================================
def _ai_clean_reference(ref_path: str, content: str, api_key: str | None = None) -> None:
"""Use AI to clean Code Timeline section in a reference file.
Sends the reference file content to Claude with a focused prompt
to reconstruct the Code Timeline from noisy OCR + transcript context.
"""
try:
import anthropic
except ImportError:
return
key = api_key or os.environ.get("ANTHROPIC_API_KEY") or os.environ.get("ANTHROPIC_AUTH_TOKEN")
if not key:
return
base_url = os.environ.get("ANTHROPIC_BASE_URL")
client_kwargs: dict = {"api_key": key}
if base_url:
client_kwargs["base_url"] = base_url
prompt = (
"You are cleaning a video tutorial reference file. The Code Timeline section "
"contains OCR-extracted code that is noisy (duplicated lines, garbled characters, "
"UI decorations mixed in). The transcript sections above provide context about "
"what the code SHOULD be.\n\n"
"Tasks:\n"
"1. Reconstruct each code block in the file using transcript context\n"
"2. Fix OCR errors (l/1, O/0, rn/m confusions)\n"
"3. Remove any UI text (Inspector, Hierarchy, button labels)\n"
"4. Set correct language tags on code fences\n"
"5. Keep the document structure but clean the code text\n\n"
"Return the COMPLETE reference file with cleaned code blocks. "
"Do NOT modify the transcript or metadata sections.\n\n"
f"Reference file:\n{content}"
)
try:
client = anthropic.Anthropic(**client_kwargs)
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=8000,
messages=[{"role": "user", "content": prompt}],
)
result = response.content[0].text
if result and len(result) > len(content) * 0.5:
with open(ref_path, "w", encoding="utf-8") as f:
f.write(result)
logger.info(f"AI-cleaned reference: {os.path.basename(ref_path)}")
except Exception as e:
logger.debug(f"Reference enhancement failed: {e}")
# =============================================================================
# Main Converter Class
# =============================================================================
@@ -675,6 +755,7 @@ class VideoToSkillConverter:
if (
ss.frame_type in (FrameType.CODE_EDITOR, FrameType.TERMINAL)
and ss.ocr_text
and _is_likely_code(ss.ocr_text)
):
lines.append(f"\n```{lang_hint}")
lines.append(ss.ocr_text)
@@ -683,15 +764,16 @@ class VideoToSkillConverter:
from skill_seekers.cli.video_models import FrameType
if kf.frame_type in (FrameType.CODE_EDITOR, FrameType.TERMINAL):
lang_hint = ""
if seg.detected_code_blocks:
for cb in seg.detected_code_blocks:
if cb.language:
lang_hint = cb.language
break
lines.append(f"\n```{lang_hint}")
lines.append(kf.ocr_text)
lines.append("```")
if _is_likely_code(kf.ocr_text):
lang_hint = ""
if seg.detected_code_blocks:
for cb in seg.detected_code_blocks:
if cb.language:
lang_hint = cb.language
break
lines.append(f"\n```{lang_hint}")
lines.append(kf.ocr_text)
lines.append("```")
elif kf.frame_type == FrameType.SLIDE:
for text_line in kf.ocr_text.split("\n"):
if text_line.strip():
@@ -779,6 +861,44 @@ class VideoToSkillConverter:
return "\n".join(lines)
def _enhance_reference_files(self, enhance_level: int, args) -> None:
"""First-pass: AI-clean reference files before SKILL.md enhancement.
When enhance_level >= 2 and an API key is available, sends each
reference file to Claude to reconstruct noisy Code Timeline
sections using transcript context.
"""
has_api_key = bool(
os.environ.get("ANTHROPIC_API_KEY")
or os.environ.get("ANTHROPIC_AUTH_TOKEN")
or getattr(args, "api_key", None)
)
if not has_api_key or enhance_level < 2:
return
refs_dir = os.path.join(self.skill_dir, "references")
if not os.path.isdir(refs_dir):
return
logger.info("\n📝 Pass 1: AI-cleaning reference files (Code Timeline reconstruction)...")
api_key = getattr(args, "api_key", None)
for ref_file in sorted(os.listdir(refs_dir)):
if not ref_file.endswith(".md"):
continue
ref_path = os.path.join(refs_dir, ref_file)
try:
with open(ref_path, encoding="utf-8") as f:
content = f.read()
except OSError:
continue
# Only enhance if there are code fences to clean
if "```" not in content:
continue
_ai_clean_reference(ref_path, content, api_key)
def _generate_skill_md(self) -> str:
"""Generate the main SKILL.md file."""
lines = []
@@ -1044,11 +1164,14 @@ Examples:
# Enhancement
enhance_level = getattr(args, "enhance_level", 0)
if enhance_level > 0:
# Pass 1: Clean reference files (Code Timeline reconstruction)
converter._enhance_reference_files(enhance_level, args)
# Auto-inject video-tutorial workflow if no workflow specified
if not getattr(args, "enhance_workflow", None):
args.enhance_workflow = ["video-tutorial"]
# Run workflow stages (specialized video analysis)
# Pass 2: Run workflow stages (specialized video analysis)
try:
from skill_seekers.cli.workflow_runner import run_workflows

View File

@@ -16,6 +16,7 @@ import difflib
import gc
import logging
import os
import re
import tempfile
from dataclasses import dataclass, field
@@ -1126,6 +1127,92 @@ def _cluster_ocr_into_lines(
return regions
# ── OCR line cleaning ────────────────────────────────────────────────
def _fuzzy_word_match(a: str, b: str) -> bool:
"""Check if two words are likely the same despite OCR noise.
Allows single-char prefix/suffix noise (e.g. 'gpublic' vs 'public')
and common OCR confusions (l/1, O/0, rn/m).
"""
if a == b:
return True
# Strip single-char OCR prefix noise (e.g. 'Jpublic' → 'public')
a_stripped = a.lstrip("gGjJlLiI|") if len(a) > 2 else a
b_stripped = b.lstrip("gGjJlLiI|") if len(b) > 2 else b
if a_stripped == b_stripped:
return True
# Allow edit distance ≤ 1 for short words
if abs(len(a) - len(b)) <= 1 and len(a) >= 3:
diffs = sum(1 for x, y in zip(a, b, strict=False) if x != y)
diffs += abs(len(a) - len(b))
return diffs <= 1
return False
def _fix_intra_line_duplication(line: str) -> str:
"""Fix lines where OCR duplicated content.
Detects when the same token sequence appears twice adjacent,
e.g. 'public class Card public class Card : MonoBehaviour'
'public class Card : MonoBehaviour'.
"""
words = line.split()
if len(words) < 4:
return line
half = len(words) // 2
for split_point in range(max(2, half - 2), min(len(words) - 1, half + 3)):
prefix = words[:split_point]
suffix = words[split_point:]
# Check if suffix starts with same sequence as prefix
match_len = 0
for i, w in enumerate(prefix):
if i < len(suffix) and _fuzzy_word_match(w, suffix[i]):
match_len += 1
else:
break
if match_len >= len(prefix) * 0.7 and match_len >= 2:
# Keep the longer/cleaner half (suffix usually has trailing content)
return (
" ".join(suffix)
if len(" ".join(suffix)) >= len(" ".join(prefix))
else " ".join(prefix)
)
return line
# Compiled patterns for _clean_ocr_line
_RE_LEADING_LINE_NUMBER = re.compile(r"^\s*\d{1,4}(?:\s+|\t)")
_RE_COLLAPSE_MARKERS = re.compile(r"[▶▼►◄…⋯⋮]")
_RE_IDE_TAB_BAR = re.compile(
r"^\s*(?:File|Edit|Assets|Window|Help|View|Tools|Debug|Run|Terminal)\s+",
re.IGNORECASE,
)
_RE_UNITY_INSPECTOR = re.compile(
r"^\s*(?:Inspector|Hierarchy|Project|Console|Scene|Game)\b.*$",
re.IGNORECASE,
)
def _clean_ocr_line(line: str) -> str:
"""Remove IDE decorations and OCR artifacts from a single line."""
if not line:
return line
# Remove full-line UI chrome
if _RE_UNITY_INSPECTOR.match(line):
return ""
if _RE_IDE_TAB_BAR.match(line):
return ""
# Strip leading line numbers (e.g. '23 public class Card')
line = _RE_LEADING_LINE_NUMBER.sub("", line)
# Remove collapse markers / VS Code decorations
line = _RE_COLLAPSE_MARKERS.sub("", line)
# Fix intra-line duplication from multi-engine overlap
line = _fix_intra_line_duplication(line)
return line.strip()
def _assemble_structured_text(regions: list[OCRRegion], frame_type: FrameType) -> str:
"""Join OCR line regions into structured text.
@@ -1148,7 +1235,7 @@ def _assemble_structured_text(regions: list[OCRRegion], frame_type: FrameType) -
return ""
# Estimate indentation from x-offset relative to leftmost region
min_x = min(r.bbox[0] for r in regions)
lines = []
raw_lines = []
for r in regions:
indent_px = r.bbox[0] - min_x
# Estimate character width from the region
@@ -1158,13 +1245,21 @@ def _assemble_structured_text(regions: list[OCRRegion], frame_type: FrameType) -
indent_chars = int(indent_px / max(char_width, 1))
# Round to nearest 4-space indent
indent_level = round(indent_chars / 4)
lines.append(" " * indent_level + r.text)
return "\n".join(lines)
raw_lines.append(" " * indent_level + r.text)
# Clean IDE decorations and OCR artifacts from each line
cleaned = []
for line in raw_lines:
c = _clean_ocr_line(line)
if c:
cleaned.append(c)
return "\n".join(cleaned)
if frame_type == FrameType.SLIDE:
return "\n\n".join(r.text for r in regions)
cleaned = [_clean_ocr_line(r.text) for r in regions]
return "\n\n".join(c for c in cleaned if c)
return " ".join(r.text for r in regions)
cleaned = [_clean_ocr_line(r.text) for r in regions]
return " ".join(c for c in cleaned if c)
def _compute_frame_timestamps(
@@ -1788,7 +1883,32 @@ class TextBlockTracker:
return list(self._completed_blocks)
def get_text_groups(self) -> list[TextGroup]:
"""Return all text groups after finalize()."""
"""Return all text groups after finalize().
Also runs language detection on groups that don't already have
a detected_language set.
"""
# Run language detection on each group
try:
from skill_seekers.cli.language_detector import LanguageDetector
detector = LanguageDetector()
except ImportError:
detector = None
if detector is not None:
for group in self._text_groups:
if group.detected_language:
continue # Already detected
text = group.full_text
if text and len(text) >= 20:
try:
lang, _conf = detector.detect_from_code(text)
if lang:
group.detected_language = lang
except Exception:
pass
return list(self._text_groups)
@@ -2143,8 +2263,8 @@ def extract_visual_data(
tracker.update(idx, ts, ocr_text, ocr_confidence, frame_type, ocr_regions=ocr_regions)
elif HAS_EASYOCR:
# Standard EasyOCR for non-code frames
elif HAS_EASYOCR and frame_type not in (FrameType.WEBCAM, FrameType.OTHER):
# Standard EasyOCR for slide/diagram frames (skip webcam/other)
raw_ocr_results, _flat_text = extract_text_from_frame(frame_path, frame_type)
if raw_ocr_results:
ocr_regions = _cluster_ocr_into_lines(raw_ocr_results, frame_type)