From 4a36e891952741d8cd05b73fff53a2c674d0b24b Mon Sep 17 00:00:00 2001 From: daymade Date: Sun, 21 Dec 2025 12:58:32 +0800 Subject: [PATCH] Add word-level diff generator script MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add scripts/generate_word_diff.py for generating word-by-word comparison HTML - Shows complete word replacements (e.g., 'japanese 3 pro' → 'Gemini 3 Pro') - More readable than character-level or line-level diffs - Update SKILL.md with usage instructions and script documentation --- transcript-fixer/SKILL.md | 10 + .../scripts/generate_word_diff.py | 315 ++++++++++++++++++ 2 files changed, 325 insertions(+) create mode 100755 transcript-fixer/scripts/generate_word_diff.py diff --git a/transcript-fixer/SKILL.md b/transcript-fixer/SKILL.md index 13984da..4830456 100644 --- a/transcript-fixer/SKILL.md +++ b/transcript-fixer/SKILL.md @@ -65,6 +65,15 @@ uv run scripts/fix_transcription.py --review-learned - `*_stage2.md` - AI corrections applied (final version) - `*_对比.html` - Visual diff (open in browser for best experience) +**Generate word-level diff** (recommended for reviewing corrections): +```bash +uv run scripts/generate_word_diff.py original.md corrected.md output.html +``` + +This creates an HTML file showing word-by-word differences with clear highlighting: +- 🔴 `japanese 3 pro` → 🟢 `Gemini 3 Pro` (complete word replacements) +- Easy to spot exactly what changed without character-level noise + ## Example Session **Input transcript** (`meeting.md`): @@ -153,6 +162,7 @@ sqlite3 ~/.transcript-fixer/corrections.db "SELECT value FROM system_config WHER - `ensure_deps.py` - Initialize shared virtual environment (run once, optional) - `fix_transcript_enhanced.py` - Enhanced wrapper (recommended for interactive use) - `fix_transcription.py` - Core CLI (for automation) +- `generate_word_diff.py` - Generate word-level diff HTML for reviewing corrections - `examples/bulk_import.py` - Bulk import example **References** (load as needed): diff --git a/transcript-fixer/scripts/generate_word_diff.py b/transcript-fixer/scripts/generate_word_diff.py new file mode 100755 index 0000000..a36f78d --- /dev/null +++ b/transcript-fixer/scripts/generate_word_diff.py @@ -0,0 +1,315 @@ +#!/usr/bin/env python3 +""" +Generate Word-Level Diff HTML Comparison + +Creates an HTML file showing word-by-word differences between original and corrected transcripts. +This helps users review corrections more easily than character-level or line-level diffs. + +Usage: + python scripts/generate_word_diff.py [output_file] + +Example: + python scripts/generate_word_diff.py original.md corrected.md comparison.html +""" + +import difflib +import html +import re +import sys +from pathlib import Path + + +def tokenize(text): + """ + Split text into tokens (words) while preserving Chinese, English, numbers, and punctuation. + + Pattern explanation: + - [\\u4e00-\\u9fff]+ : Chinese characters (one or more) + - [a-zA-Z0-9]+ : English words and numbers (one or more) + - [^\\u4e00-\\u9fffa-zA-Z0-9\\s] : Single punctuation/special chars + - \\s+ : Whitespace sequences + + Returns: + List of token strings + """ + pattern = r'[\u4e00-\u9fff]+|[a-zA-Z0-9]+|[^\u4e00-\u9fffa-zA-Z0-9\s]|\s+' + return re.findall(pattern, text) + + +def get_word_diff(old, new): + """ + Generate word-level diff with HTML highlighting. + + Args: + old: Original text line + new: Corrected text line + + Returns: + HTML string with word-level diff highlighting + """ + old_tokens = tokenize(old) + new_tokens = tokenize(new) + + s = difflib.SequenceMatcher(None, old_tokens, new_tokens) + result = [] + + for tag, i1, i2, j1, j2 in s.get_opcodes(): + old_part = ''.join(old_tokens[i1:i2]) + new_part = ''.join(new_tokens[j1:j2]) + + if tag == 'equal': + result.append(html.escape(old_part)) + elif tag == 'delete': + result.append(f'{html.escape(old_part)}') + elif tag == 'insert': + result.append(f'{html.escape(new_part)}') + elif tag == 'replace': + result.append(f'{html.escape(old_part)}{html.escape(new_part)}') + + return ''.join(result) + + +def generate_html_header(total_lines, total_changes): + """Generate HTML header with statistics and styling.""" + change_rate = total_changes / total_lines * 100 if total_lines > 0 else 0 + + return f''' + + + + 录音转写修正对比(词语级别) + + + +

🎙️ 录音转写修正对比(词语级别)

+ +
+

📊 修正统计

+ 总行数: {total_lines} + 修改行数: {total_changes} + 修改率: {change_rate:.1f}% +
+ +
+ 📖 图例说明:
+ 删除的词 原文中的错误 + 添加的词 修正后的内容 + 错误正确 词语替换 +
+ +
+''' + + +def generate_diff_content(original_lines, corrected_lines, context_lines=1): + """ + Generate diff content HTML showing changed lines with context. + + Args: + original_lines: List of original text lines + corrected_lines: List of corrected text lines + context_lines: Number of context lines to show around changes + + Returns: + HTML string with diff content + """ + html_parts = [] + last_change_idx = -999 + + for i, (old_line, new_line) in enumerate(zip(original_lines, corrected_lines), 1): + old_line = old_line.rstrip('\n') + new_line = new_line.rstrip('\n') + + if old_line.strip() != new_line.strip(): + # Show separator if gap is large + if i - last_change_idx > context_lines + 1: + if last_change_idx > 0: + html_parts.append('
⋯ ⋯ ⋯
\n') + + # Show changed line + diff_html = get_word_diff(old_line, new_line) + html_parts.append(f'
第 {i} 行{diff_html}
\n') + last_change_idx = i + elif abs(i - last_change_idx) <= context_lines and last_change_idx > 0: + # Show context line + escaped = html.escape(old_line) + html_parts.append(f'
第 {i} 行{escaped}
\n') + + return ''.join(html_parts) + + +def generate_html_footer(): + """Generate HTML footer.""" + return ''' +
+ + +''' + + +def main(): + """Main entry point for the script.""" + if len(sys.argv) < 3: + print("Usage: python generate_word_diff.py [output_file]") + print("\nExample:") + print(" python generate_word_diff.py original.md corrected.md comparison.html") + sys.exit(1) + + original_path = Path(sys.argv[1]) + corrected_path = Path(sys.argv[2]) + + # Determine output path + if len(sys.argv) >= 4: + output_path = Path(sys.argv[3]) + else: + # Default: save next to original file with _对比_词语级.html suffix + output_path = original_path.parent / f"{original_path.stem}_对比_词语级.html" + + # Validate input files + if not original_path.exists(): + print(f"❌ Error: Original file not found: {original_path}") + sys.exit(1) + + if not corrected_path.exists(): + print(f"❌ Error: Corrected file not found: {corrected_path}") + sys.exit(1) + + # Read files + try: + with open(original_path, 'r', encoding='utf-8') as f: + original_lines = f.readlines() + + with open(corrected_path, 'r', encoding='utf-8') as f: + corrected_lines = f.readlines() + except Exception as e: + print(f"❌ Error reading files: {e}") + sys.exit(1) + + # Count changes + total_changes = sum(1 for old, new in zip(original_lines, corrected_lines) + if old.strip() != new.strip()) + + # Generate HTML + html_content = generate_html_header(len(original_lines), total_changes) + html_content += generate_diff_content(original_lines, corrected_lines) + html_content += generate_html_footer() + + # Write output + try: + with open(output_path, 'w', encoding='utf-8') as f: + f.write(html_content) + + print(f"✅ 词语级 diff HTML 已生成: {output_path}") + print(f"📊 共修改了 {total_changes} 行,占总行数的 {total_changes/len(original_lines)*100:.1f}%") + + return 0 + except Exception as e: + print(f"❌ Error writing output file: {e}") + sys.exit(1) + + +if __name__ == "__main__": + sys.exit(main())