Add word-level diff generator script
- Add scripts/generate_word_diff.py for generating word-by-word comparison HTML - Shows complete word replacements (e.g., 'japanese 3 pro' → 'Gemini 3 Pro') - More readable than character-level or line-level diffs - Update SKILL.md with usage instructions and script documentation
This commit is contained in:
@@ -65,6 +65,15 @@ uv run scripts/fix_transcription.py --review-learned
|
||||
- `*_stage2.md` - AI corrections applied (final version)
|
||||
- `*_对比.html` - Visual diff (open in browser for best experience)
|
||||
|
||||
**Generate word-level diff** (recommended for reviewing corrections):
|
||||
```bash
|
||||
uv run scripts/generate_word_diff.py original.md corrected.md output.html
|
||||
```
|
||||
|
||||
This creates an HTML file showing word-by-word differences with clear highlighting:
|
||||
- 🔴 `japanese 3 pro` → 🟢 `Gemini 3 Pro` (complete word replacements)
|
||||
- Easy to spot exactly what changed without character-level noise
|
||||
|
||||
## Example Session
|
||||
|
||||
**Input transcript** (`meeting.md`):
|
||||
@@ -153,6 +162,7 @@ sqlite3 ~/.transcript-fixer/corrections.db "SELECT value FROM system_config WHER
|
||||
- `ensure_deps.py` - Initialize shared virtual environment (run once, optional)
|
||||
- `fix_transcript_enhanced.py` - Enhanced wrapper (recommended for interactive use)
|
||||
- `fix_transcription.py` - Core CLI (for automation)
|
||||
- `generate_word_diff.py` - Generate word-level diff HTML for reviewing corrections
|
||||
- `examples/bulk_import.py` - Bulk import example
|
||||
|
||||
**References** (load as needed):
|
||||
|
||||
315
transcript-fixer/scripts/generate_word_diff.py
Executable file
315
transcript-fixer/scripts/generate_word_diff.py
Executable file
@@ -0,0 +1,315 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate Word-Level Diff HTML Comparison
|
||||
|
||||
Creates an HTML file showing word-by-word differences between original and corrected transcripts.
|
||||
This helps users review corrections more easily than character-level or line-level diffs.
|
||||
|
||||
Usage:
|
||||
python scripts/generate_word_diff.py <original_file> <corrected_file> [output_file]
|
||||
|
||||
Example:
|
||||
python scripts/generate_word_diff.py original.md corrected.md comparison.html
|
||||
"""
|
||||
|
||||
import difflib
|
||||
import html
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def tokenize(text):
|
||||
"""
|
||||
Split text into tokens (words) while preserving Chinese, English, numbers, and punctuation.
|
||||
|
||||
Pattern explanation:
|
||||
- [\\u4e00-\\u9fff]+ : Chinese characters (one or more)
|
||||
- [a-zA-Z0-9]+ : English words and numbers (one or more)
|
||||
- [^\\u4e00-\\u9fffa-zA-Z0-9\\s] : Single punctuation/special chars
|
||||
- \\s+ : Whitespace sequences
|
||||
|
||||
Returns:
|
||||
List of token strings
|
||||
"""
|
||||
pattern = r'[\u4e00-\u9fff]+|[a-zA-Z0-9]+|[^\u4e00-\u9fffa-zA-Z0-9\s]|\s+'
|
||||
return re.findall(pattern, text)
|
||||
|
||||
|
||||
def get_word_diff(old, new):
|
||||
"""
|
||||
Generate word-level diff with HTML highlighting.
|
||||
|
||||
Args:
|
||||
old: Original text line
|
||||
new: Corrected text line
|
||||
|
||||
Returns:
|
||||
HTML string with word-level diff highlighting
|
||||
"""
|
||||
old_tokens = tokenize(old)
|
||||
new_tokens = tokenize(new)
|
||||
|
||||
s = difflib.SequenceMatcher(None, old_tokens, new_tokens)
|
||||
result = []
|
||||
|
||||
for tag, i1, i2, j1, j2 in s.get_opcodes():
|
||||
old_part = ''.join(old_tokens[i1:i2])
|
||||
new_part = ''.join(new_tokens[j1:j2])
|
||||
|
||||
if tag == 'equal':
|
||||
result.append(html.escape(old_part))
|
||||
elif tag == 'delete':
|
||||
result.append(f'<del class="word-del" title="删除: {html.escape(old_part)}">{html.escape(old_part)}</del>')
|
||||
elif tag == 'insert':
|
||||
result.append(f'<ins class="word-ins" title="添加: {html.escape(new_part)}">{html.escape(new_part)}</ins>')
|
||||
elif tag == 'replace':
|
||||
result.append(f'<span class="word-change"><del class="word-del" title="原文">{html.escape(old_part)}</del> → <ins class="word-ins" title="修正后">{html.escape(new_part)}</ins></span>')
|
||||
|
||||
return ''.join(result)
|
||||
|
||||
|
||||
def generate_html_header(total_lines, total_changes):
|
||||
"""Generate HTML header with statistics and styling."""
|
||||
change_rate = total_changes / total_lines * 100 if total_lines > 0 else 0
|
||||
|
||||
return f'''<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>录音转写修正对比(词语级别)</title>
|
||||
<style>
|
||||
body {{
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'PingFang SC', 'Hiragino Sans GB', 'Microsoft YaHei', sans-serif;
|
||||
font-size: 16px;
|
||||
line-height: 2;
|
||||
max-width: 1400px;
|
||||
margin: 20px auto;
|
||||
padding: 20px;
|
||||
background: #f8f9fa;
|
||||
}}
|
||||
h1 {{
|
||||
color: #1a1a1a;
|
||||
border-bottom: 3px solid #007aff;
|
||||
padding-bottom: 15px;
|
||||
margin-bottom: 25px;
|
||||
}}
|
||||
.summary {{
|
||||
background: white;
|
||||
padding: 20px;
|
||||
border-radius: 8px;
|
||||
margin-bottom: 25px;
|
||||
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
||||
}}
|
||||
.summary h2 {{
|
||||
margin: 0 0 15px 0;
|
||||
color: #007aff;
|
||||
font-size: 18px;
|
||||
}}
|
||||
.stat {{
|
||||
display: inline-block;
|
||||
margin-right: 25px;
|
||||
padding: 8px 15px;
|
||||
background: #f0f0f0;
|
||||
border-radius: 5px;
|
||||
font-weight: 500;
|
||||
}}
|
||||
.legend {{
|
||||
background: #fff9e6;
|
||||
padding: 15px;
|
||||
border-radius: 8px;
|
||||
margin-bottom: 20px;
|
||||
border-left: 4px solid #ffc107;
|
||||
}}
|
||||
.legend-item {{
|
||||
display: inline-block;
|
||||
margin-right: 20px;
|
||||
margin-bottom: 5px;
|
||||
}}
|
||||
.diff-container {{
|
||||
background: white;
|
||||
padding: 25px;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
||||
}}
|
||||
.diff-line {{
|
||||
padding: 12px 15px;
|
||||
margin: 10px 0;
|
||||
border-left: 4px solid transparent;
|
||||
border-radius: 4px;
|
||||
background: #fafafa;
|
||||
}}
|
||||
.diff-line.changed {{
|
||||
background: #fff;
|
||||
border-left-color: #ff9800;
|
||||
}}
|
||||
.diff-line.unchanged {{
|
||||
opacity: 0.5;
|
||||
font-size: 14px;
|
||||
}}
|
||||
del.word-del {{
|
||||
background-color: #ffcdd2;
|
||||
color: #c62828;
|
||||
text-decoration: line-through;
|
||||
padding: 3px 6px;
|
||||
border-radius: 4px;
|
||||
font-weight: 700;
|
||||
margin: 0 2px;
|
||||
}}
|
||||
ins.word-ins {{
|
||||
background-color: #c8e6c9;
|
||||
color: #2e7d32;
|
||||
text-decoration: none;
|
||||
padding: 3px 6px;
|
||||
border-radius: 4px;
|
||||
font-weight: 700;
|
||||
margin: 0 2px;
|
||||
}}
|
||||
.word-change {{
|
||||
display: inline-block;
|
||||
background: #fff3e0;
|
||||
padding: 4px 8px;
|
||||
border-radius: 4px;
|
||||
margin: 0 2px;
|
||||
}}
|
||||
.line-number {{
|
||||
display: inline-block;
|
||||
min-width: 60px;
|
||||
color: #999;
|
||||
font-size: 13px;
|
||||
margin-right: 15px;
|
||||
user-select: none;
|
||||
font-family: 'SF Mono', 'Monaco', monospace;
|
||||
}}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>🎙️ 录音转写修正对比(词语级别)</h1>
|
||||
|
||||
<div class="summary">
|
||||
<h2>📊 修正统计</h2>
|
||||
<span class="stat">总行数: <strong>{total_lines}</strong></span>
|
||||
<span class="stat" style="color: #ff9800;">修改行数: <strong>{total_changes}</strong></span>
|
||||
<span class="stat" style="color: #4caf50;">修改率: <strong>{change_rate:.1f}%</strong></span>
|
||||
</div>
|
||||
|
||||
<div class="legend">
|
||||
<strong>📖 图例说明:</strong><br>
|
||||
<span class="legend-item"><del class="word-del">删除的词</del> 原文中的错误</span>
|
||||
<span class="legend-item"><ins class="word-ins">添加的词</ins> 修正后的内容</span>
|
||||
<span class="legend-item"><span class="word-change"><del class="word-del">错误</del> → <ins class="word-ins">正确</ins></span> 词语替换</span>
|
||||
</div>
|
||||
|
||||
<div class="diff-container">
|
||||
'''
|
||||
|
||||
|
||||
def generate_diff_content(original_lines, corrected_lines, context_lines=1):
|
||||
"""
|
||||
Generate diff content HTML showing changed lines with context.
|
||||
|
||||
Args:
|
||||
original_lines: List of original text lines
|
||||
corrected_lines: List of corrected text lines
|
||||
context_lines: Number of context lines to show around changes
|
||||
|
||||
Returns:
|
||||
HTML string with diff content
|
||||
"""
|
||||
html_parts = []
|
||||
last_change_idx = -999
|
||||
|
||||
for i, (old_line, new_line) in enumerate(zip(original_lines, corrected_lines), 1):
|
||||
old_line = old_line.rstrip('\n')
|
||||
new_line = new_line.rstrip('\n')
|
||||
|
||||
if old_line.strip() != new_line.strip():
|
||||
# Show separator if gap is large
|
||||
if i - last_change_idx > context_lines + 1:
|
||||
if last_change_idx > 0:
|
||||
html_parts.append('<div style="text-align: center; color: #999; margin: 20px 0; font-size: 18px;">⋯ ⋯ ⋯</div>\n')
|
||||
|
||||
# Show changed line
|
||||
diff_html = get_word_diff(old_line, new_line)
|
||||
html_parts.append(f'<div class="diff-line changed"><span class="line-number">第 {i} 行</span>{diff_html}</div>\n')
|
||||
last_change_idx = i
|
||||
elif abs(i - last_change_idx) <= context_lines and last_change_idx > 0:
|
||||
# Show context line
|
||||
escaped = html.escape(old_line)
|
||||
html_parts.append(f'<div class="diff-line unchanged"><span class="line-number">第 {i} 行</span>{escaped}</div>\n')
|
||||
|
||||
return ''.join(html_parts)
|
||||
|
||||
|
||||
def generate_html_footer():
|
||||
"""Generate HTML footer."""
|
||||
return '''
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for the script."""
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: python generate_word_diff.py <original_file> <corrected_file> [output_file]")
|
||||
print("\nExample:")
|
||||
print(" python generate_word_diff.py original.md corrected.md comparison.html")
|
||||
sys.exit(1)
|
||||
|
||||
original_path = Path(sys.argv[1])
|
||||
corrected_path = Path(sys.argv[2])
|
||||
|
||||
# Determine output path
|
||||
if len(sys.argv) >= 4:
|
||||
output_path = Path(sys.argv[3])
|
||||
else:
|
||||
# Default: save next to original file with _对比_词语级.html suffix
|
||||
output_path = original_path.parent / f"{original_path.stem}_对比_词语级.html"
|
||||
|
||||
# Validate input files
|
||||
if not original_path.exists():
|
||||
print(f"❌ Error: Original file not found: {original_path}")
|
||||
sys.exit(1)
|
||||
|
||||
if not corrected_path.exists():
|
||||
print(f"❌ Error: Corrected file not found: {corrected_path}")
|
||||
sys.exit(1)
|
||||
|
||||
# Read files
|
||||
try:
|
||||
with open(original_path, 'r', encoding='utf-8') as f:
|
||||
original_lines = f.readlines()
|
||||
|
||||
with open(corrected_path, 'r', encoding='utf-8') as f:
|
||||
corrected_lines = f.readlines()
|
||||
except Exception as e:
|
||||
print(f"❌ Error reading files: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Count changes
|
||||
total_changes = sum(1 for old, new in zip(original_lines, corrected_lines)
|
||||
if old.strip() != new.strip())
|
||||
|
||||
# Generate HTML
|
||||
html_content = generate_html_header(len(original_lines), total_changes)
|
||||
html_content += generate_diff_content(original_lines, corrected_lines)
|
||||
html_content += generate_html_footer()
|
||||
|
||||
# Write output
|
||||
try:
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(html_content)
|
||||
|
||||
print(f"✅ 词语级 diff HTML 已生成: {output_path}")
|
||||
print(f"📊 共修改了 {total_changes} 行,占总行数的 {total_changes/len(original_lines)*100:.1f}%")
|
||||
|
||||
return 0
|
||||
except Exception as e:
|
||||
print(f"❌ Error writing output file: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user