Files
claude-code-skills-reference/transcript-fixer/scripts/generate_word_diff.py
2026-04-06 08:50:10 +08:00

320 lines
10 KiB
Python
Executable File

#!/usr/bin/env python3
# /// script
# requires-python = ">=3.10"
# dependencies = []
# ///
"""
Generate Word-Level Diff HTML Comparison
Creates an HTML file showing word-by-word differences between original and corrected transcripts.
This helps users review corrections more easily than character-level or line-level diffs.
Usage:
python scripts/generate_word_diff.py <original_file> <corrected_file> [output_file]
Example:
python scripts/generate_word_diff.py original.md corrected.md comparison.html
"""
import difflib
import html
import re
import sys
from pathlib import Path
def tokenize(text):
"""
Split text into tokens (words) while preserving Chinese, English, numbers, and punctuation.
Pattern explanation:
- [\\u4e00-\\u9fff]+ : Chinese characters (one or more)
- [a-zA-Z0-9]+ : English words and numbers (one or more)
- [^\\u4e00-\\u9fffa-zA-Z0-9\\s] : Single punctuation/special chars
- \\s+ : Whitespace sequences
Returns:
List of token strings
"""
pattern = r'[\u4e00-\u9fff]+|[a-zA-Z0-9]+|[^\u4e00-\u9fffa-zA-Z0-9\s]|\s+'
return re.findall(pattern, text)
def get_word_diff(old, new):
"""
Generate word-level diff with HTML highlighting.
Args:
old: Original text line
new: Corrected text line
Returns:
HTML string with word-level diff highlighting
"""
old_tokens = tokenize(old)
new_tokens = tokenize(new)
s = difflib.SequenceMatcher(None, old_tokens, new_tokens)
result = []
for tag, i1, i2, j1, j2 in s.get_opcodes():
old_part = ''.join(old_tokens[i1:i2])
new_part = ''.join(new_tokens[j1:j2])
if tag == 'equal':
result.append(html.escape(old_part))
elif tag == 'delete':
result.append(f'<del class="word-del" title="删除: {html.escape(old_part)}">{html.escape(old_part)}</del>')
elif tag == 'insert':
result.append(f'<ins class="word-ins" title="添加: {html.escape(new_part)}">{html.escape(new_part)}</ins>')
elif tag == 'replace':
result.append(f'<span class="word-change"><del class="word-del" title="原文">{html.escape(old_part)}</del> → <ins class="word-ins" title="修正后">{html.escape(new_part)}</ins></span>')
return ''.join(result)
def generate_html_header(total_lines, total_changes):
"""Generate HTML header with statistics and styling."""
change_rate = total_changes / total_lines * 100 if total_lines > 0 else 0
return f'''<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>录音转写修正对比(词语级别)</title>
<style>
body {{
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'PingFang SC', 'Hiragino Sans GB', 'Microsoft YaHei', sans-serif;
font-size: 16px;
line-height: 2;
max-width: 1400px;
margin: 20px auto;
padding: 20px;
background: #f8f9fa;
}}
h1 {{
color: #1a1a1a;
border-bottom: 3px solid #007aff;
padding-bottom: 15px;
margin-bottom: 25px;
}}
.summary {{
background: white;
padding: 20px;
border-radius: 8px;
margin-bottom: 25px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}}
.summary h2 {{
margin: 0 0 15px 0;
color: #007aff;
font-size: 18px;
}}
.stat {{
display: inline-block;
margin-right: 25px;
padding: 8px 15px;
background: #f0f0f0;
border-radius: 5px;
font-weight: 500;
}}
.legend {{
background: #fff9e6;
padding: 15px;
border-radius: 8px;
margin-bottom: 20px;
border-left: 4px solid #ffc107;
}}
.legend-item {{
display: inline-block;
margin-right: 20px;
margin-bottom: 5px;
}}
.diff-container {{
background: white;
padding: 25px;
border-radius: 8px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}}
.diff-line {{
padding: 12px 15px;
margin: 10px 0;
border-left: 4px solid transparent;
border-radius: 4px;
background: #fafafa;
}}
.diff-line.changed {{
background: #fff;
border-left-color: #ff9800;
}}
.diff-line.unchanged {{
opacity: 0.5;
font-size: 14px;
}}
del.word-del {{
background-color: #ffcdd2;
color: #c62828;
text-decoration: line-through;
padding: 3px 6px;
border-radius: 4px;
font-weight: 700;
margin: 0 2px;
}}
ins.word-ins {{
background-color: #c8e6c9;
color: #2e7d32;
text-decoration: none;
padding: 3px 6px;
border-radius: 4px;
font-weight: 700;
margin: 0 2px;
}}
.word-change {{
display: inline-block;
background: #fff3e0;
padding: 4px 8px;
border-radius: 4px;
margin: 0 2px;
}}
.line-number {{
display: inline-block;
min-width: 60px;
color: #999;
font-size: 13px;
margin-right: 15px;
user-select: none;
font-family: 'SF Mono', 'Monaco', monospace;
}}
</style>
</head>
<body>
<h1>🎙️ 录音转写修正对比(词语级别)</h1>
<div class="summary">
<h2>📊 修正统计</h2>
<span class="stat">总行数: <strong>{total_lines}</strong></span>
<span class="stat" style="color: #ff9800;">修改行数: <strong>{total_changes}</strong></span>
<span class="stat" style="color: #4caf50;">修改率: <strong>{change_rate:.1f}%</strong></span>
</div>
<div class="legend">
<strong>📖 图例说明:</strong><br>
<span class="legend-item"><del class="word-del">删除的词</del> 原文中的错误</span>
<span class="legend-item"><ins class="word-ins">添加的词</ins> 修正后的内容</span>
<span class="legend-item"><span class="word-change"><del class="word-del">错误</del> → <ins class="word-ins">正确</ins></span> 词语替换</span>
</div>
<div class="diff-container">
'''
def generate_diff_content(original_lines, corrected_lines, context_lines=1):
"""
Generate diff content HTML showing changed lines with context.
Args:
original_lines: List of original text lines
corrected_lines: List of corrected text lines
context_lines: Number of context lines to show around changes
Returns:
HTML string with diff content
"""
html_parts = []
last_change_idx = -999
for i, (old_line, new_line) in enumerate(zip(original_lines, corrected_lines), 1):
old_line = old_line.rstrip('\n')
new_line = new_line.rstrip('\n')
if old_line.strip() != new_line.strip():
# Show separator if gap is large
if i - last_change_idx > context_lines + 1:
if last_change_idx > 0:
html_parts.append('<div style="text-align: center; color: #999; margin: 20px 0; font-size: 18px;">⋯ ⋯ ⋯</div>\n')
# Show changed line
diff_html = get_word_diff(old_line, new_line)
html_parts.append(f'<div class="diff-line changed"><span class="line-number">第 {i} 行</span>{diff_html}</div>\n')
last_change_idx = i
elif abs(i - last_change_idx) <= context_lines and last_change_idx > 0:
# Show context line
escaped = html.escape(old_line)
html_parts.append(f'<div class="diff-line unchanged"><span class="line-number">第 {i} 行</span>{escaped}</div>\n')
return ''.join(html_parts)
def generate_html_footer():
"""Generate HTML footer."""
return '''
</div>
</body>
</html>
'''
def main():
"""Main entry point for the script."""
if len(sys.argv) < 3:
print("Usage: python generate_word_diff.py <original_file> <corrected_file> [output_file]")
print("\nExample:")
print(" python generate_word_diff.py original.md corrected.md comparison.html")
sys.exit(1)
original_path = Path(sys.argv[1])
corrected_path = Path(sys.argv[2])
# Determine output path
if len(sys.argv) >= 4:
output_path = Path(sys.argv[3])
else:
# Default: save next to original file with _对比_词语级.html suffix
output_path = original_path.parent / f"{original_path.stem}_对比_词语级.html"
# Validate input files
if not original_path.exists():
print(f"❌ Error: Original file not found: {original_path}")
sys.exit(1)
if not corrected_path.exists():
print(f"❌ Error: Corrected file not found: {corrected_path}")
sys.exit(1)
# Read files
try:
with open(original_path, 'r', encoding='utf-8') as f:
original_lines = f.readlines()
with open(corrected_path, 'r', encoding='utf-8') as f:
corrected_lines = f.readlines()
except Exception as e:
print(f"❌ Error reading files: {e}")
sys.exit(1)
# Count changes
total_changes = sum(1 for old, new in zip(original_lines, corrected_lines)
if old.strip() != new.strip())
# Generate HTML
html_content = generate_html_header(len(original_lines), total_changes)
html_content += generate_diff_content(original_lines, corrected_lines)
html_content += generate_html_footer()
# Write output
try:
with open(output_path, 'w', encoding='utf-8') as f:
f.write(html_content)
print(f"✅ 词语级 diff HTML 已生成: {output_path}")
print(f"📊 共修改了 {total_changes} 行,占总行数的 {total_changes/len(original_lines)*100:.1f}%")
return 0
except Exception as e:
print(f"❌ Error writing output file: {e}")
sys.exit(1)
if __name__ == "__main__":
sys.exit(main())