#!/usr/bin/env python3 # /// script # requires-python = ">=3.10" # dependencies = [] # /// """ Generate Word-Level Diff HTML Comparison Creates an HTML file showing word-by-word differences between original and corrected transcripts. This helps users review corrections more easily than character-level or line-level diffs. Usage: python scripts/generate_word_diff.py [output_file] Example: python scripts/generate_word_diff.py original.md corrected.md comparison.html """ import difflib import html import re import sys from pathlib import Path def tokenize(text): """ Split text into tokens (words) while preserving Chinese, English, numbers, and punctuation. Pattern explanation: - [\\u4e00-\\u9fff]+ : Chinese characters (one or more) - [a-zA-Z0-9]+ : English words and numbers (one or more) - [^\\u4e00-\\u9fffa-zA-Z0-9\\s] : Single punctuation/special chars - \\s+ : Whitespace sequences Returns: List of token strings """ pattern = r'[\u4e00-\u9fff]+|[a-zA-Z0-9]+|[^\u4e00-\u9fffa-zA-Z0-9\s]|\s+' return re.findall(pattern, text) def get_word_diff(old, new): """ Generate word-level diff with HTML highlighting. Args: old: Original text line new: Corrected text line Returns: HTML string with word-level diff highlighting """ old_tokens = tokenize(old) new_tokens = tokenize(new) s = difflib.SequenceMatcher(None, old_tokens, new_tokens) result = [] for tag, i1, i2, j1, j2 in s.get_opcodes(): old_part = ''.join(old_tokens[i1:i2]) new_part = ''.join(new_tokens[j1:j2]) if tag == 'equal': result.append(html.escape(old_part)) elif tag == 'delete': result.append(f'~~{html.escape(old_part)}~~') elif tag == 'insert': result.append(f'{html.escape(new_part)}') elif tag == 'replace': result.append(f'~~{html.escape(old_part)}~~ → {html.escape(new_part)}') return ''.join(result) def generate_html_header(total_lines, total_changes): """Generate HTML header with statistics and styling.""" change_rate = total_changes / total_lines * 100 if total_lines > 0 else 0 return f''' 录音转写修正对比（词语级别）

🎙️ 录音转写修正对比（词语级别）

📊 修正统计

总行数: {total_lines} 修改行数: {total_changes} 修改率: {change_rate:.1f}%

📖 图例说明：
~~删除的词~~ 原文中的错误添加的词修正后的内容错误 → 正确词语替换

''' def generate_diff_content(original_lines, corrected_lines, context_lines=1): """ Generate diff content HTML showing changed lines with context. Args: original_lines: List of original text lines corrected_lines: List of corrected text lines context_lines: Number of context lines to show around changes Returns: HTML string with diff content """ html_parts = [] last_change_idx = -999 for i, (old_line, new_line) in enumerate(zip(original_lines, corrected_lines), 1): old_line = old_line.rstrip('\n') new_line = new_line.rstrip('\n') if old_line.strip() != new_line.strip(): # Show separator if gap is large if i - last_change_idx > context_lines + 1: if last_change_idx > 0: html_parts.append('

⋯ ⋯ ⋯

\n') # Show changed line diff_html = get_word_diff(old_line, new_line) html_parts.append(f'

第 {i} 行{diff_html}

\n') last_change_idx = i elif abs(i - last_change_idx) <= context_lines and last_change_idx > 0: # Show context line escaped = html.escape(old_line) html_parts.append(f'

第 {i} 行{escaped}

\n') return ''.join(html_parts) def generate_html_footer(): """Generate HTML footer.""" return '''

''' def main(): """Main entry point for the script.""" if len(sys.argv) < 3: print("Usage: python generate_word_diff.py [output_file]") print("\nExample:") print(" python generate_word_diff.py original.md corrected.md comparison.html") sys.exit(1) original_path = Path(sys.argv[1]) corrected_path = Path(sys.argv[2]) # Determine output path if len(sys.argv) >= 4: output_path = Path(sys.argv[3]) else: # Default: save next to original file with _对比_词语级.html suffix output_path = original_path.parent / f"{original_path.stem}_对比_词语级.html" # Validate input files if not original_path.exists(): print(f"❌ Error: Original file not found: {original_path}") sys.exit(1) if not corrected_path.exists(): print(f"❌ Error: Corrected file not found: {corrected_path}") sys.exit(1) # Read files try: with open(original_path, 'r', encoding='utf-8') as f: original_lines = f.readlines() with open(corrected_path, 'r', encoding='utf-8') as f: corrected_lines = f.readlines() except Exception as e: print(f"❌ Error reading files: {e}") sys.exit(1) # Count changes total_changes = sum(1 for old, new in zip(original_lines, corrected_lines) if old.strip() != new.strip()) # Generate HTML html_content = generate_html_header(len(original_lines), total_changes) html_content += generate_diff_content(original_lines, corrected_lines) html_content += generate_html_footer() # Write output try: with open(output_path, 'w', encoding='utf-8') as f: f.write(html_content) print(f"✅ 词语级 diff HTML 已生成: {output_path}") print(f"📊 共修改了 {total_changes} 行，占总行数的 {total_changes/len(original_lines)*100:.1f}%") return 0 except Exception as e: print(f"❌ Error writing output file: {e}") sys.exit(1) if __name__ == "__main__": sys.exit(main())