Release v1.8.0: Add transcript-fixer skill
## New Skill: transcript-fixer v1.0.0 Correct speech-to-text (ASR/STT) transcription errors through dictionary-based rules and AI-powered corrections with automatic pattern learning. **Features:** - Two-stage correction pipeline (dictionary + AI) - Automatic pattern detection and learning - Domain-specific dictionaries (general, embodied_ai, finance, medical) - SQLite-based correction repository - Team collaboration with import/export - GLM API integration for AI corrections - Cost optimization through dictionary promotion **Use cases:** - Correcting meeting notes, lecture recordings, or interview transcripts - Fixing Chinese/English homophone errors and technical terminology - Building domain-specific correction dictionaries - Improving transcript accuracy through iterative learning **Documentation:** - Complete workflow guides in references/ - SQL query templates - Troubleshooting guide - Team collaboration patterns - API setup instructions **Marketplace updates:** - Updated marketplace to v1.8.0 - Added transcript-fixer plugin (category: productivity) - Updated README.md with skill description and use cases - Updated CLAUDE.md with skill listing and counts 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
18
transcript-fixer/scripts/utils/diff_formats/__init__.py
Normal file
18
transcript-fixer/scripts/utils/diff_formats/__init__.py
Normal file
@@ -0,0 +1,18 @@
|
||||
"""
|
||||
Diff format generators for transcript comparison
|
||||
"""
|
||||
|
||||
from .unified_format import generate_unified_diff
|
||||
from .html_format import generate_html_diff
|
||||
from .inline_format import generate_inline_diff
|
||||
from .markdown_format import generate_markdown_report
|
||||
from .change_extractor import extract_changes, generate_change_summary
|
||||
|
||||
__all__ = [
|
||||
'generate_unified_diff',
|
||||
'generate_html_diff',
|
||||
'generate_inline_diff',
|
||||
'generate_markdown_report',
|
||||
'extract_changes',
|
||||
'generate_change_summary',
|
||||
]
|
||||
102
transcript-fixer/scripts/utils/diff_formats/change_extractor.py
Normal file
102
transcript-fixer/scripts/utils/diff_formats/change_extractor.py
Normal file
@@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Change extraction and summarization
|
||||
|
||||
SINGLE RESPONSIBILITY: Extract and summarize changes between text versions
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import difflib
|
||||
|
||||
from .text_splitter import split_into_words
|
||||
|
||||
|
||||
def extract_changes(original: str, fixed: str) -> list[dict]:
|
||||
"""
|
||||
Extract all changes and return change list
|
||||
|
||||
Args:
|
||||
original: Original text
|
||||
fixed: Fixed text
|
||||
|
||||
Returns:
|
||||
List of change dictionaries with type, context, and content
|
||||
"""
|
||||
original_words = split_into_words(original)
|
||||
fixed_words = split_into_words(fixed)
|
||||
|
||||
diff = difflib.SequenceMatcher(None, original_words, fixed_words)
|
||||
changes = []
|
||||
|
||||
for tag, i1, i2, j1, j2 in diff.get_opcodes():
|
||||
if tag == 'replace':
|
||||
original_text = ''.join(original_words[i1:i2])
|
||||
fixed_text = ''.join(fixed_words[j1:j2])
|
||||
changes.append({
|
||||
'type': 'replace',
|
||||
'original': original_text,
|
||||
'fixed': fixed_text,
|
||||
'context_before': ''.join(original_words[max(0, i1-5):i1]),
|
||||
'context_after': ''.join(original_words[i2:min(len(original_words), i2+5)])
|
||||
})
|
||||
elif tag == 'delete':
|
||||
original_text = ''.join(original_words[i1:i2])
|
||||
changes.append({
|
||||
'type': 'delete',
|
||||
'original': original_text,
|
||||
'fixed': '',
|
||||
'context_before': ''.join(original_words[max(0, i1-5):i1]),
|
||||
'context_after': ''.join(original_words[i2:min(len(original_words), i2+5)])
|
||||
})
|
||||
elif tag == 'insert':
|
||||
fixed_text = ''.join(fixed_words[j1:j2])
|
||||
changes.append({
|
||||
'type': 'insert',
|
||||
'original': '',
|
||||
'fixed': fixed_text,
|
||||
'context_before': ''.join(fixed_words[max(0, j1-5):j1]) if j1 > 0 else '',
|
||||
'context_after': ''.join(fixed_words[j2:min(len(fixed_words), j2+5)])
|
||||
})
|
||||
|
||||
return changes
|
||||
|
||||
|
||||
def generate_change_summary(changes: list[dict]) -> str:
|
||||
"""
|
||||
Generate change summary
|
||||
|
||||
Args:
|
||||
changes: List of change dictionaries
|
||||
|
||||
Returns:
|
||||
Formatted summary string
|
||||
"""
|
||||
result = []
|
||||
result.append("=" * 80)
|
||||
result.append(f"修改摘要 (共 {len(changes)} 处修改)")
|
||||
result.append("=" * 80)
|
||||
result.append("")
|
||||
|
||||
for i, change in enumerate(changes, 1):
|
||||
change_type = {
|
||||
'replace': '替换',
|
||||
'delete': '删除',
|
||||
'insert': '添加'
|
||||
}[change['type']]
|
||||
|
||||
result.append(f"[{i}] {change_type}")
|
||||
|
||||
if change['original']:
|
||||
result.append(f" 原文: {change['original']}")
|
||||
if change['fixed']:
|
||||
result.append(f" 修复: {change['fixed']}")
|
||||
|
||||
# Show context
|
||||
context = change['context_before'] + "【修改处】" + change['context_after']
|
||||
if context.strip():
|
||||
result.append(f" 上下文: ...{context}...")
|
||||
|
||||
result.append("")
|
||||
|
||||
return '\n'.join(result)
|
||||
37
transcript-fixer/scripts/utils/diff_formats/html_format.py
Normal file
37
transcript-fixer/scripts/utils/diff_formats/html_format.py
Normal file
@@ -0,0 +1,37 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
HTML diff format generator
|
||||
|
||||
SINGLE RESPONSIBILITY: Generate HTML side-by-side comparison
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import difflib
|
||||
|
||||
|
||||
def generate_html_diff(original: str, fixed: str) -> str:
|
||||
"""
|
||||
Generate HTML format comparison report (side-by-side)
|
||||
|
||||
Args:
|
||||
original: Original text
|
||||
fixed: Fixed text
|
||||
|
||||
Returns:
|
||||
HTML format string with side-by-side comparison
|
||||
"""
|
||||
original_lines = original.splitlines(keepends=True)
|
||||
fixed_lines = fixed.splitlines(keepends=True)
|
||||
|
||||
differ = difflib.HtmlDiff(wrapcolumn=80)
|
||||
html = differ.make_file(
|
||||
original_lines,
|
||||
fixed_lines,
|
||||
fromdesc='原始版本',
|
||||
todesc='修复版本',
|
||||
context=True,
|
||||
numlines=3
|
||||
)
|
||||
|
||||
return html
|
||||
65
transcript-fixer/scripts/utils/diff_formats/inline_format.py
Normal file
65
transcript-fixer/scripts/utils/diff_formats/inline_format.py
Normal file
@@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Inline diff format generator
|
||||
|
||||
SINGLE RESPONSIBILITY: Generate inline diff with change markers
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import difflib
|
||||
|
||||
from .text_splitter import split_into_words
|
||||
|
||||
|
||||
def generate_inline_diff(original: str, fixed: str) -> str:
|
||||
"""
|
||||
Generate inline diff marking deletions and additions
|
||||
|
||||
Format:
|
||||
- Normal words: unchanged
|
||||
- Deletions: [-word-]
|
||||
- Additions: [+word+]
|
||||
|
||||
Args:
|
||||
original: Original text
|
||||
fixed: Fixed text
|
||||
|
||||
Returns:
|
||||
Inline diff string with markers
|
||||
"""
|
||||
original_words = split_into_words(original)
|
||||
fixed_words = split_into_words(fixed)
|
||||
|
||||
diff = difflib.ndiff(original_words, fixed_words)
|
||||
|
||||
result = []
|
||||
result.append("=" * 80)
|
||||
result.append("行内词语级别对比 (- 删除, + 添加, ? 修改标记)")
|
||||
result.append("=" * 80)
|
||||
result.append("")
|
||||
|
||||
current_line = []
|
||||
for item in diff:
|
||||
marker = item[0]
|
||||
word = item[2:]
|
||||
|
||||
if marker == ' ':
|
||||
current_line.append(word)
|
||||
elif marker == '-':
|
||||
current_line.append(f"[-{word}-]")
|
||||
elif marker == '+':
|
||||
current_line.append(f"[+{word}+]")
|
||||
elif marker == '?':
|
||||
# Skip change marker lines
|
||||
continue
|
||||
|
||||
# Wrap at 80 characters
|
||||
if len(''.join(current_line)) > 80:
|
||||
result.append(''.join(current_line))
|
||||
current_line = []
|
||||
|
||||
if current_line:
|
||||
result.append(''.join(current_line))
|
||||
|
||||
return '\n'.join(result)
|
||||
104
transcript-fixer/scripts/utils/diff_formats/markdown_format.py
Normal file
104
transcript-fixer/scripts/utils/diff_formats/markdown_format.py
Normal file
@@ -0,0 +1,104 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Markdown report generator
|
||||
|
||||
SINGLE RESPONSIBILITY: Generate detailed Markdown comparison report
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from .change_extractor import extract_changes, generate_change_summary
|
||||
|
||||
|
||||
def generate_markdown_report(
|
||||
original_file: str,
|
||||
stage1_file: str,
|
||||
stage2_file: str,
|
||||
original: str,
|
||||
stage1: str,
|
||||
stage2: str
|
||||
) -> str:
|
||||
"""
|
||||
Generate comprehensive Markdown comparison report
|
||||
|
||||
Args:
|
||||
original_file: Original file path
|
||||
stage1_file: Stage 1 file path
|
||||
stage2_file: Stage 2 file path
|
||||
original: Original text content
|
||||
stage1: Stage 1 text content
|
||||
stage2: Stage 2 text content
|
||||
|
||||
Returns:
|
||||
Formatted Markdown report string
|
||||
"""
|
||||
original_path = Path(original_file)
|
||||
stage1_path = Path(stage1_file)
|
||||
stage2_path = Path(stage2_file)
|
||||
|
||||
# Extract changes for each stage
|
||||
changes_stage1 = extract_changes(original, stage1)
|
||||
changes_stage2 = extract_changes(stage1, stage2)
|
||||
changes_total = extract_changes(original, stage2)
|
||||
|
||||
# Generate summaries
|
||||
summary_stage1 = generate_change_summary(changes_stage1)
|
||||
summary_stage2 = generate_change_summary(changes_stage2)
|
||||
summary_total = generate_change_summary(changes_total)
|
||||
|
||||
# Build report
|
||||
report = f"""# 会议记录修复对比报告
|
||||
|
||||
## 文件信息
|
||||
|
||||
- **原始文件**: {original_path.name}
|
||||
- **阶段1修复**: {stage1_path.name}
|
||||
- **阶段2修复**: {stage2_path.name}
|
||||
- **生成时间**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
||||
|
||||
## 修改统计
|
||||
|
||||
| 阶段 | 修改数量 | 说明 |
|
||||
|------|---------|------|
|
||||
| 阶段1: 词典修复 | {len(changes_stage1)} | 基于预定义词典的批量替换 |
|
||||
| 阶段2: AI修复 | {len(changes_stage2)} | GLM-4.6智能纠错 |
|
||||
| **总计** | **{len(changes_total)}** | **原始→最终版本** |
|
||||
|
||||
---
|
||||
|
||||
# 阶段1: 词典修复详情
|
||||
|
||||
{summary_stage1}
|
||||
|
||||
---
|
||||
|
||||
# 阶段2: AI智能修复详情
|
||||
|
||||
{summary_stage2}
|
||||
|
||||
---
|
||||
|
||||
# 总体修改详情 (原始→最终)
|
||||
|
||||
{summary_total}
|
||||
|
||||
---
|
||||
|
||||
## 使用说明
|
||||
|
||||
1. **查看修改**: 每处修改都包含上下文,便于理解修改原因
|
||||
2. **人工审核**: 重点审核标记为"替换"的修改
|
||||
3. **专业术语**: 特别注意公司名、人名、技术术语的修改
|
||||
|
||||
## 建议审核重点
|
||||
|
||||
- [ ] 专业术语(具身智能、机器人等)
|
||||
- [ ] 人名和公司名
|
||||
- [ ] 数字(金额、时间等)
|
||||
- [ ] 上下文是否通顺
|
||||
"""
|
||||
|
||||
return report
|
||||
33
transcript-fixer/scripts/utils/diff_formats/text_splitter.py
Normal file
33
transcript-fixer/scripts/utils/diff_formats/text_splitter.py
Normal file
@@ -0,0 +1,33 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Text splitter utility for word-level diff generation
|
||||
|
||||
SINGLE RESPONSIBILITY: Split text into words while preserving structure
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
|
||||
def split_into_words(text: str) -> list[str]:
|
||||
"""
|
||||
Split text into words, preserving whitespace and punctuation
|
||||
|
||||
This enables word-level diff generation for Chinese and English text
|
||||
|
||||
Args:
|
||||
text: Input text to split
|
||||
|
||||
Returns:
|
||||
List of word tokens (Chinese words, English words, numbers, punctuation)
|
||||
"""
|
||||
# Pattern: Chinese chars, English words, numbers, non-alphanumeric chars
|
||||
pattern = r'[\u4e00-\u9fff]+|[a-zA-Z]+|[0-9]+|[^\u4e00-\u9fffa-zA-Z0-9]'
|
||||
return re.findall(pattern, text)
|
||||
|
||||
|
||||
def read_file(file_path: str) -> str:
|
||||
"""Read file contents"""
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return f.read()
|
||||
@@ -0,0 +1,44 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Unified diff format generator
|
||||
|
||||
SINGLE RESPONSIBILITY: Generate unified diff format output
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import difflib
|
||||
|
||||
from .text_splitter import split_into_words
|
||||
|
||||
|
||||
def generate_unified_diff(
|
||||
original: str,
|
||||
fixed: str,
|
||||
original_label: str = "原始版本",
|
||||
fixed_label: str = "修复版本"
|
||||
) -> str:
|
||||
"""
|
||||
Generate unified format diff report
|
||||
|
||||
Args:
|
||||
original: Original text
|
||||
fixed: Fixed text
|
||||
original_label: Label for original version
|
||||
fixed_label: Label for fixed version
|
||||
|
||||
Returns:
|
||||
Unified diff format string
|
||||
"""
|
||||
original_words = split_into_words(original)
|
||||
fixed_words = split_into_words(fixed)
|
||||
|
||||
diff = difflib.unified_diff(
|
||||
original_words,
|
||||
fixed_words,
|
||||
fromfile=original_label,
|
||||
tofile=fixed_label,
|
||||
lineterm=''
|
||||
)
|
||||
|
||||
return '\n'.join(diff)
|
||||
Reference in New Issue
Block a user