Release v1.8.0: Add transcript-fixer skill

## New Skill: transcript-fixer v1.0.0

Correct speech-to-text (ASR/STT) transcription errors through dictionary-based rules and AI-powered corrections with automatic pattern learning.

**Features:**
- Two-stage correction pipeline (dictionary + AI)
- Automatic pattern detection and learning
- Domain-specific dictionaries (general, embodied_ai, finance, medical)
- SQLite-based correction repository
- Team collaboration with import/export
- GLM API integration for AI corrections
- Cost optimization through dictionary promotion

**Use cases:**
- Correcting meeting notes, lecture recordings, or interview transcripts
- Fixing Chinese/English homophone errors and technical terminology
- Building domain-specific correction dictionaries
- Improving transcript accuracy through iterative learning

**Documentation:**
- Complete workflow guides in references/
- SQL query templates
- Troubleshooting guide
- Team collaboration patterns
- API setup instructions

**Marketplace updates:**
- Updated marketplace to v1.8.0
- Added transcript-fixer plugin (category: productivity)
- Updated README.md with skill description and use cases
- Updated CLAUDE.md with skill listing and counts

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
daymade
2025-10-28 13:16:37 +08:00
parent d1041ac203
commit bd0aa12004
44 changed files with 7432 additions and 8 deletions

View File

@@ -0,0 +1,16 @@
"""
Utils Module - Utility Functions and Tools
This module contains utility functions:
- diff_generator: Multi-format diff report generation
- validation: Configuration validation
"""
from .diff_generator import generate_full_report
from .validation import validate_configuration, print_validation_summary
__all__ = [
'generate_full_report',
'validate_configuration',
'print_validation_summary',
]

View File

@@ -0,0 +1,18 @@
"""
Diff format generators for transcript comparison
"""
from .unified_format import generate_unified_diff
from .html_format import generate_html_diff
from .inline_format import generate_inline_diff
from .markdown_format import generate_markdown_report
from .change_extractor import extract_changes, generate_change_summary
__all__ = [
'generate_unified_diff',
'generate_html_diff',
'generate_inline_diff',
'generate_markdown_report',
'extract_changes',
'generate_change_summary',
]

View File

@@ -0,0 +1,102 @@
#!/usr/bin/env python3
"""
Change extraction and summarization
SINGLE RESPONSIBILITY: Extract and summarize changes between text versions
"""
from __future__ import annotations
import difflib
from .text_splitter import split_into_words
def extract_changes(original: str, fixed: str) -> list[dict]:
"""
Extract all changes and return change list
Args:
original: Original text
fixed: Fixed text
Returns:
List of change dictionaries with type, context, and content
"""
original_words = split_into_words(original)
fixed_words = split_into_words(fixed)
diff = difflib.SequenceMatcher(None, original_words, fixed_words)
changes = []
for tag, i1, i2, j1, j2 in diff.get_opcodes():
if tag == 'replace':
original_text = ''.join(original_words[i1:i2])
fixed_text = ''.join(fixed_words[j1:j2])
changes.append({
'type': 'replace',
'original': original_text,
'fixed': fixed_text,
'context_before': ''.join(original_words[max(0, i1-5):i1]),
'context_after': ''.join(original_words[i2:min(len(original_words), i2+5)])
})
elif tag == 'delete':
original_text = ''.join(original_words[i1:i2])
changes.append({
'type': 'delete',
'original': original_text,
'fixed': '',
'context_before': ''.join(original_words[max(0, i1-5):i1]),
'context_after': ''.join(original_words[i2:min(len(original_words), i2+5)])
})
elif tag == 'insert':
fixed_text = ''.join(fixed_words[j1:j2])
changes.append({
'type': 'insert',
'original': '',
'fixed': fixed_text,
'context_before': ''.join(fixed_words[max(0, j1-5):j1]) if j1 > 0 else '',
'context_after': ''.join(fixed_words[j2:min(len(fixed_words), j2+5)])
})
return changes
def generate_change_summary(changes: list[dict]) -> str:
"""
Generate change summary
Args:
changes: List of change dictionaries
Returns:
Formatted summary string
"""
result = []
result.append("=" * 80)
result.append(f"修改摘要 (共 {len(changes)} 处修改)")
result.append("=" * 80)
result.append("")
for i, change in enumerate(changes, 1):
change_type = {
'replace': '替换',
'delete': '删除',
'insert': '添加'
}[change['type']]
result.append(f"[{i}] {change_type}")
if change['original']:
result.append(f" 原文: {change['original']}")
if change['fixed']:
result.append(f" 修复: {change['fixed']}")
# Show context
context = change['context_before'] + "【修改处】" + change['context_after']
if context.strip():
result.append(f" 上下文: ...{context}...")
result.append("")
return '\n'.join(result)

View File

@@ -0,0 +1,37 @@
#!/usr/bin/env python3
"""
HTML diff format generator
SINGLE RESPONSIBILITY: Generate HTML side-by-side comparison
"""
from __future__ import annotations
import difflib
def generate_html_diff(original: str, fixed: str) -> str:
"""
Generate HTML format comparison report (side-by-side)
Args:
original: Original text
fixed: Fixed text
Returns:
HTML format string with side-by-side comparison
"""
original_lines = original.splitlines(keepends=True)
fixed_lines = fixed.splitlines(keepends=True)
differ = difflib.HtmlDiff(wrapcolumn=80)
html = differ.make_file(
original_lines,
fixed_lines,
fromdesc='原始版本',
todesc='修复版本',
context=True,
numlines=3
)
return html

View File

@@ -0,0 +1,65 @@
#!/usr/bin/env python3
"""
Inline diff format generator
SINGLE RESPONSIBILITY: Generate inline diff with change markers
"""
from __future__ import annotations
import difflib
from .text_splitter import split_into_words
def generate_inline_diff(original: str, fixed: str) -> str:
"""
Generate inline diff marking deletions and additions
Format:
- Normal words: unchanged
- Deletions: [-word-]
- Additions: [+word+]
Args:
original: Original text
fixed: Fixed text
Returns:
Inline diff string with markers
"""
original_words = split_into_words(original)
fixed_words = split_into_words(fixed)
diff = difflib.ndiff(original_words, fixed_words)
result = []
result.append("=" * 80)
result.append("行内词语级别对比 (- 删除, + 添加, ? 修改标记)")
result.append("=" * 80)
result.append("")
current_line = []
for item in diff:
marker = item[0]
word = item[2:]
if marker == ' ':
current_line.append(word)
elif marker == '-':
current_line.append(f"[-{word}-]")
elif marker == '+':
current_line.append(f"[+{word}+]")
elif marker == '?':
# Skip change marker lines
continue
# Wrap at 80 characters
if len(''.join(current_line)) > 80:
result.append(''.join(current_line))
current_line = []
if current_line:
result.append(''.join(current_line))
return '\n'.join(result)

View File

@@ -0,0 +1,104 @@
#!/usr/bin/env python3
"""
Markdown report generator
SINGLE RESPONSIBILITY: Generate detailed Markdown comparison report
"""
from __future__ import annotations
from datetime import datetime
from pathlib import Path
from .change_extractor import extract_changes, generate_change_summary
def generate_markdown_report(
original_file: str,
stage1_file: str,
stage2_file: str,
original: str,
stage1: str,
stage2: str
) -> str:
"""
Generate comprehensive Markdown comparison report
Args:
original_file: Original file path
stage1_file: Stage 1 file path
stage2_file: Stage 2 file path
original: Original text content
stage1: Stage 1 text content
stage2: Stage 2 text content
Returns:
Formatted Markdown report string
"""
original_path = Path(original_file)
stage1_path = Path(stage1_file)
stage2_path = Path(stage2_file)
# Extract changes for each stage
changes_stage1 = extract_changes(original, stage1)
changes_stage2 = extract_changes(stage1, stage2)
changes_total = extract_changes(original, stage2)
# Generate summaries
summary_stage1 = generate_change_summary(changes_stage1)
summary_stage2 = generate_change_summary(changes_stage2)
summary_total = generate_change_summary(changes_total)
# Build report
report = f"""# 会议记录修复对比报告
## 文件信息
- **原始文件**: {original_path.name}
- **阶段1修复**: {stage1_path.name}
- **阶段2修复**: {stage2_path.name}
- **生成时间**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
## 修改统计
| 阶段 | 修改数量 | 说明 |
|------|---------|------|
| 阶段1: 词典修复 | {len(changes_stage1)} | 基于预定义词典的批量替换 |
| 阶段2: AI修复 | {len(changes_stage2)} | GLM-4.6智能纠错 |
| **总计** | **{len(changes_total)}** | **原始→最终版本** |
---
# 阶段1: 词典修复详情
{summary_stage1}
---
# 阶段2: AI智能修复详情
{summary_stage2}
---
# 总体修改详情 (原始→最终)
{summary_total}
---
## 使用说明
1. **查看修改**: 每处修改都包含上下文,便于理解修改原因
2. **人工审核**: 重点审核标记为"替换"的修改
3. **专业术语**: 特别注意公司名、人名、技术术语的修改
## 建议审核重点
- [ ] 专业术语(具身智能、机器人等)
- [ ] 人名和公司名
- [ ] 数字(金额、时间等)
- [ ] 上下文是否通顺
"""
return report

View File

@@ -0,0 +1,33 @@
#!/usr/bin/env python3
"""
Text splitter utility for word-level diff generation
SINGLE RESPONSIBILITY: Split text into words while preserving structure
"""
from __future__ import annotations
import re
def split_into_words(text: str) -> list[str]:
"""
Split text into words, preserving whitespace and punctuation
This enables word-level diff generation for Chinese and English text
Args:
text: Input text to split
Returns:
List of word tokens (Chinese words, English words, numbers, punctuation)
"""
# Pattern: Chinese chars, English words, numbers, non-alphanumeric chars
pattern = r'[\u4e00-\u9fff]+|[a-zA-Z]+|[0-9]+|[^\u4e00-\u9fffa-zA-Z0-9]'
return re.findall(pattern, text)
def read_file(file_path: str) -> str:
"""Read file contents"""
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()

View File

@@ -0,0 +1,44 @@
#!/usr/bin/env python3
"""
Unified diff format generator
SINGLE RESPONSIBILITY: Generate unified diff format output
"""
from __future__ import annotations
import difflib
from .text_splitter import split_into_words
def generate_unified_diff(
original: str,
fixed: str,
original_label: str = "原始版本",
fixed_label: str = "修复版本"
) -> str:
"""
Generate unified format diff report
Args:
original: Original text
fixed: Fixed text
original_label: Label for original version
fixed_label: Label for fixed version
Returns:
Unified diff format string
"""
original_words = split_into_words(original)
fixed_words = split_into_words(fixed)
diff = difflib.unified_diff(
original_words,
fixed_words,
fromfile=original_label,
tofile=fixed_label,
lineterm=''
)
return '\n'.join(diff)

View File

@@ -0,0 +1,132 @@
#!/usr/bin/env python3
"""
Generate word-level correction comparison reports
Orchestrates multiple diff formats for visualization
SINGLE RESPONSIBILITY: Coordinate diff generation workflow
"""
from __future__ import annotations
import sys
from pathlib import Path
from .diff_formats import (
generate_unified_diff,
generate_html_diff,
generate_inline_diff,
generate_markdown_report,
)
from .diff_formats.text_splitter import read_file
def generate_full_report(
original_file: str,
stage1_file: str,
stage2_file: str,
output_dir: str = None
):
"""
Generate comprehensive comparison report
Creates 4 output files:
1. Markdown format detailed report
2. Unified diff format
3. HTML side-by-side comparison
4. Inline marked comparison
Args:
original_file: Path to original transcript
stage1_file: Path to stage 1 (dictionary) corrected version
stage2_file: Path to stage 2 (AI) corrected version
output_dir: Optional output directory (defaults to original file location)
"""
original_path = Path(original_file)
stage1_path = Path(stage1_file)
stage2_path = Path(stage2_file)
# Determine output directory
if output_dir:
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
else:
output_path = original_path.parent
base_name = original_path.stem
# Read files
print(f"📖 读取文件...")
original = read_file(original_file)
stage1 = read_file(stage1_file)
stage2 = read_file(stage2_file)
# Generate reports
print(f"📝 生成对比报告...")
# 1. Markdown report
print(f" 生成Markdown报告...")
md_report = generate_markdown_report(
original_file, stage1_file, stage2_file,
original, stage1, stage2
)
md_file = output_path / f"{base_name}_对比报告.md"
with open(md_file, 'w', encoding='utf-8') as f:
f.write(md_report)
print(f" ✓ Markdown报告: {md_file.name}")
# 2. Unified Diff
print(f" 生成Unified Diff...")
unified_diff = generate_unified_diff(original, stage2)
diff_file = output_path / f"{base_name}_unified.diff"
with open(diff_file, 'w', encoding='utf-8') as f:
f.write(unified_diff)
print(f" ✓ Unified Diff: {diff_file.name}")
# 3. HTML comparison
print(f" 生成HTML对比...")
html_diff = generate_html_diff(original, stage2)
html_file = output_path / f"{base_name}_对比.html"
with open(html_file, 'w', encoding='utf-8') as f:
f.write(html_diff)
print(f" ✓ HTML对比: {html_file.name}")
# 4. Inline diff
print(f" 生成行内diff...")
inline_diff = generate_inline_diff(original, stage2)
inline_file = output_path / f"{base_name}_行内对比.txt"
with open(inline_file, 'w', encoding='utf-8') as f:
f.write(inline_diff)
print(f" ✓ 行内对比: {inline_file.name}")
# Summary
print(f"\n✅ 对比报告生成完成!")
print(f"📂 输出目录: {output_path}")
print(f"\n生成的文件:")
print(f" 1. {md_file.name} - Markdown格式详细报告")
print(f" 2. {diff_file.name} - Unified Diff格式")
print(f" 3. {html_file.name} - HTML并排对比")
print(f" 4. {inline_file.name} - 行内标记对比")
def main():
"""CLI entry point"""
if len(sys.argv) < 4:
print("用法: python generate_diff_report.py <原始文件> <阶段1文件> <阶段2文件> [输出目录]")
print()
print("示例:")
print(" python generate_diff_report.py \\")
print(" 原始.md \\")
print(" 原始_阶段1_词典修复.md \\")
print(" 原始_阶段2_AI修复.md")
sys.exit(1)
original_file = sys.argv[1]
stage1_file = sys.argv[2]
stage2_file = sys.argv[3]
output_dir = sys.argv[4] if len(sys.argv) > 4 else None
generate_full_report(original_file, stage1_file, stage2_file, output_dir)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,129 @@
#!/usr/bin/env python3
"""
Logging Configuration for Transcript Fixer
Provides structured logging with rotation, levels, and audit trails.
"""
import logging
import logging.handlers
import sys
from pathlib import Path
from typing import Optional
def setup_logging(
log_dir: Optional[Path] = None,
level: str = "INFO",
enable_console: bool = True,
enable_file: bool = True,
enable_audit: bool = True
) -> None:
"""
Configure logging for the application.
Args:
log_dir: Directory for log files (default: ~/.transcript-fixer/logs)
level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
enable_console: Enable console output
enable_file: Enable file logging
enable_audit: Enable audit logging
Example:
>>> setup_logging(level="DEBUG")
>>> logger = logging.getLogger(__name__)
>>> logger.info("Application started")
"""
# Default log directory
if log_dir is None:
log_dir = Path.home() / ".transcript-fixer" / "logs"
log_dir.mkdir(parents=True, exist_ok=True)
# Root logger configuration
root_logger = logging.getLogger()
root_logger.setLevel(logging.DEBUG) # Capture all, filter by handler
# Clear existing handlers
root_logger.handlers.clear()
# Formatters
detailed_formatter = logging.Formatter(
fmt='%(asctime)s - %(name)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
simple_formatter = logging.Formatter(
fmt='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
# Console handler
if enable_console:
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(getattr(logging, level.upper()))
console_handler.setFormatter(simple_formatter)
root_logger.addHandler(console_handler)
# File handler (rotating)
if enable_file:
file_handler = logging.handlers.RotatingFileHandler(
filename=log_dir / "transcript-fixer.log",
maxBytes=10 * 1024 * 1024, # 10MB
backupCount=5,
encoding='utf-8'
)
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(detailed_formatter)
root_logger.addHandler(file_handler)
# Error file handler (only errors)
if enable_file:
error_handler = logging.handlers.RotatingFileHandler(
filename=log_dir / "errors.log",
maxBytes=10 * 1024 * 1024, # 10MB
backupCount=3,
encoding='utf-8'
)
error_handler.setLevel(logging.ERROR)
error_handler.setFormatter(detailed_formatter)
root_logger.addHandler(error_handler)
# Audit handler (separate audit trail)
if enable_audit:
audit_handler = logging.handlers.RotatingFileHandler(
filename=log_dir / "audit.log",
maxBytes=50 * 1024 * 1024, # 50MB
backupCount=10,
encoding='utf-8'
)
audit_handler.setLevel(logging.INFO)
audit_handler.setFormatter(detailed_formatter)
# Create audit logger
audit_logger = logging.getLogger('audit')
audit_logger.setLevel(logging.INFO)
audit_logger.addHandler(audit_handler)
audit_logger.propagate = False # Don't propagate to root
logging.info(f"Logging configured: level={level}, log_dir={log_dir}")
def get_audit_logger() -> logging.Logger:
"""Get the dedicated audit logger."""
return logging.getLogger('audit')
# Example usage
if __name__ == "__main__":
setup_logging(level="DEBUG")
logger = logging.getLogger(__name__)
logger.debug("Debug message")
logger.info("Info message")
logger.warning("Warning message")
logger.error("Error message")
logger.critical("Critical message")
audit_logger = get_audit_logger()
audit_logger.info("User 'admin' added correction: '错误''正确'")

View File

@@ -0,0 +1,141 @@
#!/usr/bin/env python3
"""
Validation Utility - Configuration Health Checker
SINGLE RESPONSIBILITY: Validate transcript-fixer configuration and JSON files
Features:
- Check directory structure
- Validate JSON syntax in all config files
- Check environment variables
- Report statistics and health status
"""
from __future__ import annotations
import json
import os
import sys
from pathlib import Path
# Handle imports for both standalone and package usage
try:
from core import CorrectionRepository, CorrectionService
except ImportError:
# Fallback for when run from scripts directory directly
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from core import CorrectionRepository, CorrectionService
def validate_configuration() -> tuple[list[str], list[str]]:
"""
Validate transcript-fixer configuration.
Returns:
Tuple of (errors, warnings) as string lists
"""
config_dir = Path.home() / ".transcript-fixer"
db_path = config_dir / "corrections.db"
errors = []
warnings = []
print("🔍 Validating transcript-fixer configuration...\n")
# Check directory exists
if not config_dir.exists():
errors.append(f"Configuration directory not found: {config_dir}")
print(f"{errors[-1]}")
print("\n💡 Run: python fix_transcription.py --init")
return errors, warnings
print(f"✅ Configuration directory exists: {config_dir}")
# Validate SQLite database
if db_path.exists():
try:
repository = CorrectionRepository(db_path)
service = CorrectionService(repository)
# Query basic stats
stats = service.get_statistics()
print(f"✅ Database valid: {stats['total_corrections']} corrections")
# Check tables exist
conn = repository._get_connection()
cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table'")
tables = [row[0] for row in cursor.fetchall()]
expected_tables = [
'corrections', 'context_rules', 'correction_history',
'correction_changes', 'learned_suggestions', 'suggestion_examples',
'system_config', 'audit_log'
]
missing_tables = [t for t in expected_tables if t not in tables]
if missing_tables:
errors.append(f"Database missing tables: {missing_tables}")
print(f"{errors[-1]}")
else:
print(f"✅ All {len(expected_tables)} tables present")
service.close()
except Exception as e:
errors.append(f"Database validation failed: {e}")
print(f"{errors[-1]}")
else:
warnings.append("Database not found (will be created on first use)")
print(f"⚠️ Database not found: {db_path}")
# Check API key
api_key = os.getenv("GLM_API_KEY")
if not api_key:
warnings.append("GLM_API_KEY environment variable not set")
print("⚠️ GLM_API_KEY not set (required for Stage 2 AI corrections)")
else:
print("✅ GLM_API_KEY is set")
return errors, warnings
def print_validation_summary(errors: list[str], warnings: list[str]) -> int:
"""
Print validation summary and return exit code.
Returns:
0 if valid, 1 if errors found
"""
print("\n" + "=" * 60)
if errors:
print(f"{len(errors)} error(s) found:")
for err in errors:
print(f" - {err}")
print("\n💡 Fix errors and run --validate again")
print("=" * 60)
return 1
elif warnings:
print(f"⚠️ {len(warnings)} warning(s):")
for warn in warnings:
print(f" - {warn}")
print("\n✅ Configuration is valid (with warnings)")
print("=" * 60)
return 0
else:
print("✅ All checks passed! Configuration is valid.")
print("=" * 60)
return 0
def main():
"""Run validation as standalone script"""
errors, warnings = validate_configuration()
exit_code = print_validation_summary(errors, warnings)
sys.exit(exit_code)
if __name__ == "__main__":
main()