Release v1.8.0: Add transcript-fixer skill
## New Skill: transcript-fixer v1.0.0 Correct speech-to-text (ASR/STT) transcription errors through dictionary-based rules and AI-powered corrections with automatic pattern learning. **Features:** - Two-stage correction pipeline (dictionary + AI) - Automatic pattern detection and learning - Domain-specific dictionaries (general, embodied_ai, finance, medical) - SQLite-based correction repository - Team collaboration with import/export - GLM API integration for AI corrections - Cost optimization through dictionary promotion **Use cases:** - Correcting meeting notes, lecture recordings, or interview transcripts - Fixing Chinese/English homophone errors and technical terminology - Building domain-specific correction dictionaries - Improving transcript accuracy through iterative learning **Documentation:** - Complete workflow guides in references/ - SQL query templates - Troubleshooting guide - Team collaboration patterns - API setup instructions **Marketplace updates:** - Updated marketplace to v1.8.0 - Added transcript-fixer plugin (category: productivity) - Updated README.md with skill description and use cases - Updated CLAUDE.md with skill listing and counts 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
16
transcript-fixer/scripts/utils/__init__.py
Normal file
16
transcript-fixer/scripts/utils/__init__.py
Normal file
@@ -0,0 +1,16 @@
|
||||
"""
|
||||
Utils Module - Utility Functions and Tools
|
||||
|
||||
This module contains utility functions:
|
||||
- diff_generator: Multi-format diff report generation
|
||||
- validation: Configuration validation
|
||||
"""
|
||||
|
||||
from .diff_generator import generate_full_report
|
||||
from .validation import validate_configuration, print_validation_summary
|
||||
|
||||
__all__ = [
|
||||
'generate_full_report',
|
||||
'validate_configuration',
|
||||
'print_validation_summary',
|
||||
]
|
||||
18
transcript-fixer/scripts/utils/diff_formats/__init__.py
Normal file
18
transcript-fixer/scripts/utils/diff_formats/__init__.py
Normal file
@@ -0,0 +1,18 @@
|
||||
"""
|
||||
Diff format generators for transcript comparison
|
||||
"""
|
||||
|
||||
from .unified_format import generate_unified_diff
|
||||
from .html_format import generate_html_diff
|
||||
from .inline_format import generate_inline_diff
|
||||
from .markdown_format import generate_markdown_report
|
||||
from .change_extractor import extract_changes, generate_change_summary
|
||||
|
||||
__all__ = [
|
||||
'generate_unified_diff',
|
||||
'generate_html_diff',
|
||||
'generate_inline_diff',
|
||||
'generate_markdown_report',
|
||||
'extract_changes',
|
||||
'generate_change_summary',
|
||||
]
|
||||
102
transcript-fixer/scripts/utils/diff_formats/change_extractor.py
Normal file
102
transcript-fixer/scripts/utils/diff_formats/change_extractor.py
Normal file
@@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Change extraction and summarization
|
||||
|
||||
SINGLE RESPONSIBILITY: Extract and summarize changes between text versions
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import difflib
|
||||
|
||||
from .text_splitter import split_into_words
|
||||
|
||||
|
||||
def extract_changes(original: str, fixed: str) -> list[dict]:
|
||||
"""
|
||||
Extract all changes and return change list
|
||||
|
||||
Args:
|
||||
original: Original text
|
||||
fixed: Fixed text
|
||||
|
||||
Returns:
|
||||
List of change dictionaries with type, context, and content
|
||||
"""
|
||||
original_words = split_into_words(original)
|
||||
fixed_words = split_into_words(fixed)
|
||||
|
||||
diff = difflib.SequenceMatcher(None, original_words, fixed_words)
|
||||
changes = []
|
||||
|
||||
for tag, i1, i2, j1, j2 in diff.get_opcodes():
|
||||
if tag == 'replace':
|
||||
original_text = ''.join(original_words[i1:i2])
|
||||
fixed_text = ''.join(fixed_words[j1:j2])
|
||||
changes.append({
|
||||
'type': 'replace',
|
||||
'original': original_text,
|
||||
'fixed': fixed_text,
|
||||
'context_before': ''.join(original_words[max(0, i1-5):i1]),
|
||||
'context_after': ''.join(original_words[i2:min(len(original_words), i2+5)])
|
||||
})
|
||||
elif tag == 'delete':
|
||||
original_text = ''.join(original_words[i1:i2])
|
||||
changes.append({
|
||||
'type': 'delete',
|
||||
'original': original_text,
|
||||
'fixed': '',
|
||||
'context_before': ''.join(original_words[max(0, i1-5):i1]),
|
||||
'context_after': ''.join(original_words[i2:min(len(original_words), i2+5)])
|
||||
})
|
||||
elif tag == 'insert':
|
||||
fixed_text = ''.join(fixed_words[j1:j2])
|
||||
changes.append({
|
||||
'type': 'insert',
|
||||
'original': '',
|
||||
'fixed': fixed_text,
|
||||
'context_before': ''.join(fixed_words[max(0, j1-5):j1]) if j1 > 0 else '',
|
||||
'context_after': ''.join(fixed_words[j2:min(len(fixed_words), j2+5)])
|
||||
})
|
||||
|
||||
return changes
|
||||
|
||||
|
||||
def generate_change_summary(changes: list[dict]) -> str:
|
||||
"""
|
||||
Generate change summary
|
||||
|
||||
Args:
|
||||
changes: List of change dictionaries
|
||||
|
||||
Returns:
|
||||
Formatted summary string
|
||||
"""
|
||||
result = []
|
||||
result.append("=" * 80)
|
||||
result.append(f"修改摘要 (共 {len(changes)} 处修改)")
|
||||
result.append("=" * 80)
|
||||
result.append("")
|
||||
|
||||
for i, change in enumerate(changes, 1):
|
||||
change_type = {
|
||||
'replace': '替换',
|
||||
'delete': '删除',
|
||||
'insert': '添加'
|
||||
}[change['type']]
|
||||
|
||||
result.append(f"[{i}] {change_type}")
|
||||
|
||||
if change['original']:
|
||||
result.append(f" 原文: {change['original']}")
|
||||
if change['fixed']:
|
||||
result.append(f" 修复: {change['fixed']}")
|
||||
|
||||
# Show context
|
||||
context = change['context_before'] + "【修改处】" + change['context_after']
|
||||
if context.strip():
|
||||
result.append(f" 上下文: ...{context}...")
|
||||
|
||||
result.append("")
|
||||
|
||||
return '\n'.join(result)
|
||||
37
transcript-fixer/scripts/utils/diff_formats/html_format.py
Normal file
37
transcript-fixer/scripts/utils/diff_formats/html_format.py
Normal file
@@ -0,0 +1,37 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
HTML diff format generator
|
||||
|
||||
SINGLE RESPONSIBILITY: Generate HTML side-by-side comparison
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import difflib
|
||||
|
||||
|
||||
def generate_html_diff(original: str, fixed: str) -> str:
|
||||
"""
|
||||
Generate HTML format comparison report (side-by-side)
|
||||
|
||||
Args:
|
||||
original: Original text
|
||||
fixed: Fixed text
|
||||
|
||||
Returns:
|
||||
HTML format string with side-by-side comparison
|
||||
"""
|
||||
original_lines = original.splitlines(keepends=True)
|
||||
fixed_lines = fixed.splitlines(keepends=True)
|
||||
|
||||
differ = difflib.HtmlDiff(wrapcolumn=80)
|
||||
html = differ.make_file(
|
||||
original_lines,
|
||||
fixed_lines,
|
||||
fromdesc='原始版本',
|
||||
todesc='修复版本',
|
||||
context=True,
|
||||
numlines=3
|
||||
)
|
||||
|
||||
return html
|
||||
65
transcript-fixer/scripts/utils/diff_formats/inline_format.py
Normal file
65
transcript-fixer/scripts/utils/diff_formats/inline_format.py
Normal file
@@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Inline diff format generator
|
||||
|
||||
SINGLE RESPONSIBILITY: Generate inline diff with change markers
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import difflib
|
||||
|
||||
from .text_splitter import split_into_words
|
||||
|
||||
|
||||
def generate_inline_diff(original: str, fixed: str) -> str:
|
||||
"""
|
||||
Generate inline diff marking deletions and additions
|
||||
|
||||
Format:
|
||||
- Normal words: unchanged
|
||||
- Deletions: [-word-]
|
||||
- Additions: [+word+]
|
||||
|
||||
Args:
|
||||
original: Original text
|
||||
fixed: Fixed text
|
||||
|
||||
Returns:
|
||||
Inline diff string with markers
|
||||
"""
|
||||
original_words = split_into_words(original)
|
||||
fixed_words = split_into_words(fixed)
|
||||
|
||||
diff = difflib.ndiff(original_words, fixed_words)
|
||||
|
||||
result = []
|
||||
result.append("=" * 80)
|
||||
result.append("行内词语级别对比 (- 删除, + 添加, ? 修改标记)")
|
||||
result.append("=" * 80)
|
||||
result.append("")
|
||||
|
||||
current_line = []
|
||||
for item in diff:
|
||||
marker = item[0]
|
||||
word = item[2:]
|
||||
|
||||
if marker == ' ':
|
||||
current_line.append(word)
|
||||
elif marker == '-':
|
||||
current_line.append(f"[-{word}-]")
|
||||
elif marker == '+':
|
||||
current_line.append(f"[+{word}+]")
|
||||
elif marker == '?':
|
||||
# Skip change marker lines
|
||||
continue
|
||||
|
||||
# Wrap at 80 characters
|
||||
if len(''.join(current_line)) > 80:
|
||||
result.append(''.join(current_line))
|
||||
current_line = []
|
||||
|
||||
if current_line:
|
||||
result.append(''.join(current_line))
|
||||
|
||||
return '\n'.join(result)
|
||||
104
transcript-fixer/scripts/utils/diff_formats/markdown_format.py
Normal file
104
transcript-fixer/scripts/utils/diff_formats/markdown_format.py
Normal file
@@ -0,0 +1,104 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Markdown report generator
|
||||
|
||||
SINGLE RESPONSIBILITY: Generate detailed Markdown comparison report
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from .change_extractor import extract_changes, generate_change_summary
|
||||
|
||||
|
||||
def generate_markdown_report(
|
||||
original_file: str,
|
||||
stage1_file: str,
|
||||
stage2_file: str,
|
||||
original: str,
|
||||
stage1: str,
|
||||
stage2: str
|
||||
) -> str:
|
||||
"""
|
||||
Generate comprehensive Markdown comparison report
|
||||
|
||||
Args:
|
||||
original_file: Original file path
|
||||
stage1_file: Stage 1 file path
|
||||
stage2_file: Stage 2 file path
|
||||
original: Original text content
|
||||
stage1: Stage 1 text content
|
||||
stage2: Stage 2 text content
|
||||
|
||||
Returns:
|
||||
Formatted Markdown report string
|
||||
"""
|
||||
original_path = Path(original_file)
|
||||
stage1_path = Path(stage1_file)
|
||||
stage2_path = Path(stage2_file)
|
||||
|
||||
# Extract changes for each stage
|
||||
changes_stage1 = extract_changes(original, stage1)
|
||||
changes_stage2 = extract_changes(stage1, stage2)
|
||||
changes_total = extract_changes(original, stage2)
|
||||
|
||||
# Generate summaries
|
||||
summary_stage1 = generate_change_summary(changes_stage1)
|
||||
summary_stage2 = generate_change_summary(changes_stage2)
|
||||
summary_total = generate_change_summary(changes_total)
|
||||
|
||||
# Build report
|
||||
report = f"""# 会议记录修复对比报告
|
||||
|
||||
## 文件信息
|
||||
|
||||
- **原始文件**: {original_path.name}
|
||||
- **阶段1修复**: {stage1_path.name}
|
||||
- **阶段2修复**: {stage2_path.name}
|
||||
- **生成时间**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
||||
|
||||
## 修改统计
|
||||
|
||||
| 阶段 | 修改数量 | 说明 |
|
||||
|------|---------|------|
|
||||
| 阶段1: 词典修复 | {len(changes_stage1)} | 基于预定义词典的批量替换 |
|
||||
| 阶段2: AI修复 | {len(changes_stage2)} | GLM-4.6智能纠错 |
|
||||
| **总计** | **{len(changes_total)}** | **原始→最终版本** |
|
||||
|
||||
---
|
||||
|
||||
# 阶段1: 词典修复详情
|
||||
|
||||
{summary_stage1}
|
||||
|
||||
---
|
||||
|
||||
# 阶段2: AI智能修复详情
|
||||
|
||||
{summary_stage2}
|
||||
|
||||
---
|
||||
|
||||
# 总体修改详情 (原始→最终)
|
||||
|
||||
{summary_total}
|
||||
|
||||
---
|
||||
|
||||
## 使用说明
|
||||
|
||||
1. **查看修改**: 每处修改都包含上下文,便于理解修改原因
|
||||
2. **人工审核**: 重点审核标记为"替换"的修改
|
||||
3. **专业术语**: 特别注意公司名、人名、技术术语的修改
|
||||
|
||||
## 建议审核重点
|
||||
|
||||
- [ ] 专业术语(具身智能、机器人等)
|
||||
- [ ] 人名和公司名
|
||||
- [ ] 数字(金额、时间等)
|
||||
- [ ] 上下文是否通顺
|
||||
"""
|
||||
|
||||
return report
|
||||
33
transcript-fixer/scripts/utils/diff_formats/text_splitter.py
Normal file
33
transcript-fixer/scripts/utils/diff_formats/text_splitter.py
Normal file
@@ -0,0 +1,33 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Text splitter utility for word-level diff generation
|
||||
|
||||
SINGLE RESPONSIBILITY: Split text into words while preserving structure
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
|
||||
def split_into_words(text: str) -> list[str]:
|
||||
"""
|
||||
Split text into words, preserving whitespace and punctuation
|
||||
|
||||
This enables word-level diff generation for Chinese and English text
|
||||
|
||||
Args:
|
||||
text: Input text to split
|
||||
|
||||
Returns:
|
||||
List of word tokens (Chinese words, English words, numbers, punctuation)
|
||||
"""
|
||||
# Pattern: Chinese chars, English words, numbers, non-alphanumeric chars
|
||||
pattern = r'[\u4e00-\u9fff]+|[a-zA-Z]+|[0-9]+|[^\u4e00-\u9fffa-zA-Z0-9]'
|
||||
return re.findall(pattern, text)
|
||||
|
||||
|
||||
def read_file(file_path: str) -> str:
|
||||
"""Read file contents"""
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return f.read()
|
||||
@@ -0,0 +1,44 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Unified diff format generator
|
||||
|
||||
SINGLE RESPONSIBILITY: Generate unified diff format output
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import difflib
|
||||
|
||||
from .text_splitter import split_into_words
|
||||
|
||||
|
||||
def generate_unified_diff(
|
||||
original: str,
|
||||
fixed: str,
|
||||
original_label: str = "原始版本",
|
||||
fixed_label: str = "修复版本"
|
||||
) -> str:
|
||||
"""
|
||||
Generate unified format diff report
|
||||
|
||||
Args:
|
||||
original: Original text
|
||||
fixed: Fixed text
|
||||
original_label: Label for original version
|
||||
fixed_label: Label for fixed version
|
||||
|
||||
Returns:
|
||||
Unified diff format string
|
||||
"""
|
||||
original_words = split_into_words(original)
|
||||
fixed_words = split_into_words(fixed)
|
||||
|
||||
diff = difflib.unified_diff(
|
||||
original_words,
|
||||
fixed_words,
|
||||
fromfile=original_label,
|
||||
tofile=fixed_label,
|
||||
lineterm=''
|
||||
)
|
||||
|
||||
return '\n'.join(diff)
|
||||
132
transcript-fixer/scripts/utils/diff_generator.py
Normal file
132
transcript-fixer/scripts/utils/diff_generator.py
Normal file
@@ -0,0 +1,132 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate word-level correction comparison reports
|
||||
Orchestrates multiple diff formats for visualization
|
||||
|
||||
SINGLE RESPONSIBILITY: Coordinate diff generation workflow
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from .diff_formats import (
|
||||
generate_unified_diff,
|
||||
generate_html_diff,
|
||||
generate_inline_diff,
|
||||
generate_markdown_report,
|
||||
)
|
||||
from .diff_formats.text_splitter import read_file
|
||||
|
||||
|
||||
def generate_full_report(
|
||||
original_file: str,
|
||||
stage1_file: str,
|
||||
stage2_file: str,
|
||||
output_dir: str = None
|
||||
):
|
||||
"""
|
||||
Generate comprehensive comparison report
|
||||
|
||||
Creates 4 output files:
|
||||
1. Markdown format detailed report
|
||||
2. Unified diff format
|
||||
3. HTML side-by-side comparison
|
||||
4. Inline marked comparison
|
||||
|
||||
Args:
|
||||
original_file: Path to original transcript
|
||||
stage1_file: Path to stage 1 (dictionary) corrected version
|
||||
stage2_file: Path to stage 2 (AI) corrected version
|
||||
output_dir: Optional output directory (defaults to original file location)
|
||||
"""
|
||||
original_path = Path(original_file)
|
||||
stage1_path = Path(stage1_file)
|
||||
stage2_path = Path(stage2_file)
|
||||
|
||||
# Determine output directory
|
||||
if output_dir:
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
else:
|
||||
output_path = original_path.parent
|
||||
|
||||
base_name = original_path.stem
|
||||
|
||||
# Read files
|
||||
print(f"📖 读取文件...")
|
||||
original = read_file(original_file)
|
||||
stage1 = read_file(stage1_file)
|
||||
stage2 = read_file(stage2_file)
|
||||
|
||||
# Generate reports
|
||||
print(f"📝 生成对比报告...")
|
||||
|
||||
# 1. Markdown report
|
||||
print(f" 生成Markdown报告...")
|
||||
md_report = generate_markdown_report(
|
||||
original_file, stage1_file, stage2_file,
|
||||
original, stage1, stage2
|
||||
)
|
||||
md_file = output_path / f"{base_name}_对比报告.md"
|
||||
with open(md_file, 'w', encoding='utf-8') as f:
|
||||
f.write(md_report)
|
||||
print(f" ✓ Markdown报告: {md_file.name}")
|
||||
|
||||
# 2. Unified Diff
|
||||
print(f" 生成Unified Diff...")
|
||||
unified_diff = generate_unified_diff(original, stage2)
|
||||
diff_file = output_path / f"{base_name}_unified.diff"
|
||||
with open(diff_file, 'w', encoding='utf-8') as f:
|
||||
f.write(unified_diff)
|
||||
print(f" ✓ Unified Diff: {diff_file.name}")
|
||||
|
||||
# 3. HTML comparison
|
||||
print(f" 生成HTML对比...")
|
||||
html_diff = generate_html_diff(original, stage2)
|
||||
html_file = output_path / f"{base_name}_对比.html"
|
||||
with open(html_file, 'w', encoding='utf-8') as f:
|
||||
f.write(html_diff)
|
||||
print(f" ✓ HTML对比: {html_file.name}")
|
||||
|
||||
# 4. Inline diff
|
||||
print(f" 生成行内diff...")
|
||||
inline_diff = generate_inline_diff(original, stage2)
|
||||
inline_file = output_path / f"{base_name}_行内对比.txt"
|
||||
with open(inline_file, 'w', encoding='utf-8') as f:
|
||||
f.write(inline_diff)
|
||||
print(f" ✓ 行内对比: {inline_file.name}")
|
||||
|
||||
# Summary
|
||||
print(f"\n✅ 对比报告生成完成!")
|
||||
print(f"📂 输出目录: {output_path}")
|
||||
print(f"\n生成的文件:")
|
||||
print(f" 1. {md_file.name} - Markdown格式详细报告")
|
||||
print(f" 2. {diff_file.name} - Unified Diff格式")
|
||||
print(f" 3. {html_file.name} - HTML并排对比")
|
||||
print(f" 4. {inline_file.name} - 行内标记对比")
|
||||
|
||||
|
||||
def main():
|
||||
"""CLI entry point"""
|
||||
if len(sys.argv) < 4:
|
||||
print("用法: python generate_diff_report.py <原始文件> <阶段1文件> <阶段2文件> [输出目录]")
|
||||
print()
|
||||
print("示例:")
|
||||
print(" python generate_diff_report.py \\")
|
||||
print(" 原始.md \\")
|
||||
print(" 原始_阶段1_词典修复.md \\")
|
||||
print(" 原始_阶段2_AI修复.md")
|
||||
sys.exit(1)
|
||||
|
||||
original_file = sys.argv[1]
|
||||
stage1_file = sys.argv[2]
|
||||
stage2_file = sys.argv[3]
|
||||
output_dir = sys.argv[4] if len(sys.argv) > 4 else None
|
||||
|
||||
generate_full_report(original_file, stage1_file, stage2_file, output_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
129
transcript-fixer/scripts/utils/logging_config.py
Normal file
129
transcript-fixer/scripts/utils/logging_config.py
Normal file
@@ -0,0 +1,129 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Logging Configuration for Transcript Fixer
|
||||
|
||||
Provides structured logging with rotation, levels, and audit trails.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import logging.handlers
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def setup_logging(
|
||||
log_dir: Optional[Path] = None,
|
||||
level: str = "INFO",
|
||||
enable_console: bool = True,
|
||||
enable_file: bool = True,
|
||||
enable_audit: bool = True
|
||||
) -> None:
|
||||
"""
|
||||
Configure logging for the application.
|
||||
|
||||
Args:
|
||||
log_dir: Directory for log files (default: ~/.transcript-fixer/logs)
|
||||
level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||
enable_console: Enable console output
|
||||
enable_file: Enable file logging
|
||||
enable_audit: Enable audit logging
|
||||
|
||||
Example:
|
||||
>>> setup_logging(level="DEBUG")
|
||||
>>> logger = logging.getLogger(__name__)
|
||||
>>> logger.info("Application started")
|
||||
"""
|
||||
# Default log directory
|
||||
if log_dir is None:
|
||||
log_dir = Path.home() / ".transcript-fixer" / "logs"
|
||||
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Root logger configuration
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.setLevel(logging.DEBUG) # Capture all, filter by handler
|
||||
|
||||
# Clear existing handlers
|
||||
root_logger.handlers.clear()
|
||||
|
||||
# Formatters
|
||||
detailed_formatter = logging.Formatter(
|
||||
fmt='%(asctime)s - %(name)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
|
||||
simple_formatter = logging.Formatter(
|
||||
fmt='%(asctime)s - %(levelname)s - %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
|
||||
# Console handler
|
||||
if enable_console:
|
||||
console_handler = logging.StreamHandler(sys.stdout)
|
||||
console_handler.setLevel(getattr(logging, level.upper()))
|
||||
console_handler.setFormatter(simple_formatter)
|
||||
root_logger.addHandler(console_handler)
|
||||
|
||||
# File handler (rotating)
|
||||
if enable_file:
|
||||
file_handler = logging.handlers.RotatingFileHandler(
|
||||
filename=log_dir / "transcript-fixer.log",
|
||||
maxBytes=10 * 1024 * 1024, # 10MB
|
||||
backupCount=5,
|
||||
encoding='utf-8'
|
||||
)
|
||||
file_handler.setLevel(logging.DEBUG)
|
||||
file_handler.setFormatter(detailed_formatter)
|
||||
root_logger.addHandler(file_handler)
|
||||
|
||||
# Error file handler (only errors)
|
||||
if enable_file:
|
||||
error_handler = logging.handlers.RotatingFileHandler(
|
||||
filename=log_dir / "errors.log",
|
||||
maxBytes=10 * 1024 * 1024, # 10MB
|
||||
backupCount=3,
|
||||
encoding='utf-8'
|
||||
)
|
||||
error_handler.setLevel(logging.ERROR)
|
||||
error_handler.setFormatter(detailed_formatter)
|
||||
root_logger.addHandler(error_handler)
|
||||
|
||||
# Audit handler (separate audit trail)
|
||||
if enable_audit:
|
||||
audit_handler = logging.handlers.RotatingFileHandler(
|
||||
filename=log_dir / "audit.log",
|
||||
maxBytes=50 * 1024 * 1024, # 50MB
|
||||
backupCount=10,
|
||||
encoding='utf-8'
|
||||
)
|
||||
audit_handler.setLevel(logging.INFO)
|
||||
audit_handler.setFormatter(detailed_formatter)
|
||||
|
||||
# Create audit logger
|
||||
audit_logger = logging.getLogger('audit')
|
||||
audit_logger.setLevel(logging.INFO)
|
||||
audit_logger.addHandler(audit_handler)
|
||||
audit_logger.propagate = False # Don't propagate to root
|
||||
|
||||
logging.info(f"Logging configured: level={level}, log_dir={log_dir}")
|
||||
|
||||
|
||||
def get_audit_logger() -> logging.Logger:
|
||||
"""Get the dedicated audit logger."""
|
||||
return logging.getLogger('audit')
|
||||
|
||||
|
||||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
setup_logging(level="DEBUG")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
logger.debug("Debug message")
|
||||
logger.info("Info message")
|
||||
logger.warning("Warning message")
|
||||
logger.error("Error message")
|
||||
logger.critical("Critical message")
|
||||
|
||||
audit_logger = get_audit_logger()
|
||||
audit_logger.info("User 'admin' added correction: '错误' → '正确'")
|
||||
141
transcript-fixer/scripts/utils/validation.py
Normal file
141
transcript-fixer/scripts/utils/validation.py
Normal file
@@ -0,0 +1,141 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validation Utility - Configuration Health Checker
|
||||
|
||||
SINGLE RESPONSIBILITY: Validate transcript-fixer configuration and JSON files
|
||||
|
||||
Features:
|
||||
- Check directory structure
|
||||
- Validate JSON syntax in all config files
|
||||
- Check environment variables
|
||||
- Report statistics and health status
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Handle imports for both standalone and package usage
|
||||
try:
|
||||
from core import CorrectionRepository, CorrectionService
|
||||
except ImportError:
|
||||
# Fallback for when run from scripts directory directly
|
||||
import sys
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
from core import CorrectionRepository, CorrectionService
|
||||
|
||||
|
||||
def validate_configuration() -> tuple[list[str], list[str]]:
|
||||
"""
|
||||
Validate transcript-fixer configuration.
|
||||
|
||||
Returns:
|
||||
Tuple of (errors, warnings) as string lists
|
||||
"""
|
||||
config_dir = Path.home() / ".transcript-fixer"
|
||||
db_path = config_dir / "corrections.db"
|
||||
|
||||
errors = []
|
||||
warnings = []
|
||||
|
||||
print("🔍 Validating transcript-fixer configuration...\n")
|
||||
|
||||
# Check directory exists
|
||||
if not config_dir.exists():
|
||||
errors.append(f"Configuration directory not found: {config_dir}")
|
||||
print(f"❌ {errors[-1]}")
|
||||
print("\n💡 Run: python fix_transcription.py --init")
|
||||
return errors, warnings
|
||||
|
||||
print(f"✅ Configuration directory exists: {config_dir}")
|
||||
|
||||
# Validate SQLite database
|
||||
if db_path.exists():
|
||||
try:
|
||||
repository = CorrectionRepository(db_path)
|
||||
service = CorrectionService(repository)
|
||||
|
||||
# Query basic stats
|
||||
stats = service.get_statistics()
|
||||
print(f"✅ Database valid: {stats['total_corrections']} corrections")
|
||||
|
||||
# Check tables exist
|
||||
conn = repository._get_connection()
|
||||
cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table'")
|
||||
tables = [row[0] for row in cursor.fetchall()]
|
||||
|
||||
expected_tables = [
|
||||
'corrections', 'context_rules', 'correction_history',
|
||||
'correction_changes', 'learned_suggestions', 'suggestion_examples',
|
||||
'system_config', 'audit_log'
|
||||
]
|
||||
|
||||
missing_tables = [t for t in expected_tables if t not in tables]
|
||||
if missing_tables:
|
||||
errors.append(f"Database missing tables: {missing_tables}")
|
||||
print(f"❌ {errors[-1]}")
|
||||
else:
|
||||
print(f"✅ All {len(expected_tables)} tables present")
|
||||
|
||||
service.close()
|
||||
|
||||
except Exception as e:
|
||||
errors.append(f"Database validation failed: {e}")
|
||||
print(f"❌ {errors[-1]}")
|
||||
else:
|
||||
warnings.append("Database not found (will be created on first use)")
|
||||
print(f"⚠️ Database not found: {db_path}")
|
||||
|
||||
# Check API key
|
||||
api_key = os.getenv("GLM_API_KEY")
|
||||
if not api_key:
|
||||
warnings.append("GLM_API_KEY environment variable not set")
|
||||
print("⚠️ GLM_API_KEY not set (required for Stage 2 AI corrections)")
|
||||
else:
|
||||
print("✅ GLM_API_KEY is set")
|
||||
|
||||
return errors, warnings
|
||||
|
||||
|
||||
def print_validation_summary(errors: list[str], warnings: list[str]) -> int:
|
||||
"""
|
||||
Print validation summary and return exit code.
|
||||
|
||||
Returns:
|
||||
0 if valid, 1 if errors found
|
||||
"""
|
||||
print("\n" + "=" * 60)
|
||||
|
||||
if errors:
|
||||
print(f"❌ {len(errors)} error(s) found:")
|
||||
for err in errors:
|
||||
print(f" - {err}")
|
||||
print("\n💡 Fix errors and run --validate again")
|
||||
print("=" * 60)
|
||||
return 1
|
||||
elif warnings:
|
||||
print(f"⚠️ {len(warnings)} warning(s):")
|
||||
for warn in warnings:
|
||||
print(f" - {warn}")
|
||||
print("\n✅ Configuration is valid (with warnings)")
|
||||
print("=" * 60)
|
||||
return 0
|
||||
else:
|
||||
print("✅ All checks passed! Configuration is valid.")
|
||||
print("=" * 60)
|
||||
return 0
|
||||
|
||||
|
||||
def main():
|
||||
"""Run validation as standalone script"""
|
||||
errors, warnings = validate_configuration()
|
||||
exit_code = print_validation_summary(errors, warnings)
|
||||
sys.exit(exit_code)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user