feat(transcript-fixer): add timestamp repair and section splitting scripts
New scripts: - fix_transcript_timestamps.py: Repair malformed timestamps (HH:MM:SS format) - split_transcript_sections.py: Split transcript by keywords and rebase timestamps - Automated tests for both scripts Features: - Timestamp validation and repair (handle missing colons, invalid ranges) - Section splitting with custom names - Rebase timestamps to 00:00:00 for each section - Preserve speaker format and content integrity - In-place editing with backup Documentation updates: - Add usage examples to SKILL.md - Clarify dictionary iteration workflow (save stable patterns only) - Update workflow guides with new script references - Add script parameter documentation Use cases: - Fix ASR output with broken timestamps - Split long meetings into focused sections - Prepare sections for independent processing Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Tests for transcript timestamp normalization and rebasing."""
|
||||
|
||||
import sys
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from fix_transcript_timestamps import repair_timestamps
|
||||
|
||||
|
||||
class TestFixTranscriptTimestamps(unittest.TestCase):
|
||||
def test_rollover_fix(self):
|
||||
text = (
|
||||
"甲 58:50\n"
|
||||
"内容 A\n"
|
||||
"乙 59:58\n"
|
||||
"内容 B\n"
|
||||
"丙 00:05\n"
|
||||
"内容 C\n"
|
||||
)
|
||||
result = repair_timestamps(
|
||||
text,
|
||||
output_format="hhmmss",
|
||||
rollover_backjump_seconds=15 * 60,
|
||||
jitter_seconds=5,
|
||||
rebase_to_zero=False,
|
||||
)
|
||||
self.assertIn("甲 00:58:50", result.repaired_text)
|
||||
self.assertIn("乙 00:59:58", result.repaired_text)
|
||||
self.assertIn("丙 01:00:05", result.repaired_text)
|
||||
self.assertEqual(len(result.anomalies), 0)
|
||||
|
||||
def test_rebase_to_zero(self):
|
||||
text = (
|
||||
"甲 01:31:10\n"
|
||||
"内容 A\n"
|
||||
"乙 01:31:12\n"
|
||||
"内容 B\n"
|
||||
)
|
||||
result = repair_timestamps(
|
||||
text,
|
||||
output_format="hhmmss",
|
||||
rollover_backjump_seconds=15 * 60,
|
||||
jitter_seconds=5,
|
||||
rebase_to_zero=True,
|
||||
)
|
||||
self.assertIn("甲 00:00:00", result.repaired_text)
|
||||
self.assertIn("乙 00:00:02", result.repaired_text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user