feat(transcript-fixer): add timestamp repair and section splitting scripts

New scripts:
- fix_transcript_timestamps.py: Repair malformed timestamps (HH:MM:SS format)
- split_transcript_sections.py: Split transcript by keywords and rebase timestamps
- Automated tests for both scripts

Features:
- Timestamp validation and repair (handle missing colons, invalid ranges)
- Section splitting with custom names
- Rebase timestamps to 00:00:00 for each section
- Preserve speaker format and content integrity
- In-place editing with backup

Documentation updates:
- Add usage examples to SKILL.md
- Clarify dictionary iteration workflow (save stable patterns only)
- Update workflow guides with new script references
- Add script parameter documentation

Use cases:
- Fix ASR output with broken timestamps
- Split long meetings into focused sections
- Prepare sections for independent processing

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
daymade
2026-03-11 13:59:36 +08:00
parent 29f85d27c3
commit 135a1873af
8 changed files with 688 additions and 4 deletions

View File

@@ -0,0 +1,54 @@
#!/usr/bin/env python3
"""Tests for transcript timestamp normalization and rebasing."""
import sys
import unittest
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from fix_transcript_timestamps import repair_timestamps
class TestFixTranscriptTimestamps(unittest.TestCase):
def test_rollover_fix(self):
text = (
"甲 58:50\n"
"内容 A\n"
"乙 59:58\n"
"内容 B\n"
"丙 00:05\n"
"内容 C\n"
)
result = repair_timestamps(
text,
output_format="hhmmss",
rollover_backjump_seconds=15 * 60,
jitter_seconds=5,
rebase_to_zero=False,
)
self.assertIn("甲 00:58:50", result.repaired_text)
self.assertIn("乙 00:59:58", result.repaired_text)
self.assertIn("丙 01:00:05", result.repaired_text)
self.assertEqual(len(result.anomalies), 0)
def test_rebase_to_zero(self):
text = (
"甲 01:31:10\n"
"内容 A\n"
"乙 01:31:12\n"
"内容 B\n"
)
result = repair_timestamps(
text,
output_format="hhmmss",
rollover_backjump_seconds=15 * 60,
jitter_seconds=5,
rebase_to_zero=True,
)
self.assertIn("甲 00:00:00", result.repaired_text)
self.assertIn("乙 00:00:02", result.repaired_text)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,45 @@
#!/usr/bin/env python3
"""Tests for transcript section splitting."""
import sys
import unittest
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from split_transcript_sections import (
SectionSpec,
sanitize_filename_component,
split_text_by_markers,
)
class TestSplitTranscriptSections(unittest.TestCase):
def test_split_text_by_markers(self):
text = (
"预热内容\n"
"开始安装\n"
"装环境内容\n"
"我们复盘一下。\n"
"复盘内容\n"
)
parts = split_text_by_markers(
text,
first_section_name="课前聊天",
sections=[
SectionSpec(name="正式上课", marker="开始安装"),
SectionSpec(name="课后复盘", marker="我们复盘一下。"),
],
)
self.assertEqual(parts[0][0], "课前聊天")
self.assertEqual(parts[1][0], "正式上课")
self.assertEqual(parts[2][0], "课后复盘")
self.assertTrue(parts[1][1].startswith("开始安装"))
self.assertTrue(parts[2][1].startswith("我们复盘一下。"))
def test_sanitize_filename_component(self):
self.assertEqual(sanitize_filename_component("课后/复盘"), "课后-复盘")
if __name__ == "__main__":
unittest.main()