From 135a1873af9e176f0ca5dcdfb2bef7a5c528ee8b Mon Sep 17 00:00:00 2001 From: daymade Date: Wed, 11 Mar 2026 13:59:36 +0800 Subject: [PATCH] feat(transcript-fixer): add timestamp repair and section splitting scripts New scripts: - fix_transcript_timestamps.py: Repair malformed timestamps (HH:MM:SS format) - split_transcript_sections.py: Split transcript by keywords and rebase timestamps - Automated tests for both scripts Features: - Timestamp validation and repair (handle missing colons, invalid ranges) - Section splitting with custom names - Rebase timestamps to 00:00:00 for each section - Preserve speaker format and content integrity - In-place editing with backup Documentation updates: - Add usage examples to SKILL.md - Clarify dictionary iteration workflow (save stable patterns only) - Update workflow guides with new script references - Add script parameter documentation Use cases: - Fix ASR output with broken timestamps - Split long meetings into focused sections - Prepare sections for independent processing Co-Authored-By: Claude Opus 4.6 (1M context) --- transcript-fixer/SKILL.md | 22 +- .../references/iteration_workflow.md | 13 +- .../references/script_parameters.md | 55 ++++ transcript-fixer/references/workflow_guide.md | 25 ++ .../scripts/fix_transcript_timestamps.py | 282 ++++++++++++++++++ .../scripts/split_transcript_sections.py | 196 ++++++++++++ .../tests/test_fix_transcript_timestamps.py | 54 ++++ .../tests/test_split_transcript_sections.py | 45 +++ 8 files changed, 688 insertions(+), 4 deletions(-) create mode 100644 transcript-fixer/scripts/fix_transcript_timestamps.py create mode 100644 transcript-fixer/scripts/split_transcript_sections.py create mode 100644 transcript-fixer/scripts/tests/test_fix_transcript_timestamps.py create mode 100644 transcript-fixer/scripts/tests/test_split_transcript_sections.py diff --git a/transcript-fixer/SKILL.md b/transcript-fixer/SKILL.md index 4830456..5785a47 100644 --- a/transcript-fixer/SKILL.md +++ b/transcript-fixer/SKILL.md @@ -44,6 +44,20 @@ The enhanced wrapper automatically: - Moves output files to specified directory - Opens HTML visual diff in browser for immediate feedback +**Timestamp repair**: +```bash +uv run scripts/fix_transcript_timestamps.py meeting.txt --in-place +``` + +**Split transcript into sections and rebase each section to `00:00:00`**: +```bash +uv run scripts/split_transcript_sections.py meeting.txt \ + --first-section-name "课前聊天" \ + --section "正式上课::好,无缝切换嘛。对。那个曹总连上了吗?那个网页。" \ + --section "课后复盘::我们复盘一下。" \ + --rebase-to-zero +``` + **Alternative: Use Core Script Directly**: ```bash @@ -117,13 +131,15 @@ See `references/workflow_guide.md` for detailed workflows, `references/script_pa ## Critical Workflow: Dictionary Iteration -**MUST save corrections after each fix.** This is the skill's core value. +**Save stable, reusable ASR patterns after each fix.** This is the skill's core value. -After fixing errors manually, immediately save to dictionary: +After fixing errors manually, immediately save stable corrections to dictionary: ```bash uv run scripts/fix_transcription.py --add "错误词" "正确词" --domain general ``` +Do **not** save one-off deletions, ambiguous context-only rewrites, or section-specific cleanup to the dictionary. + See `references/iteration_workflow.md` for complete iteration guide with checklist. ## AI Fallback Strategy @@ -162,7 +178,9 @@ sqlite3 ~/.transcript-fixer/corrections.db "SELECT value FROM system_config WHER - `ensure_deps.py` - Initialize shared virtual environment (run once, optional) - `fix_transcript_enhanced.py` - Enhanced wrapper (recommended for interactive use) - `fix_transcription.py` - Core CLI (for automation) +- `fix_transcript_timestamps.py` - Normalize/repair speaker timestamps and optionally rebase to zero - `generate_word_diff.py` - Generate word-level diff HTML for reviewing corrections +- `split_transcript_sections.py` - Split a transcript by marker phrases and optionally rebase each section - `examples/bulk_import.py` - Bulk import example **References** (load as needed): diff --git a/transcript-fixer/references/iteration_workflow.md b/transcript-fixer/references/iteration_workflow.md index ba94a46..1738311 100644 --- a/transcript-fixer/references/iteration_workflow.md +++ b/transcript-fixer/references/iteration_workflow.md @@ -16,7 +16,7 @@ The core value of transcript-fixer is building a personalized correction diction └─────────────────────────────────────────────────┘ ``` -**Key principle**: Every correction you make should be saved to the dictionary. This transforms one-time work into permanent value. +**Key principle**: Every stable, reusable ASR correction you make should be saved to the dictionary. This transforms one-time work into permanent value without polluting the database. ## Workflow Checklist @@ -34,7 +34,7 @@ Correction Progress: ## Save Corrections Immediately -After fixing any transcript, save corrections: +After fixing any transcript, save stable corrections: ```bash # Single correction @@ -122,3 +122,12 @@ Patterns appearing ≥3 times at ≥80% confidence are suggested for review. 3. **Use domains**: Organize corrections by topic for better precision 4. **Verify**: Always run --list to confirm saves 5. **Review suggestions**: Periodically check --review-learned for auto-detected patterns + +## What NOT to Save to Dictionary + +Do **not** save these as reusable dictionary entries: + +- Full-sentence deletions +- One-off section headers or meeting-specific boilerplate +- Context-only disambiguations such as `cloud -> Claude` when `cloud` can also be legitimate +- File-local cleanup after section splitting or timestamp rebasing diff --git a/transcript-fixer/references/script_parameters.md b/transcript-fixer/references/script_parameters.md index a5537ab..6cbeab3 100644 --- a/transcript-fixer/references/script_parameters.md +++ b/transcript-fixer/references/script_parameters.md @@ -9,6 +9,8 @@ Detailed command-line parameters and usage examples for transcript-fixer Python - [Correction Management](#correction-management) - [Correction Workflow](#correction-workflow) - [Learning Commands](#learning-commands) +- [fix_transcript_timestamps.py](#fix_transcript_timestampspy) - Normalize/repair speaker timestamps +- [split_transcript_sections.py](#split_transcript_sectionspy) - Split transcript into named sections - [diff_generator.py](#diffgeneratorpy) - Generate comparison reports - [Common Workflows](#common-workflows) - [Exit Codes](#exit-codes) @@ -74,6 +76,59 @@ python scripts/fix_transcription.py --input meeting.md --stage 3 --output ./corr - `2` - GLM_API_KEY environment variable not set (Stage 2 or 3 only) - `3` - API request failed +## fix_transcript_timestamps.py + +Normalize speaker timestamp lines such as `天生 00:21` or `Speaker 7 01:31:10`. + +### Syntax + +```bash +python scripts/fix_transcript_timestamps.py [--output FILE | --in-place | --check] +``` + +### Key Parameters + +- `--format {hhmmss,preserve}`: output timestamp style +- `--rebase-to-zero`: reset the first detected speaker timestamp to `00:00:00` +- `--rollover-backjump-seconds`: threshold for treating `59:58 -> 00:05` as a new hour +- `--jitter-seconds`: tolerated small backward jitter before flagging anomaly + +### Usage Examples + +```bash +# Normalize mixed MM:SS / HH:MM:SS +python scripts/fix_transcript_timestamps.py meeting.txt --in-place + +# Rebase a split transcript so it starts at 00:00:00 +python scripts/fix_transcript_timestamps.py workshop-class.txt --in-place --rebase-to-zero + +# Only inspect anomalies, do not write +python scripts/fix_transcript_timestamps.py meeting.txt --check +``` + +## split_transcript_sections.py + +Split a transcript into named sections using marker phrases. Useful for workshop transcripts that include setup chat, class, and debrief in one file. + +### Syntax + +```bash +python scripts/split_transcript_sections.py \ + --first-section-name \ + --section "Name::Marker" \ + --section "Name::Marker" +``` + +### Usage Example + +```bash +python scripts/split_transcript_sections.py workshop.txt \ + --first-section-name "课前聊天" \ + --section "正式上课::好,无缝切换嘛。对。那个曹总连上了吗?那个网页。" \ + --section "课后复盘::我们复盘一下。" \ + --rebase-to-zero +``` + ## generate_diff_report.py Multi-format diff report generator for comparing correction stages. diff --git a/transcript-fixer/references/workflow_guide.md b/transcript-fixer/references/workflow_guide.md index b6132b6..5dbc7da 100644 --- a/transcript-fixer/references/workflow_guide.md +++ b/transcript-fixer/references/workflow_guide.md @@ -17,6 +17,7 @@ Detailed step-by-step workflows for transcript correction and management. - [5. Stage-by-Stage Execution](#5-stage-by-stage-execution) - [6. Context-Aware Rules](#6-context-aware-rules) - [7. Diff Report Generation](#7-diff-report-generation) + - [8. Workshop Transcript Split + Timestamp Rebase](#8-workshop-transcript-split--timestamp-rebase) - [Batch Processing](#batch-processing) - [Process Multiple Files](#process-multiple-files) - [Parallel Processing](#parallel-processing) @@ -400,6 +401,30 @@ See `file_formats.md` for context_rules schema. See `script_parameters.md` for advanced diff options. +### 8. Workshop Transcript Split + Timestamp Rebase + +**Goal**: Split a long workshop transcript into sections such as setup chat, class, and debrief, then make each section start from `00:00:00`. + +**Steps**: + +1. **Correct transcript text first** (dictionary + AI/manual review) +2. **Pick marker phrases** for each section boundary +3. **Split and rebase**: + +```bash +uv run scripts/split_transcript_sections.py workshop.txt \ + --first-section-name "课前聊天" \ + --section "正式上课::好,无缝切换嘛。对。那个曹总连上了吗?那个网页。" \ + --section "课后复盘::我们复盘一下。" \ + --rebase-to-zero +``` + +4. **If you already split the files**, rebase a single file directly: + +```bash +uv run scripts/fix_transcript_timestamps.py class.txt --in-place --rebase-to-zero +``` + ## Batch Processing ### Process Multiple Files diff --git a/transcript-fixer/scripts/fix_transcript_timestamps.py b/transcript-fixer/scripts/fix_transcript_timestamps.py new file mode 100644 index 0000000..17754d2 --- /dev/null +++ b/transcript-fixer/scripts/fix_transcript_timestamps.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python3 +"""Normalize and repair speaker timestamp lines in ASR transcripts. + +This script targets transcript lines shaped like: + + 天生 00:21 + Speaker 11 01:31:10 + +Common fixes: +- Normalize mixed `MM:SS` / `HH:MM:SS` into one format +- Repair hour rollovers for `MM:SS` timestamps after long recordings +- Optionally rebase a split transcript so it starts at 00:00:00 +- Report suspicious backward jumps instead of silently rewriting them +""" + +from __future__ import annotations + +import argparse +import re +import sys +from dataclasses import dataclass +from pathlib import Path + + +SPEAKER_TS_RE = re.compile(r"^(?P.+?)\s+(?P\d{1,2}:\d{2}(?::\d{2})?)\s*$") + + +@dataclass +class TimestampEvent: + line_no: int + original: str + fixed: str + reason: str + + +@dataclass +class RepairResult: + changed_lines: list[TimestampEvent] + anomalies: list[TimestampEvent] + repaired_text: str + matched_count: int + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Repair speaker timestamp lines in transcript files." + ) + parser.add_argument("input", help="Input transcript file") + parser.add_argument( + "--output", + help="Output path. Defaults to _timestamps_fixed when not using --check.", + ) + parser.add_argument( + "--in-place", + action="store_true", + help="Overwrite the input file", + ) + parser.add_argument( + "--check", + action="store_true", + help="Only analyze and report issues without writing a file", + ) + parser.add_argument( + "--format", + choices=("hhmmss", "preserve"), + default="hhmmss", + help="Timestamp output format. Default: hhmmss", + ) + parser.add_argument( + "--rollover-backjump-seconds", + type=int, + default=15 * 60, + help=( + "Backward jump threshold for treating MM:SS as a new hour rollover. " + "Default: 900" + ), + ) + parser.add_argument( + "--jitter-seconds", + type=int, + default=5, + help="Allowed small backward jitter before flagging an anomaly. Default: 5", + ) + parser.add_argument( + "--rebase-to-zero", + action="store_true", + help="Rebase the first detected speaker timestamp in the file to 00:00:00.", + ) + return parser.parse_args() + + +def to_seconds(parts: list[int]) -> int: + if len(parts) == 2: + minutes, seconds = parts + return minutes * 60 + seconds + hours, minutes, seconds = parts + return hours * 3600 + minutes * 60 + seconds + + +def format_timestamp(total_seconds: int, output_format: str) -> str: + hours = total_seconds // 3600 + minutes = (total_seconds % 3600) // 60 + seconds = total_seconds % 60 + + if output_format == "preserve" and hours == 0: + return f"{minutes:02d}:{seconds:02d}" + return f"{hours:02d}:{minutes:02d}:{seconds:02d}" + + +def repair_timestamps( + text: str, + *, + output_format: str, + rollover_backjump_seconds: int, + jitter_seconds: int, + rebase_to_zero: bool, +) -> RepairResult: + lines = text.splitlines(keepends=True) + repaired_lines: list[str] = [] + changed_lines: list[TimestampEvent] = [] + anomalies: list[TimestampEvent] = [] + + current_hour = 0 + last_abs_seconds: int | None = None + first_abs_seconds: int | None = None + matched_count = 0 + + for line_no, line in enumerate(lines, start=1): + stripped = line.rstrip("\r\n") + newline = line[len(stripped) :] + match = SPEAKER_TS_RE.match(stripped) + + if not match: + repaired_lines.append(line) + continue + + matched_count += 1 + prefix = match.group("prefix") + raw_ts = match.group("ts") + parts = [int(part) for part in raw_ts.split(":")] + + if len(parts) == 3: + abs_seconds = to_seconds(parts) + current_hour = parts[0] + + if ( + last_abs_seconds is not None + and abs_seconds + jitter_seconds < last_abs_seconds + ): + anomalies.append( + TimestampEvent( + line_no=line_no, + original=raw_ts, + fixed=format_timestamp(abs_seconds, output_format), + reason="explicit_hhmmss_backwards", + ) + ) + else: + minutes, seconds = parts + candidate = current_hour * 3600 + minutes * 60 + seconds + + if last_abs_seconds is not None: + while candidate + rollover_backjump_seconds < last_abs_seconds: + current_hour += 1 + candidate = current_hour * 3600 + minutes * 60 + seconds + + if candidate + jitter_seconds < last_abs_seconds: + anomalies.append( + TimestampEvent( + line_no=line_no, + original=raw_ts, + fixed=format_timestamp(candidate, output_format), + reason="small_backjump_after_rollover_check", + ) + ) + + abs_seconds = candidate + + if first_abs_seconds is None: + first_abs_seconds = abs_seconds + + display_seconds = ( + abs_seconds - first_abs_seconds + if rebase_to_zero and first_abs_seconds is not None + else abs_seconds + ) + + fixed_ts = format_timestamp(display_seconds, output_format) + repaired_lines.append(f"{prefix} {fixed_ts}{newline}") + + if fixed_ts != raw_ts: + changed_lines.append( + TimestampEvent( + line_no=line_no, + original=raw_ts, + fixed=fixed_ts, + reason="normalized_or_rollover_fixed", + ) + ) + + last_abs_seconds = abs_seconds + + return RepairResult( + changed_lines=changed_lines, + anomalies=anomalies, + repaired_text="".join(repaired_lines), + matched_count=matched_count, + ) + + +def default_output_path(input_path: Path) -> Path: + return input_path.with_name( + f"{input_path.stem}_timestamps_fixed{input_path.suffix}" + ) + + +def print_summary(result: RepairResult) -> None: + print(f"Matched speaker timestamp lines: {result.matched_count}") + print(f"Rewritten timestamp lines: {len(result.changed_lines)}") + print(f"Anomalies flagged: {len(result.anomalies)}") + + if result.changed_lines: + print("\nChanged lines:") + for event in result.changed_lines[:20]: + print( + f" L{event.line_no}: {event.original} -> {event.fixed} " + f"({event.reason})" + ) + if len(result.changed_lines) > 20: + print(f" ... {len(result.changed_lines) - 20} more") + + if result.anomalies: + print("\nAnomalies:") + for event in result.anomalies[:20]: + print( + f" L{event.line_no}: {event.original} -> {event.fixed} " + f"({event.reason})" + ) + if len(result.anomalies) > 20: + print(f" ... {len(result.anomalies) - 20} more") + + +def main() -> int: + args = parse_args() + input_path = Path(args.input).expanduser().resolve() + + if not input_path.exists(): + print(f"Input file not found: {input_path}", file=sys.stderr) + return 1 + + if args.check and args.in_place: + print("--check and --in-place cannot be used together", file=sys.stderr) + return 1 + + text = input_path.read_text(encoding="utf-8") + result = repair_timestamps( + text, + output_format=args.format, + rollover_backjump_seconds=args.rollover_backjump_seconds, + jitter_seconds=args.jitter_seconds, + rebase_to_zero=args.rebase_to_zero, + ) + + print_summary(result) + + if args.check: + return 0 if not result.anomalies else 2 + + if args.in_place: + output_path = input_path + elif args.output: + output_path = Path(args.output).expanduser().resolve() + else: + output_path = default_output_path(input_path) + + output_path.write_text(result.repaired_text, encoding="utf-8") + print(f"\nWrote repaired transcript: {output_path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/transcript-fixer/scripts/split_transcript_sections.py b/transcript-fixer/scripts/split_transcript_sections.py new file mode 100644 index 0000000..89733ba --- /dev/null +++ b/transcript-fixer/scripts/split_transcript_sections.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 +"""Split a transcript into named sections and optionally rebase timestamps. + +Example: + uv run scripts/split_transcript_sections.py meeting.txt \ + --first-section-name "课前聊天" \ + --section "正式上课::好,无缝切换嘛。对。那个曹总连上了吗?那个网页。" \ + --section "课后复盘::我们复盘一下。" \ + --rebase-to-zero +""" + +from __future__ import annotations + +import argparse +import re +import sys +from dataclasses import dataclass +from pathlib import Path + +from fix_transcript_timestamps import repair_timestamps + + +INVALID_FILENAME_CHARS = re.compile(r'[\\/:*?"<>|]+') + + +@dataclass +class SectionSpec: + name: str + marker: str + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Split a transcript into named sections using marker phrases." + ) + parser.add_argument("input", help="Input transcript file") + parser.add_argument( + "--first-section-name", + default="part-1", + help="Name for the section before the first marker. Default: part-1", + ) + parser.add_argument( + "--section", + action="append", + required=True, + metavar="NAME::MARKER", + help=( + "Section boundary definition. Split starts at the first occurrence of MARKER " + "and names that segment NAME." + ), + ) + parser.add_argument( + "--output-dir", + help="Directory for output files. Defaults to the input file directory.", + ) + parser.add_argument( + "--rebase-to-zero", + action="store_true", + help="Rebase each output section so its first speaker timestamp becomes 00:00:00.", + ) + parser.add_argument( + "--format", + choices=("hhmmss", "preserve"), + default="hhmmss", + help="Timestamp output format when rebasing. Default: hhmmss", + ) + parser.add_argument( + "--rollover-backjump-seconds", + type=int, + default=15 * 60, + help="Backward jump threshold for MM:SS hour rollover repair. Default: 900", + ) + parser.add_argument( + "--jitter-seconds", + type=int, + default=5, + help="Allowed small backward jitter before flagging an anomaly. Default: 5", + ) + return parser.parse_args() + + +def parse_section_arg(value: str) -> SectionSpec: + if "::" not in value: + raise ValueError(f"Invalid --section value: {value!r}") + name, marker = value.split("::", 1) + name = name.strip() + marker = marker.strip() + if not name or not marker: + raise ValueError(f"Invalid --section value: {value!r}") + return SectionSpec(name=name, marker=marker) + + +def sanitize_filename_component(name: str) -> str: + return INVALID_FILENAME_CHARS.sub("-", name).strip().replace(" ", "-") + + +def split_text_by_markers( + text: str, + *, + first_section_name: str, + sections: list[SectionSpec], +) -> list[tuple[str, str]]: + boundaries: list[tuple[int, SectionSpec]] = [] + search_from = 0 + + for spec in sections: + idx = text.find(spec.marker, search_from) + if idx == -1: + raise ValueError(f"Marker not found for section {spec.name!r}: {spec.marker!r}") + boundaries.append((idx, spec)) + search_from = idx + 1 + + result: list[tuple[str, str]] = [] + prev_idx = 0 + prev_name = first_section_name + + for idx, spec in boundaries: + result.append((prev_name, text[prev_idx:idx].rstrip() + "\n")) + prev_idx = idx + prev_name = spec.name + + result.append((prev_name, text[prev_idx:].rstrip() + "\n")) + return result + + +def maybe_rebase( + text: str, + *, + rebase_to_zero: bool, + output_format: str, + rollover_backjump_seconds: int, + jitter_seconds: int, +) -> str: + if not rebase_to_zero: + return text + result = repair_timestamps( + text, + output_format=output_format, + rollover_backjump_seconds=rollover_backjump_seconds, + jitter_seconds=jitter_seconds, + rebase_to_zero=True, + ) + return result.repaired_text + + +def main() -> int: + args = parse_args() + input_path = Path(args.input).expanduser().resolve() + + if not input_path.exists(): + print(f"Input file not found: {input_path}", file=sys.stderr) + return 1 + + try: + sections = [parse_section_arg(value) for value in args.section] + except ValueError as exc: + print(str(exc), file=sys.stderr) + return 1 + + text = input_path.read_text(encoding="utf-8") + + try: + parts = split_text_by_markers( + text, + first_section_name=args.first_section_name, + sections=sections, + ) + except ValueError as exc: + print(str(exc), file=sys.stderr) + return 1 + + output_dir = ( + Path(args.output_dir).expanduser().resolve() + if args.output_dir + else input_path.parent + ) + output_dir.mkdir(parents=True, exist_ok=True) + + for idx, (name, content) in enumerate(parts, start=1): + content = maybe_rebase( + content, + rebase_to_zero=args.rebase_to_zero, + output_format=args.format, + rollover_backjump_seconds=args.rollover_backjump_seconds, + jitter_seconds=args.jitter_seconds, + ) + safe_name = sanitize_filename_component(name) + output_path = output_dir / f"{input_path.stem}-{idx:02d}-{safe_name}{input_path.suffix}" + output_path.write_text(content, encoding="utf-8") + print(f"{output_path}\t{len(content.splitlines())} lines") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/transcript-fixer/scripts/tests/test_fix_transcript_timestamps.py b/transcript-fixer/scripts/tests/test_fix_transcript_timestamps.py new file mode 100644 index 0000000..b7fd457 --- /dev/null +++ b/transcript-fixer/scripts/tests/test_fix_transcript_timestamps.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +"""Tests for transcript timestamp normalization and rebasing.""" + +import sys +import unittest +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from fix_transcript_timestamps import repair_timestamps + + +class TestFixTranscriptTimestamps(unittest.TestCase): + def test_rollover_fix(self): + text = ( + "甲 58:50\n" + "内容 A\n" + "乙 59:58\n" + "内容 B\n" + "丙 00:05\n" + "内容 C\n" + ) + result = repair_timestamps( + text, + output_format="hhmmss", + rollover_backjump_seconds=15 * 60, + jitter_seconds=5, + rebase_to_zero=False, + ) + self.assertIn("甲 00:58:50", result.repaired_text) + self.assertIn("乙 00:59:58", result.repaired_text) + self.assertIn("丙 01:00:05", result.repaired_text) + self.assertEqual(len(result.anomalies), 0) + + def test_rebase_to_zero(self): + text = ( + "甲 01:31:10\n" + "内容 A\n" + "乙 01:31:12\n" + "内容 B\n" + ) + result = repair_timestamps( + text, + output_format="hhmmss", + rollover_backjump_seconds=15 * 60, + jitter_seconds=5, + rebase_to_zero=True, + ) + self.assertIn("甲 00:00:00", result.repaired_text) + self.assertIn("乙 00:00:02", result.repaired_text) + + +if __name__ == "__main__": + unittest.main() diff --git a/transcript-fixer/scripts/tests/test_split_transcript_sections.py b/transcript-fixer/scripts/tests/test_split_transcript_sections.py new file mode 100644 index 0000000..d896e2b --- /dev/null +++ b/transcript-fixer/scripts/tests/test_split_transcript_sections.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +"""Tests for transcript section splitting.""" + +import sys +import unittest +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from split_transcript_sections import ( + SectionSpec, + sanitize_filename_component, + split_text_by_markers, +) + + +class TestSplitTranscriptSections(unittest.TestCase): + def test_split_text_by_markers(self): + text = ( + "预热内容\n" + "开始安装\n" + "装环境内容\n" + "我们复盘一下。\n" + "复盘内容\n" + ) + parts = split_text_by_markers( + text, + first_section_name="课前聊天", + sections=[ + SectionSpec(name="正式上课", marker="开始安装"), + SectionSpec(name="课后复盘", marker="我们复盘一下。"), + ], + ) + self.assertEqual(parts[0][0], "课前聊天") + self.assertEqual(parts[1][0], "正式上课") + self.assertEqual(parts[2][0], "课后复盘") + self.assertTrue(parts[1][1].startswith("开始安装")) + self.assertTrue(parts[2][1].startswith("我们复盘一下。")) + + def test_sanitize_filename_component(self): + self.assertEqual(sanitize_filename_component("课后/复盘"), "课后-复盘") + + +if __name__ == "__main__": + unittest.main()