Files
daymade ee38ae41b8 feat: add asr-transcribe-to-text skill + optimize skill-creator with AskUserQuestion
New skill: asr-transcribe-to-text (v1.0.0)
- Transcribe audio/video via configurable ASR endpoint (Qwen3-ASR default)
- Persistent config in CLAUDE_PLUGIN_DATA (endpoint, model, proxy bypass)
- Single-request-first strategy (empirically proven: 55min in one request)
- Fallback overlap-merge script for very long audio (18min chunks, 2min overlap)
- AskUserQuestion at config init, health check failure, and output verification

skill-creator optimization (v1.5.1 → v1.6.0)
- Add AskUserQuestion best practices section (Re-ground/Simplify/Recommend/Options)
- Inject structured decision points at 8 key workflow stages
- Inspired by gstack's atomic question pattern

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-23 00:03:45 +08:00

201 lines
7.8 KiB
Python

# /// script
# requires-python = ">=3.9"
# ///
"""
Overlap-merge transcription for long audio files.
Splits audio into 18-minute chunks with 2-minute overlap, transcribes each chunk
via a configurable ASR endpoint, then merges using punctuation-stripped fuzzy
matching to eliminate sentence truncation at boundaries.
Usage:
python3 scripts/overlap_merge_transcribe.py INPUT_AUDIO OUTPUT.txt --config CONFIG.json
python3 scripts/overlap_merge_transcribe.py INPUT_AUDIO OUTPUT.txt --endpoint URL --model MODEL
"""
import argparse
import json
import os
import re
import subprocess
import sys
import tempfile
def get_duration(audio_path: str) -> float:
result = subprocess.run(
["ffprobe", "-v", "error", "-show_entries", "format=duration",
"-of", "default=noprint_wrappers=1:nokey=1", audio_path],
capture_output=True, text=True
)
return float(result.stdout.strip())
def split_audio(audio_path: str, chunk_dir: str, chunk_duration: int, overlap: int) -> list[tuple[int, int, str]]:
"""Split audio into overlapping chunks. Returns list of (start_sec, duration_sec, chunk_path)."""
total = int(get_duration(audio_path))
chunks = []
start = 0
while start < total:
duration = min(chunk_duration, total - start)
chunk_path = os.path.join(chunk_dir, f"chunk_{len(chunks):02d}.mp3")
subprocess.run(
["ffmpeg", "-i", audio_path, "-ss", str(start), "-t", str(duration),
"-acodec", "copy", chunk_path, "-y"],
capture_output=True
)
chunks.append((start, duration, chunk_path))
print(f" Chunk {len(chunks)-1}: {start//60}:{start%60:02d} - {(start+duration)//60}:{(start+duration)%60:02d}", file=sys.stderr)
start += duration - overlap
if start + duration >= total and duration == chunk_duration:
start = total - duration # ensure last chunk covers the end
if start <= chunks[-1][0]:
break
return chunks
def transcribe(audio_path: str, endpoint: str, model: str, noproxy: bool = True) -> str:
"""Send audio to ASR endpoint and return text."""
noproxy_args = ["--noproxy", "*"] if noproxy else []
result = subprocess.run(
["curl", "-s", "--max-time", "600"] + noproxy_args + [
endpoint,
"-F", f"file=@{audio_path}",
"-F", f"model={model}"
],
capture_output=True, text=True
)
data = json.loads(result.stdout)
return data["text"]
def strip_punct(text: str) -> str:
"""Remove all punctuation, keep only CJK chars, letters, and digits."""
return re.sub(r'[^\w\u4e00-\u9fff]', '', text)
def fuzzy_merge(text_a: str, text_b: str, search_chars: int = 600, min_match: int = 15) -> str:
"""
Merge two overlapping transcription segments using punctuation-stripped fuzzy matching.
The ASR model produces slightly different punctuation for the same audio segment
across different runs, so exact string matching fails. By stripping punctuation
before matching, we find the true overlap region reliably.
Uses text_b's version at the merge point because text_a truncates its final sentence
while text_b has the complete version.
"""
tail_a_clean = strip_punct(text_a[-search_chars:])
text_b_clean = strip_punct(text_b)
best_match_len = 0
best_b_clean_end = 0
# Search for longest matching substring (punctuation-stripped)
for start in range(len(tail_a_clean)):
substr = tail_a_clean[start:start + min_match]
if len(substr) < min_match:
break
pos = text_b_clean.find(substr)
if pos >= 0:
# Extend the match as far as possible
match_len = min_match
while (start + match_len < len(tail_a_clean)
and pos + match_len < len(text_b_clean)
and tail_a_clean[start + match_len] == text_b_clean[pos + match_len]):
match_len += 1
if match_len > best_match_len:
best_match_len = match_len
best_b_clean_end = pos + match_len
best_a_clean_start = start
if best_match_len >= min_match:
# Map clean positions back to raw text positions
# For text_a: find where the match starts in raw text
a_offset = len(text_a) - search_chars
clean_count = 0
a_cut_pos = len(text_a)
for idx, ch in enumerate(text_a[-search_chars:]):
if strip_punct(ch):
clean_count += 1
if clean_count > best_a_clean_start:
a_cut_pos = a_offset + idx
break
# For text_b: find where the match ends in raw text
clean_count = 0
b_start_pos = 0
for idx, ch in enumerate(text_b):
if strip_punct(ch):
clean_count += 1
if clean_count >= best_b_clean_end:
b_start_pos = idx + 1
break
print(f" Merged: {best_match_len} chars matched (punct-stripped)", file=sys.stderr)
return text_a[:a_cut_pos] + text_b[b_start_pos:]
else:
print(f" Warning: no overlap found ({best_match_len} chars), concatenating directly", file=sys.stderr)
return text_a + text_b
def main():
parser = argparse.ArgumentParser(description="Overlap-merge ASR transcription")
parser.add_argument("input", help="Input audio/video file")
parser.add_argument("output", help="Output text file")
parser.add_argument("--config", help="Path to config.json (from CLAUDE_PLUGIN_DATA)")
parser.add_argument("--endpoint", default="http://workstation-4090-wsl:8002/v1/audio/transcriptions", help="ASR endpoint URL")
parser.add_argument("--model", default="Qwen/Qwen3-ASR-1.7B", help="Model name")
parser.add_argument("--noproxy", action="store_true", default=True, help="Use --noproxy with curl")
parser.add_argument("--chunk-duration", type=int, default=1080, help="Chunk duration in seconds (default: 1080 = 18min)")
parser.add_argument("--overlap", type=int, default=120, help="Overlap duration in seconds (default: 120 = 2min)")
args = parser.parse_args()
# Load config from file if provided, otherwise use CLI args
if args.config and os.path.exists(args.config):
with open(args.config) as f:
cfg = json.load(f)
args.endpoint = cfg.get("endpoint", args.endpoint)
args.model = cfg.get("model", args.model)
args.noproxy = cfg.get("noproxy", args.noproxy)
print(f"Input: {args.input}", file=sys.stderr)
total_duration = get_duration(args.input)
print(f"Duration: {total_duration:.0f}s ({total_duration/60:.1f}min)", file=sys.stderr)
with tempfile.TemporaryDirectory() as chunk_dir:
# Split
print(f"\nSplitting into {args.chunk_duration}s chunks with {args.overlap}s overlap...", file=sys.stderr)
chunks = split_audio(args.input, chunk_dir, args.chunk_duration, args.overlap)
print(f"Created {len(chunks)} chunks\n", file=sys.stderr)
# Transcribe each chunk
texts = []
for i, (start, dur, path) in enumerate(chunks):
print(f"Transcribing chunk {i} ({start//60}:{start%60:02d})...", end=" ", file=sys.stderr, flush=True)
text = transcribe(path, args.endpoint, args.model, args.noproxy)
texts.append(text)
print(f"{len(text)} chars", file=sys.stderr)
# Merge
print(f"\nMerging {len(texts)} segments...", file=sys.stderr)
merged = texts[0]
for i in range(1, len(texts)):
print(f" Merging chunk {i-1} + {i}:", file=sys.stderr)
merged = fuzzy_merge(merged, texts[i])
# Save
with open(args.output, "w", encoding="utf-8") as f:
f.write(merged)
print(f"\nDone! {len(merged)} chars saved to {args.output}", file=sys.stderr)
if __name__ == "__main__":
main()