#!/usr/bin/env bash # Basic Audio Transcription Example # Demonstrates how to use the audio-transcriber skill manually set -euo pipefail # Configuration AUDIO_FILE="${1:-}" MODEL="${MODEL:-base}" # Options: tiny, base, small, medium, large OUTPUT_FORMAT="${OUTPUT_FORMAT:-markdown}" # Options: markdown, txt, srt, vtt, json # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color # Helper functions error() { echo -e "${RED}❌ Error: $1${NC}" >&2 exit 1 } success() { echo -e "${GREEN}✅ $1${NC}" } info() { echo -e "${BLUE}ℹ️ $1${NC}" } warn() { echo -e "${YELLOW}⚠️ $1${NC}" } # Check if audio file is provided if [[ -z "$AUDIO_FILE" ]]; then error "Usage: $0 " fi # Verify file exists if [[ ! -f "$AUDIO_FILE" ]]; then error "File not found: $AUDIO_FILE" fi # Step 0: Discovery - Check for transcription tools info "Step 0: Discovering transcription tools..." TRANSCRIBER="" if python3 -c "import faster_whisper" 2>/dev/null; then TRANSCRIBER="faster-whisper" success "Faster-Whisper detected (optimized)" elif python3 -c "import whisper" 2>/dev/null; then TRANSCRIBER="whisper" success "OpenAI Whisper detected" else error "No transcription tool found. Install with: pip install faster-whisper" fi # Check for ffmpeg if command -v ffmpeg &>/dev/null; then success "ffmpeg available (format conversion enabled)" else warn "ffmpeg not found (limited format support)" fi # Step 1: Extract metadata info "Step 1: Extracting audio metadata..." FILE_SIZE=$(du -h "$AUDIO_FILE" | cut -f1) info "File size: $FILE_SIZE" # Get duration if ffprobe is available if command -v ffprobe &>/dev/null; then DURATION=$(ffprobe -v error -show_entries format=duration \ -of default=noprint_wrappers=1:nokey=1 "$AUDIO_FILE" 2>/dev/null || echo "0") # Convert to HH:MM:SS if command -v date &>/dev/null; then if [[ "$OSTYPE" == "darwin"* ]]; then # macOS DURATION_HMS=$(date -u -r "${DURATION%.*}" +%H:%M:%S 2>/dev/null || echo "Unknown") else # Linux DURATION_HMS=$(date -u -d @"${DURATION%.*}" +%H:%M:%S 2>/dev/null || echo "Unknown") fi else DURATION_HMS="Unknown" fi info "Duration: $DURATION_HMS" else warn "ffprobe not found - cannot extract duration" DURATION="0" DURATION_HMS="Unknown" fi # Check file size warning SIZE_MB=$(du -m "$AUDIO_FILE" | cut -f1) if [[ $SIZE_MB -gt 25 ]]; then warn "Large file ($FILE_SIZE) - processing may take several minutes" read -p "Continue? [Y/n]: " CONTINUE if [[ "$CONTINUE" =~ ^[Nn] ]]; then info "Transcription cancelled" exit 0 fi fi # Step 2: Transcribe using Python info "Step 2: Transcribing audio..." OUTPUT_FILE="${AUDIO_FILE%.*}.md" TEMP_JSON="/tmp/transcription_$$.json" python3 << EOF import sys import json from datetime import datetime try: if "$TRANSCRIBER" == "faster-whisper": from faster_whisper import WhisperModel model = WhisperModel("$MODEL", device="cpu", compute_type="int8") segments, info = model.transcribe("$AUDIO_FILE", language=None, vad_filter=True) data = { "language": info.language, "language_probability": round(info.language_probability, 2), "duration": info.duration, "segments": [] } for segment in segments: data["segments"].append({ "start": round(segment.start, 2), "end": round(segment.end, 2), "text": segment.text.strip() }) else: import whisper model = whisper.load_model("$MODEL") result = model.transcribe("$AUDIO_FILE") data = { "language": result["language"], "duration": result["segments"][-1]["end"] if result["segments"] else 0, "segments": result["segments"] } with open("$TEMP_JSON", "w") as f: json.dump(data, f) print(f"✅ Language detected: {data['language']}") print(f"📝 Transcribed {len(data['segments'])} segments") except Exception as e: print(f"❌ Error: {e}", file=sys.stderr) sys.exit(1) EOF # Check if transcription succeeded if [[ ! -f "$TEMP_JSON" ]]; then error "Transcription failed" fi # Step 3: Generate Markdown output info "Step 3: Generating Markdown report..." python3 << 'EOF' import json import sys from datetime import datetime # Load transcription data with open("${TEMP_JSON}") as f: data = json.load(f) # Prepare metadata filename = "${AUDIO_FILE}".split("/")[-1] file_size = "${FILE_SIZE}" duration_hms = "${DURATION_HMS}" language = data["language"] process_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S") num_segments = len(data["segments"]) # Generate Markdown markdown = f"""# Audio Transcription Report ## 📊 Metadata | Field | Value | |-------|-------| | **File Name** | {filename} | | **File Size** | {file_size} | | **Duration** | {duration_hms} | | **Language** | {language.upper()} | | **Processed Date** | {process_date} | | **Segments** | {num_segments} | | **Transcription Engine** | ${TRANSCRIBER} (model: ${MODEL}) | --- ## 🎙️ Full Transcription """ # Add transcription with timestamps for seg in data["segments"]: start_time = f"{int(seg['start'] // 60):02d}:{int(seg['start'] % 60):02d}" end_time = f"{int(seg['end'] // 60):02d}:{int(seg['end'] % 60):02d}" markdown += f"**[{start_time} → {end_time}]** \n{seg['text']}\n\n" markdown += """--- ## 📝 Summary *Automatic summary generation requires AI integration (Claude/GPT).* *For now, review the full transcription above.* --- *Generated by audio-transcriber skill example script* *Transcription engine: ${TRANSCRIBER} | Model: ${MODEL}* """ # Write to file with open("${OUTPUT_FILE}", "w") as f: f.write(markdown) print(f"✅ Markdown report saved: ${OUTPUT_FILE}") EOF # Clean up rm -f "$TEMP_JSON" # Step 4: Display summary success "Transcription complete!" echo "" echo "📊 Results:" echo " Output file: $OUTPUT_FILE" echo " Transcription engine: $TRANSCRIBER" echo " Model: $MODEL" echo "" info "Next steps:" echo " 1. Review the transcription: cat $OUTPUT_FILE" echo " 2. Edit if needed: vim $OUTPUT_FILE" echo " 3. Share with team or archive" EOF