/ transcribe.mjs
transcribe.mjs
1 #!/usr/bin/env node 2 /** 3 * Download MP3s and transcribe episodes using mlx_whisper. 4 * Optionally adds speaker diarization via pyannote. 5 * Saves JSON (source of truth) alongside markdown transcript. 6 * 7 * Usage: 8 * node transcribe.mjs # all episodes without transcripts 9 * node transcribe.mjs --limit 5 # first 5 only 10 * node transcribe.mjs --diarize # add speaker diarization 11 * node transcribe.mjs --keep-audio # don't delete audio after transcribing 12 */ 13 14 import { parseArgs } from 'node:util'; 15 import { readFileSync, writeFileSync, readdirSync, existsSync, unlinkSync, mkdirSync } from 'node:fs'; 16 import { join, basename } from 'node:path'; 17 import { execSync } from 'node:child_process'; 18 19 const EPISODES_DIR = join(import.meta.dirname, 'episodes'); 20 const AUDIO_DIR = join(import.meta.dirname, 'audio'); 21 const JSON_DIR = join(import.meta.dirname, 'transcripts-json'); 22 const WHISPER_MODEL = 'mlx-community/whisper-large-v3-turbo'; 23 24 const { values: flags } = parseArgs({ 25 options: { 26 limit: { type: 'string', short: 'l' }, 27 diarize: { type: 'boolean', short: 'd' }, 28 'keep-audio': { type: 'boolean', short: 'k' }, 29 }, 30 }); 31 32 function extractAudioUrl(md) { 33 const m = md.match(/\*\*Audio\*\*:\s*(https?:\/\/\S+)/); 34 return m ? m[1] : null; 35 } 36 37 function hasTranscript(md) { 38 return !md.includes('_Not yet transcribed._'); 39 } 40 41 function formatTime(seconds) { 42 const m = Math.floor(seconds / 60); 43 const s = Math.floor(seconds % 60); 44 return `${m}m${s}s`; 45 } 46 47 function formatTimestamp(secs) { 48 const h = Math.floor(secs / 3600); 49 const m = Math.floor((secs % 3600) / 60); 50 const s = Math.floor(secs % 60); 51 return h > 0 52 ? `${h}:${String(m).padStart(2, '0')}:${String(s).padStart(2, '0')}` 53 : `${m}:${String(s).padStart(2, '0')}`; 54 } 55 56 async function downloadMp3(url, destPath) { 57 console.log(` Downloading audio...`); 58 execSync( 59 `curl -L -A "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" -o "${destPath}" "${url}"`, 60 { stdio: 'pipe', timeout: 600000 } 61 ); 62 const stat = execSync(`stat -f %z "${destPath}"`, { encoding: 'utf-8' }).trim(); 63 const mb = (parseInt(stat) / 1024 / 1024).toFixed(1); 64 console.log(` Downloaded: ${mb}MB`); 65 } 66 67 function transcribeWithWhisper(audioPath, jsonOutPath) { 68 console.log(` Transcribing with mlx_whisper...`); 69 const start = Date.now(); 70 71 const tmpDir = join(AUDIO_DIR, 'tmp-transcript'); 72 mkdirSync(tmpDir, { recursive: true }); 73 74 execSync( 75 `mlx_whisper "${audioPath}" --model ${WHISPER_MODEL} --language en --output-format json --output-dir "${tmpDir}" --condition-on-previous-text False --hallucination-silence-threshold 2 --word-timestamps True`, 76 { stdio: 'pipe', timeout: 7200000 } 77 ); 78 79 const elapsed = (Date.now() - start) / 1000; 80 81 // Find the output json file 82 const jsonFiles = readdirSync(tmpDir).filter(f => f.endsWith('.json')); 83 if (jsonFiles.length === 0) throw new Error('No transcript output found'); 84 85 const whisperJson = JSON.parse(readFileSync(join(tmpDir, jsonFiles[0]), 'utf-8')); 86 87 // Cleanup tmp 88 for (const f of readdirSync(tmpDir)) unlinkSync(join(tmpDir, f)); 89 90 const wordCount = whisperJson.segments.reduce((n, s) => n + s.text.trim().split(/\s+/).length, 0); 91 console.log(` Transcribed in ${formatTime(elapsed)} (${wordCount} words)`); 92 93 return whisperJson; 94 } 95 96 function convertToWav(mp3Path) { 97 const wavPath = mp3Path.replace(/\.mp3$/, '.wav'); 98 console.log(` Converting to WAV for diarization...`); 99 execSync( 100 `ffmpeg -i "${mp3Path}" -ar 16000 -ac 1 "${wavPath}" -y`, 101 { stdio: 'pipe', timeout: 600000 } 102 ); 103 return wavPath; 104 } 105 106 function runDiarization(wavPath) { 107 console.log(` Running speaker diarization...`); 108 const start = Date.now(); 109 110 // Write a temp Python script that outputs JSON 111 const pyScript = join(AUDIO_DIR, '_diarize.py'); 112 writeFileSync(pyScript, ` 113 import json, sys, torch 114 from pyannote.audio import Pipeline 115 116 pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-community-1", token=True) 117 device = torch.device("mps" if torch.backends.mps.is_available() else "cpu") 118 pipeline.to(device) 119 120 result = pipeline(sys.argv[1]) 121 sd = result.speaker_diarization 122 123 segments = [] 124 for turn, _, speaker in sd.itertracks(yield_label=True): 125 segments.append({"start": turn.start, "end": turn.end, "speaker": speaker}) 126 127 print(json.dumps(segments)) 128 `); 129 130 const output = execSync( 131 `uv run --python 3.12 --with "pyannote-audio>=4.0" --with "torch" --with "torchaudio" python3 "${pyScript}" "${wavPath}"`, 132 { encoding: 'utf-8', timeout: 7200000, maxBuffer: 50 * 1024 * 1024 } 133 ).trim(); 134 135 unlinkSync(pyScript); 136 137 // The JSON output may be preceded by warnings/logs on stderr — extract last line that's valid JSON 138 const lines = output.split('\n'); 139 let diarization = null; 140 for (let i = lines.length - 1; i >= 0; i--) { 141 try { 142 diarization = JSON.parse(lines[i]); 143 break; 144 } catch {} 145 } 146 147 if (!diarization) throw new Error('Failed to parse diarization output'); 148 149 const elapsed = (Date.now() - start) / 1000; 150 const speakers = [...new Set(diarization.map(s => s.speaker))]; 151 console.log(` Diarized in ${formatTime(elapsed)} (${speakers.length} speakers, ${diarization.length} segments)`); 152 153 return diarization; 154 } 155 156 function assignSpeakers(whisperJson, diarization) { 157 for (const seg of whisperJson.segments) { 158 const speakers = {}; 159 for (const d of diarization) { 160 const overlapStart = Math.max(d.start, seg.start); 161 const overlapEnd = Math.min(d.end, seg.end); 162 if (overlapStart < overlapEnd) { 163 speakers[d.speaker] = (speakers[d.speaker] || 0) + (overlapEnd - overlapStart); 164 } 165 } 166 seg.speaker = Object.keys(speakers).length > 0 167 ? Object.entries(speakers).sort((a, b) => b[1] - a[1])[0][0] 168 : 'UNKNOWN'; 169 } 170 return whisperJson; 171 } 172 173 function renderMarkdown(whisperJson, hasSpeakers) { 174 const lines = []; 175 let lastSpeaker = null; 176 177 for (const seg of whisperJson.segments) { 178 const ts = formatTimestamp(seg.start); 179 const text = seg.text.trim(); 180 if (!text) continue; 181 182 if (hasSpeakers) { 183 const speaker = seg.speaker; 184 if (speaker !== lastSpeaker) { 185 if (lastSpeaker !== null) lines.push(''); 186 lines.push(`**${speaker}** [${ts}]`); 187 lastSpeaker = speaker; 188 } 189 lines.push(text); 190 } else { 191 lines.push(`[${ts}] ${text}`); 192 } 193 } 194 195 return lines.join('\n'); 196 } 197 198 async function main() { 199 mkdirSync(AUDIO_DIR, { recursive: true }); 200 mkdirSync(JSON_DIR, { recursive: true }); 201 202 const files = readdirSync(EPISODES_DIR).filter(f => f.endsWith('.md')).sort(); 203 204 // Filter to episodes needing transcription 205 const todo = []; 206 for (const file of files) { 207 const md = readFileSync(join(EPISODES_DIR, file), 'utf-8'); 208 if (!hasTranscript(md)) { 209 const url = extractAudioUrl(md); 210 if (url) todo.push({ file, url, md }); 211 } 212 } 213 214 const limit = flags.limit ? parseInt(flags.limit) : todo.length; 215 const batch = todo.slice(0, limit); 216 217 console.log(`Found ${todo.length} episodes needing transcription, processing ${batch.length}`); 218 if (flags.diarize) console.log(`Speaker diarization: enabled`); 219 console.log(); 220 221 for (let i = 0; i < batch.length; i++) { 222 const { file, url, md } = batch[i]; 223 const name = basename(file, '.md'); 224 console.log(`[${i + 1}/${batch.length}] ${name}`); 225 226 const audioPath = join(AUDIO_DIR, name + '.mp3'); 227 228 try { 229 // Download 230 await downloadMp3(url, audioPath); 231 232 // Transcribe (JSON with word timestamps) 233 let whisperJson = transcribeWithWhisper(audioPath); 234 235 // Diarize if requested 236 let hasSpeakers = false; 237 if (flags.diarize) { 238 const wavPath = convertToWav(audioPath); 239 const diarization = runDiarization(wavPath); 240 whisperJson = assignSpeakers(whisperJson, diarization); 241 hasSpeakers = true; 242 // Cleanup wav 243 if (existsSync(wavPath)) unlinkSync(wavPath); 244 } 245 246 // Save JSON (source of truth) 247 writeFileSync(join(JSON_DIR, name + '.json'), JSON.stringify(whisperJson, null, 2)); 248 249 // Render markdown transcript 250 const transcript = renderMarkdown(whisperJson, hasSpeakers); 251 252 // Replace placeholder in markdown 253 const updated = md.replace('_Not yet transcribed._', transcript); 254 writeFileSync(join(EPISODES_DIR, file), updated); 255 console.log(` ✓ Written: ${file} + ${name}.json\n`); 256 257 // Cleanup audio unless --keep-audio 258 if (!flags['keep-audio'] && existsSync(audioPath)) { 259 unlinkSync(audioPath); 260 } 261 } catch (err) { 262 console.error(` ✗ Failed: ${err.message}\n`); 263 } 264 } 265 266 console.log('Done.'); 267 } 268 269 main().catch(err => { console.error(err); process.exit(1); });