Cradicle Explorer

/ transcribe.mjs
transcribe.mjs
  1  #!/usr/bin/env node
  2  /**
  3   * Download MP3s and transcribe episodes using mlx_whisper.
  4   * Optionally adds speaker diarization via pyannote.
  5   * Saves JSON (source of truth) alongside markdown transcript.
  6   *
  7   * Usage:
  8   *   node transcribe.mjs                     # all episodes without transcripts
  9   *   node transcribe.mjs --limit 5           # first 5 only
 10   *   node transcribe.mjs --diarize           # add speaker diarization
 11   *   node transcribe.mjs --keep-audio        # don't delete audio after transcribing
 12   */
 13  
 14  import { parseArgs } from 'node:util';
 15  import { readFileSync, writeFileSync, readdirSync, existsSync, unlinkSync, mkdirSync } from 'node:fs';
 16  import { join, basename } from 'node:path';
 17  import { execSync } from 'node:child_process';
 18  
 19  const EPISODES_DIR = join(import.meta.dirname, 'episodes');
 20  const AUDIO_DIR = join(import.meta.dirname, 'audio');
 21  const JSON_DIR = join(import.meta.dirname, 'transcripts-json');
 22  const WHISPER_MODEL = 'mlx-community/whisper-large-v3-turbo';
 23  
 24  const { values: flags } = parseArgs({
 25    options: {
 26      limit: { type: 'string', short: 'l' },
 27      diarize: { type: 'boolean', short: 'd' },
 28      'keep-audio': { type: 'boolean', short: 'k' },
 29    },
 30  });
 31  
 32  function extractAudioUrl(md) {
 33    const m = md.match(/\*\*Audio\*\*:\s*(https?:\/\/\S+)/);
 34    return m ? m[1] : null;
 35  }
 36  
 37  function hasTranscript(md) {
 38    return !md.includes('_Not yet transcribed._');
 39  }
 40  
 41  function formatTime(seconds) {
 42    const m = Math.floor(seconds / 60);
 43    const s = Math.floor(seconds % 60);
 44    return `${m}m${s}s`;
 45  }
 46  
 47  function formatTimestamp(secs) {
 48    const h = Math.floor(secs / 3600);
 49    const m = Math.floor((secs % 3600) / 60);
 50    const s = Math.floor(secs % 60);
 51    return h > 0
 52      ? `${h}:${String(m).padStart(2, '0')}:${String(s).padStart(2, '0')}`
 53      : `${m}:${String(s).padStart(2, '0')}`;
 54  }
 55  
 56  async function downloadMp3(url, destPath) {
 57    console.log(`    Downloading audio...`);
 58    execSync(
 59      `curl -L -A "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" -o "${destPath}" "${url}"`,
 60      { stdio: 'pipe', timeout: 600000 }
 61    );
 62    const stat = execSync(`stat -f %z "${destPath}"`, { encoding: 'utf-8' }).trim();
 63    const mb = (parseInt(stat) / 1024 / 1024).toFixed(1);
 64    console.log(`    Downloaded: ${mb}MB`);
 65  }
 66  
 67  function transcribeWithWhisper(audioPath, jsonOutPath) {
 68    console.log(`    Transcribing with mlx_whisper...`);
 69    const start = Date.now();
 70  
 71    const tmpDir = join(AUDIO_DIR, 'tmp-transcript');
 72    mkdirSync(tmpDir, { recursive: true });
 73  
 74    execSync(
 75      `mlx_whisper "${audioPath}" --model ${WHISPER_MODEL} --language en --output-format json --output-dir "${tmpDir}" --condition-on-previous-text False --hallucination-silence-threshold 2 --word-timestamps True`,
 76      { stdio: 'pipe', timeout: 7200000 }
 77    );
 78  
 79    const elapsed = (Date.now() - start) / 1000;
 80  
 81    // Find the output json file
 82    const jsonFiles = readdirSync(tmpDir).filter(f => f.endsWith('.json'));
 83    if (jsonFiles.length === 0) throw new Error('No transcript output found');
 84  
 85    const whisperJson = JSON.parse(readFileSync(join(tmpDir, jsonFiles[0]), 'utf-8'));
 86  
 87    // Cleanup tmp
 88    for (const f of readdirSync(tmpDir)) unlinkSync(join(tmpDir, f));
 89  
 90    const wordCount = whisperJson.segments.reduce((n, s) => n + s.text.trim().split(/\s+/).length, 0);
 91    console.log(`    Transcribed in ${formatTime(elapsed)} (${wordCount} words)`);
 92  
 93    return whisperJson;
 94  }
 95  
 96  function convertToWav(mp3Path) {
 97    const wavPath = mp3Path.replace(/\.mp3$/, '.wav');
 98    console.log(`    Converting to WAV for diarization...`);
 99    execSync(
100      `ffmpeg -i "${mp3Path}" -ar 16000 -ac 1 "${wavPath}" -y`,
101      { stdio: 'pipe', timeout: 600000 }
102    );
103    return wavPath;
104  }
105  
106  function runDiarization(wavPath) {
107    console.log(`    Running speaker diarization...`);
108    const start = Date.now();
109  
110    // Write a temp Python script that outputs JSON
111    const pyScript = join(AUDIO_DIR, '_diarize.py');
112    writeFileSync(pyScript, `
113  import json, sys, torch
114  from pyannote.audio import Pipeline
115  
116  pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-community-1", token=True)
117  device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
118  pipeline.to(device)
119  
120  result = pipeline(sys.argv[1])
121  sd = result.speaker_diarization
122  
123  segments = []
124  for turn, _, speaker in sd.itertracks(yield_label=True):
125      segments.append({"start": turn.start, "end": turn.end, "speaker": speaker})
126  
127  print(json.dumps(segments))
128  `);
129  
130    const output = execSync(
131      `uv run --python 3.12 --with "pyannote-audio>=4.0" --with "torch" --with "torchaudio" python3 "${pyScript}" "${wavPath}"`,
132      { encoding: 'utf-8', timeout: 7200000, maxBuffer: 50 * 1024 * 1024 }
133    ).trim();
134  
135    unlinkSync(pyScript);
136  
137    // The JSON output may be preceded by warnings/logs on stderr — extract last line that's valid JSON
138    const lines = output.split('\n');
139    let diarization = null;
140    for (let i = lines.length - 1; i >= 0; i--) {
141      try {
142        diarization = JSON.parse(lines[i]);
143        break;
144      } catch {}
145    }
146  
147    if (!diarization) throw new Error('Failed to parse diarization output');
148  
149    const elapsed = (Date.now() - start) / 1000;
150    const speakers = [...new Set(diarization.map(s => s.speaker))];
151    console.log(`    Diarized in ${formatTime(elapsed)} (${speakers.length} speakers, ${diarization.length} segments)`);
152  
153    return diarization;
154  }
155  
156  function assignSpeakers(whisperJson, diarization) {
157    for (const seg of whisperJson.segments) {
158      const speakers = {};
159      for (const d of diarization) {
160        const overlapStart = Math.max(d.start, seg.start);
161        const overlapEnd = Math.min(d.end, seg.end);
162        if (overlapStart < overlapEnd) {
163          speakers[d.speaker] = (speakers[d.speaker] || 0) + (overlapEnd - overlapStart);
164        }
165      }
166      seg.speaker = Object.keys(speakers).length > 0
167        ? Object.entries(speakers).sort((a, b) => b[1] - a[1])[0][0]
168        : 'UNKNOWN';
169    }
170    return whisperJson;
171  }
172  
173  function renderMarkdown(whisperJson, hasSpeakers) {
174    const lines = [];
175    let lastSpeaker = null;
176  
177    for (const seg of whisperJson.segments) {
178      const ts = formatTimestamp(seg.start);
179      const text = seg.text.trim();
180      if (!text) continue;
181  
182      if (hasSpeakers) {
183        const speaker = seg.speaker;
184        if (speaker !== lastSpeaker) {
185          if (lastSpeaker !== null) lines.push('');
186          lines.push(`**${speaker}** [${ts}]`);
187          lastSpeaker = speaker;
188        }
189        lines.push(text);
190      } else {
191        lines.push(`[${ts}] ${text}`);
192      }
193    }
194  
195    return lines.join('\n');
196  }
197  
198  async function main() {
199    mkdirSync(AUDIO_DIR, { recursive: true });
200    mkdirSync(JSON_DIR, { recursive: true });
201  
202    const files = readdirSync(EPISODES_DIR).filter(f => f.endsWith('.md')).sort();
203  
204    // Filter to episodes needing transcription
205    const todo = [];
206    for (const file of files) {
207      const md = readFileSync(join(EPISODES_DIR, file), 'utf-8');
208      if (!hasTranscript(md)) {
209        const url = extractAudioUrl(md);
210        if (url) todo.push({ file, url, md });
211      }
212    }
213  
214    const limit = flags.limit ? parseInt(flags.limit) : todo.length;
215    const batch = todo.slice(0, limit);
216  
217    console.log(`Found ${todo.length} episodes needing transcription, processing ${batch.length}`);
218    if (flags.diarize) console.log(`Speaker diarization: enabled`);
219    console.log();
220  
221    for (let i = 0; i < batch.length; i++) {
222      const { file, url, md } = batch[i];
223      const name = basename(file, '.md');
224      console.log(`[${i + 1}/${batch.length}] ${name}`);
225  
226      const audioPath = join(AUDIO_DIR, name + '.mp3');
227  
228      try {
229        // Download
230        await downloadMp3(url, audioPath);
231  
232        // Transcribe (JSON with word timestamps)
233        let whisperJson = transcribeWithWhisper(audioPath);
234  
235        // Diarize if requested
236        let hasSpeakers = false;
237        if (flags.diarize) {
238          const wavPath = convertToWav(audioPath);
239          const diarization = runDiarization(wavPath);
240          whisperJson = assignSpeakers(whisperJson, diarization);
241          hasSpeakers = true;
242          // Cleanup wav
243          if (existsSync(wavPath)) unlinkSync(wavPath);
244        }
245  
246        // Save JSON (source of truth)
247        writeFileSync(join(JSON_DIR, name + '.json'), JSON.stringify(whisperJson, null, 2));
248  
249        // Render markdown transcript
250        const transcript = renderMarkdown(whisperJson, hasSpeakers);
251  
252        // Replace placeholder in markdown
253        const updated = md.replace('_Not yet transcribed._', transcript);
254        writeFileSync(join(EPISODES_DIR, file), updated);
255        console.log(`    ✓ Written: ${file} + ${name}.json\n`);
256  
257        // Cleanup audio unless --keep-audio
258        if (!flags['keep-audio'] && existsSync(audioPath)) {
259          unlinkSync(audioPath);
260        }
261      } catch (err) {
262        console.error(`    ✗ Failed: ${err.message}\n`);
263      }
264    }
265  
266    console.log('Done.');
267  }
268  
269  main().catch(err => { console.error(err); process.exit(1); });