transcript.js
1 /** 2 * YouTube transcript — uses InnerTube player API with Android client context. 3 * 4 * The Web client's caption URLs require a PoToken (proof of origin) generated 5 * by BotGuard at runtime. The Android client returns caption URLs that work 6 * without PoToken — same approach used by youtube-transcript-api (Python). 7 * 8 * Modes: 9 * --mode grouped (default): sentences merged, speaker detection, chapters 10 * --mode raw: every caption segment as-is with precise timestamps 11 */ 12 import { cli, Strategy } from '@jackwener/opencli/registry'; 13 import { parseVideoId, prepareYoutubeApiPage } from './utils.js'; 14 import { groupTranscriptSegments, formatGroupedTranscript, } from './transcript-group.js'; 15 import { CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors'; 16 cli({ 17 site: 'youtube', 18 name: 'transcript', 19 description: 'Get YouTube video transcript/subtitles', 20 domain: 'www.youtube.com', 21 strategy: Strategy.COOKIE, 22 args: [ 23 { name: 'url', required: true, positional: true, help: 'YouTube video URL or video ID' }, 24 { name: 'lang', required: false, help: 'Language code (e.g. en, zh-Hans). Omit to auto-select' }, 25 { name: 'mode', required: false, default: 'grouped', help: 'Output mode: grouped (readable paragraphs) or raw (every segment)' }, 26 ], 27 // columns intentionally omitted — raw and grouped modes return different schemas, 28 // so we let the renderer auto-detect columns from the data keys. 29 func: async (page, kwargs) => { 30 const videoId = parseVideoId(kwargs.url); 31 await prepareYoutubeApiPage(page); 32 const lang = kwargs.lang || ''; 33 const mode = kwargs.mode || 'grouped'; 34 // Step 1: Get caption track URL via Android InnerTube API 35 const captionData = await page.evaluate(` 36 (async () => { 37 const cfg = window.ytcfg?.data_ || {}; 38 const apiKey = cfg.INNERTUBE_API_KEY; 39 if (!apiKey) return { error: 'INNERTUBE_API_KEY not found on page' }; 40 41 const resp = await fetch('/youtubei/v1/player?key=' + apiKey + '&prettyPrint=false', { 42 method: 'POST', 43 credentials: 'include', 44 headers: { 'Content-Type': 'application/json' }, 45 body: JSON.stringify({ 46 context: { client: { clientName: 'ANDROID', clientVersion: '20.10.38' } }, 47 videoId: ${JSON.stringify(videoId)} 48 }) 49 }); 50 51 if (!resp.ok) return { error: 'InnerTube player API returned HTTP ' + resp.status }; 52 const data = await resp.json(); 53 54 const renderer = data.captions?.playerCaptionsTracklistRenderer; 55 if (!renderer?.captionTracks?.length) { 56 return { error: 'No captions available for this video' }; 57 } 58 59 const tracks = renderer.captionTracks; 60 const available = tracks.map(t => t.languageCode + (t.kind === 'asr' ? ' (auto)' : '')); 61 62 const langPref = ${JSON.stringify(lang)}; 63 let track = null; 64 if (langPref) { 65 track = tracks.find(t => t.languageCode === langPref) 66 || tracks.find(t => t.languageCode.startsWith(langPref)); 67 } 68 if (!track) { 69 track = tracks.find(t => t.kind !== 'asr') || tracks[0]; 70 } 71 72 return { 73 captionUrl: track.baseUrl, 74 language: track.languageCode, 75 kind: track.kind || 'manual', 76 available, 77 requestedLang: langPref || null, 78 langMatched: !!(langPref && track.languageCode === langPref), 79 langPrefixMatched: !!(langPref && track.languageCode !== langPref && track.languageCode.startsWith(langPref)) 80 }; 81 })() 82 `); 83 if (!captionData || typeof captionData === 'string') { 84 throw new CommandExecutionError(`Failed to get caption info: ${typeof captionData === 'string' ? captionData : 'null response'}`); 85 } 86 if (captionData.error) { 87 throw new CommandExecutionError(`${captionData.error}${captionData.available ? ' (available: ' + captionData.available.join(', ') + ')' : ''}`); 88 } 89 // Warn if --lang was specified but not matched 90 if (captionData.requestedLang && !captionData.langMatched && !captionData.langPrefixMatched) { 91 console.error(`Warning: --lang "${captionData.requestedLang}" not found. Using "${captionData.language}" instead. Available: ${captionData.available.join(', ')}`); 92 } 93 // Step 2: Fetch caption XML and parse segments 94 const segments = await page.evaluate(` 95 (async () => { 96 const resp = await fetch(${JSON.stringify(captionData.captionUrl)}); 97 const xml = await resp.text(); 98 99 if (!xml?.length) { 100 return { error: 'Caption URL returned empty response' }; 101 } 102 103 function getAttr(tag, name) { 104 const needle = name + '="'; 105 const idx = tag.indexOf(needle); 106 if (idx === -1) return ''; 107 const valStart = idx + needle.length; 108 const valEnd = tag.indexOf('"', valStart); 109 if (valEnd === -1) return ''; 110 return tag.substring(valStart, valEnd); 111 } 112 113 function decodeEntities(s) { 114 return s 115 .replaceAll('&', '&') 116 .replaceAll('<', '<') 117 .replaceAll('>', '>') 118 .replaceAll('"', '"') 119 .replaceAll(''', "'"); 120 } 121 122 const isFormat3 = xml.includes('<p t="'); 123 const marker = isFormat3 ? '<p ' : '<text '; 124 const endMarker = isFormat3 ? '</p>' : '</text>'; 125 const results = []; 126 let pos = 0; 127 128 while (true) { 129 const tagStart = xml.indexOf(marker, pos); 130 if (tagStart === -1) break; 131 let contentStart = xml.indexOf('>', tagStart); 132 if (contentStart === -1) break; 133 contentStart += 1; 134 const tagEnd = xml.indexOf(endMarker, contentStart); 135 if (tagEnd === -1) break; 136 137 const attrStr = xml.substring(tagStart + marker.length, contentStart - 1); 138 const content = xml.substring(contentStart, tagEnd); 139 140 let startSec, durSec; 141 if (isFormat3) { 142 startSec = (parseFloat(getAttr(attrStr, 't')) || 0) / 1000; 143 durSec = (parseFloat(getAttr(attrStr, 'd')) || 0) / 1000; 144 } else { 145 startSec = parseFloat(getAttr(attrStr, 'start')) || 0; 146 durSec = parseFloat(getAttr(attrStr, 'dur')) || 0; 147 } 148 149 // Strip inner tags (e.g. <s> in srv3 format) and decode entities 150 const text = decodeEntities(content.replace(/<[^>]+>/g, '')).split('\\\\n').join(' ').trim(); 151 if (text) { 152 results.push({ start: startSec, end: startSec + durSec, text }); 153 } 154 155 pos = tagEnd + endMarker.length; 156 } 157 158 if (results.length === 0) { 159 return { error: 'Parsed 0 segments from caption XML' }; 160 } 161 162 return results; 163 })() 164 `); 165 if (!Array.isArray(segments)) { 166 throw new CommandExecutionError(segments?.error || 'Failed to parse caption segments'); 167 } 168 if (segments.length === 0) { 169 throw new EmptyResultError('youtube transcript'); 170 } 171 // Step 3: Fetch chapters (for grouped mode) 172 let chapters = []; 173 if (mode === 'grouped') { 174 try { 175 const chapterData = await page.evaluate(` 176 (async () => { 177 const cfg = window.ytcfg?.data_ || {}; 178 const apiKey = cfg.INNERTUBE_API_KEY; 179 if (!apiKey) return []; 180 181 const resp = await fetch('/youtubei/v1/next?key=' + apiKey + '&prettyPrint=false', { 182 method: 'POST', 183 credentials: 'include', 184 headers: { 'Content-Type': 'application/json' }, 185 body: JSON.stringify({ 186 context: { client: { clientName: 'WEB', clientVersion: '2.20240101.00.00' } }, 187 videoId: ${JSON.stringify(videoId)} 188 }) 189 }); 190 if (!resp.ok) return []; 191 const data = await resp.json(); 192 193 const chapters = []; 194 195 // Try chapterRenderer from player bar 196 const panels = data.playerOverlays?.playerOverlayRenderer 197 ?.decoratedPlayerBarRenderer?.decoratedPlayerBarRenderer 198 ?.playerBar?.multiMarkersPlayerBarRenderer?.markersMap; 199 200 if (Array.isArray(panels)) { 201 for (const panel of panels) { 202 const markers = panel.value?.chapters; 203 if (!Array.isArray(markers)) continue; 204 for (const marker of markers) { 205 const ch = marker.chapterRenderer; 206 if (!ch) continue; 207 const title = ch.title?.simpleText || ''; 208 const startMs = ch.timeRangeStartMillis; 209 if (title && typeof startMs === 'number') { 210 chapters.push({ title, start: startMs / 1000 }); 211 } 212 } 213 } 214 } 215 if (chapters.length > 0) return chapters; 216 217 // Fallback: macroMarkersListItemRenderer from engagement panels 218 const engPanels = data.engagementPanels; 219 if (!Array.isArray(engPanels)) return []; 220 for (const ep of engPanels) { 221 const content = ep.engagementPanelSectionListRenderer?.content; 222 const items = content?.macroMarkersListRenderer?.contents; 223 if (!Array.isArray(items)) continue; 224 for (const item of items) { 225 const renderer = item.macroMarkersListItemRenderer; 226 if (!renderer) continue; 227 const t = renderer.title?.simpleText || ''; 228 const ts = renderer.timeDescription?.simpleText || ''; 229 if (!t || !ts) continue; 230 const parts = ts.split(':').map(Number); 231 let secs = null; 232 if (parts.length === 3 && parts.every(n => !isNaN(n))) secs = parts[0]*3600 + parts[1]*60 + parts[2]; 233 else if (parts.length === 2 && parts.every(n => !isNaN(n))) secs = parts[0]*60 + parts[1]; 234 if (secs !== null) chapters.push({ title: t, start: secs }); 235 } 236 } 237 return chapters; 238 })() 239 `); 240 if (Array.isArray(chapterData)) { 241 chapters = chapterData; 242 } 243 } 244 catch { 245 // Chapters are optional — proceed without them 246 } 247 } 248 // Step 4: Format output based on mode 249 if (mode === 'raw') { 250 // Precise timestamps in seconds with decimals, matching bilibili/subtitle format 251 return segments.map((seg, i) => ({ 252 index: i + 1, 253 start: Number(seg.start).toFixed(2) + 's', 254 end: Number(seg.end).toFixed(2) + 's', 255 text: seg.text, 256 })); 257 } 258 // Grouped mode: merge sentences, detect speakers, insert chapters 259 const grouped = groupTranscriptSegments(segments.map(s => ({ start: s.start, text: s.text }))); 260 const { rows } = formatGroupedTranscript(grouped, chapters); 261 return rows; 262 }, 263 });