/ clis / youtube / transcript.js
transcript.js
  1  /**
  2   * YouTube transcript — uses InnerTube player API with Android client context.
  3   *
  4   * The Web client's caption URLs require a PoToken (proof of origin) generated
  5   * by BotGuard at runtime. The Android client returns caption URLs that work
  6   * without PoToken — same approach used by youtube-transcript-api (Python).
  7   *
  8   * Modes:
  9   *   --mode grouped (default): sentences merged, speaker detection, chapters
 10   *   --mode raw: every caption segment as-is with precise timestamps
 11   */
 12  import { cli, Strategy } from '@jackwener/opencli/registry';
 13  import { parseVideoId, prepareYoutubeApiPage } from './utils.js';
 14  import { groupTranscriptSegments, formatGroupedTranscript, } from './transcript-group.js';
 15  import { CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors';
 16  cli({
 17      site: 'youtube',
 18      name: 'transcript',
 19      description: 'Get YouTube video transcript/subtitles',
 20      domain: 'www.youtube.com',
 21      strategy: Strategy.COOKIE,
 22      args: [
 23          { name: 'url', required: true, positional: true, help: 'YouTube video URL or video ID' },
 24          { name: 'lang', required: false, help: 'Language code (e.g. en, zh-Hans). Omit to auto-select' },
 25          { name: 'mode', required: false, default: 'grouped', help: 'Output mode: grouped (readable paragraphs) or raw (every segment)' },
 26      ],
 27      // columns intentionally omitted — raw and grouped modes return different schemas,
 28      // so we let the renderer auto-detect columns from the data keys.
 29      func: async (page, kwargs) => {
 30          const videoId = parseVideoId(kwargs.url);
 31          await prepareYoutubeApiPage(page);
 32          const lang = kwargs.lang || '';
 33          const mode = kwargs.mode || 'grouped';
 34          // Step 1: Get caption track URL via Android InnerTube API
 35          const captionData = await page.evaluate(`
 36        (async () => {
 37          const cfg = window.ytcfg?.data_ || {};
 38          const apiKey = cfg.INNERTUBE_API_KEY;
 39          if (!apiKey) return { error: 'INNERTUBE_API_KEY not found on page' };
 40  
 41          const resp = await fetch('/youtubei/v1/player?key=' + apiKey + '&prettyPrint=false', {
 42            method: 'POST',
 43            credentials: 'include',
 44            headers: { 'Content-Type': 'application/json' },
 45            body: JSON.stringify({
 46              context: { client: { clientName: 'ANDROID', clientVersion: '20.10.38' } },
 47              videoId: ${JSON.stringify(videoId)}
 48            })
 49          });
 50  
 51          if (!resp.ok) return { error: 'InnerTube player API returned HTTP ' + resp.status };
 52          const data = await resp.json();
 53  
 54          const renderer = data.captions?.playerCaptionsTracklistRenderer;
 55          if (!renderer?.captionTracks?.length) {
 56            return { error: 'No captions available for this video' };
 57          }
 58  
 59          const tracks = renderer.captionTracks;
 60          const available = tracks.map(t => t.languageCode + (t.kind === 'asr' ? ' (auto)' : ''));
 61  
 62          const langPref = ${JSON.stringify(lang)};
 63          let track = null;
 64          if (langPref) {
 65            track = tracks.find(t => t.languageCode === langPref)
 66              || tracks.find(t => t.languageCode.startsWith(langPref));
 67          }
 68          if (!track) {
 69            track = tracks.find(t => t.kind !== 'asr') || tracks[0];
 70          }
 71  
 72          return {
 73            captionUrl: track.baseUrl,
 74            language: track.languageCode,
 75            kind: track.kind || 'manual',
 76            available,
 77            requestedLang: langPref || null,
 78            langMatched: !!(langPref && track.languageCode === langPref),
 79            langPrefixMatched: !!(langPref && track.languageCode !== langPref && track.languageCode.startsWith(langPref))
 80          };
 81        })()
 82      `);
 83          if (!captionData || typeof captionData === 'string') {
 84              throw new CommandExecutionError(`Failed to get caption info: ${typeof captionData === 'string' ? captionData : 'null response'}`);
 85          }
 86          if (captionData.error) {
 87              throw new CommandExecutionError(`${captionData.error}${captionData.available ? ' (available: ' + captionData.available.join(', ') + ')' : ''}`);
 88          }
 89          // Warn if --lang was specified but not matched
 90          if (captionData.requestedLang && !captionData.langMatched && !captionData.langPrefixMatched) {
 91              console.error(`Warning: --lang "${captionData.requestedLang}" not found. Using "${captionData.language}" instead. Available: ${captionData.available.join(', ')}`);
 92          }
 93          // Step 2: Fetch caption XML and parse segments
 94          const segments = await page.evaluate(`
 95        (async () => {
 96          const resp = await fetch(${JSON.stringify(captionData.captionUrl)});
 97          const xml = await resp.text();
 98  
 99          if (!xml?.length) {
100            return { error: 'Caption URL returned empty response' };
101          }
102  
103          function getAttr(tag, name) {
104            const needle = name + '="';
105            const idx = tag.indexOf(needle);
106            if (idx === -1) return '';
107            const valStart = idx + needle.length;
108            const valEnd = tag.indexOf('"', valStart);
109            if (valEnd === -1) return '';
110            return tag.substring(valStart, valEnd);
111          }
112  
113          function decodeEntities(s) {
114            return s
115              .replaceAll('&', '&')
116              .replaceAll('&lt;', '<')
117              .replaceAll('&gt;', '>')
118              .replaceAll('&quot;', '"')
119              .replaceAll('&#39;', "'");
120          }
121  
122          const isFormat3 = xml.includes('<p t="');
123          const marker = isFormat3 ? '<p ' : '<text ';
124          const endMarker = isFormat3 ? '</p>' : '</text>';
125          const results = [];
126          let pos = 0;
127  
128          while (true) {
129            const tagStart = xml.indexOf(marker, pos);
130            if (tagStart === -1) break;
131            let contentStart = xml.indexOf('>', tagStart);
132            if (contentStart === -1) break;
133            contentStart += 1;
134            const tagEnd = xml.indexOf(endMarker, contentStart);
135            if (tagEnd === -1) break;
136  
137            const attrStr = xml.substring(tagStart + marker.length, contentStart - 1);
138            const content = xml.substring(contentStart, tagEnd);
139  
140            let startSec, durSec;
141            if (isFormat3) {
142              startSec = (parseFloat(getAttr(attrStr, 't')) || 0) / 1000;
143              durSec = (parseFloat(getAttr(attrStr, 'd')) || 0) / 1000;
144            } else {
145              startSec = parseFloat(getAttr(attrStr, 'start')) || 0;
146              durSec = parseFloat(getAttr(attrStr, 'dur')) || 0;
147            }
148  
149            // Strip inner tags (e.g. <s> in srv3 format) and decode entities
150            const text = decodeEntities(content.replace(/<[^>]+>/g, '')).split('\\\\n').join(' ').trim();
151            if (text) {
152              results.push({ start: startSec, end: startSec + durSec, text });
153            }
154  
155            pos = tagEnd + endMarker.length;
156          }
157  
158          if (results.length === 0) {
159            return { error: 'Parsed 0 segments from caption XML' };
160          }
161  
162          return results;
163        })()
164      `);
165          if (!Array.isArray(segments)) {
166              throw new CommandExecutionError(segments?.error || 'Failed to parse caption segments');
167          }
168          if (segments.length === 0) {
169              throw new EmptyResultError('youtube transcript');
170          }
171          // Step 3: Fetch chapters (for grouped mode)
172          let chapters = [];
173          if (mode === 'grouped') {
174              try {
175                  const chapterData = await page.evaluate(`
176            (async () => {
177              const cfg = window.ytcfg?.data_ || {};
178              const apiKey = cfg.INNERTUBE_API_KEY;
179              if (!apiKey) return [];
180  
181              const resp = await fetch('/youtubei/v1/next?key=' + apiKey + '&prettyPrint=false', {
182                method: 'POST',
183                credentials: 'include',
184                headers: { 'Content-Type': 'application/json' },
185                body: JSON.stringify({
186                  context: { client: { clientName: 'WEB', clientVersion: '2.20240101.00.00' } },
187                  videoId: ${JSON.stringify(videoId)}
188                })
189              });
190              if (!resp.ok) return [];
191              const data = await resp.json();
192  
193              const chapters = [];
194  
195              // Try chapterRenderer from player bar
196              const panels = data.playerOverlays?.playerOverlayRenderer
197                ?.decoratedPlayerBarRenderer?.decoratedPlayerBarRenderer
198                ?.playerBar?.multiMarkersPlayerBarRenderer?.markersMap;
199  
200              if (Array.isArray(panels)) {
201                for (const panel of panels) {
202                  const markers = panel.value?.chapters;
203                  if (!Array.isArray(markers)) continue;
204                  for (const marker of markers) {
205                    const ch = marker.chapterRenderer;
206                    if (!ch) continue;
207                    const title = ch.title?.simpleText || '';
208                    const startMs = ch.timeRangeStartMillis;
209                    if (title && typeof startMs === 'number') {
210                      chapters.push({ title, start: startMs / 1000 });
211                    }
212                  }
213                }
214              }
215              if (chapters.length > 0) return chapters;
216  
217              // Fallback: macroMarkersListItemRenderer from engagement panels
218              const engPanels = data.engagementPanels;
219              if (!Array.isArray(engPanels)) return [];
220              for (const ep of engPanels) {
221                const content = ep.engagementPanelSectionListRenderer?.content;
222                const items = content?.macroMarkersListRenderer?.contents;
223                if (!Array.isArray(items)) continue;
224                for (const item of items) {
225                  const renderer = item.macroMarkersListItemRenderer;
226                  if (!renderer) continue;
227                  const t = renderer.title?.simpleText || '';
228                  const ts = renderer.timeDescription?.simpleText || '';
229                  if (!t || !ts) continue;
230                  const parts = ts.split(':').map(Number);
231                  let secs = null;
232                  if (parts.length === 3 && parts.every(n => !isNaN(n))) secs = parts[0]*3600 + parts[1]*60 + parts[2];
233                  else if (parts.length === 2 && parts.every(n => !isNaN(n))) secs = parts[0]*60 + parts[1];
234                  if (secs !== null) chapters.push({ title: t, start: secs });
235                }
236              }
237              return chapters;
238            })()
239          `);
240                  if (Array.isArray(chapterData)) {
241                      chapters = chapterData;
242                  }
243              }
244              catch {
245                  // Chapters are optional — proceed without them
246              }
247          }
248          // Step 4: Format output based on mode
249          if (mode === 'raw') {
250              // Precise timestamps in seconds with decimals, matching bilibili/subtitle format
251              return segments.map((seg, i) => ({
252                  index: i + 1,
253                  start: Number(seg.start).toFixed(2) + 's',
254                  end: Number(seg.end).toFixed(2) + 's',
255                  text: seg.text,
256              }));
257          }
258          // Grouped mode: merge sentences, detect speakers, insert chapters
259          const grouped = groupTranscriptSegments(segments.map(s => ({ start: s.start, text: s.text })));
260          const { rows } = formatGroupedTranscript(grouped, chapters);
261          return rows;
262      },
263  });