/ clis / youtube / transcript-group.js
transcript-group.js
  1  /**
  2   * Transcript grouping: sentence merging, speaker detection, and chapter support.
  3   * Ported and simplified from Defuddle's YouTube extractor.
  4   *
  5   * Raw segments (2-3 second fragments) are grouped into readable paragraphs:
  6   * - Sentence boundaries: merge until sentence-ending punctuation (.!?)
  7   * - Speaker turns: detect ">>" markers from YouTube auto-captions
  8   * - Chapters: optional chapter headings inserted at appropriate timestamps
  9   */
 10  // Include CJK sentence-ending punctuation: 。!? (fullwidth: .!?)
 11  const SENTENCE_END = /[.!?\u3002\uFF01\uFF1F\uFF0E]["'\u2019\u201D)]*\s*$/;
 12  const QUESTION_END = /[?\uFF1F]["'\u2019\u201D)]*\s*$/;
 13  const TRANSCRIPT_GROUP_GAP_SECONDS = 20;
 14  const TURN_MERGE_MAX_WORDS = 80;
 15  const TURN_MERGE_MAX_SPAN_SECONDS = 45;
 16  const SHORT_UTTERANCE_MAX_WORDS = 3;
 17  const FIRST_GROUP_MERGE_MIN_WORDS = 8;
 18  function countWords(text) {
 19      return text.split(/\s+/).filter(Boolean).length;
 20  }
 21  /**
 22   * Group raw transcript segments into readable blocks.
 23   * If speaker markers (>>) are present, groups by speaker turn.
 24   * Otherwise, groups by sentence boundaries.
 25   */
 26  export function groupTranscriptSegments(segments) {
 27      if (segments.length === 0)
 28          return [];
 29      const hasSpeakerMarkers = segments.some(s => /^>>/.test(s.text));
 30      return hasSpeakerMarkers ? groupBySpeaker(segments) : groupBySentence(segments);
 31  }
 32  /**
 33   * Format grouped segments + chapters into a final text output.
 34   */
 35  export function formatGroupedTranscript(segments, chapters = []) {
 36      const sortedChapters = [...chapters].sort((a, b) => a.start - b.start);
 37      let chapterIdx = 0;
 38      const rows = [];
 39      const textParts = [];
 40      for (const segment of segments) {
 41          // Insert chapter headings
 42          while (chapterIdx < sortedChapters.length && sortedChapters[chapterIdx].start <= segment.start) {
 43              const title = sortedChapters[chapterIdx].title;
 44              rows.push({ timestamp: fmtTime(sortedChapters[chapterIdx].start), speaker: '', text: `[Chapter] ${title}` });
 45              if (textParts.length > 0)
 46                  textParts.push('');
 47              textParts.push(`### ${title}`);
 48              textParts.push('');
 49              chapterIdx++;
 50          }
 51          const timestamp = fmtTime(segment.start);
 52          const speaker = segment.speaker !== undefined ? `Speaker ${segment.speaker + 1}` : '';
 53          rows.push({ timestamp, speaker, text: segment.text });
 54          if (segment.speakerChange && textParts.length > 0) {
 55              textParts.push('');
 56          }
 57          textParts.push(`${timestamp} ${segment.text}`);
 58      }
 59      return { rows, plainText: textParts.join('\n') };
 60  }
 61  function fmtTime(sec) {
 62      const h = Math.floor(sec / 3600);
 63      const m = Math.floor((sec % 3600) / 60);
 64      const s = Math.floor(sec % 60);
 65      if (h > 0) {
 66          return `${h}:${String(m).padStart(2, '0')}:${String(s).padStart(2, '0')}`;
 67      }
 68      return `${m}:${String(s).padStart(2, '0')}`;
 69  }
 70  // ── Sentence grouping ─────────────────────────────────────────────────────
 71  // Max time span (seconds) for a single group when no sentence boundaries are found.
 72  // Prevents unbounded merging for languages without punctuation (Chinese, etc.).
 73  const MAX_GROUP_SPAN_SECONDS = 30;
 74  function groupBySentence(segments) {
 75      const groups = [];
 76      let buffer = '';
 77      let bufferStart = 0;
 78      let lastStart = 0;
 79      const flush = () => {
 80          if (buffer.trim()) {
 81              groups.push({ start: bufferStart, text: buffer.trim(), speakerChange: false });
 82              buffer = '';
 83          }
 84      };
 85      for (const seg of segments) {
 86          // Large gap between segments — always flush
 87          if (buffer && seg.start - lastStart > TRANSCRIPT_GROUP_GAP_SECONDS) {
 88              flush();
 89          }
 90          // Time-based flush: prevent unbounded groups for unpunctuated languages
 91          if (buffer && seg.start - bufferStart > MAX_GROUP_SPAN_SECONDS) {
 92              flush();
 93          }
 94          if (!buffer)
 95              bufferStart = seg.start;
 96          buffer += (buffer ? ' ' : '') + seg.text;
 97          lastStart = seg.start;
 98          if (SENTENCE_END.test(seg.text))
 99              flush();
100      }
101      flush();
102      return groups;
103  }
104  // ── Speaker grouping ──────────────────────────────────────────────────────
105  function groupBySpeaker(segments) {
106      const turns = [];
107      let currentTurn = null;
108      let speakerIndex = -1;
109      let prevSegText = '';
110      for (const seg of segments) {
111          const isSpeakerChange = /^>>/.test(seg.text);
112          const cleanText = seg.text.replace(/^>>\s*/, '').replace(/^-\s+/, '');
113          const prevEndsWithComma = /,\s*$/.test(prevSegText);
114          const prevEndedSentence = (SENTENCE_END.test(prevSegText) || !prevSegText) && !prevEndsWithComma;
115          const isRealSpeakerChange = isSpeakerChange && prevEndedSentence;
116          if (isRealSpeakerChange) {
117              if (currentTurn)
118                  turns.push(currentTurn);
119              speakerIndex = (speakerIndex + 1) % 2;
120              currentTurn = {
121                  start: seg.start,
122                  segments: [{ start: seg.start, text: cleanText }],
123                  speakerChange: true,
124                  speaker: speakerIndex,
125              };
126          }
127          else {
128              if (!currentTurn) {
129                  currentTurn = { start: seg.start, segments: [], speakerChange: false };
130              }
131              currentTurn.segments.push({ start: seg.start, text: cleanText });
132          }
133          prevSegText = cleanText;
134      }
135      if (currentTurn)
136          turns.push(currentTurn);
137      splitAffirmativeTurns(turns);
138      const groups = [];
139      for (const turn of turns) {
140          const sentenceGroups = turn.speaker === undefined
141              ? groupBySentence(turn.segments)
142              : mergeSentenceGroupsWithinTurn(groupBySentence(turn.segments));
143          for (let i = 0; i < sentenceGroups.length; i++) {
144              groups.push({
145                  ...sentenceGroups[i],
146                  speakerChange: i === 0 && turn.speakerChange,
147                  speaker: turn.speaker,
148              });
149          }
150      }
151      return groups;
152  }
153  function splitAffirmativeTurns(turns) {
154      const affirmativePattern = /^(mhm|yeah|yes|yep|right|okay|ok|absolutely|sure|exactly|uh-huh|mm-hmm)[.!,]?\s+/i;
155      for (let i = 0; i < turns.length; i++) {
156          const turn = turns[i];
157          if (turn.speaker === undefined || turn.segments.length === 0)
158              continue;
159          const firstSeg = turn.segments[0];
160          const match = affirmativePattern.exec(firstSeg.text);
161          if (!match)
162              continue;
163          if (/,\s*$/.test(match[0]))
164              continue;
165          const remainder = firstSeg.text.slice(match[0].length).trim();
166          const restSegments = turn.segments.slice(1);
167          const restWords = countWords(remainder) + restSegments.reduce((sum, s) => sum + countWords(s.text), 0);
168          if (restWords < 30)
169              continue;
170          const affirmativeText = match[0].trimEnd();
171          const newRestSegments = remainder
172              ? [{ start: firstSeg.start, text: remainder }, ...restSegments]
173              : restSegments;
174          turns.splice(i, 1, {
175              start: turn.start,
176              segments: [{ start: firstSeg.start, text: affirmativeText }],
177              speakerChange: turn.speakerChange,
178              speaker: turn.speaker,
179          }, {
180              start: newRestSegments[0].start,
181              segments: newRestSegments,
182              speakerChange: true,
183              speaker: turn.speaker === 0 ? 1 : 0,
184          });
185          i++;
186      }
187  }
188  function mergeSentenceGroupsWithinTurn(groups) {
189      if (groups.length <= 1)
190          return groups;
191      const merged = [];
192      let current = { ...groups[0] };
193      let currentIsFirstInTurn = true;
194      for (let i = 1; i < groups.length; i++) {
195          const next = groups[i];
196          if (shouldMergeSentenceGroups(current, next, currentIsFirstInTurn)) {
197              current.text = `${current.text} ${next.text}`;
198              continue;
199          }
200          merged.push(current);
201          current = { ...next };
202          currentIsFirstInTurn = false;
203      }
204      merged.push(current);
205      return merged;
206  }
207  function shouldMergeSentenceGroups(current, next, currentIsFirstInTurn) {
208      const currentWords = countWords(current.text);
209      const nextWords = countWords(next.text);
210      if (isShortStandaloneUtterance(current.text, currentWords)
211          || isShortStandaloneUtterance(next.text, nextWords))
212          return false;
213      if (currentIsFirstInTurn && currentWords < FIRST_GROUP_MERGE_MIN_WORDS)
214          return false;
215      if (QUESTION_END.test(current.text) || QUESTION_END.test(next.text))
216          return false;
217      if (currentWords + nextWords > TURN_MERGE_MAX_WORDS)
218          return false;
219      if (next.start - current.start > TURN_MERGE_MAX_SPAN_SECONDS)
220          return false;
221      return true;
222  }
223  function isShortStandaloneUtterance(text, words) {
224      const w = words ?? countWords(text);
225      return w > 0 && w <= SHORT_UTTERANCE_MAX_WORDS && SENTENCE_END.test(text);
226  }