transcript-group.js
1 /** 2 * Transcript grouping: sentence merging, speaker detection, and chapter support. 3 * Ported and simplified from Defuddle's YouTube extractor. 4 * 5 * Raw segments (2-3 second fragments) are grouped into readable paragraphs: 6 * - Sentence boundaries: merge until sentence-ending punctuation (.!?) 7 * - Speaker turns: detect ">>" markers from YouTube auto-captions 8 * - Chapters: optional chapter headings inserted at appropriate timestamps 9 */ 10 // Include CJK sentence-ending punctuation: 。!? (fullwidth: .!?) 11 const SENTENCE_END = /[.!?\u3002\uFF01\uFF1F\uFF0E]["'\u2019\u201D)]*\s*$/; 12 const QUESTION_END = /[?\uFF1F]["'\u2019\u201D)]*\s*$/; 13 const TRANSCRIPT_GROUP_GAP_SECONDS = 20; 14 const TURN_MERGE_MAX_WORDS = 80; 15 const TURN_MERGE_MAX_SPAN_SECONDS = 45; 16 const SHORT_UTTERANCE_MAX_WORDS = 3; 17 const FIRST_GROUP_MERGE_MIN_WORDS = 8; 18 function countWords(text) { 19 return text.split(/\s+/).filter(Boolean).length; 20 } 21 /** 22 * Group raw transcript segments into readable blocks. 23 * If speaker markers (>>) are present, groups by speaker turn. 24 * Otherwise, groups by sentence boundaries. 25 */ 26 export function groupTranscriptSegments(segments) { 27 if (segments.length === 0) 28 return []; 29 const hasSpeakerMarkers = segments.some(s => /^>>/.test(s.text)); 30 return hasSpeakerMarkers ? groupBySpeaker(segments) : groupBySentence(segments); 31 } 32 /** 33 * Format grouped segments + chapters into a final text output. 34 */ 35 export function formatGroupedTranscript(segments, chapters = []) { 36 const sortedChapters = [...chapters].sort((a, b) => a.start - b.start); 37 let chapterIdx = 0; 38 const rows = []; 39 const textParts = []; 40 for (const segment of segments) { 41 // Insert chapter headings 42 while (chapterIdx < sortedChapters.length && sortedChapters[chapterIdx].start <= segment.start) { 43 const title = sortedChapters[chapterIdx].title; 44 rows.push({ timestamp: fmtTime(sortedChapters[chapterIdx].start), speaker: '', text: `[Chapter] ${title}` }); 45 if (textParts.length > 0) 46 textParts.push(''); 47 textParts.push(`### ${title}`); 48 textParts.push(''); 49 chapterIdx++; 50 } 51 const timestamp = fmtTime(segment.start); 52 const speaker = segment.speaker !== undefined ? `Speaker ${segment.speaker + 1}` : ''; 53 rows.push({ timestamp, speaker, text: segment.text }); 54 if (segment.speakerChange && textParts.length > 0) { 55 textParts.push(''); 56 } 57 textParts.push(`${timestamp} ${segment.text}`); 58 } 59 return { rows, plainText: textParts.join('\n') }; 60 } 61 function fmtTime(sec) { 62 const h = Math.floor(sec / 3600); 63 const m = Math.floor((sec % 3600) / 60); 64 const s = Math.floor(sec % 60); 65 if (h > 0) { 66 return `${h}:${String(m).padStart(2, '0')}:${String(s).padStart(2, '0')}`; 67 } 68 return `${m}:${String(s).padStart(2, '0')}`; 69 } 70 // ── Sentence grouping ───────────────────────────────────────────────────── 71 // Max time span (seconds) for a single group when no sentence boundaries are found. 72 // Prevents unbounded merging for languages without punctuation (Chinese, etc.). 73 const MAX_GROUP_SPAN_SECONDS = 30; 74 function groupBySentence(segments) { 75 const groups = []; 76 let buffer = ''; 77 let bufferStart = 0; 78 let lastStart = 0; 79 const flush = () => { 80 if (buffer.trim()) { 81 groups.push({ start: bufferStart, text: buffer.trim(), speakerChange: false }); 82 buffer = ''; 83 } 84 }; 85 for (const seg of segments) { 86 // Large gap between segments — always flush 87 if (buffer && seg.start - lastStart > TRANSCRIPT_GROUP_GAP_SECONDS) { 88 flush(); 89 } 90 // Time-based flush: prevent unbounded groups for unpunctuated languages 91 if (buffer && seg.start - bufferStart > MAX_GROUP_SPAN_SECONDS) { 92 flush(); 93 } 94 if (!buffer) 95 bufferStart = seg.start; 96 buffer += (buffer ? ' ' : '') + seg.text; 97 lastStart = seg.start; 98 if (SENTENCE_END.test(seg.text)) 99 flush(); 100 } 101 flush(); 102 return groups; 103 } 104 // ── Speaker grouping ────────────────────────────────────────────────────── 105 function groupBySpeaker(segments) { 106 const turns = []; 107 let currentTurn = null; 108 let speakerIndex = -1; 109 let prevSegText = ''; 110 for (const seg of segments) { 111 const isSpeakerChange = /^>>/.test(seg.text); 112 const cleanText = seg.text.replace(/^>>\s*/, '').replace(/^-\s+/, ''); 113 const prevEndsWithComma = /,\s*$/.test(prevSegText); 114 const prevEndedSentence = (SENTENCE_END.test(prevSegText) || !prevSegText) && !prevEndsWithComma; 115 const isRealSpeakerChange = isSpeakerChange && prevEndedSentence; 116 if (isRealSpeakerChange) { 117 if (currentTurn) 118 turns.push(currentTurn); 119 speakerIndex = (speakerIndex + 1) % 2; 120 currentTurn = { 121 start: seg.start, 122 segments: [{ start: seg.start, text: cleanText }], 123 speakerChange: true, 124 speaker: speakerIndex, 125 }; 126 } 127 else { 128 if (!currentTurn) { 129 currentTurn = { start: seg.start, segments: [], speakerChange: false }; 130 } 131 currentTurn.segments.push({ start: seg.start, text: cleanText }); 132 } 133 prevSegText = cleanText; 134 } 135 if (currentTurn) 136 turns.push(currentTurn); 137 splitAffirmativeTurns(turns); 138 const groups = []; 139 for (const turn of turns) { 140 const sentenceGroups = turn.speaker === undefined 141 ? groupBySentence(turn.segments) 142 : mergeSentenceGroupsWithinTurn(groupBySentence(turn.segments)); 143 for (let i = 0; i < sentenceGroups.length; i++) { 144 groups.push({ 145 ...sentenceGroups[i], 146 speakerChange: i === 0 && turn.speakerChange, 147 speaker: turn.speaker, 148 }); 149 } 150 } 151 return groups; 152 } 153 function splitAffirmativeTurns(turns) { 154 const affirmativePattern = /^(mhm|yeah|yes|yep|right|okay|ok|absolutely|sure|exactly|uh-huh|mm-hmm)[.!,]?\s+/i; 155 for (let i = 0; i < turns.length; i++) { 156 const turn = turns[i]; 157 if (turn.speaker === undefined || turn.segments.length === 0) 158 continue; 159 const firstSeg = turn.segments[0]; 160 const match = affirmativePattern.exec(firstSeg.text); 161 if (!match) 162 continue; 163 if (/,\s*$/.test(match[0])) 164 continue; 165 const remainder = firstSeg.text.slice(match[0].length).trim(); 166 const restSegments = turn.segments.slice(1); 167 const restWords = countWords(remainder) + restSegments.reduce((sum, s) => sum + countWords(s.text), 0); 168 if (restWords < 30) 169 continue; 170 const affirmativeText = match[0].trimEnd(); 171 const newRestSegments = remainder 172 ? [{ start: firstSeg.start, text: remainder }, ...restSegments] 173 : restSegments; 174 turns.splice(i, 1, { 175 start: turn.start, 176 segments: [{ start: firstSeg.start, text: affirmativeText }], 177 speakerChange: turn.speakerChange, 178 speaker: turn.speaker, 179 }, { 180 start: newRestSegments[0].start, 181 segments: newRestSegments, 182 speakerChange: true, 183 speaker: turn.speaker === 0 ? 1 : 0, 184 }); 185 i++; 186 } 187 } 188 function mergeSentenceGroupsWithinTurn(groups) { 189 if (groups.length <= 1) 190 return groups; 191 const merged = []; 192 let current = { ...groups[0] }; 193 let currentIsFirstInTurn = true; 194 for (let i = 1; i < groups.length; i++) { 195 const next = groups[i]; 196 if (shouldMergeSentenceGroups(current, next, currentIsFirstInTurn)) { 197 current.text = `${current.text} ${next.text}`; 198 continue; 199 } 200 merged.push(current); 201 current = { ...next }; 202 currentIsFirstInTurn = false; 203 } 204 merged.push(current); 205 return merged; 206 } 207 function shouldMergeSentenceGroups(current, next, currentIsFirstInTurn) { 208 const currentWords = countWords(current.text); 209 const nextWords = countWords(next.text); 210 if (isShortStandaloneUtterance(current.text, currentWords) 211 || isShortStandaloneUtterance(next.text, nextWords)) 212 return false; 213 if (currentIsFirstInTurn && currentWords < FIRST_GROUP_MERGE_MIN_WORDS) 214 return false; 215 if (QUESTION_END.test(current.text) || QUESTION_END.test(next.text)) 216 return false; 217 if (currentWords + nextWords > TURN_MERGE_MAX_WORDS) 218 return false; 219 if (next.start - current.start > TURN_MERGE_MAX_SPAN_SECONDS) 220 return false; 221 return true; 222 } 223 function isShortStandaloneUtterance(text, words) { 224 const w = words ?? countWords(text); 225 return w > 0 && w <= SHORT_UTTERANCE_MAX_WORDS && SENTENCE_END.test(text); 226 }