transcript-group.test.js
1 import { describe, it, expect } from 'vitest'; 2 import { groupTranscriptSegments, formatGroupedTranscript } from './transcript-group.js'; 3 describe('groupTranscriptSegments', () => { 4 it('groups segments by sentence boundaries', () => { 5 const segments = [ 6 { start: 0, text: 'Hello there.' }, 7 { start: 2, text: 'How are you doing today?' }, 8 { start: 5, text: 'I am' }, 9 { start: 6, text: 'doing well.' }, 10 ]; 11 const result = groupTranscriptSegments(segments); 12 expect(result).toHaveLength(3); 13 expect(result[0].text).toBe('Hello there.'); 14 expect(result[1].text).toBe('How are you doing today?'); 15 expect(result[2].text).toBe('I am doing well.'); 16 }); 17 it('flushes on large time gaps', () => { 18 const segments = [ 19 { start: 0, text: 'First part' }, 20 { start: 2, text: 'still first' }, 21 { start: 25, text: 'second part after gap' }, 22 ]; 23 const result = groupTranscriptSegments(segments); 24 expect(result).toHaveLength(2); 25 expect(result[0].text).toBe('First part still first'); 26 expect(result[1].text).toBe('second part after gap'); 27 }); 28 it('respects 30s max group span for unpunctuated text', () => { 29 // Simulate CJK captions without punctuation 30 const segments = Array.from({ length: 20 }, (_, i) => ({ 31 start: i * 2, 32 text: `segment${i}`, 33 })); 34 const result = groupTranscriptSegments(segments); 35 // 20 segments * 2s = 40s total, should be split into at least 2 groups 36 expect(result.length).toBeGreaterThanOrEqual(2); 37 // No single group should span more than ~30s 38 for (const g of result) { 39 const words = g.text.split(' '); 40 // With 2s per segment and 30s max, each group should have at most ~16 segments 41 expect(words.length).toBeLessThanOrEqual(16); 42 } 43 }); 44 it('detects speaker changes via >> markers', () => { 45 const segments = [ 46 { start: 0, text: '>> How are you?' }, 47 { start: 3, text: '>> I am fine.' }, 48 ]; 49 const result = groupTranscriptSegments(segments); 50 expect(result.some(g => g.speakerChange)).toBe(true); 51 expect(result.some(g => g.speaker !== undefined)).toBe(true); 52 }); 53 it('recognizes CJK sentence-ending punctuation', () => { 54 const segments = [ 55 { start: 0, text: '你好世界。' }, 56 { start: 2, text: '这是测试' }, 57 { start: 4, text: '内容。' }, 58 ]; 59 const result = groupTranscriptSegments(segments); 60 expect(result).toHaveLength(2); 61 expect(result[0].text).toBe('你好世界。'); 62 expect(result[1].text).toBe('这是测试 内容。'); 63 }); 64 it('returns empty array for empty input', () => { 65 expect(groupTranscriptSegments([])).toEqual([]); 66 }); 67 }); 68 describe('formatGroupedTranscript', () => { 69 it('formats timestamps correctly', () => { 70 const segments = [ 71 { start: 65, text: 'One minute five.', speakerChange: false }, 72 { start: 3661, text: 'One hour one minute.', speakerChange: false }, 73 ]; 74 const { rows } = formatGroupedTranscript(segments); 75 expect(rows[0].timestamp).toBe('1:05'); 76 expect(rows[1].timestamp).toBe('1:01:01'); 77 }); 78 it('inserts chapter headings at correct positions', () => { 79 const segments = [ 80 { start: 0, text: 'Intro text.', speakerChange: false }, 81 { start: 60, text: 'Chapter content.', speakerChange: false }, 82 ]; 83 const chapters = [{ title: 'Introduction', start: 0 }, { title: 'Main', start: 50 }]; 84 const { rows } = formatGroupedTranscript(segments, chapters); 85 expect(rows[0].text).toBe('[Chapter] Introduction'); 86 expect(rows[1].text).toBe('Intro text.'); 87 expect(rows[2].text).toBe('[Chapter] Main'); 88 expect(rows[3].text).toBe('Chapter content.'); 89 }); 90 it('labels speakers', () => { 91 const segments = [ 92 { start: 0, text: 'Hello.', speakerChange: true, speaker: 0 }, 93 { start: 5, text: 'Hi there.', speakerChange: true, speaker: 1 }, 94 ]; 95 const { rows } = formatGroupedTranscript(segments); 96 expect(rows[0].speaker).toBe('Speaker 1'); 97 expect(rows[1].speaker).toBe('Speaker 2'); 98 }); 99 });