extract.test.ts
1 import { describe, expect, it } from 'vitest'; 2 import { buildExtractHtmlJs, chunkMarkdown, runExtractFromHtml } from './extract.js'; 3 4 describe('chunkMarkdown', () => { 5 it('returns the full content when it fits in one chunk', () => { 6 const content = 'short body'; 7 const r = chunkMarkdown({ content, start: 0, chunkSize: 20000 }); 8 expect(r.content).toBe(content); 9 expect(r.start).toBe(0); 10 expect(r.end).toBe(content.length); 11 expect(r.nextStartChar).toBeNull(); 12 }); 13 14 it('emits next_start_char when more content remains', () => { 15 // Build content long enough that chunkSize cuts it mid-stream. 16 const para = 'p'.repeat(400); 17 const content = [para, para, para].join('\n\n'); 18 const r = chunkMarkdown({ content, start: 0, chunkSize: 500 }); 19 expect(r.nextStartChar).not.toBeNull(); 20 expect(r.nextStartChar).toBeGreaterThan(0); 21 expect(r.nextStartChar).toBeLessThan(content.length); 22 }); 23 24 it('prefers to break at a paragraph boundary inside the boundary window', () => { 25 // chunkSize=500, window=15% → [425, 500). Place `\n\n` at 450 so it lands 26 // inside the window; the chunker should snap the cut back to it. 27 const a = 'a'.repeat(450); 28 const b = 'b'.repeat(400); 29 const content = `${a}\n\n${b}`; 30 const r = chunkMarkdown({ content, start: 0, chunkSize: 500 }); 31 expect(r.content.endsWith('\n\n')).toBe(true); 32 expect(r.nextStartChar).toBe(r.end); 33 expect(content.slice(r.end).startsWith('b')).toBe(true); 34 }); 35 36 it('falls back to a single newline when no paragraph boundary is in window', () => { 37 // 6 lines × 90 chars joined by `\n` → `\n` at 90, 181, 272, 363, 454. 38 // chunkSize=500 with window [425, 500) catches the `\n` at 454. 39 const line = 'l'.repeat(90); 40 const content = Array.from({ length: 6 }, () => line).join('\n'); 41 const r = chunkMarkdown({ content, start: 0, chunkSize: 500 }); 42 expect(r.content.endsWith('\n')).toBe(true); 43 expect(content.slice(r.end).startsWith('l')).toBe(true); 44 }); 45 46 it('hard-cuts when no boundary is found within the window', () => { 47 const content = 'x'.repeat(5000); 48 const r = chunkMarkdown({ content, start: 0, chunkSize: 500 }); 49 expect(r.end).toBe(500); 50 expect(r.content).toHaveLength(500); 51 expect(r.nextStartChar).toBe(500); 52 }); 53 54 it('handles start >= content.length with an empty final chunk', () => { 55 const content = 'hello'; 56 const r = chunkMarkdown({ content, start: 5, chunkSize: 100 }); 57 expect(r.content).toBe(''); 58 expect(r.nextStartChar).toBeNull(); 59 }); 60 61 it('resumes from a provided start cursor until the stream terminates', () => { 62 const content = `${'a'.repeat(100)}\n\n${'b'.repeat(100)}\n\n${'c'.repeat(100)}`; 63 const first = chunkMarkdown({ content, start: 0, chunkSize: 110 }); 64 expect(first.nextStartChar).not.toBeNull(); 65 const second = chunkMarkdown({ content, start: first.nextStartChar!, chunkSize: 110 }); 66 expect(second.start).toBe(first.nextStartChar); 67 expect(second.content.length).toBeGreaterThan(0); 68 let cursor: number | null = second.nextStartChar; 69 let safety = 20; 70 while (cursor !== null && safety-- > 0) { 71 const step = chunkMarkdown({ content, start: cursor, chunkSize: 110 }); 72 cursor = step.nextStartChar; 73 } 74 expect(cursor).toBeNull(); 75 }); 76 77 it('clamps chunk size to the configured minimum', () => { 78 const content = 'a'.repeat(2000); 79 const r = chunkMarkdown({ content, start: 0, chunkSize: 1 }); 80 // MIN_CHUNK_SIZE is 100 — requesting 1 should still produce >= 100 chars. 81 expect(r.end).toBeGreaterThanOrEqual(100); 82 }); 83 }); 84 85 describe('runExtractFromHtml', () => { 86 it('converts HTML to markdown and wraps it in the chunking envelope', () => { 87 const html = '<article><h1>Title</h1><p>Hello <strong>world</strong>.</p></article>'; 88 const r = runExtractFromHtml({ 89 html, 90 url: 'https://example.com/a', 91 title: 'Example', 92 selector: 'article', 93 start: 0, 94 chunkSize: 20000, 95 }); 96 expect(r.url).toBe('https://example.com/a'); 97 expect(r.title).toBe('Example'); 98 expect(r.selector).toBe('article'); 99 expect(r.content).toContain('# Title'); 100 expect(r.content).toContain('**world**'); 101 expect(r.start).toBe(0); 102 expect(r.end).toBe(r.content.length); 103 expect(r.total_chars).toBe(r.content.length); 104 expect(r.next_start_char).toBeNull(); 105 }); 106 107 it('reports total_chars and chunk_size against the final markdown', () => { 108 const body = Array.from({ length: 30 }, (_, i) => `<p>paragraph ${i} ${'x'.repeat(200)}</p>`).join(''); 109 const r = runExtractFromHtml({ 110 html: `<main>${body}</main>`, 111 url: 'https://example.com/b', 112 title: 't', 113 selector: 'main', 114 start: 0, 115 chunkSize: 500, 116 }); 117 expect(r.total_chars).toBeGreaterThan(r.end); 118 expect(r.chunk_size).toBe(r.end - r.start); 119 expect(r.next_start_char).toBe(r.end); 120 }); 121 }); 122 123 describe('buildExtractHtmlJs', () => { 124 it('embeds the selector as a JSON literal', () => { 125 const js = buildExtractHtmlJs('main.article'); 126 expect(js).toContain('"main.article"'); 127 }); 128 129 it('uses null when no selector given', () => { 130 const js = buildExtractHtmlJs(null); 131 // The expression references `sel` and compares to null. 132 expect(js).toContain('const sel = null;'); 133 }); 134 135 it('includes the denoise selector list', () => { 136 const js = buildExtractHtmlJs(null); 137 expect(js).toContain("'script'"); 138 expect(js).toContain("'nav'"); 139 expect(js).toContain("'iframe'"); 140 expect(js).toContain("'[aria-hidden=\"true\"]'"); 141 }); 142 });