/ src / browser / extract.test.ts
extract.test.ts
  1  import { describe, expect, it } from 'vitest';
  2  import { buildExtractHtmlJs, chunkMarkdown, runExtractFromHtml } from './extract.js';
  3  
  4  describe('chunkMarkdown', () => {
  5      it('returns the full content when it fits in one chunk', () => {
  6          const content = 'short body';
  7          const r = chunkMarkdown({ content, start: 0, chunkSize: 20000 });
  8          expect(r.content).toBe(content);
  9          expect(r.start).toBe(0);
 10          expect(r.end).toBe(content.length);
 11          expect(r.nextStartChar).toBeNull();
 12      });
 13  
 14      it('emits next_start_char when more content remains', () => {
 15          // Build content long enough that chunkSize cuts it mid-stream.
 16          const para = 'p'.repeat(400);
 17          const content = [para, para, para].join('\n\n');
 18          const r = chunkMarkdown({ content, start: 0, chunkSize: 500 });
 19          expect(r.nextStartChar).not.toBeNull();
 20          expect(r.nextStartChar).toBeGreaterThan(0);
 21          expect(r.nextStartChar).toBeLessThan(content.length);
 22      });
 23  
 24      it('prefers to break at a paragraph boundary inside the boundary window', () => {
 25          // chunkSize=500, window=15% → [425, 500). Place `\n\n` at 450 so it lands
 26          // inside the window; the chunker should snap the cut back to it.
 27          const a = 'a'.repeat(450);
 28          const b = 'b'.repeat(400);
 29          const content = `${a}\n\n${b}`;
 30          const r = chunkMarkdown({ content, start: 0, chunkSize: 500 });
 31          expect(r.content.endsWith('\n\n')).toBe(true);
 32          expect(r.nextStartChar).toBe(r.end);
 33          expect(content.slice(r.end).startsWith('b')).toBe(true);
 34      });
 35  
 36      it('falls back to a single newline when no paragraph boundary is in window', () => {
 37          // 6 lines × 90 chars joined by `\n` → `\n` at 90, 181, 272, 363, 454.
 38          // chunkSize=500 with window [425, 500) catches the `\n` at 454.
 39          const line = 'l'.repeat(90);
 40          const content = Array.from({ length: 6 }, () => line).join('\n');
 41          const r = chunkMarkdown({ content, start: 0, chunkSize: 500 });
 42          expect(r.content.endsWith('\n')).toBe(true);
 43          expect(content.slice(r.end).startsWith('l')).toBe(true);
 44      });
 45  
 46      it('hard-cuts when no boundary is found within the window', () => {
 47          const content = 'x'.repeat(5000);
 48          const r = chunkMarkdown({ content, start: 0, chunkSize: 500 });
 49          expect(r.end).toBe(500);
 50          expect(r.content).toHaveLength(500);
 51          expect(r.nextStartChar).toBe(500);
 52      });
 53  
 54      it('handles start >= content.length with an empty final chunk', () => {
 55          const content = 'hello';
 56          const r = chunkMarkdown({ content, start: 5, chunkSize: 100 });
 57          expect(r.content).toBe('');
 58          expect(r.nextStartChar).toBeNull();
 59      });
 60  
 61      it('resumes from a provided start cursor until the stream terminates', () => {
 62          const content = `${'a'.repeat(100)}\n\n${'b'.repeat(100)}\n\n${'c'.repeat(100)}`;
 63          const first = chunkMarkdown({ content, start: 0, chunkSize: 110 });
 64          expect(first.nextStartChar).not.toBeNull();
 65          const second = chunkMarkdown({ content, start: first.nextStartChar!, chunkSize: 110 });
 66          expect(second.start).toBe(first.nextStartChar);
 67          expect(second.content.length).toBeGreaterThan(0);
 68          let cursor: number | null = second.nextStartChar;
 69          let safety = 20;
 70          while (cursor !== null && safety-- > 0) {
 71              const step = chunkMarkdown({ content, start: cursor, chunkSize: 110 });
 72              cursor = step.nextStartChar;
 73          }
 74          expect(cursor).toBeNull();
 75      });
 76  
 77      it('clamps chunk size to the configured minimum', () => {
 78          const content = 'a'.repeat(2000);
 79          const r = chunkMarkdown({ content, start: 0, chunkSize: 1 });
 80          // MIN_CHUNK_SIZE is 100 — requesting 1 should still produce >= 100 chars.
 81          expect(r.end).toBeGreaterThanOrEqual(100);
 82      });
 83  });
 84  
 85  describe('runExtractFromHtml', () => {
 86      it('converts HTML to markdown and wraps it in the chunking envelope', () => {
 87          const html = '<article><h1>Title</h1><p>Hello <strong>world</strong>.</p></article>';
 88          const r = runExtractFromHtml({
 89              html,
 90              url: 'https://example.com/a',
 91              title: 'Example',
 92              selector: 'article',
 93              start: 0,
 94              chunkSize: 20000,
 95          });
 96          expect(r.url).toBe('https://example.com/a');
 97          expect(r.title).toBe('Example');
 98          expect(r.selector).toBe('article');
 99          expect(r.content).toContain('# Title');
100          expect(r.content).toContain('**world**');
101          expect(r.start).toBe(0);
102          expect(r.end).toBe(r.content.length);
103          expect(r.total_chars).toBe(r.content.length);
104          expect(r.next_start_char).toBeNull();
105      });
106  
107      it('reports total_chars and chunk_size against the final markdown', () => {
108          const body = Array.from({ length: 30 }, (_, i) => `<p>paragraph ${i} ${'x'.repeat(200)}</p>`).join('');
109          const r = runExtractFromHtml({
110              html: `<main>${body}</main>`,
111              url: 'https://example.com/b',
112              title: 't',
113              selector: 'main',
114              start: 0,
115              chunkSize: 500,
116          });
117          expect(r.total_chars).toBeGreaterThan(r.end);
118          expect(r.chunk_size).toBe(r.end - r.start);
119          expect(r.next_start_char).toBe(r.end);
120      });
121  });
122  
123  describe('buildExtractHtmlJs', () => {
124      it('embeds the selector as a JSON literal', () => {
125          const js = buildExtractHtmlJs('main.article');
126          expect(js).toContain('"main.article"');
127      });
128  
129      it('uses null when no selector given', () => {
130          const js = buildExtractHtmlJs(null);
131          // The expression references `sel` and compares to null.
132          expect(js).toContain('const sel = null;');
133      });
134  
135      it('includes the denoise selector list', () => {
136          const js = buildExtractHtmlJs(null);
137          expect(js).toContain("'script'");
138          expect(js).toContain("'nav'");
139          expect(js).toContain("'iframe'");
140          expect(js).toContain("'[aria-hidden=\"true\"]'");
141      });
142  });