article-extract.test.ts
1 import { describe, expect, it } from 'vitest'; 2 import { 3 buildExtractArticleJs, 4 extractArticle, 5 DEFAULT_FALLBACK_SELECTORS, 6 type ExtractedArticle, 7 type PageLike, 8 } from './article-extract.js'; 9 10 function fakePage(response: unknown): PageLike & { lastJs: string | null } { 11 const state = { lastJs: null as string | null }; 12 return { 13 lastJs: null, 14 async evaluate(js: string) { 15 state.lastJs = js; 16 Object.assign(this as unknown as { lastJs: string | null }, state); 17 return response; 18 }, 19 }; 20 } 21 22 describe('buildExtractArticleJs', () => { 23 it('embeds Readability + Readerable sources once per evaluation', () => { 24 const js = buildExtractArticleJs(); 25 // Both libs should be inlined (matched by identifying strings from the 26 // upstream @mozilla/readability sources). 27 expect(js).toContain('function Readability(doc, options)'); 28 expect(js).toContain('function isProbablyReaderable'); 29 }); 30 31 it('serializes caller-supplied options into the evaluated JS', () => { 32 const js = buildExtractArticleJs({ 33 cleanSelectors: ['.ads', '#banner'], 34 fallbackSelectors: ['article', 'body'], 35 force: true, 36 }); 37 expect(js).toContain('[".ads","#banner"]'); 38 expect(js).toContain('["article","body"]'); 39 expect(js).toContain('const force = true;'); 40 }); 41 42 it('uses the default fallback chain when none is supplied', () => { 43 const js = buildExtractArticleJs(); 44 for (const sel of DEFAULT_FALLBACK_SELECTORS) { 45 expect(js).toContain(JSON.stringify(sel)); 46 } 47 }); 48 49 it('runs fallback selection against the cleaned clone', () => { 50 const js = buildExtractArticleJs({ cleanSelectors: ['.noise'] }); 51 expect(js).toContain('el = cloneDoc.querySelector(sel);'); 52 expect(js).not.toContain('el = document.querySelector(sel);'); 53 }); 54 55 it('produces syntactically valid JavaScript', () => { 56 // Parsing via the Function constructor rejects any syntax error in the 57 // generated code — including accidental template-literal break-outs from 58 // the embedded Readability sources. 59 expect(() => new Function(buildExtractArticleJs())).not.toThrow(); 60 expect(() => new Function(buildExtractArticleJs({ force: true }))).not.toThrow(); 61 expect(() => new Function(buildExtractArticleJs({ 62 cleanSelectors: ['.a', '.b'], 63 fallbackSelectors: ['main', 'body'], 64 }))).not.toThrow(); 65 }); 66 }); 67 68 describe('extractArticle (host-side)', () => { 69 it('returns a normalized ExtractedArticle when the page responds with one', async () => { 70 const page = fakePage({ 71 source: 'readability', 72 html: '<p>hello</p>', 73 title: 'Hello', 74 byline: 'Alice', 75 publishedTime: '2026-04-22', 76 siteName: 'Example', 77 }); 78 const res = await extractArticle(page); 79 expect(res).toEqual<ExtractedArticle>({ 80 source: 'readability', 81 html: '<p>hello</p>', 82 title: 'Hello', 83 byline: 'Alice', 84 publishedTime: '2026-04-22', 85 siteName: 'Example', 86 }); 87 }); 88 89 it('drops undefined optional fields cleanly', async () => { 90 const page = fakePage({ source: 'fallback', html: '<main>x</main>', title: 't' }); 91 const res = await extractArticle(page); 92 expect(res).toEqual({ source: 'fallback', html: '<main>x</main>', title: 't' }); 93 expect(res).not.toHaveProperty('byline'); 94 expect(res).not.toHaveProperty('publishedTime'); 95 }); 96 97 it('returns null on a missing body or malformed payload', async () => { 98 expect(await extractArticle(fakePage(null))).toBeNull(); 99 expect(await extractArticle(fakePage('oops'))).toBeNull(); 100 expect(await extractArticle(fakePage({ source: 'readability' }))).toBeNull(); 101 expect(await extractArticle(fakePage({ html: '<p>x</p>' }))).toBeNull(); 102 }); 103 104 it('defaults title to empty string when the page omits it', async () => { 105 const page = fakePage({ source: 'pre', html: '<body><pre>x</pre></body>' }); 106 const res = await extractArticle(page); 107 expect(res?.title).toBe(''); 108 expect(res?.source).toBe('pre'); 109 }); 110 });