Cradicle Explorer

/ src / browser / article-extract.test.ts
article-extract.test.ts
  1  import { describe, expect, it } from 'vitest';
  2  import {
  3    buildExtractArticleJs,
  4    extractArticle,
  5    DEFAULT_FALLBACK_SELECTORS,
  6    type ExtractedArticle,
  7    type PageLike,
  8  } from './article-extract.js';
  9  
 10  function fakePage(response: unknown): PageLike & { lastJs: string | null } {
 11    const state = { lastJs: null as string | null };
 12    return {
 13      lastJs: null,
 14      async evaluate(js: string) {
 15        state.lastJs = js;
 16        Object.assign(this as unknown as { lastJs: string | null }, state);
 17        return response;
 18      },
 19    };
 20  }
 21  
 22  describe('buildExtractArticleJs', () => {
 23    it('embeds Readability + Readerable sources once per evaluation', () => {
 24      const js = buildExtractArticleJs();
 25      // Both libs should be inlined (matched by identifying strings from the
 26      // upstream @mozilla/readability sources).
 27      expect(js).toContain('function Readability(doc, options)');
 28      expect(js).toContain('function isProbablyReaderable');
 29    });
 30  
 31    it('serializes caller-supplied options into the evaluated JS', () => {
 32      const js = buildExtractArticleJs({
 33        cleanSelectors: ['.ads', '#banner'],
 34        fallbackSelectors: ['article', 'body'],
 35        force: true,
 36      });
 37      expect(js).toContain('[".ads","#banner"]');
 38      expect(js).toContain('["article","body"]');
 39      expect(js).toContain('const force = true;');
 40    });
 41  
 42    it('uses the default fallback chain when none is supplied', () => {
 43      const js = buildExtractArticleJs();
 44      for (const sel of DEFAULT_FALLBACK_SELECTORS) {
 45        expect(js).toContain(JSON.stringify(sel));
 46      }
 47    });
 48  
 49    it('runs fallback selection against the cleaned clone', () => {
 50      const js = buildExtractArticleJs({ cleanSelectors: ['.noise'] });
 51      expect(js).toContain('el = cloneDoc.querySelector(sel);');
 52      expect(js).not.toContain('el = document.querySelector(sel);');
 53    });
 54  
 55    it('produces syntactically valid JavaScript', () => {
 56      // Parsing via the Function constructor rejects any syntax error in the
 57      // generated code — including accidental template-literal break-outs from
 58      // the embedded Readability sources.
 59      expect(() => new Function(buildExtractArticleJs())).not.toThrow();
 60      expect(() => new Function(buildExtractArticleJs({ force: true }))).not.toThrow();
 61      expect(() => new Function(buildExtractArticleJs({
 62        cleanSelectors: ['.a', '.b'],
 63        fallbackSelectors: ['main', 'body'],
 64      }))).not.toThrow();
 65    });
 66  });
 67  
 68  describe('extractArticle (host-side)', () => {
 69    it('returns a normalized ExtractedArticle when the page responds with one', async () => {
 70      const page = fakePage({
 71        source: 'readability',
 72        html: '<p>hello</p>',
 73        title: 'Hello',
 74        byline: 'Alice',
 75        publishedTime: '2026-04-22',
 76        siteName: 'Example',
 77      });
 78      const res = await extractArticle(page);
 79      expect(res).toEqual<ExtractedArticle>({
 80        source: 'readability',
 81        html: '<p>hello</p>',
 82        title: 'Hello',
 83        byline: 'Alice',
 84        publishedTime: '2026-04-22',
 85        siteName: 'Example',
 86      });
 87    });
 88  
 89    it('drops undefined optional fields cleanly', async () => {
 90      const page = fakePage({ source: 'fallback', html: '<main>x</main>', title: 't' });
 91      const res = await extractArticle(page);
 92      expect(res).toEqual({ source: 'fallback', html: '<main>x</main>', title: 't' });
 93      expect(res).not.toHaveProperty('byline');
 94      expect(res).not.toHaveProperty('publishedTime');
 95    });
 96  
 97    it('returns null on a missing body or malformed payload', async () => {
 98      expect(await extractArticle(fakePage(null))).toBeNull();
 99      expect(await extractArticle(fakePage('oops'))).toBeNull();
100      expect(await extractArticle(fakePage({ source: 'readability' }))).toBeNull();
101      expect(await extractArticle(fakePage({ html: '<p>x</p>' }))).toBeNull();
102    });
103  
104    it('defaults title to empty string when the page omits it', async () => {
105      const page = fakePage({ source: 'pre', html: '<body><pre>x</pre></body>' });
106      const res = await extractArticle(page);
107      expect(res?.title).toBe('');
108      expect(res?.source).toBe('pre');
109    });
110  });