Cradicle Explorer

/ src / browser / article-extract.e2e.test.ts
article-extract.e2e.test.ts
  1  import { afterEach, describe, expect, it } from 'vitest';
  2  import { JSDOM } from 'jsdom';
  3  import * as fs from 'node:fs';
  4  import * as os from 'node:os';
  5  import * as path from 'node:path';
  6  import { fileURLToPath } from 'node:url';
  7  import { buildExtractArticleJs, type ExtractArticleOptions, type ExtractedArticle } from './article-extract.js';
  8  import { downloadArticle } from '../download/article-download.js';
  9  
 10  const __dirname = path.dirname(fileURLToPath(import.meta.url));
 11  const fixturesDir = path.join(__dirname, '__fixtures__', 'article-extract');
 12  const tempDirs: string[] = [];
 13  
 14  afterEach(() => {
 15    for (const dir of tempDirs) fs.rmSync(dir, { recursive: true, force: true });
 16    tempDirs.length = 0;
 17  });
 18  
 19  function loadFixture(name: string): string {
 20    return fs.readFileSync(path.join(fixturesDir, name), 'utf8');
 21  }
 22  
 23  function escapeHtml(text: string): string {
 24    return text.replace(/[&<>]/g, ch => ({ '&': '&amp;', '<': '&lt;', '>': '&gt;' }[ch]!));
 25  }
 26  
 27  function runExtract(
 28    html: string,
 29    url: string,
 30    options: ExtractArticleOptions = {},
 31    contentType?: string,
 32  ): ExtractedArticle | null {
 33    const dom = new JSDOM(html, {
 34      url,
 35      contentType: 'text/html',
 36      pretendToBeVisual: true,
 37      runScripts: 'outside-only',
 38    });
 39    if (contentType) {
 40      Object.defineProperty(dom.window.document, 'contentType', {
 41        value: contentType,
 42        configurable: true,
 43      });
 44    }
 45    return dom.window.eval(buildExtractArticleJs(options)) as ExtractedArticle | null;
 46  }
 47  
 48  async function renderMarkdown(
 49    article: ExtractedArticle,
 50    url: string,
 51    options: { cleanSelectors?: string[] } = {},
 52  ): Promise<string> {
 53    const tempDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-article-e2e-'));
 54    tempDirs.push(tempDir);
 55    const result = await downloadArticle({
 56      title: article.title || 'untitled',
 57      contentHtml: article.html,
 58      sourceUrl: url,
 59    }, {
 60      output: tempDir,
 61      downloadImages: false,
 62      cleanSelectors: options.cleanSelectors,
 63    });
 64    expect(result[0].status).toBe('success');
 65    return fs.readFileSync(result[0].saved, 'utf8');
 66  }
 67  
 68  describe('article extract → markdown e2e fixtures', () => {
 69    it('extracts a Wikipedia article fixture and keeps infobox/reference noise out of markdown', async () => {
 70      const url = 'https://en.wikipedia.org/wiki/Markdown';
 71      const cleanSelectors = ['.infobox', '.navbox', '.reference', '.mw-editsection', '.metadata'];
 72      const article = runExtract(loadFixture('wikipedia-markdown.html'), url, { cleanSelectors });
 73      expect(article?.source).toBe('readability');
 74      expect(article?.title).toBe('Markdown');
 75      if (!article) throw new Error('expected extracted article');
 76  
 77      const md = await renderMarkdown(article, url, { cleanSelectors });
 78      expect(md).toContain('lightweight markup language');
 79      expect(md).toContain('John Gruber');
 80      expect(md).not.toContain('Syntax description');
 81      expect(md).not.toContain('Standard file extension');
 82    });
 83  
 84    it('extracts a Deno blog fixture, preserves embedded iframes as markdown links, and drops page chrome', async () => {
 85      const url = 'https://deno.com/blog/v2.0';
 86      const article = runExtract(loadFixture('deno-v2.html'), url);
 87      expect(article?.source).toBe('readability');
 88      expect(article?.title).toBe('Announcing Deno 2 | Deno');
 89      if (!article) throw new Error('expected extracted article');
 90  
 91      const md = await renderMarkdown(article, url);
 92      expect(md).toContain('## Announcing Deno 2');
 93      expect(md).toContain('The web is humanity’s largest software platform');
 94      expect(md).toMatch(/\]\(https:\/\/www\.youtube(?:-nocookie)?\.com\/embed\/[^)]+\)/);
 95      expect(md).not.toContain('Skip to main content');
 96    });
 97  
 98    it('short-circuits non-HTML raw text pages end-to-end', async () => {
 99      const url = 'https://raw.githubusercontent.com/openai/openai-cookbook/main/README.md';
100      const text = loadFixture('openai-cookbook-readme.txt');
101      const html = `<html><head><title>OpenAI Cookbook README</title></head><body><pre>${escapeHtml(text)}</pre></body></html>`;
102      const article = runExtract(html, url, {}, 'text/plain');
103      expect(article?.source).toBe('raw-text');
104      if (!article) throw new Error('expected extracted article');
105  
106      const md = await renderMarkdown(article, url);
107      expect(md).toContain('OPENAI\\_API\\_KEY');
108      expect(md).toContain('Example code and guides for accomplishing common tasks');
109    });
110  
111    it('short-circuits a single-pre document end-to-end', async () => {
112      const url = 'https://raw.githubusercontent.com/openai/openai-cookbook/main/README.md';
113      const text = loadFixture('openai-cookbook-readme.txt');
114      const html = `<html><head><title>OpenAI Cookbook README</title></head><body><pre>${escapeHtml(text)}</pre></body></html>`;
115      const article = runExtract(html, url);
116      expect(article?.source).toBe('pre');
117      if (!article) throw new Error('expected extracted article');
118  
119      const md = await renderMarkdown(article, url);
120      expect(md).toContain('OPENAI\\_API\\_KEY');
121      expect(md).toContain('Most code examples are written in Python');
122    });
123  });