article-extract.e2e.test.ts
1 import { afterEach, describe, expect, it } from 'vitest'; 2 import { JSDOM } from 'jsdom'; 3 import * as fs from 'node:fs'; 4 import * as os from 'node:os'; 5 import * as path from 'node:path'; 6 import { fileURLToPath } from 'node:url'; 7 import { buildExtractArticleJs, type ExtractArticleOptions, type ExtractedArticle } from './article-extract.js'; 8 import { downloadArticle } from '../download/article-download.js'; 9 10 const __dirname = path.dirname(fileURLToPath(import.meta.url)); 11 const fixturesDir = path.join(__dirname, '__fixtures__', 'article-extract'); 12 const tempDirs: string[] = []; 13 14 afterEach(() => { 15 for (const dir of tempDirs) fs.rmSync(dir, { recursive: true, force: true }); 16 tempDirs.length = 0; 17 }); 18 19 function loadFixture(name: string): string { 20 return fs.readFileSync(path.join(fixturesDir, name), 'utf8'); 21 } 22 23 function escapeHtml(text: string): string { 24 return text.replace(/[&<>]/g, ch => ({ '&': '&', '<': '<', '>': '>' }[ch]!)); 25 } 26 27 function runExtract( 28 html: string, 29 url: string, 30 options: ExtractArticleOptions = {}, 31 contentType?: string, 32 ): ExtractedArticle | null { 33 const dom = new JSDOM(html, { 34 url, 35 contentType: 'text/html', 36 pretendToBeVisual: true, 37 runScripts: 'outside-only', 38 }); 39 if (contentType) { 40 Object.defineProperty(dom.window.document, 'contentType', { 41 value: contentType, 42 configurable: true, 43 }); 44 } 45 return dom.window.eval(buildExtractArticleJs(options)) as ExtractedArticle | null; 46 } 47 48 async function renderMarkdown( 49 article: ExtractedArticle, 50 url: string, 51 options: { cleanSelectors?: string[] } = {}, 52 ): Promise<string> { 53 const tempDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-article-e2e-')); 54 tempDirs.push(tempDir); 55 const result = await downloadArticle({ 56 title: article.title || 'untitled', 57 contentHtml: article.html, 58 sourceUrl: url, 59 }, { 60 output: tempDir, 61 downloadImages: false, 62 cleanSelectors: options.cleanSelectors, 63 }); 64 expect(result[0].status).toBe('success'); 65 return fs.readFileSync(result[0].saved, 'utf8'); 66 } 67 68 describe('article extract → markdown e2e fixtures', () => { 69 it('extracts a Wikipedia article fixture and keeps infobox/reference noise out of markdown', async () => { 70 const url = 'https://en.wikipedia.org/wiki/Markdown'; 71 const cleanSelectors = ['.infobox', '.navbox', '.reference', '.mw-editsection', '.metadata']; 72 const article = runExtract(loadFixture('wikipedia-markdown.html'), url, { cleanSelectors }); 73 expect(article?.source).toBe('readability'); 74 expect(article?.title).toBe('Markdown'); 75 if (!article) throw new Error('expected extracted article'); 76 77 const md = await renderMarkdown(article, url, { cleanSelectors }); 78 expect(md).toContain('lightweight markup language'); 79 expect(md).toContain('John Gruber'); 80 expect(md).not.toContain('Syntax description'); 81 expect(md).not.toContain('Standard file extension'); 82 }); 83 84 it('extracts a Deno blog fixture, preserves embedded iframes as markdown links, and drops page chrome', async () => { 85 const url = 'https://deno.com/blog/v2.0'; 86 const article = runExtract(loadFixture('deno-v2.html'), url); 87 expect(article?.source).toBe('readability'); 88 expect(article?.title).toBe('Announcing Deno 2 | Deno'); 89 if (!article) throw new Error('expected extracted article'); 90 91 const md = await renderMarkdown(article, url); 92 expect(md).toContain('## Announcing Deno 2'); 93 expect(md).toContain('The web is humanity’s largest software platform'); 94 expect(md).toMatch(/\]\(https:\/\/www\.youtube(?:-nocookie)?\.com\/embed\/[^)]+\)/); 95 expect(md).not.toContain('Skip to main content'); 96 }); 97 98 it('short-circuits non-HTML raw text pages end-to-end', async () => { 99 const url = 'https://raw.githubusercontent.com/openai/openai-cookbook/main/README.md'; 100 const text = loadFixture('openai-cookbook-readme.txt'); 101 const html = `<html><head><title>OpenAI Cookbook README</title></head><body><pre>${escapeHtml(text)}</pre></body></html>`; 102 const article = runExtract(html, url, {}, 'text/plain'); 103 expect(article?.source).toBe('raw-text'); 104 if (!article) throw new Error('expected extracted article'); 105 106 const md = await renderMarkdown(article, url); 107 expect(md).toContain('OPENAI\\_API\\_KEY'); 108 expect(md).toContain('Example code and guides for accomplishing common tasks'); 109 }); 110 111 it('short-circuits a single-pre document end-to-end', async () => { 112 const url = 'https://raw.githubusercontent.com/openai/openai-cookbook/main/README.md'; 113 const text = loadFixture('openai-cookbook-readme.txt'); 114 const html = `<html><head><title>OpenAI Cookbook README</title></head><body><pre>${escapeHtml(text)}</pre></body></html>`; 115 const article = runExtract(html, url); 116 expect(article?.source).toBe('pre'); 117 if (!article) throw new Error('expected extracted article'); 118 119 const md = await renderMarkdown(article, url); 120 expect(md).toContain('OPENAI\\_API\\_KEY'); 121 expect(md).toContain('Most code examples are written in Python'); 122 }); 123 });