article-download-pipeline.test.ts
1 /** 2 * E2E regression tests for the HTML → Markdown article pipeline. 3 * 4 * Drives real pages through `opencli web read` and asserts the hardened 5 * converter's invariants hold on the produced file: 6 * - no base64 `data:image/…` leaks 7 * - no <script> / <style> leakage 8 * - no runs of 3+ blank lines 9 * - no lone `-` / `·` residue lines 10 * - no trailing-whitespace lines 11 * - no NBSP residue 12 * 13 * Sites are picked to cover the features the pipeline claims to support: 14 * example.com — baseline / tiny article 15 * Wikipedia — GFM tables, many headings, long content 16 * MDN — meta-tag extracted author + published_time 17 * GitHub README — fenced code blocks, dense chrome 18 * Vercel blog — JS-heavy SSR, publish_time from schema.org 19 * Ruan Yifeng blog — CJK, inline images, multi-link paragraphs 20 * 21 * Each run exits cleanly with `status: 'success'` or is skipped on transient 22 * / bot-detection failures (mirroring `browser-public.test.ts` patterns). 23 */ 24 25 import * as fs from 'node:fs'; 26 import * as os from 'node:os'; 27 import * as path from 'node:path'; 28 import { afterEach, describe, expect, it } from 'vitest'; 29 import { runCli, parseJsonOutput } from './helpers.js'; 30 31 interface WebReadResult { 32 title: string; 33 author: string; 34 publish_time: string; 35 status: string; 36 size: string; 37 saved: string; 38 } 39 40 const tempDirs: string[] = []; 41 42 afterEach(() => { 43 for (const dir of tempDirs) { 44 try { fs.rmSync(dir, { recursive: true, force: true }); } catch { /* */ } 45 } 46 tempDirs.length = 0; 47 }); 48 49 function isTransient(text: string): boolean { 50 return /Detached while handling command|No tab with id|Debugger is not attached|Browser Bridge.*not connected|net::ERR_/i.test(text); 51 } 52 53 async function runWebReadOrSkip( 54 url: string, 55 label: string, 56 ): Promise<WebReadResult | null> { 57 const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'opencli-article-e2e-')); 58 tempDirs.push(tempDir); 59 60 const args = ['web', 'read', '--url', url, '--output', tempDir, '--download-images', 'false', '--format', 'json']; 61 let result = await runCli(args, { timeout: 90_000 }); 62 if (result.code !== 0 && isTransient(result.stderr + result.stdout)) { 63 result = await runCli(args, { timeout: 90_000 }); 64 } 65 66 if (result.code !== 0) { 67 console.warn(`${label}: skipped — CLI failed (likely bot detection / network): ${result.stderr.slice(0, 200)}`); 68 return null; 69 } 70 71 let parsed: WebReadResult[]; 72 try { 73 parsed = parseJsonOutput(result.stdout); 74 } catch { 75 console.warn(`${label}: skipped — CLI output was not JSON`); 76 return null; 77 } 78 if (!Array.isArray(parsed) || parsed.length === 0) { 79 console.warn(`${label}: skipped — empty result array`); 80 return null; 81 } 82 const row = parsed[0]; 83 if (row.status !== 'success') { 84 console.warn(`${label}: skipped — status=${row.status} (${row.saved})`); 85 return null; 86 } 87 return row; 88 } 89 90 function assertPipelineInvariants(md: string, label: string) { 91 expect(md.length, `${label}: markdown should be non-trivial`).toBeGreaterThan(200); 92 93 expect(md.match(/data:image/g) ?? [], `${label}: no base64 data-URI images`).toHaveLength(0); 94 expect(md.match(/<script[\s>]/gi) ?? [], `${label}: no leaked <script> tags`).toHaveLength(0); 95 expect(md.match(/<style[\s>]/gi) ?? [], `${label}: no leaked <style> tags`).toHaveLength(0); 96 97 // The post-processing pipeline guarantees blank-line collapse to at most 2. 98 expect(md).not.toMatch(/\n{3,}/); 99 100 // Lone dash / middle-dot residue from lost list bullets. 101 expect(md).not.toMatch(/^[ \t]*-[ \t]*$/m); 102 expect(md).not.toMatch(/^[ \t]*·[ \t]*$/m); 103 104 // Trailing whitespace should never appear (stripped in post-processing). 105 expect(md).not.toMatch(/[ \t]+\n/); 106 107 // NBSP should be normalized to a regular space. 108 expect(md.match(/\u00a0/g) ?? [], `${label}: NBSP should be normalized`).toHaveLength(0); 109 } 110 111 interface SiteCase { 112 url: string; 113 label: string; 114 extra?: (md: string) => void; 115 } 116 117 const SITES: SiteCase[] = [ 118 { 119 url: 'https://example.com/', 120 label: 'example.com (baseline)', 121 }, 122 { 123 url: 'https://en.wikipedia.org/wiki/Markdown', 124 label: 'Wikipedia — Markdown (GFM tables + headings)', 125 extra: (md) => { 126 // Wikipedia's Markdown article contains several tables — the hardened 127 // converter with turndown-plugin-gfm should produce real `|---|---|` rows. 128 expect(md.match(/^\|.*---.*\|/gm) ?? []).not.toHaveLength(0); 129 expect(md.match(/^## /gm) ?? []).not.toHaveLength(0); 130 }, 131 }, 132 { 133 url: 'https://developer.mozilla.org/en-US/docs/Web/HTML/Element/table', 134 label: 'MDN — <table> element', 135 }, 136 { 137 url: 'https://github.com/mozilla/readability', 138 label: 'GitHub — mozilla/readability README', 139 extra: (md) => { 140 // README uses fenced code blocks extensively; ensure a few survived. 141 expect(md.match(/^```/gm) ?? [], 'fenced code blocks preserved').not.toHaveLength(0); 142 }, 143 }, 144 { 145 url: 'https://vercel.com/blog/vercel-ship-2024', 146 label: 'Vercel blog — Ship 2024 recap (JS-heavy SSR)', 147 }, 148 { 149 url: 'https://www.ruanyifeng.com/blog/2024/07/weekly-issue-309.html', 150 label: 'Ruan Yifeng blog — CJK + image-dense', 151 extra: (md) => { 152 // Chinese site with many inline images. The extractor should preserve 153 // both CJK text and remote image URLs (not drop them like base64 would). 154 expect(md).toMatch(/[\u4e00-\u9fff]/); // contains at least one CJK char 155 expect(md.match(/^!\[.*?\]\(https?:\/\//gm) ?? []).not.toHaveLength(0); 156 }, 157 }, 158 ]; 159 160 describe('web read — hardened article pipeline (real-site regression)', () => { 161 for (const site of SITES) { 162 it(`${site.label} survives the hardened pipeline`, async () => { 163 const row = await runWebReadOrSkip(site.url, site.label); 164 if (!row) return; 165 166 expect(row.saved, `${site.label}: saved path present`).toBeTruthy(); 167 expect(fs.existsSync(row.saved)).toBe(true); 168 169 const md = fs.readFileSync(row.saved, 'utf8'); 170 assertPipelineInvariants(md, site.label); 171 if (site.extra) site.extra(md); 172 }, 120_000); 173 } 174 });