Cradicle Explorer

/ tests / e2e / article-download-pipeline.test.ts
article-download-pipeline.test.ts
  1  /**
  2   * E2E regression tests for the HTML → Markdown article pipeline.
  3   *
  4   * Drives real pages through `opencli web read` and asserts the hardened
  5   * converter's invariants hold on the produced file:
  6   *   - no base64 `data:image/…` leaks
  7   *   - no <script> / <style> leakage
  8   *   - no runs of 3+ blank lines
  9   *   - no lone `-` / `·` residue lines
 10   *   - no trailing-whitespace lines
 11   *   - no NBSP residue
 12   *
 13   * Sites are picked to cover the features the pipeline claims to support:
 14   *   example.com      — baseline / tiny article
 15   *   Wikipedia        — GFM tables, many headings, long content
 16   *   MDN              — meta-tag extracted author + published_time
 17   *   GitHub README    — fenced code blocks, dense chrome
 18   *   Vercel blog      — JS-heavy SSR, publish_time from schema.org
 19   *   Ruan Yifeng blog — CJK, inline images, multi-link paragraphs
 20   *
 21   * Each run exits cleanly with `status: 'success'` or is skipped on transient
 22   * / bot-detection failures (mirroring `browser-public.test.ts` patterns).
 23   */
 24  
 25  import * as fs from 'node:fs';
 26  import * as os from 'node:os';
 27  import * as path from 'node:path';
 28  import { afterEach, describe, expect, it } from 'vitest';
 29  import { runCli, parseJsonOutput } from './helpers.js';
 30  
 31  interface WebReadResult {
 32    title: string;
 33    author: string;
 34    publish_time: string;
 35    status: string;
 36    size: string;
 37    saved: string;
 38  }
 39  
 40  const tempDirs: string[] = [];
 41  
 42  afterEach(() => {
 43    for (const dir of tempDirs) {
 44      try { fs.rmSync(dir, { recursive: true, force: true }); } catch { /* */ }
 45    }
 46    tempDirs.length = 0;
 47  });
 48  
 49  function isTransient(text: string): boolean {
 50    return /Detached while handling command|No tab with id|Debugger is not attached|Browser Bridge.*not connected|net::ERR_/i.test(text);
 51  }
 52  
 53  async function runWebReadOrSkip(
 54    url: string,
 55    label: string,
 56  ): Promise<WebReadResult | null> {
 57    const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'opencli-article-e2e-'));
 58    tempDirs.push(tempDir);
 59  
 60    const args = ['web', 'read', '--url', url, '--output', tempDir, '--download-images', 'false', '--format', 'json'];
 61    let result = await runCli(args, { timeout: 90_000 });
 62    if (result.code !== 0 && isTransient(result.stderr + result.stdout)) {
 63      result = await runCli(args, { timeout: 90_000 });
 64    }
 65  
 66    if (result.code !== 0) {
 67      console.warn(`${label}: skipped — CLI failed (likely bot detection / network): ${result.stderr.slice(0, 200)}`);
 68      return null;
 69    }
 70  
 71    let parsed: WebReadResult[];
 72    try {
 73      parsed = parseJsonOutput(result.stdout);
 74    } catch {
 75      console.warn(`${label}: skipped — CLI output was not JSON`);
 76      return null;
 77    }
 78    if (!Array.isArray(parsed) || parsed.length === 0) {
 79      console.warn(`${label}: skipped — empty result array`);
 80      return null;
 81    }
 82    const row = parsed[0];
 83    if (row.status !== 'success') {
 84      console.warn(`${label}: skipped — status=${row.status} (${row.saved})`);
 85      return null;
 86    }
 87    return row;
 88  }
 89  
 90  function assertPipelineInvariants(md: string, label: string) {
 91    expect(md.length, `${label}: markdown should be non-trivial`).toBeGreaterThan(200);
 92  
 93    expect(md.match(/data:image/g) ?? [], `${label}: no base64 data-URI images`).toHaveLength(0);
 94    expect(md.match(/<script[\s>]/gi) ?? [], `${label}: no leaked <script> tags`).toHaveLength(0);
 95    expect(md.match(/<style[\s>]/gi) ?? [], `${label}: no leaked <style> tags`).toHaveLength(0);
 96  
 97    // The post-processing pipeline guarantees blank-line collapse to at most 2.
 98    expect(md).not.toMatch(/\n{3,}/);
 99  
100    // Lone dash / middle-dot residue from lost list bullets.
101    expect(md).not.toMatch(/^[ \t]*-[ \t]*$/m);
102    expect(md).not.toMatch(/^[ \t]*·[ \t]*$/m);
103  
104    // Trailing whitespace should never appear (stripped in post-processing).
105    expect(md).not.toMatch(/[ \t]+\n/);
106  
107    // NBSP should be normalized to a regular space.
108    expect(md.match(/\u00a0/g) ?? [], `${label}: NBSP should be normalized`).toHaveLength(0);
109  }
110  
111  interface SiteCase {
112    url: string;
113    label: string;
114    extra?: (md: string) => void;
115  }
116  
117  const SITES: SiteCase[] = [
118    {
119      url: 'https://example.com/',
120      label: 'example.com (baseline)',
121    },
122    {
123      url: 'https://en.wikipedia.org/wiki/Markdown',
124      label: 'Wikipedia — Markdown (GFM tables + headings)',
125      extra: (md) => {
126        // Wikipedia's Markdown article contains several tables — the hardened
127        // converter with turndown-plugin-gfm should produce real `|---|---|` rows.
128        expect(md.match(/^\|.*---.*\|/gm) ?? []).not.toHaveLength(0);
129        expect(md.match(/^## /gm) ?? []).not.toHaveLength(0);
130      },
131    },
132    {
133      url: 'https://developer.mozilla.org/en-US/docs/Web/HTML/Element/table',
134      label: 'MDN — <table> element',
135    },
136    {
137      url: 'https://github.com/mozilla/readability',
138      label: 'GitHub — mozilla/readability README',
139      extra: (md) => {
140        // README uses fenced code blocks extensively; ensure a few survived.
141        expect(md.match(/^```/gm) ?? [], 'fenced code blocks preserved').not.toHaveLength(0);
142      },
143    },
144    {
145      url: 'https://vercel.com/blog/vercel-ship-2024',
146      label: 'Vercel blog — Ship 2024 recap (JS-heavy SSR)',
147    },
148    {
149      url: 'https://www.ruanyifeng.com/blog/2024/07/weekly-issue-309.html',
150      label: 'Ruan Yifeng blog — CJK + image-dense',
151      extra: (md) => {
152        // Chinese site with many inline images. The extractor should preserve
153        // both CJK text and remote image URLs (not drop them like base64 would).
154        expect(md).toMatch(/[\u4e00-\u9fff]/); // contains at least one CJK char
155        expect(md.match(/^!\[.*?\]\(https?:\/\//gm) ?? []).not.toHaveLength(0);
156      },
157    },
158  ];
159  
160  describe('web read — hardened article pipeline (real-site regression)', () => {
161    for (const site of SITES) {
162      it(`${site.label} survives the hardened pipeline`, async () => {
163        const row = await runWebReadOrSkip(site.url, site.label);
164        if (!row) return;
165  
166        expect(row.saved, `${site.label}: saved path present`).toBeTruthy();
167        expect(fs.existsSync(row.saved)).toBe(true);
168  
169        const md = fs.readFileSync(row.saved, 'utf8');
170        assertPipelineInvariants(md, site.label);
171        if (site.extra) site.extra(md);
172      }, 120_000);
173    }
174  });