Cradicle Explorer

/ src / browser / article-extract.ts
article-extract.ts
  1  /**
  2   * Article extraction via Readability — generic `page → article HTML` pipeline.
  3   *
  4   * Complements `src/browser/extract.ts`: that one takes a caller-supplied
  5   * selector. This one works with zero configuration on arbitrary article pages
  6   * (blogs, news, docs) by running `@mozilla/readability` inside the page
  7   * context via CDP evaluate.
  8   *
  9   * Pipeline:
 10   *   1. Short-circuit non-HTML documents (`text/plain`, JSON, XML) — a page
 11   *      renderer wrapping a plain-text file would pollute the DOM pipeline.
 12   *   2. Short-circuit the "body is a single <pre>" case, which browsers use
 13   *      when loading *.txt / *.md over file:// or raw.githubusercontent.com.
 14   *   3. Deep-clone the document, apply caller-supplied `cleanSelectors` to the
 15   *      clone (preserves live page state for subsequent snapshot/click).
 16   *   4. Inject Readability + isProbablyReaderable sources into the page,
 17   *      parse on the clone. `isProbablyReaderable` gates the parse unless
 18   *      `force: true`.
 19   *   5. On Readability miss, walk a fallback selector chain
 20   *      (main → [role="main"] → #main-content → … → body) and return the
 21   *      first root with >80 characters of text.
 22   *
 23   * Readability runs in the page's own window because it needs real DOM APIs
 24   * (getComputedStyle, treeWalker). Running it Node-side would require jsdom —
 25   * a heavy dep the rest of OpenCLI doesn't need.
 26   */
 27  
 28  import * as fs from 'node:fs';
 29  import { createRequire } from 'node:module';
 30  
 31  const requireFromHere = createRequire(import.meta.url);
 32  
 33  let cachedSources: { readability: string; readerable: string } | null = null;
 34  
 35  function readabilitySources(): { readability: string; readerable: string } {
 36    if (cachedSources) return cachedSources;
 37    const readabilityPath = requireFromHere.resolve('@mozilla/readability/Readability.js');
 38    const readerablePath = requireFromHere.resolve('@mozilla/readability/Readability-readerable.js');
 39    cachedSources = {
 40      readability: fs.readFileSync(readabilityPath, 'utf8'),
 41      readerable: fs.readFileSync(readerablePath, 'utf8'),
 42    };
 43    return cachedSources;
 44  }
 45  
 46  export interface ExtractArticleOptions {
 47    /** CSS selectors removed from the cloned document before Readability runs. */
 48    cleanSelectors?: string[];
 49    /** Fallback chain when Readability fails. Defaults to the common structural ids. */
 50    fallbackSelectors?: string[];
 51    /** Bypass `isProbablyReaderable` and always attempt a parse. */
 52    force?: boolean;
 53  }
 54  
 55  export type ExtractSource = 'readability' | 'fallback' | 'raw-text' | 'pre';
 56  
 57  export interface ExtractedArticle {
 58    html: string;
 59    title: string;
 60    byline?: string;
 61    publishedTime?: string;
 62    siteName?: string;
 63    source: ExtractSource;
 64  }
 65  
 66  export const DEFAULT_FALLBACK_SELECTORS: string[] = [
 67    'main',
 68    '[role="main"]',
 69    '#main-content',
 70    '#main',
 71    '#content',
 72    '.content',
 73    'article',
 74    'body',
 75  ];
 76  
 77  const MIN_FALLBACK_TEXT_LENGTH = 80;
 78  
 79  /**
 80   * Build the JS expression evaluated in-page to extract the article. Exported
 81   * for testability — callers on the host side should use `extractArticle`.
 82   */
 83  export function buildExtractArticleJs(options: ExtractArticleOptions = {}): string {
 84    const { readability, readerable } = readabilitySources();
 85    const cleanSelectors = options.cleanSelectors ?? [];
 86    const fallbackSelectors = options.fallbackSelectors ?? DEFAULT_FALLBACK_SELECTORS;
 87    const force = !!options.force;
 88  
 89    // Library sources contain backticks and ${...} fragments, so we embed them
 90    // as JSON-encoded string literals and eval them inside a Function() scope.
 91    // This isolates their var declarations from the outer IIFE without polluting
 92    // window globals.
 93    const readabilityLit = JSON.stringify(readability);
 94    const readerableLit = JSON.stringify(readerable);
 95    const cleanLit = JSON.stringify(cleanSelectors);
 96    const fallbackLit = JSON.stringify(fallbackSelectors);
 97    const forceLit = JSON.stringify(force);
 98  
 99    return [
100      '(() => {',
101      '  const cleanSelectors = ' + cleanLit + ';',
102      '  const fallbackSelectors = ' + fallbackLit + ';',
103      '  const force = ' + forceLit + ';',
104      '  const minFallbackText = ' + MIN_FALLBACK_TEXT_LENGTH + ';',
105      '  const readabilitySrc = ' + readabilityLit + ';',
106      '  const readerableSrc = ' + readerableLit + ';',
107      '',
108      '  function escapeHtml(s) {',
109      '    return String(s).replace(/[&<>]/g, c => ({ "&": "&amp;", "<": "&lt;", ">": "&gt;" }[c]));',
110      '  }',
111      '',
112      '  // Short-circuit 1: non-HTML document',
113      '  const ct = document.contentType || "";',
114      '  if (ct && ct !== "text/html" && ct !== "application/xhtml+xml") {',
115      '    const body = document.body ? (document.body.textContent || "") : "";',
116      '    return { source: "raw-text", html: "<pre>" + escapeHtml(body) + "</pre>", title: document.title || "" };',
117      '  }',
118      '',
119      '  // Short-circuit 2: body is a single <pre>',
120      '  if (document.body) {',
121      '    const kids = document.body.children;',
122      '    if (kids.length === 1 && kids[0] && kids[0].tagName === "PRE") {',
123      '      return { source: "pre", html: document.body.outerHTML, title: document.title || "" };',
124      '    }',
125      '  }',
126      '',
127      '  // Deep-clone + adapter-supplied dirty-node removal',
128      '  const cloneDoc = document.cloneNode(true);',
129      '  for (const sel of cleanSelectors) {',
130      '    try { for (const n of cloneDoc.querySelectorAll(sel)) n.remove(); }',
131      '    catch (e) { /* ignore invalid selector */ }',
132      '  }',
133      '',
134      '  // Inject Readability into an isolated Function scope and extract the',
135      '  // constructors we need. Library sources use their own module.exports',
136      '  // guard (if typeof module === "object"), which is falsy here.',
137      '  const libs = (new Function(',
138      '    readabilitySrc + "\\n" + readerableSrc + "\\nreturn {" +',
139      '    " Readability: typeof Readability !== \\"undefined\\" ? Readability : null," +',
140      '    " isProbablyReaderable: typeof isProbablyReaderable !== \\"undefined\\" ? isProbablyReaderable : null" +',
141      '    " };"',
142      '  ))();',
143      '  const Readability = libs.Readability;',
144      '  const isProbablyReaderable = libs.isProbablyReaderable;',
145      '',
146      '  const readerableOk = force || (typeof isProbablyReaderable === "function" ? isProbablyReaderable(cloneDoc) : true);',
147      '  let article = null;',
148      '  if (readerableOk && typeof Readability === "function") {',
149      '    try { article = new Readability(cloneDoc).parse(); } catch (e) { article = null; }',
150      '  }',
151      '  if (article && article.content) {',
152      '    return {',
153      '      source: "readability",',
154      '      html: article.content,',
155      '      title: article.title || document.title || "",',
156      '      byline: article.byline || undefined,',
157      '      publishedTime: article.publishedTime || undefined,',
158      '      siteName: article.siteName || undefined,',
159      '    };',
160      '  }',
161      '',
162      '  // Fallback chain',
163      '  for (const sel of fallbackSelectors) {',
164      '    let el = null;',
165      '    try { el = cloneDoc.querySelector(sel); } catch (e) { continue; }',
166      '    if (!el) continue;',
167      '    const text = (el.textContent || "").trim();',
168      '    if (text.length < minFallbackText) continue;',
169      '    return { source: "fallback", html: el.outerHTML, title: document.title || "" };',
170      '  }',
171      '',
172      '  return null;',
173      '})()',
174    ].join('\n');
175  }
176  
177  export interface PageLike {
178    evaluate(js: string): Promise<unknown>;
179  }
180  
181  /**
182   * Run the extract pipeline on the given page. Returns `null` when no usable
183   * content is found (Readability miss + empty fallback chain).
184   */
185  export async function extractArticle(
186    page: PageLike,
187    options: ExtractArticleOptions = {},
188  ): Promise<ExtractedArticle | null> {
189    const js = buildExtractArticleJs(options);
190    const raw = await page.evaluate(js);
191    if (raw == null || typeof raw !== 'object') return null;
192    const r = raw as Partial<ExtractedArticle> & { source?: string };
193    if (typeof r.html !== 'string' || typeof r.source !== 'string') return null;
194    const source = r.source as ExtractSource;
195    return {
196      html: r.html,
197      title: typeof r.title === 'string' ? r.title : '',
198      ...(r.byline && { byline: r.byline }),
199      ...(r.publishedTime && { publishedTime: r.publishedTime }),
200      ...(r.siteName && { siteName: r.siteName }),
201      source,
202    };
203  }