article-extract.ts
1 /** 2 * Article extraction via Readability — generic `page → article HTML` pipeline. 3 * 4 * Complements `src/browser/extract.ts`: that one takes a caller-supplied 5 * selector. This one works with zero configuration on arbitrary article pages 6 * (blogs, news, docs) by running `@mozilla/readability` inside the page 7 * context via CDP evaluate. 8 * 9 * Pipeline: 10 * 1. Short-circuit non-HTML documents (`text/plain`, JSON, XML) — a page 11 * renderer wrapping a plain-text file would pollute the DOM pipeline. 12 * 2. Short-circuit the "body is a single <pre>" case, which browsers use 13 * when loading *.txt / *.md over file:// or raw.githubusercontent.com. 14 * 3. Deep-clone the document, apply caller-supplied `cleanSelectors` to the 15 * clone (preserves live page state for subsequent snapshot/click). 16 * 4. Inject Readability + isProbablyReaderable sources into the page, 17 * parse on the clone. `isProbablyReaderable` gates the parse unless 18 * `force: true`. 19 * 5. On Readability miss, walk a fallback selector chain 20 * (main → [role="main"] → #main-content → … → body) and return the 21 * first root with >80 characters of text. 22 * 23 * Readability runs in the page's own window because it needs real DOM APIs 24 * (getComputedStyle, treeWalker). Running it Node-side would require jsdom — 25 * a heavy dep the rest of OpenCLI doesn't need. 26 */ 27 28 import * as fs from 'node:fs'; 29 import { createRequire } from 'node:module'; 30 31 const requireFromHere = createRequire(import.meta.url); 32 33 let cachedSources: { readability: string; readerable: string } | null = null; 34 35 function readabilitySources(): { readability: string; readerable: string } { 36 if (cachedSources) return cachedSources; 37 const readabilityPath = requireFromHere.resolve('@mozilla/readability/Readability.js'); 38 const readerablePath = requireFromHere.resolve('@mozilla/readability/Readability-readerable.js'); 39 cachedSources = { 40 readability: fs.readFileSync(readabilityPath, 'utf8'), 41 readerable: fs.readFileSync(readerablePath, 'utf8'), 42 }; 43 return cachedSources; 44 } 45 46 export interface ExtractArticleOptions { 47 /** CSS selectors removed from the cloned document before Readability runs. */ 48 cleanSelectors?: string[]; 49 /** Fallback chain when Readability fails. Defaults to the common structural ids. */ 50 fallbackSelectors?: string[]; 51 /** Bypass `isProbablyReaderable` and always attempt a parse. */ 52 force?: boolean; 53 } 54 55 export type ExtractSource = 'readability' | 'fallback' | 'raw-text' | 'pre'; 56 57 export interface ExtractedArticle { 58 html: string; 59 title: string; 60 byline?: string; 61 publishedTime?: string; 62 siteName?: string; 63 source: ExtractSource; 64 } 65 66 export const DEFAULT_FALLBACK_SELECTORS: string[] = [ 67 'main', 68 '[role="main"]', 69 '#main-content', 70 '#main', 71 '#content', 72 '.content', 73 'article', 74 'body', 75 ]; 76 77 const MIN_FALLBACK_TEXT_LENGTH = 80; 78 79 /** 80 * Build the JS expression evaluated in-page to extract the article. Exported 81 * for testability — callers on the host side should use `extractArticle`. 82 */ 83 export function buildExtractArticleJs(options: ExtractArticleOptions = {}): string { 84 const { readability, readerable } = readabilitySources(); 85 const cleanSelectors = options.cleanSelectors ?? []; 86 const fallbackSelectors = options.fallbackSelectors ?? DEFAULT_FALLBACK_SELECTORS; 87 const force = !!options.force; 88 89 // Library sources contain backticks and ${...} fragments, so we embed them 90 // as JSON-encoded string literals and eval them inside a Function() scope. 91 // This isolates their var declarations from the outer IIFE without polluting 92 // window globals. 93 const readabilityLit = JSON.stringify(readability); 94 const readerableLit = JSON.stringify(readerable); 95 const cleanLit = JSON.stringify(cleanSelectors); 96 const fallbackLit = JSON.stringify(fallbackSelectors); 97 const forceLit = JSON.stringify(force); 98 99 return [ 100 '(() => {', 101 ' const cleanSelectors = ' + cleanLit + ';', 102 ' const fallbackSelectors = ' + fallbackLit + ';', 103 ' const force = ' + forceLit + ';', 104 ' const minFallbackText = ' + MIN_FALLBACK_TEXT_LENGTH + ';', 105 ' const readabilitySrc = ' + readabilityLit + ';', 106 ' const readerableSrc = ' + readerableLit + ';', 107 '', 108 ' function escapeHtml(s) {', 109 ' return String(s).replace(/[&<>]/g, c => ({ "&": "&", "<": "<", ">": ">" }[c]));', 110 ' }', 111 '', 112 ' // Short-circuit 1: non-HTML document', 113 ' const ct = document.contentType || "";', 114 ' if (ct && ct !== "text/html" && ct !== "application/xhtml+xml") {', 115 ' const body = document.body ? (document.body.textContent || "") : "";', 116 ' return { source: "raw-text", html: "<pre>" + escapeHtml(body) + "</pre>", title: document.title || "" };', 117 ' }', 118 '', 119 ' // Short-circuit 2: body is a single <pre>', 120 ' if (document.body) {', 121 ' const kids = document.body.children;', 122 ' if (kids.length === 1 && kids[0] && kids[0].tagName === "PRE") {', 123 ' return { source: "pre", html: document.body.outerHTML, title: document.title || "" };', 124 ' }', 125 ' }', 126 '', 127 ' // Deep-clone + adapter-supplied dirty-node removal', 128 ' const cloneDoc = document.cloneNode(true);', 129 ' for (const sel of cleanSelectors) {', 130 ' try { for (const n of cloneDoc.querySelectorAll(sel)) n.remove(); }', 131 ' catch (e) { /* ignore invalid selector */ }', 132 ' }', 133 '', 134 ' // Inject Readability into an isolated Function scope and extract the', 135 ' // constructors we need. Library sources use their own module.exports', 136 ' // guard (if typeof module === "object"), which is falsy here.', 137 ' const libs = (new Function(', 138 ' readabilitySrc + "\\n" + readerableSrc + "\\nreturn {" +', 139 ' " Readability: typeof Readability !== \\"undefined\\" ? Readability : null," +', 140 ' " isProbablyReaderable: typeof isProbablyReaderable !== \\"undefined\\" ? isProbablyReaderable : null" +', 141 ' " };"', 142 ' ))();', 143 ' const Readability = libs.Readability;', 144 ' const isProbablyReaderable = libs.isProbablyReaderable;', 145 '', 146 ' const readerableOk = force || (typeof isProbablyReaderable === "function" ? isProbablyReaderable(cloneDoc) : true);', 147 ' let article = null;', 148 ' if (readerableOk && typeof Readability === "function") {', 149 ' try { article = new Readability(cloneDoc).parse(); } catch (e) { article = null; }', 150 ' }', 151 ' if (article && article.content) {', 152 ' return {', 153 ' source: "readability",', 154 ' html: article.content,', 155 ' title: article.title || document.title || "",', 156 ' byline: article.byline || undefined,', 157 ' publishedTime: article.publishedTime || undefined,', 158 ' siteName: article.siteName || undefined,', 159 ' };', 160 ' }', 161 '', 162 ' // Fallback chain', 163 ' for (const sel of fallbackSelectors) {', 164 ' let el = null;', 165 ' try { el = cloneDoc.querySelector(sel); } catch (e) { continue; }', 166 ' if (!el) continue;', 167 ' const text = (el.textContent || "").trim();', 168 ' if (text.length < minFallbackText) continue;', 169 ' return { source: "fallback", html: el.outerHTML, title: document.title || "" };', 170 ' }', 171 '', 172 ' return null;', 173 '})()', 174 ].join('\n'); 175 } 176 177 export interface PageLike { 178 evaluate(js: string): Promise<unknown>; 179 } 180 181 /** 182 * Run the extract pipeline on the given page. Returns `null` when no usable 183 * content is found (Readability miss + empty fallback chain). 184 */ 185 export async function extractArticle( 186 page: PageLike, 187 options: ExtractArticleOptions = {}, 188 ): Promise<ExtractedArticle | null> { 189 const js = buildExtractArticleJs(options); 190 const raw = await page.evaluate(js); 191 if (raw == null || typeof raw !== 'object') return null; 192 const r = raw as Partial<ExtractedArticle> & { source?: string }; 193 if (typeof r.html !== 'string' || typeof r.source !== 'string') return null; 194 const source = r.source as ExtractSource; 195 return { 196 html: r.html, 197 title: typeof r.title === 'string' ? r.title : '', 198 ...(r.byline && { byline: r.byline }), 199 ...(r.publishedTime && { publishedTime: r.publishedTime }), 200 ...(r.siteName && { siteName: r.siteName }), 201 source, 202 }; 203 }