read.js
1 /** 2 * Generic web page reader — fetch any URL and export as Markdown. 3 * 4 * Uses browser-side DOM heuristics to extract the main content: 5 * 1. <article> element 6 * 2. [role="main"] element 7 * 3. <main> element 8 * 4. Largest text-dense block as fallback 9 * 10 * Pipes through the shared article-download pipeline (Turndown + image download). 11 * 12 * Usage: 13 * opencli web read --url "https://www.anthropic.com/research/..." --output ./articles 14 * opencli web read --url "https://..." --download-images false 15 */ 16 import { cli, Strategy } from '@jackwener/opencli/registry'; 17 import { downloadArticle } from '@jackwener/opencli/download/article-download'; 18 const command = cli({ 19 site: 'web', 20 name: 'read', 21 description: 'Fetch any web page and export as Markdown', 22 strategy: Strategy.COOKIE, 23 navigateBefore: false, // we handle navigation ourselves 24 args: [ 25 { name: 'url', required: true, help: 'Any web page URL' }, 26 { name: 'output', default: './web-articles', help: 'Output directory' }, 27 { name: 'download-images', type: 'boolean', default: true, help: 'Download images locally' }, 28 { name: 'wait', type: 'int', default: 3, help: 'Seconds to wait after page load' }, 29 { name: 'stdout', type: 'boolean', default: false, help: 'Print markdown to stdout instead of saving to a file' }, 30 ], 31 columns: ['title', 'author', 'publish_time', 'status', 'size', 'saved'], 32 func: async (page, kwargs) => { 33 const url = kwargs.url; 34 const waitSeconds = kwargs.wait ?? 3; 35 // Navigate to the target URL 36 await page.goto(url); 37 await page.wait(waitSeconds); 38 // Extract article content using browser-side heuristics 39 const data = await page.evaluate(` 40 (() => { 41 const result = { 42 title: '', 43 author: '', 44 publishTime: '', 45 contentHtml: '', 46 imageUrls: [] 47 }; 48 49 // --- Title extraction --- 50 // Priority: og:title > <title> > first <h1> 51 const ogTitle = document.querySelector('meta[property="og:title"]'); 52 if (ogTitle) { 53 result.title = ogTitle.getAttribute('content')?.trim() || ''; 54 } 55 if (!result.title) { 56 result.title = document.title?.trim() || ''; 57 } 58 if (!result.title) { 59 const h1 = document.querySelector('h1'); 60 result.title = h1?.textContent?.trim() || 'untitled'; 61 } 62 // Strip site suffix (e.g. " | Anthropic", " - Blog") 63 result.title = result.title.replace(/\\s*[|\\-–—]\\s*[^|\\-–—]{1,30}$/, '').trim(); 64 65 // --- Author extraction --- 66 const authorMeta = document.querySelector( 67 'meta[name="author"], meta[property="article:author"], meta[name="twitter:creator"]' 68 ); 69 result.author = authorMeta?.getAttribute('content')?.trim() || ''; 70 71 // --- Publish time extraction --- 72 const timeMeta = document.querySelector( 73 'meta[property="article:published_time"], meta[name="date"], meta[name="publishdate"], time[datetime]' 74 ); 75 if (timeMeta) { 76 result.publishTime = timeMeta.getAttribute('content') 77 || timeMeta.getAttribute('datetime') 78 || timeMeta.textContent?.trim() 79 || ''; 80 } 81 82 // --- Content extraction --- 83 // Strategy: try semantic elements first, then fall back to largest text block 84 let contentEl = null; 85 86 // 1. <article> 87 const articles = document.querySelectorAll('article'); 88 if (articles.length === 1) { 89 contentEl = articles[0]; 90 } else if (articles.length > 1) { 91 // Pick the largest article by text length 92 let maxLen = 0; 93 articles.forEach(a => { 94 const len = a.textContent?.length || 0; 95 if (len > maxLen) { maxLen = len; contentEl = a; } 96 }); 97 } 98 99 // 2. [role="main"] 100 if (!contentEl) { 101 contentEl = document.querySelector('[role="main"]'); 102 } 103 104 // 3. <main> 105 if (!contentEl) { 106 contentEl = document.querySelector('main'); 107 } 108 109 // 4. Largest text-dense block fallback 110 if (!contentEl) { 111 const candidates = document.querySelectorAll( 112 'div[class*="content"], div[class*="article"], div[class*="post"], ' + 113 'div[class*="entry"], div[class*="body"], div[id*="content"], ' + 114 'div[id*="article"], div[id*="post"], section' 115 ); 116 let maxLen = 0; 117 candidates.forEach(c => { 118 const len = c.textContent?.length || 0; 119 if (len > maxLen) { maxLen = len; contentEl = c; } 120 }); 121 } 122 123 // 5. Last resort: document.body 124 if (!contentEl || (contentEl.textContent?.length || 0) < 200) { 125 contentEl = document.body; 126 } 127 128 // Clean up noise elements before extraction 129 const clone = contentEl.cloneNode(true); 130 const noise = 'nav, header, footer, aside, .sidebar, .nav, .menu, .footer, ' + 131 '.header, .comments, .comment, .ad, .ads, .advertisement, .social-share, ' + 132 '.related-posts, .newsletter, .cookie-banner, script, style, noscript, iframe'; 133 clone.querySelectorAll(noise).forEach(el => el.remove()); 134 135 // Deduplicate: some sites (e.g. Anthropic) render each paragraph twice 136 // (a visible version + a line-broken animation version with missing spaces). 137 // Compare by stripping ALL whitespace so "Hello world" matches "Helloworld". 138 const stripWS = (s) => (s || '').replace(/\\s+/g, ''); 139 const dedup = (parent) => { 140 const children = Array.from(parent.children || []); 141 for (let i = children.length - 1; i >= 1; i--) { 142 const curRaw = children[i].textContent || ''; 143 const prevRaw = children[i - 1].textContent || ''; 144 const cur = stripWS(curRaw); 145 const prev = stripWS(prevRaw); 146 if (cur.length < 20 || prev.length < 20) continue; 147 // Exact match after whitespace strip, or >90% overlap 148 if (cur === prev) { 149 // Keep the one with more proper spacing (more spaces = better formatted) 150 const curSpaces = (curRaw.match(/ /g) || []).length; 151 const prevSpaces = (prevRaw.match(/ /g) || []).length; 152 if (curSpaces >= prevSpaces) children[i - 1].remove(); 153 else children[i].remove(); 154 } else if (prev.includes(cur) && cur.length / prev.length > 0.8) { 155 children[i].remove(); 156 } else if (cur.includes(prev) && prev.length / cur.length > 0.8) { 157 children[i - 1].remove(); 158 } 159 } 160 }; 161 dedup(clone); 162 clone.querySelectorAll('section, div').forEach(el => { 163 if (el.children && el.children.length > 2) dedup(el); 164 }); 165 166 // --- Lazy-load image src rewrite --- 167 // Many sites render <img src="placeholder.gif" data-src="real.jpg">. 168 // Promote the real URL onto src so both the markdown body and the 169 // image download list reference the same URL. 170 clone.querySelectorAll('img').forEach(img => { 171 const srcset = img.getAttribute('data-srcset') || ''; 172 const srcsetFirst = srcset.split(',')[0]?.trim().split(' ')[0] || ''; 173 const real = img.getAttribute('data-src') 174 || img.getAttribute('data-original') 175 || img.getAttribute('data-lazy-src') 176 || srcsetFirst; 177 if (real) img.setAttribute('src', real); 178 }); 179 180 result.contentHtml = clone.innerHTML; 181 182 // --- Image extraction --- 183 const seen = new Set(); 184 clone.querySelectorAll('img').forEach(img => { 185 const src = img.getAttribute('src') || ''; 186 if (src && !src.startsWith('data:') && !seen.has(src)) { 187 seen.add(src); 188 result.imageUrls.push(src); 189 } 190 }); 191 192 return result; 193 })() 194 `); 195 // Determine Referer from URL for image downloads 196 let referer = ''; 197 try { 198 const parsed = new URL(url); 199 referer = parsed.origin + '/'; 200 } 201 catch { /* ignore */ } 202 const result = await downloadArticle({ 203 title: data?.title || 'untitled', 204 author: data?.author, 205 publishTime: data?.publishTime, 206 sourceUrl: url, 207 contentHtml: data?.contentHtml || '', 208 imageUrls: data?.imageUrls, 209 }, { 210 output: kwargs.output, 211 downloadImages: kwargs['download-images'], 212 imageHeaders: referer ? { Referer: referer } : undefined, 213 stdout: kwargs.stdout, 214 }); 215 // `--stdout` is a content-streaming mode. The markdown body already went 216 // to process.stdout inside downloadArticle(), so returning rows here 217 // would make Commander append table/JSON output to the same stdout 218 // stream and break piping. 219 return kwargs.stdout ? null : result; 220 }, 221 }); 222 export const __test__ = { command };