/ clis / web / read.js
read.js
  1  /**
  2   * Generic web page reader — fetch any URL and export as Markdown.
  3   *
  4   * Uses browser-side DOM heuristics to extract the main content:
  5   *   1. <article> element
  6   *   2. [role="main"] element
  7   *   3. <main> element
  8   *   4. Largest text-dense block as fallback
  9   *
 10   * Pipes through the shared article-download pipeline (Turndown + image download).
 11   *
 12   * Usage:
 13   *   opencli web read --url "https://www.anthropic.com/research/..." --output ./articles
 14   *   opencli web read --url "https://..." --download-images false
 15   */
 16  import { cli, Strategy } from '@jackwener/opencli/registry';
 17  import { downloadArticle } from '@jackwener/opencli/download/article-download';
 18  const command = cli({
 19      site: 'web',
 20      name: 'read',
 21      description: 'Fetch any web page and export as Markdown',
 22      strategy: Strategy.COOKIE,
 23      navigateBefore: false, // we handle navigation ourselves
 24      args: [
 25          { name: 'url', required: true, help: 'Any web page URL' },
 26          { name: 'output', default: './web-articles', help: 'Output directory' },
 27          { name: 'download-images', type: 'boolean', default: true, help: 'Download images locally' },
 28          { name: 'wait', type: 'int', default: 3, help: 'Seconds to wait after page load' },
 29          { name: 'stdout', type: 'boolean', default: false, help: 'Print markdown to stdout instead of saving to a file' },
 30      ],
 31      columns: ['title', 'author', 'publish_time', 'status', 'size', 'saved'],
 32      func: async (page, kwargs) => {
 33          const url = kwargs.url;
 34          const waitSeconds = kwargs.wait ?? 3;
 35          // Navigate to the target URL
 36          await page.goto(url);
 37          await page.wait(waitSeconds);
 38          // Extract article content using browser-side heuristics
 39          const data = await page.evaluate(`
 40        (() => {
 41          const result = {
 42            title: '',
 43            author: '',
 44            publishTime: '',
 45            contentHtml: '',
 46            imageUrls: []
 47          };
 48  
 49          // --- Title extraction ---
 50          // Priority: og:title > <title> > first <h1>
 51          const ogTitle = document.querySelector('meta[property="og:title"]');
 52          if (ogTitle) {
 53            result.title = ogTitle.getAttribute('content')?.trim() || '';
 54          }
 55          if (!result.title) {
 56            result.title = document.title?.trim() || '';
 57          }
 58          if (!result.title) {
 59            const h1 = document.querySelector('h1');
 60            result.title = h1?.textContent?.trim() || 'untitled';
 61          }
 62          // Strip site suffix (e.g. " | Anthropic", " - Blog")
 63          result.title = result.title.replace(/\\s*[|\\-–—]\\s*[^|\\-–—]{1,30}$/, '').trim();
 64  
 65          // --- Author extraction ---
 66          const authorMeta = document.querySelector(
 67            'meta[name="author"], meta[property="article:author"], meta[name="twitter:creator"]'
 68          );
 69          result.author = authorMeta?.getAttribute('content')?.trim() || '';
 70  
 71          // --- Publish time extraction ---
 72          const timeMeta = document.querySelector(
 73            'meta[property="article:published_time"], meta[name="date"], meta[name="publishdate"], time[datetime]'
 74          );
 75          if (timeMeta) {
 76            result.publishTime = timeMeta.getAttribute('content')
 77              || timeMeta.getAttribute('datetime')
 78              || timeMeta.textContent?.trim()
 79              || '';
 80          }
 81  
 82          // --- Content extraction ---
 83          // Strategy: try semantic elements first, then fall back to largest text block
 84          let contentEl = null;
 85  
 86          // 1. <article>
 87          const articles = document.querySelectorAll('article');
 88          if (articles.length === 1) {
 89            contentEl = articles[0];
 90          } else if (articles.length > 1) {
 91            // Pick the largest article by text length
 92            let maxLen = 0;
 93            articles.forEach(a => {
 94              const len = a.textContent?.length || 0;
 95              if (len > maxLen) { maxLen = len; contentEl = a; }
 96            });
 97          }
 98  
 99          // 2. [role="main"]
100          if (!contentEl) {
101            contentEl = document.querySelector('[role="main"]');
102          }
103  
104          // 3. <main>
105          if (!contentEl) {
106            contentEl = document.querySelector('main');
107          }
108  
109          // 4. Largest text-dense block fallback
110          if (!contentEl) {
111            const candidates = document.querySelectorAll(
112              'div[class*="content"], div[class*="article"], div[class*="post"], ' +
113              'div[class*="entry"], div[class*="body"], div[id*="content"], ' +
114              'div[id*="article"], div[id*="post"], section'
115            );
116            let maxLen = 0;
117            candidates.forEach(c => {
118              const len = c.textContent?.length || 0;
119              if (len > maxLen) { maxLen = len; contentEl = c; }
120            });
121          }
122  
123          // 5. Last resort: document.body
124          if (!contentEl || (contentEl.textContent?.length || 0) < 200) {
125            contentEl = document.body;
126          }
127  
128          // Clean up noise elements before extraction
129          const clone = contentEl.cloneNode(true);
130          const noise = 'nav, header, footer, aside, .sidebar, .nav, .menu, .footer, ' +
131            '.header, .comments, .comment, .ad, .ads, .advertisement, .social-share, ' +
132            '.related-posts, .newsletter, .cookie-banner, script, style, noscript, iframe';
133          clone.querySelectorAll(noise).forEach(el => el.remove());
134  
135          // Deduplicate: some sites (e.g. Anthropic) render each paragraph twice
136          // (a visible version + a line-broken animation version with missing spaces).
137          // Compare by stripping ALL whitespace so "Hello world" matches "Helloworld".
138          const stripWS = (s) => (s || '').replace(/\\s+/g, '');
139          const dedup = (parent) => {
140            const children = Array.from(parent.children || []);
141            for (let i = children.length - 1; i >= 1; i--) {
142              const curRaw = children[i].textContent || '';
143              const prevRaw = children[i - 1].textContent || '';
144              const cur = stripWS(curRaw);
145              const prev = stripWS(prevRaw);
146              if (cur.length < 20 || prev.length < 20) continue;
147              // Exact match after whitespace strip, or >90% overlap
148              if (cur === prev) {
149                // Keep the one with more proper spacing (more spaces = better formatted)
150                const curSpaces = (curRaw.match(/ /g) || []).length;
151                const prevSpaces = (prevRaw.match(/ /g) || []).length;
152                if (curSpaces >= prevSpaces) children[i - 1].remove();
153                else children[i].remove();
154              } else if (prev.includes(cur) && cur.length / prev.length > 0.8) {
155                children[i].remove();
156              } else if (cur.includes(prev) && prev.length / cur.length > 0.8) {
157                children[i - 1].remove();
158              }
159            }
160          };
161          dedup(clone);
162          clone.querySelectorAll('section, div').forEach(el => {
163            if (el.children && el.children.length > 2) dedup(el);
164          });
165  
166          // --- Lazy-load image src rewrite ---
167          // Many sites render <img src="placeholder.gif" data-src="real.jpg">.
168          // Promote the real URL onto src so both the markdown body and the
169          // image download list reference the same URL.
170          clone.querySelectorAll('img').forEach(img => {
171            const srcset = img.getAttribute('data-srcset') || '';
172            const srcsetFirst = srcset.split(',')[0]?.trim().split(' ')[0] || '';
173            const real = img.getAttribute('data-src')
174              || img.getAttribute('data-original')
175              || img.getAttribute('data-lazy-src')
176              || srcsetFirst;
177            if (real) img.setAttribute('src', real);
178          });
179  
180          result.contentHtml = clone.innerHTML;
181  
182          // --- Image extraction ---
183          const seen = new Set();
184          clone.querySelectorAll('img').forEach(img => {
185            const src = img.getAttribute('src') || '';
186            if (src && !src.startsWith('data:') && !seen.has(src)) {
187              seen.add(src);
188              result.imageUrls.push(src);
189            }
190          });
191  
192          return result;
193        })()
194      `);
195          // Determine Referer from URL for image downloads
196          let referer = '';
197          try {
198              const parsed = new URL(url);
199              referer = parsed.origin + '/';
200          }
201          catch { /* ignore */ }
202          const result = await downloadArticle({
203              title: data?.title || 'untitled',
204              author: data?.author,
205              publishTime: data?.publishTime,
206              sourceUrl: url,
207              contentHtml: data?.contentHtml || '',
208              imageUrls: data?.imageUrls,
209          }, {
210              output: kwargs.output,
211              downloadImages: kwargs['download-images'],
212              imageHeaders: referer ? { Referer: referer } : undefined,
213              stdout: kwargs.stdout,
214          });
215          // `--stdout` is a content-streaming mode. The markdown body already went
216          // to process.stdout inside downloadArticle(), so returning rows here
217          // would make Commander append table/JSON output to the same stdout
218          // stream and break piping.
219          return kwargs.stdout ? null : result;
220      },
221  });
222  export const __test__ = { command };