utils.js
1 import { CommandExecutionError } from '@jackwener/opencli/errors'; 2 const FEED_POST_LINK_SELECTOR = 'a[href*="/home/post/"], a[href*="/p/"]'; 3 const ARCHIVE_POST_LINK_SELECTOR = 'a[href*="/p/"]'; 4 export function buildSubstackBrowseUrl(category) { 5 if (!category || category === 'all') 6 return 'https://substack.com/'; 7 const slug = category === 'tech' ? 'technology' : category; 8 return `https://substack.com/browse/${slug}`; 9 } 10 export async function loadSubstackFeed(page, url, limit) { 11 if (!page) 12 throw new CommandExecutionError('Browser session required for substack feed'); 13 await page.goto(url); 14 await page.wait({ selector: FEED_POST_LINK_SELECTOR, timeout: 5 }); 15 const data = await page.evaluate(` 16 (async () => { 17 await new Promise((resolve) => setTimeout(resolve, 3000)); 18 const limit = ${Math.max(1, Math.min(limit, 50))}; 19 const normalize = (value) => (value || '').replace(/\\s+/g, ' ').trim(); 20 const posts = []; 21 const seen = new Set(); 22 23 const allLinks = Array.from(document.querySelectorAll('a')).filter((link) => { 24 const href = link.getAttribute('href') || ''; 25 return href.includes('/home/post/') || href.includes('/p/'); 26 }); 27 28 for (const linkEl of allLinks) { 29 let postUrl = linkEl.getAttribute('href') || ''; 30 if (!postUrl) continue; 31 if (!postUrl.startsWith('http')) postUrl = 'https://substack.com' + postUrl; 32 if (seen.has(postUrl)) continue; 33 34 const lines = (linkEl.innerText || '') 35 .split('\\n') 36 .map((line) => normalize(line)) 37 .filter(Boolean); 38 39 const readMeta = lines.find((line) => /\\b(read|watch|listen)\\b/i.test(line)) || ''; 40 if (!readMeta) continue; 41 42 const date = lines.find((line) => /^[A-Z]{3}\\s+\\d{1,2}$/i.test(line)) || ''; 43 const contentLines = lines.filter((line) => 44 line && 45 line !== date && 46 line !== readMeta && 47 line.toLowerCase() !== 'save' && 48 line.toLowerCase() !== 'more' && 49 !/^(sign in|create account|get app)$/i.test(line), 50 ); 51 52 const metaParts = readMeta.split('∙').map((part) => normalize(part)); 53 const author = metaParts[0] || ''; 54 const readTime = metaParts.slice(1).join(' ∙ ') || readMeta; 55 const title = contentLines.length >= 2 ? contentLines[1] : (contentLines[0] || ''); 56 const description = contentLines.length >= 3 ? contentLines.slice(2).join(' ') : ''; 57 if (!title) continue; 58 59 seen.add(postUrl); 60 posts.push({ 61 rank: posts.length + 1, 62 title, 63 author, 64 date, 65 readTime, 66 description: description.slice(0, 150), 67 url: postUrl, 68 }); 69 70 if (posts.length >= limit) break; 71 } 72 73 return posts; 74 })() 75 `); 76 return Array.isArray(data) ? data : []; 77 } 78 export async function loadSubstackArchive(page, baseUrl, limit) { 79 if (!page) 80 throw new CommandExecutionError('Browser session required for substack archive'); 81 await page.goto(`${baseUrl}/archive`); 82 await page.wait({ selector: ARCHIVE_POST_LINK_SELECTOR, timeout: 5 }); 83 const data = await page.evaluate(` 84 (async () => { 85 await new Promise((resolve) => setTimeout(resolve, 3000)); 86 const normalize = (value) => (value || '').replace(/\\s+/g, ' ').trim(); 87 const limit = ${Math.max(1, Math.min(limit, 50))}; 88 const grouped = new Map(); 89 90 for (const link of Array.from(document.querySelectorAll('a[href*="/p/"]'))) { 91 const rawHref = link.getAttribute('href') || ''; 92 if (!rawHref || rawHref === '/p/upgrade') continue; 93 94 const url = rawHref.startsWith('http') ? rawHref : ${JSON.stringify(baseUrl)} + rawHref; 95 const text = normalize(link.textContent); 96 if (!text) continue; 97 if (/^(subscribe|paid|home|about|latest|top|discussions)$/i.test(text)) continue; 98 if (/^[\\d,]+$/.test(text)) continue; 99 100 const entry = grouped.get(url) || { texts: new Set(), date: '' }; 101 entry.texts.add(text); 102 103 const container = link.closest('article, section, div') || link.parentElement || link; 104 const containerText = normalize(container.textContent); 105 if (!entry.date) { 106 entry.date = containerText.match(/\\b(?:[A-Z]{3}\\s+\\d{1,2}|[A-Z][a-z]{2}\\s+\\d{1,2})\\b/)?.[0] || ''; 107 } 108 109 grouped.set(url, entry); 110 } 111 112 const posts = []; 113 for (const [url, entry] of Array.from(grouped.entries())) { 114 const texts = Array.from(entry.texts).map((text) => normalize(text)).filter((text) => text.length > 3).sort((a, b) => a.length - b.length); 115 const title = texts[0] || ''; 116 const description = texts.find((text) => text !== title) || ''; 117 if (!title) continue; 118 posts.push({ 119 rank: posts.length + 1, 120 title, 121 date: entry.date, 122 description: description.slice(0, 150), 123 url, 124 }); 125 if (posts.length >= limit) break; 126 } 127 128 return posts; 129 })() 130 `); 131 return Array.isArray(data) ? data : []; 132 } 133 export const __test__ = { 134 FEED_POST_LINK_SELECTOR, 135 ARCHIVE_POST_LINK_SELECTOR, 136 };