/ clis / substack / utils.js
utils.js
  1  import { CommandExecutionError } from '@jackwener/opencli/errors';
  2  const FEED_POST_LINK_SELECTOR = 'a[href*="/home/post/"], a[href*="/p/"]';
  3  const ARCHIVE_POST_LINK_SELECTOR = 'a[href*="/p/"]';
  4  export function buildSubstackBrowseUrl(category) {
  5      if (!category || category === 'all')
  6          return 'https://substack.com/';
  7      const slug = category === 'tech' ? 'technology' : category;
  8      return `https://substack.com/browse/${slug}`;
  9  }
 10  export async function loadSubstackFeed(page, url, limit) {
 11      if (!page)
 12          throw new CommandExecutionError('Browser session required for substack feed');
 13      await page.goto(url);
 14      await page.wait({ selector: FEED_POST_LINK_SELECTOR, timeout: 5 });
 15      const data = await page.evaluate(`
 16      (async () => {
 17        await new Promise((resolve) => setTimeout(resolve, 3000));
 18        const limit = ${Math.max(1, Math.min(limit, 50))};
 19        const normalize = (value) => (value || '').replace(/\\s+/g, ' ').trim();
 20        const posts = [];
 21        const seen = new Set();
 22  
 23        const allLinks = Array.from(document.querySelectorAll('a')).filter((link) => {
 24          const href = link.getAttribute('href') || '';
 25          return href.includes('/home/post/') || href.includes('/p/');
 26        });
 27  
 28        for (const linkEl of allLinks) {
 29          let postUrl = linkEl.getAttribute('href') || '';
 30          if (!postUrl) continue;
 31          if (!postUrl.startsWith('http')) postUrl = 'https://substack.com' + postUrl;
 32          if (seen.has(postUrl)) continue;
 33  
 34          const lines = (linkEl.innerText || '')
 35            .split('\\n')
 36            .map((line) => normalize(line))
 37            .filter(Boolean);
 38  
 39          const readMeta = lines.find((line) => /\\b(read|watch|listen)\\b/i.test(line)) || '';
 40          if (!readMeta) continue;
 41  
 42          const date = lines.find((line) => /^[A-Z]{3}\\s+\\d{1,2}$/i.test(line)) || '';
 43          const contentLines = lines.filter((line) =>
 44            line &&
 45            line !== date &&
 46            line !== readMeta &&
 47            line.toLowerCase() !== 'save' &&
 48            line.toLowerCase() !== 'more' &&
 49            !/^(sign in|create account|get app)$/i.test(line),
 50          );
 51  
 52          const metaParts = readMeta.split('∙').map((part) => normalize(part));
 53          const author = metaParts[0] || '';
 54          const readTime = metaParts.slice(1).join(' ∙ ') || readMeta;
 55          const title = contentLines.length >= 2 ? contentLines[1] : (contentLines[0] || '');
 56          const description = contentLines.length >= 3 ? contentLines.slice(2).join(' ') : '';
 57          if (!title) continue;
 58  
 59          seen.add(postUrl);
 60          posts.push({
 61            rank: posts.length + 1,
 62            title,
 63            author,
 64            date,
 65            readTime,
 66            description: description.slice(0, 150),
 67            url: postUrl,
 68          });
 69  
 70          if (posts.length >= limit) break;
 71        }
 72  
 73        return posts;
 74      })()
 75    `);
 76      return Array.isArray(data) ? data : [];
 77  }
 78  export async function loadSubstackArchive(page, baseUrl, limit) {
 79      if (!page)
 80          throw new CommandExecutionError('Browser session required for substack archive');
 81      await page.goto(`${baseUrl}/archive`);
 82      await page.wait({ selector: ARCHIVE_POST_LINK_SELECTOR, timeout: 5 });
 83      const data = await page.evaluate(`
 84      (async () => {
 85        await new Promise((resolve) => setTimeout(resolve, 3000));
 86        const normalize = (value) => (value || '').replace(/\\s+/g, ' ').trim();
 87        const limit = ${Math.max(1, Math.min(limit, 50))};
 88        const grouped = new Map();
 89  
 90        for (const link of Array.from(document.querySelectorAll('a[href*="/p/"]'))) {
 91          const rawHref = link.getAttribute('href') || '';
 92          if (!rawHref || rawHref === '/p/upgrade') continue;
 93  
 94          const url = rawHref.startsWith('http') ? rawHref : ${JSON.stringify(baseUrl)} + rawHref;
 95          const text = normalize(link.textContent);
 96          if (!text) continue;
 97          if (/^(subscribe|paid|home|about|latest|top|discussions)$/i.test(text)) continue;
 98          if (/^[\\d,]+$/.test(text)) continue;
 99  
100          const entry = grouped.get(url) || { texts: new Set(), date: '' };
101          entry.texts.add(text);
102  
103          const container = link.closest('article, section, div') || link.parentElement || link;
104          const containerText = normalize(container.textContent);
105          if (!entry.date) {
106            entry.date = containerText.match(/\\b(?:[A-Z]{3}\\s+\\d{1,2}|[A-Z][a-z]{2}\\s+\\d{1,2})\\b/)?.[0] || '';
107          }
108  
109          grouped.set(url, entry);
110        }
111  
112        const posts = [];
113        for (const [url, entry] of Array.from(grouped.entries())) {
114          const texts = Array.from(entry.texts).map((text) => normalize(text)).filter((text) => text.length > 3).sort((a, b) => a.length - b.length);
115          const title = texts[0] || '';
116          const description = texts.find((text) => text !== title) || '';
117          if (!title) continue;
118          posts.push({
119            rank: posts.length + 1,
120            title,
121            date: entry.date,
122            description: description.slice(0, 150),
123            url,
124          });
125          if (posts.length >= limit) break;
126        }
127  
128        return posts;
129      })()
130    `);
131      return Array.isArray(data) ? data : [];
132  }
133  export const __test__ = {
134      FEED_POST_LINK_SELECTOR,
135      ARCHIVE_POST_LINK_SELECTOR,
136  };