/ clis / band / post.js
post.js
  1  import { AuthRequiredError, EmptyResultError } from '@jackwener/opencli/errors';
  2  import { formatCookieHeader } from '@jackwener/opencli/download';
  3  import { downloadMedia } from '@jackwener/opencli/download/media-download';
  4  import { cli, Strategy } from '@jackwener/opencli/registry';
  5  /**
  6   * band post — Export full content of a Band post: body, comments, and optional photo download.
  7   *
  8   * Navigates directly to the post URL and extracts everything from the DOM.
  9   * No XHR interception needed — Band renders the full post for logged-in users.
 10   *
 11   * Output rows:
 12   *   type=post    → the post itself (author, date, body text)
 13   *   type=comment → top-level comment
 14   *   type=reply   → reply to a comment (nested under its parent)
 15   *
 16   * Photo thumbnail URLs carry a ?type=sNNN suffix; stripping it yields full-res.
 17   */
 18  cli({
 19      site: 'band',
 20      name: 'post',
 21      description: 'Export full content of a post including comments',
 22      domain: 'www.band.us',
 23      strategy: Strategy.COOKIE,
 24      navigateBefore: false,
 25      browser: true,
 26      args: [
 27          { name: 'band_no', positional: true, required: true, type: 'int', help: 'Band number' },
 28          { name: 'post_no', positional: true, required: true, type: 'int', help: 'Post number' },
 29          { name: 'output', type: 'str', default: '', help: 'Directory to save attached photos' },
 30          { name: 'comments', type: 'bool', default: true, help: 'Include comments (default: true)' },
 31      ],
 32      columns: ['type', 'author', 'date', 'text'],
 33      func: async (page, kwargs) => {
 34          const bandNo = Number(kwargs.band_no);
 35          const postNo = Number(kwargs.post_no);
 36          const outputDir = kwargs.output;
 37          const withComments = kwargs.comments;
 38          await page.goto(`https://www.band.us/band/${bandNo}/post/${postNo}`);
 39          const cookies = await page.getCookies({ domain: 'band.us' });
 40          const isLoggedIn = cookies.some(c => c.name === 'band_session');
 41          if (!isLoggedIn)
 42              throw new AuthRequiredError('band.us', 'Not logged in to Band');
 43          const data = await page.evaluate(`
 44        (async () => {
 45          const withComments = ${withComments};
 46          const sleep = ms => new Promise(r => setTimeout(r, ms));
 47          const norm = s => (s || '').replace(/\\s+/g, ' ').trim();
 48          // Band embeds <band:mention>, <band:sticker>, etc. in content — strip to plain text.
 49          const stripTags = s => s.replace(/<\\/?band:[^>]+>/g, '');
 50  
 51          // Wait up to 9 s for the post content to render (poll for the author link,
 52          // which appears after React hydration fills the post header).
 53          for (let i = 0; i < 30; i++) {
 54            if (document.querySelector('._postWrapper a.text')) break;
 55            await sleep(300);
 56          }
 57  
 58          const postCard = document.querySelector('._postWrapper');
 59          const commentSection = postCard?.querySelector('.dPostCommentMainView');
 60  
 61          // Author and date live in the post header, above the comment section.
 62          // Exclude any matches inside the comment section to avoid picking up comment authors.
 63          let author = '', date = '';
 64          for (const el of (postCard?.querySelectorAll('a.text') || [])) {
 65            if (!commentSection?.contains(el)) { author = norm(el.textContent); break; }
 66          }
 67          for (const el of (postCard?.querySelectorAll('time.time') || [])) {
 68            if (!commentSection?.contains(el)) { date = norm(el.textContent); break; }
 69          }
 70  
 71          const bodyEl = postCard?.querySelector('.postText._postText');
 72          const text = bodyEl ? stripTags(norm(bodyEl.innerText || bodyEl.textContent)) : '';
 73  
 74          // Photo thumbnails have a ?type=sNNN query param; strip it for full-res URL.
 75          // Use location.href as base so protocol-relative or relative URLs resolve correctly.
 76          const photos = Array.from(postCard?.querySelectorAll('img._imgRecentPhoto, img._imgPhoto') || [])
 77            .map(img => {
 78              const src = img.getAttribute('src') || '';
 79              if (!src) return '';
 80              try { const u = new URL(src, location.href); return u.origin + u.pathname; }
 81              catch { return ''; }
 82            })
 83            .filter(Boolean);
 84  
 85          if (!withComments) return { author, date, text, photos, comments: [] };
 86  
 87          // Wait up to 6 s for the comment list container to render.
 88          // Wait for the container itself (not .cComment) so posts with zero comments
 89          // don't incur a fixed 6s delay waiting for an element that never appears.
 90          for (let i = 0; i < 20; i++) {
 91            if (postCard?.querySelector('.sCommentList._heightDetectAreaForComment')) break;
 92            await sleep(300);
 93          }
 94  
 95          // Recursively collect comments and their replies.
 96          // Replies live in .sReplyList > .sCommentList, not in ._replyRegion.
 97          function extractComments(container, depth) {
 98            const results = [];
 99            for (const el of container.querySelectorAll(':scope > .cComment')) {
100              results.push({
101                depth,
102                author: norm(el.querySelector('strong.name')?.textContent),
103                date:   norm(el.querySelector('time.time')?.textContent),
104                text:   stripTags(norm(el.querySelector('p.txt._commentContent')?.innerText || '')),
105              });
106              const replyList = el.querySelector('.sReplyList .sCommentList._heightDetectAreaForComment');
107              if (replyList) results.push(...extractComments(replyList, depth + 1));
108            }
109            return results;
110          }
111  
112          const commentList = postCard?.querySelector('.sCommentList._heightDetectAreaForComment');
113          const comments = commentList ? extractComments(commentList, 0) : [];
114  
115          return { author, date, text, photos, comments };
116        })()
117      `);
118          if (!data?.text && !data?.comments?.length && !data?.photos?.length) {
119              throw new EmptyResultError('band post', 'Post not found or not accessible');
120          }
121          const photos = data.photos ?? [];
122          // Download photos when --output is specified, using the shared downloadMedia utility
123          // which handles redirects, timeouts, and stream errors correctly.
124          // Pass browser cookies so Band's login-protected photo URLs don't fail with 401/403.
125          if (outputDir && photos.length > 0) {
126              // Only send Band cookies to Band-hosted URLs; avoid leaking auth cookies to third-party CDNs.
127              // Use a global index across both batches so filenames don't collide (photo_1, photo_2, ...).
128              const cookieHeader = formatCookieHeader(await page.getCookies({ url: 'https://www.band.us' }));
129              const isBandUrl = (u) => { try {
130                  const h = new URL(u).hostname;
131                  return h === 'band.us' || h.endsWith('.band.us');
132              }
133              catch {
134                  return false;
135              } };
136              // Derive extension from URL path so downloaded files have correct extensions (e.g. photo_1.jpg).
137              const urlExt = (u) => { try {
138                  return new URL(u).pathname.match(/\.(\w+)$/)?.[1] ?? 'jpg';
139              }
140              catch {
141                  return 'jpg';
142              } };
143              let globalIndex = 1;
144              const bandPhotos = photos.filter(isBandUrl);
145              const otherPhotos = photos.filter(u => !isBandUrl(u));
146              if (bandPhotos.length > 0) {
147                  await downloadMedia(bandPhotos.map(url => ({ type: 'image', url, filename: `photo_${globalIndex++}.${urlExt(url)}` })), { output: outputDir, verbose: false, cookies: cookieHeader });
148              }
149              if (otherPhotos.length > 0) {
150                  await downloadMedia(otherPhotos.map(url => ({ type: 'image', url, filename: `photo_${globalIndex++}.${urlExt(url)}` })), { output: outputDir, verbose: false });
151              }
152          }
153          const rows = [];
154          // Post row — append photo URLs inline when not downloading to disk.
155          rows.push({
156              type: 'post',
157              author: data.author ?? '',
158              date: data.date ?? '',
159              text: [
160                  data.text ?? '',
161                  ...(outputDir ? [] : photos.map((u, i) => `[photo${i + 1}] ${u}`)),
162              ].filter(Boolean).join('\n'),
163          });
164          // Comment rows — depth=0 → type 'comment', depth≥1 → type 'reply'.
165          for (const c of data.comments ?? []) {
166              rows.push({
167                  type: c.depth === 0 ? 'comment' : 'reply',
168                  author: c.author ?? '',
169                  date: c.date ?? '',
170                  text: c.depth > 0 ? '  '.repeat(c.depth) + '└ ' + (c.text ?? '') : (c.text ?? ''),
171              });
172          }
173          return rows;
174      },
175  });