Cradicle Explorer

read.js
  1  import { EmptyResultError } from '@jackwener/opencli/errors';
  2  import { cli, Strategy } from '@jackwener/opencli/registry';
  3  import { buildTiebaReadItems } from './utils.js';
  4  function getThreadUrl(kwargs) {
  5      const threadId = String(kwargs.id || '');
  6      const pageNumber = Math.max(1, Number(kwargs.page || 1));
  7      return `https://tieba.baidu.com/p/${encodeURIComponent(threadId)}?pn=${pageNumber}`;
  8  }
  9  /**
 10   * Ensure the browser actually landed on the requested thread page before we trust the DOM.
 11   */
 12  function assertTiebaReadTargetPage(raw, kwargs) {
 13      const expectedThreadId = String(kwargs.id || '').trim();
 14      const expectedPageNumber = Math.max(1, Number(kwargs.page || 1));
 15      const pathname = String(raw.pageMeta?.pathname || '').trim();
 16      const actualThreadId = pathname.match(/^\/p\/(\d+)/)?.[1] || '';
 17      const actualPn = String(raw.pageMeta?.pn || '').trim();
 18      if (!actualThreadId || actualThreadId !== expectedThreadId) {
 19          throw new EmptyResultError('tieba read', 'Tieba did not land on the requested thread page');
 20      }
 21      if (expectedPageNumber > 1 && actualPn !== String(expectedPageNumber)) {
 22          throw new EmptyResultError('tieba read', 'Tieba did not land on the requested page');
 23      }
 24  }
 25  function buildExtractReadEvaluate() {
 26      return `
 27      (async () => {
 28        const wait = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
 29        const waitFor = async (predicate, timeoutMs = 4000) => {
 30          const start = Date.now();
 31          while (Date.now() - start < timeoutMs) {
 32            if (predicate()) return true;
 33            await wait(100);
 34          }
 35          return false;
 36        };
 37        const normalizeText = (value) => (value || '').replace(/\\s+/g, ' ').trim();
 38        const getVueProps = (element) => {
 39          const vue = element && element.__vue__ ? element.__vue__ : null;
 40          return vue ? (vue._props || vue.$props || {}) : {};
 41        };
 42        const extractStructuredText = (content) => {
 43          if (!Array.isArray(content)) return '';
 44          return content
 45            .map((part) => (part && typeof part === 'object' && typeof part.text === 'string') ? part.text : '')
 46            .join('')
 47            .replace(/\\s+/g, ' ')
 48            .trim();
 49        };
 50        const parseFloor = (text) => {
 51          const match = (text || '').match(/第(\\d+)楼/);
 52          return match ? parseInt(match[1], 10) : 0;
 53        };
 54  
 55        await waitFor(() => {
 56          const hasMainTree = document.querySelector('.pb-title-wrap.pc-pb-title') || document.querySelector('.pb-content-wrap');
 57          return Boolean(hasMainTree || document.querySelector('.pb-comment-item'));
 58        });
 59  
 60        const titleNode = document.querySelector('.pb-title-wrap.pc-pb-title');
 61        const titleProps = getVueProps(titleNode);
 62        const mainUser = document.querySelector('.head-line.user-info:not(.no-extra-margin)');
 63        const mainUserProps = getVueProps(mainUser);
 64        const contentWrap = document.querySelector('.pb-content-wrap');
 65        const contentProps = getVueProps(contentWrap);
 66        const structuredContent = Array.isArray(contentProps.content) ? contentProps.content : [];
 67        const visibleContent = normalizeText(
 68          contentWrap?.querySelector('.pb-content-item .text')?.textContent
 69          || contentWrap?.querySelector('.text')?.textContent
 70          || contentWrap?.textContent
 71        );
 72  
 73        return {
 74          pageMeta: {
 75            pathname: window.location.pathname || '',
 76            pn: new URLSearchParams(window.location.search).get('pn') || '',
 77          },
 78          mainPost: {
 79            title: typeof titleProps.title === 'string' && titleProps.title.trim()
 80              ? titleProps.title.trim()
 81              : normalizeText(titleNode?.textContent).replace(/-百度贴吧$/, '').trim(),
 82            author: normalizeText(
 83              mainUser?.querySelector('.head-name')?.textContent
 84              || mainUser?.querySelector('.name-info .head-name')?.textContent
 85              || ''
 86            ),
 87            fallbackAuthor: mainUserProps?.userShowInfo?.[0]?.text?.text || '',
 88            contentText: visibleContent,
 89            structuredText: extractStructuredText(structuredContent),
 90            visibleTime: (() => {
 91              const userText = normalizeText(mainUser?.textContent);
 92              const match = userText.match(/(刚刚|昨天|前天|\\d+\\s*(?:分钟|小时|天)前|\\d{2}-\\d{2}(?:\\s+\\d{2}:\\d{2})?|\\d{4}-\\d{2}-\\d{2}(?:\\s+\\d{2}:\\d{2})?)/);
 93              return match ? match[1].trim() : '';
 94            })(),
 95            structuredTime: mainUserProps?.descInfo?.time || 0,
 96            hasMedia: structuredContent.length > 0 && !extractStructuredText(structuredContent),
 97          },
 98          replies: Array.from(document.querySelectorAll('.pb-comment-item')).map((item) => {
 99            const meta = item.querySelector('.comment-desc-left')?.textContent?.replace(/\\s+/g, ' ').trim() || '';
100            return {
101              floor: parseFloor(meta),
102              author: item.querySelector('.head-name')?.textContent?.trim() || '',
103              content: item.querySelector('.comment-content .pb-content-item .text')?.textContent?.replace(/\\s+/g, ' ').trim() || '',
104              time: meta,
105            };
106          }),
107        };
108      })()
109    `;
110  }
111  cli({
112      site: 'tieba',
113      name: 'read',
114      description: 'Read a tieba thread',
115      domain: 'tieba.baidu.com',
116      strategy: Strategy.COOKIE,
117      browser: true,
118      navigateBefore: false,
119      args: [
120          { name: 'id', positional: true, required: true, type: 'string', help: 'Thread ID' },
121          { name: 'page', type: 'int', default: 1, help: 'Page number' },
122          { name: 'limit', type: 'int', default: 30, help: 'Number of replies to return' },
123      ],
124      columns: ['floor', 'author', 'content', 'time'],
125      func: async (page, kwargs) => {
126          const pageNumber = Math.max(1, Number(kwargs.page || 1));
127          // Use the browser's normal settle path so we do not scrape stale DOM from the previous tab state.
128          await page.goto(getThreadUrl(kwargs));
129          const raw = (await page.evaluate(buildExtractReadEvaluate()) || {});
130          assertTiebaReadTargetPage(raw, kwargs);
131          const items = buildTiebaReadItems(raw, {
132              limit: kwargs.limit,
133              includeMainPost: pageNumber === 1,
134          });
135          if (!items.length) {
136              throw new EmptyResultError('tieba read', 'Tieba may have blocked the thread page, or the DOM structure may have changed');
137          }
138          return items;
139      },
140  });