read.js
1 import { EmptyResultError } from '@jackwener/opencli/errors'; 2 import { cli, Strategy } from '@jackwener/opencli/registry'; 3 import { buildTiebaReadItems } from './utils.js'; 4 function getThreadUrl(kwargs) { 5 const threadId = String(kwargs.id || ''); 6 const pageNumber = Math.max(1, Number(kwargs.page || 1)); 7 return `https://tieba.baidu.com/p/${encodeURIComponent(threadId)}?pn=${pageNumber}`; 8 } 9 /** 10 * Ensure the browser actually landed on the requested thread page before we trust the DOM. 11 */ 12 function assertTiebaReadTargetPage(raw, kwargs) { 13 const expectedThreadId = String(kwargs.id || '').trim(); 14 const expectedPageNumber = Math.max(1, Number(kwargs.page || 1)); 15 const pathname = String(raw.pageMeta?.pathname || '').trim(); 16 const actualThreadId = pathname.match(/^\/p\/(\d+)/)?.[1] || ''; 17 const actualPn = String(raw.pageMeta?.pn || '').trim(); 18 if (!actualThreadId || actualThreadId !== expectedThreadId) { 19 throw new EmptyResultError('tieba read', 'Tieba did not land on the requested thread page'); 20 } 21 if (expectedPageNumber > 1 && actualPn !== String(expectedPageNumber)) { 22 throw new EmptyResultError('tieba read', 'Tieba did not land on the requested page'); 23 } 24 } 25 function buildExtractReadEvaluate() { 26 return ` 27 (async () => { 28 const wait = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); 29 const waitFor = async (predicate, timeoutMs = 4000) => { 30 const start = Date.now(); 31 while (Date.now() - start < timeoutMs) { 32 if (predicate()) return true; 33 await wait(100); 34 } 35 return false; 36 }; 37 const normalizeText = (value) => (value || '').replace(/\\s+/g, ' ').trim(); 38 const getVueProps = (element) => { 39 const vue = element && element.__vue__ ? element.__vue__ : null; 40 return vue ? (vue._props || vue.$props || {}) : {}; 41 }; 42 const extractStructuredText = (content) => { 43 if (!Array.isArray(content)) return ''; 44 return content 45 .map((part) => (part && typeof part === 'object' && typeof part.text === 'string') ? part.text : '') 46 .join('') 47 .replace(/\\s+/g, ' ') 48 .trim(); 49 }; 50 const parseFloor = (text) => { 51 const match = (text || '').match(/第(\\d+)楼/); 52 return match ? parseInt(match[1], 10) : 0; 53 }; 54 55 await waitFor(() => { 56 const hasMainTree = document.querySelector('.pb-title-wrap.pc-pb-title') || document.querySelector('.pb-content-wrap'); 57 return Boolean(hasMainTree || document.querySelector('.pb-comment-item')); 58 }); 59 60 const titleNode = document.querySelector('.pb-title-wrap.pc-pb-title'); 61 const titleProps = getVueProps(titleNode); 62 const mainUser = document.querySelector('.head-line.user-info:not(.no-extra-margin)'); 63 const mainUserProps = getVueProps(mainUser); 64 const contentWrap = document.querySelector('.pb-content-wrap'); 65 const contentProps = getVueProps(contentWrap); 66 const structuredContent = Array.isArray(contentProps.content) ? contentProps.content : []; 67 const visibleContent = normalizeText( 68 contentWrap?.querySelector('.pb-content-item .text')?.textContent 69 || contentWrap?.querySelector('.text')?.textContent 70 || contentWrap?.textContent 71 ); 72 73 return { 74 pageMeta: { 75 pathname: window.location.pathname || '', 76 pn: new URLSearchParams(window.location.search).get('pn') || '', 77 }, 78 mainPost: { 79 title: typeof titleProps.title === 'string' && titleProps.title.trim() 80 ? titleProps.title.trim() 81 : normalizeText(titleNode?.textContent).replace(/-百度贴吧$/, '').trim(), 82 author: normalizeText( 83 mainUser?.querySelector('.head-name')?.textContent 84 || mainUser?.querySelector('.name-info .head-name')?.textContent 85 || '' 86 ), 87 fallbackAuthor: mainUserProps?.userShowInfo?.[0]?.text?.text || '', 88 contentText: visibleContent, 89 structuredText: extractStructuredText(structuredContent), 90 visibleTime: (() => { 91 const userText = normalizeText(mainUser?.textContent); 92 const match = userText.match(/(刚刚|昨天|前天|\\d+\\s*(?:分钟|小时|天)前|\\d{2}-\\d{2}(?:\\s+\\d{2}:\\d{2})?|\\d{4}-\\d{2}-\\d{2}(?:\\s+\\d{2}:\\d{2})?)/); 93 return match ? match[1].trim() : ''; 94 })(), 95 structuredTime: mainUserProps?.descInfo?.time || 0, 96 hasMedia: structuredContent.length > 0 && !extractStructuredText(structuredContent), 97 }, 98 replies: Array.from(document.querySelectorAll('.pb-comment-item')).map((item) => { 99 const meta = item.querySelector('.comment-desc-left')?.textContent?.replace(/\\s+/g, ' ').trim() || ''; 100 return { 101 floor: parseFloor(meta), 102 author: item.querySelector('.head-name')?.textContent?.trim() || '', 103 content: item.querySelector('.comment-content .pb-content-item .text')?.textContent?.replace(/\\s+/g, ' ').trim() || '', 104 time: meta, 105 }; 106 }), 107 }; 108 })() 109 `; 110 } 111 cli({ 112 site: 'tieba', 113 name: 'read', 114 description: 'Read a tieba thread', 115 domain: 'tieba.baidu.com', 116 strategy: Strategy.COOKIE, 117 browser: true, 118 navigateBefore: false, 119 args: [ 120 { name: 'id', positional: true, required: true, type: 'string', help: 'Thread ID' }, 121 { name: 'page', type: 'int', default: 1, help: 'Page number' }, 122 { name: 'limit', type: 'int', default: 30, help: 'Number of replies to return' }, 123 ], 124 columns: ['floor', 'author', 'content', 'time'], 125 func: async (page, kwargs) => { 126 const pageNumber = Math.max(1, Number(kwargs.page || 1)); 127 // Use the browser's normal settle path so we do not scrape stale DOM from the previous tab state. 128 await page.goto(getThreadUrl(kwargs)); 129 const raw = (await page.evaluate(buildExtractReadEvaluate()) || {}); 130 assertTiebaReadTargetPage(raw, kwargs); 131 const items = buildTiebaReadItems(raw, { 132 limit: kwargs.limit, 133 includeMainPost: pageNumber === 1, 134 }); 135 if (!items.length) { 136 throw new EmptyResultError('tieba read', 'Tieba may have blocked the thread page, or the DOM structure may have changed'); 137 } 138 return items; 139 }, 140 });