search.js
1 /** 2 * PowerChina search — browser DOM extraction with multi-entry URL probing. 3 */ 4 import { cli, Strategy } from '@jackwener/opencli/registry'; 5 import { AuthRequiredError } from '@jackwener/opencli/errors'; 6 import { 7 cleanText, 8 normalizeDate, 9 toProcurementSearchRecords, 10 } from '../jianyu/shared/procurement-contract.js'; 11 import { searchRowsFromEntries } from '../jianyu/shared/china-bid-search.js'; 12 13 const SEARCH_ENTRIES = [ 14 'https://bid.powerchina.cn/search', 15 'https://bid.powerchina.cn/', 16 ]; 17 const API_LIST_ENDPOINT = 'https://bid.powerchina.cn/newcbs/recpro-newmember/BidAnnouncementSummary/list'; 18 const API_DETAIL_ENDPOINT = 'https://bid.powerchina.cn/newcbs/recpro-newmember/BidAnnouncementSummary/getInfo'; 19 const API_DEFAULT_ANNOUNCEMENT_TYPE = '招采公告'; 20 21 const PROCUREMENT_TITLE_HINT = /(公告|招标|采购|中标|成交|项目|notice|tender|bidding)/i; 22 const NAVIGATION_TITLE_HINT = /^(english|中文|chinese|language|home|首页|搜索|search)$/i; 23 const RETRYABLE_SEARCH_ERROR_HINT = /(detached while handling command|execution context was destroyed|target closed|cannot find context with specified id)/i; 24 25 export function buildSearchCandidates(query) { 26 const keyword = query.trim(); 27 if (!keyword) return [...SEARCH_ENTRIES]; 28 const encoded = encodeURIComponent(keyword); 29 return [ 30 `https://bid.powerchina.cn/search?keyword=${encoded}`, 31 `https://bid.powerchina.cn/search?keywords=${encoded}`, 32 `https://bid.powerchina.cn/search?q=${encoded}`, 33 ...SEARCH_ENTRIES, 34 ]; 35 } 36 37 function dedupeCandidates(items) { 38 const deduped = []; 39 const seen = new Set(); 40 for (const item of items) { 41 const key = `${item.title}\t${item.url}`; 42 if (seen.has(key)) continue; 43 seen.add(key); 44 deduped.push(item); 45 } 46 return deduped; 47 } 48 49 function isLikelyNavigationUrl(rawUrl) { 50 const urlText = cleanText(rawUrl); 51 if (!urlText) return true; 52 try { 53 const parsed = new URL(urlText); 54 const pathname = parsed.pathname.toLowerCase().replace(/\/+$/, '') || '/'; 55 const hash = cleanText(parsed.hash).toLowerCase(); 56 if (pathname === '/' || pathname === '/index') return true; 57 if (pathname === '/search') return true; 58 if (pathname === '/old' || pathname.startsWith('/old/')) return true; 59 if (pathname === '/en' || pathname.startsWith('/en/')) return true; 60 if (pathname === '/zh' || pathname.startsWith('/zh/')) return true; 61 if (hash === '#/' || hash === '#/index' || hash.startsWith('#/search')) return true; 62 return false; 63 } catch { 64 return true; 65 } 66 } 67 68 function isLikelyNavigationTitle(rawTitle) { 69 const title = cleanText(rawTitle); 70 if (!title) return true; 71 const normalized = title.toLowerCase(); 72 if (NAVIGATION_TITLE_HINT.test(normalized)) return true; 73 if (normalized.length <= 10 && (normalized === 'en' || normalized === 'zh' || normalized.includes('english'))) { 74 return true; 75 } 76 return false; 77 } 78 79 function filterNavigationRows(items) { 80 return items.filter((item) => { 81 const title = cleanText(item.title); 82 const url = cleanText(item.url); 83 if (!url || !title) return false; 84 if (isLikelyNavigationUrl(url)) return false; 85 if (isLikelyNavigationTitle(title) && !PROCUREMENT_TITLE_HINT.test(title)) return false; 86 return true; 87 }); 88 } 89 90 export function buildApiDetailUrl(id) { 91 const normalizedId = cleanText(id); 92 if (!normalizedId) return ''; 93 return `${API_DETAIL_ENDPOINT}/${encodeURIComponent(normalizedId)}`; 94 } 95 96 function toApiCandidate(row) { 97 const id = cleanText(row.id); 98 const title = cleanText(row.title); 99 if (!id || !title) return null; 100 101 const url = buildApiDetailUrl(id); 102 if (!url) return null; 103 104 const contextText = cleanText([ 105 row.announcementType, 106 row.titleTypeName, 107 row.source, 108 row.publishTime, 109 row.registrationDeadline, 110 row.submissionDeadline, 111 row.bidOpenTime, 112 ].filter(Boolean).join(' | ')); 113 114 const date = normalizeDate(cleanText(row.publishTime || row.bidOpenTime || row.submissionDeadline || '')); 115 return { 116 title, 117 url, 118 date, 119 contextText, 120 }; 121 } 122 123 async function searchRowsFromApi(query, limit) { 124 const keyword = cleanText(query); 125 const pageSize = Math.max(20, Math.min(100, Math.max(limit * 3, limit))); 126 const payload = { 127 pageNum: 1, 128 pageSize, 129 announcementType: API_DEFAULT_ANNOUNCEMENT_TYPE, 130 companyType: '3', 131 time: Date.now(), 132 }; 133 if (keyword) payload.keyWords = keyword; 134 135 const response = await fetch(API_LIST_ENDPOINT, { 136 method: 'POST', 137 headers: { 138 'Content-Type': 'application/json;charset=utf-8', 139 }, 140 body: JSON.stringify(payload), 141 }); 142 143 if (!response.ok) { 144 throw new Error(`[taxonomy=relay_unavailable] site=powerchina command=search api HTTP ${response.status}`); 145 } 146 147 const data = await response.json(); 148 if ((data.code ?? 200) !== 200) { 149 throw new Error(`[taxonomy=relay_unavailable] site=powerchina command=search api code=${data.code ?? 'unknown'} msg=${cleanText(data.msg)}`); 150 } 151 152 const rows = Array.isArray(data.rows) ? data.rows : []; 153 const mapped = rows 154 .map((row) => toApiCandidate(row)) 155 .filter(Boolean); 156 return dedupeCandidates(mapped).slice(0, limit); 157 } 158 159 cli({ 160 site: 'powerchina', 161 name: 'search', 162 description: '搜索中国电建阳光采购公告', 163 domain: 'bid.powerchina.cn', 164 strategy: Strategy.COOKIE, 165 browser: true, 166 args: [ 167 { name: 'query', required: true, positional: true, help: 'Search keyword, e.g. "procurement"' }, 168 { name: 'limit', type: 'int', default: 20, help: 'Number of results (max 50)' }, 169 ], 170 columns: ['rank', 'content_type', 'title', 'publish_time', 'project_code', 'budget_or_limit', 'url'], 171 func: async (page, kwargs) => { 172 const query = cleanText(kwargs.query); 173 const limit = Math.max(1, Math.min(Number(kwargs.limit) || 20, 50)); 174 let extractedRows = []; 175 let apiFailure = null; 176 let apiSucceeded = false; 177 178 try { 179 const apiRows = await searchRowsFromApi(query, limit); 180 extractedRows = apiRows; 181 apiSucceeded = true; 182 } catch (error) { 183 apiFailure = cleanText(error instanceof Error ? error.message : String(error || '')); 184 } 185 186 if (apiSucceeded && extractedRows.length === 0) { 187 return []; 188 } 189 190 if (!apiSucceeded) { 191 try { 192 extractedRows = await searchRowsFromEntries(page, { 193 query, 194 candidateUrls: buildSearchCandidates(query), 195 allowedHostFragments: ['bid.powerchina.cn', 'powerchina.cn'], 196 limit, 197 }); 198 } catch (error) { 199 const message = cleanText(error instanceof Error ? error.message : String(error || '')); 200 if (RETRYABLE_SEARCH_ERROR_HINT.test(message)) { 201 throw new Error(`[taxonomy=relay_unavailable] site=powerchina command=search detached browser context: ${message}`); 202 } 203 throw error; 204 } 205 } 206 207 const rows = filterNavigationRows( 208 dedupeCandidates(extractedRows).map((item) => ({ 209 title: cleanText(item.title), 210 url: cleanText(item.url), 211 date: normalizeDate(cleanText(item.date)), 212 contextText: cleanText(item.contextText), 213 })), 214 ); 215 216 if (rows.length === 0 && extractedRows.length > 0) { 217 throw new Error('[taxonomy=empty_result] site=powerchina command=search extracted only navigation/portal rows'); 218 } 219 220 if (rows.length === 0) { 221 const pageText = cleanText(await page.evaluate('document.body ? document.body.innerText : ""')); 222 if (/(请先登录|未登录|登录后|验证码|人机验证)/.test(pageText)) { 223 throw new AuthRequiredError( 224 'bid.powerchina.cn', 225 '[taxonomy=selector_drift] site=powerchina command=search login required or human verification', 226 ); 227 } 228 if (apiFailure) { 229 throw new Error(`[taxonomy=empty_result] site=powerchina command=search api/dom yielded no result: ${apiFailure}`); 230 } 231 } 232 233 return toProcurementSearchRecords(rows, { 234 site: 'powerchina', 235 query, 236 limit, 237 }); 238 }, 239 }); 240 241 export const __test__ = { 242 buildSearchCandidates, 243 normalizeDate, 244 dedupeCandidates, 245 filterNavigationRows, 246 isLikelyNavigationUrl, 247 isLikelyNavigationTitle, 248 buildApiDetailUrl, 249 toApiCandidate, 250 };