search.js
1 import { CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors'; 2 import { cli, Strategy } from '@jackwener/opencli/registry'; 3 import { FACTORY_BADGE_PATTERNS, SERVICE_BADGE_PATTERNS, assertAuthenticatedState, buildProvenance, buildSearchUrl, canonicalizeItemUrl, canonicalizeSellerUrl, cleanText, extractBadges, extractLocation, extractMemberId, extractOfferId, extractShopId, gotoAndReadState, parseMoqText, parsePriceText, SEARCH_LIMIT_DEFAULT, SEARCH_LIMIT_MAX, parseSearchLimit, uniqueNonEmpty, } from './shared.js'; 4 const SEARCH_ITEM_URL_PATTERNS = [ 5 'detail.1688.com/offer/', 6 'detail.m.1688.com/page/index.html?offerId=', 7 ]; 8 const MAX_SEARCH_PAGES = 12; 9 function normalizeSearchCandidate(candidate, sourceUrl) { 10 const canonicalItemUrl = canonicalizeItemUrl(cleanText(candidate.item_url)); 11 const containerText = cleanText(candidate.container_text); 12 const priceText = firstNonEmpty([ 13 normalizeInlineText(candidate.price_text), 14 normalizeInlineText(extractPriceText(candidate.hover_price_text)), 15 ]); 16 const priceRange = parsePriceText(priceText || containerText); 17 const moq = parseMoqText(firstNonEmpty([ 18 normalizeInlineText(candidate.moq_text), 19 normalizeInlineText(extractMoqText(containerText)), 20 ])); 21 const canonicalSellerUrl = canonicalizeSellerUrl(cleanText(candidate.seller_url)); 22 const evidenceText = uniqueNonEmpty([ 23 containerText, 24 ...(candidate.desc_rows ?? []), 25 ...(candidate.tag_items ?? []), 26 ...(candidate.hover_items ?? []), 27 ]).join('\n'); 28 const badges = extractBadges(evidenceText, [...FACTORY_BADGE_PATTERNS, ...SERVICE_BADGE_PATTERNS]); 29 const salesText = firstNonEmpty([ 30 extractSalesText(candidate.sales_text), 31 extractSalesText(containerText), 32 ]); 33 const returnRateText = extractReturnRateText([...(candidate.tag_items ?? []), ...(candidate.hover_items ?? [])]); 34 const provenance = buildProvenance(sourceUrl); 35 return { 36 rank: 0, 37 offer_id: extractOfferId(canonicalItemUrl ?? '') ?? null, 38 member_id: extractMemberId(canonicalSellerUrl ?? '') ?? null, 39 shop_id: extractShopId(canonicalSellerUrl ?? '') ?? null, 40 title: cleanText(candidate.title) || firstWord(containerText) || null, 41 item_url: canonicalItemUrl, 42 seller_name: cleanText(candidate.seller_name) || null, 43 seller_url: canonicalSellerUrl, 44 price_text: priceRange.price_text || null, 45 price_min: priceRange.price_min, 46 price_max: priceRange.price_max, 47 currency: priceRange.currency, 48 moq_text: moq.moq_text || null, 49 moq_value: moq.moq_value, 50 location: extractLocation(containerText), 51 badges, 52 sales_text: salesText || null, 53 return_rate_text: returnRateText, 54 source_url: provenance.source_url, 55 fetched_at: provenance.fetched_at, 56 strategy: provenance.strategy, 57 }; 58 } 59 function extractMoqText(text) { 60 const normalized = normalizeInlineText(text); 61 return normalized.match(/\d+(?:\.\d+)?\s*(件|个|套|箱|包|双|台|把|只)\s*起批/i)?.[0] 62 ?? normalized.match(/≥\s*\d+(?:\.\d+)?\s*(件|个|套|箱|包|双|台|把|只)?/i)?.[0] 63 ?? normalized.match(/\d+(?:\.\d+)?\s*(?:~|-|至|到)\s*\d+(?:\.\d+)?\s*(件|个|套|箱|包|双|台|把|只)/i)?.[0] 64 ?? ''; 65 } 66 function extractPriceText(text) { 67 const normalized = normalizeInlineText(text); 68 return normalized.match(/[¥$€]\s*\d+(?:\.\d+)?/)?.[0] ?? ''; 69 } 70 function extractSalesText(text) { 71 const normalized = normalizeInlineText(text); 72 if (!normalized) 73 return ''; 74 if (/^\d+(?:\.\d+)?\+?\s*(件|套|个|单)$/.test(normalized)) { 75 return normalized; 76 } 77 const match = normalized.match(/(?:已售|销量|售)\s*\d+(?:\.\d+)?\+?\s*(件|套|个|单)?/); 78 return match ? cleanText(match[0]) : ''; 79 } 80 function firstWord(text) { 81 return text.split(/\s+/).find(Boolean) ?? ''; 82 } 83 function firstNonEmpty(values) { 84 return values.map((value) => cleanText(value)).find(Boolean) ?? ''; 85 } 86 function normalizeInlineText(text) { 87 return cleanText(text) 88 .replace(/([¥$€])\s+(?=\d)/g, '$1') 89 .replace(/(\d)\s*\.\s*(\d)/g, '$1.$2') 90 .replace(/\s*([~-])\s*/g, '$1') 91 .trim(); 92 } 93 function extractReturnRateText(values) { 94 return uniqueNonEmpty(values.map((value) => normalizeInlineText(value))) 95 .find((value) => /^回头率\s*\d+(?:\.\d+)?%$/.test(value)) 96 ?? null; 97 } 98 function buildDedupeKey(row) { 99 if (row.offer_id) 100 return `offer:${row.offer_id}`; 101 if (row.item_url) 102 return `url:${row.item_url}`; 103 return null; 104 } 105 async function readSearchPayload(page, url) { 106 const state = await gotoAndReadState(page, url, 2500, 'search'); 107 assertAuthenticatedState(state, 'search'); 108 const payload = await page.evaluate(` 109 (() => { 110 const normalizeText = (value) => (value || '').replace(/\\s+/g, ' ').trim(); 111 const normalizeUrl = (href) => { 112 if (!href) return ''; 113 try { 114 return new URL(href, window.location.href).toString(); 115 } catch { 116 return ''; 117 } 118 }; 119 const isItemHref = (href) => ${JSON.stringify(SEARCH_ITEM_URL_PATTERNS)} 120 .some((pattern) => (href || '').includes(pattern)); 121 const uniqueTexts = (values) => [...new Set(values.map((value) => normalizeText(value)).filter(Boolean))]; 122 const collectTexts = (root, selector) => uniqueTexts( 123 Array.from(root.querySelectorAll(selector)).map((node) => node.innerText || node.textContent || ''), 124 ); 125 const firstText = (root, selectors) => { 126 for (const selector of selectors) { 127 const node = root.querySelector(selector); 128 const value = normalizeText(node ? node.innerText || node.textContent || '' : ''); 129 if (value) return value; 130 } 131 return ''; 132 }; 133 const findMoqText = (values, priceText) => { 134 const moqPattern = /(≥\\s*\\d+(?:\\.\\d+)?\\s*(件|个|套|箱|包|双|台|把|只)?)|(\\d+(?:\\.\\d+)?\\s*(?:~|-|至|到)\\s*\\d+(?:\\.\\d+)?\\s*(件|个|套|箱|包|双|台|把|只))|(\\d+(?:\\.\\d+)?\\s*(件|个|套|箱|包|双|台|把|只)\\s*起批)/i; 135 return values.find((value) => moqPattern.test(value)) 136 || normalizeText(priceText).match(moqPattern)?.[0] 137 || ''; 138 }; 139 const isSellerHref = (href) => { 140 if (!href) return false; 141 try { 142 const url = new URL(href, window.location.href); 143 const host = url.hostname || ''; 144 if (!host.endsWith('.1688.com')) return false; 145 if ( 146 host === 's.1688.com' 147 || host === 'r.1688.com' 148 || host === 'air.1688.com' 149 || host === 'detail.1688.com' 150 || host === 'detail.m.1688.com' 151 || host === 'dj.1688.com' 152 ) { 153 return false; 154 } 155 return true; 156 } catch { 157 return false; 158 } 159 }; 160 const pickContainer = (anchor) => { 161 let node = anchor; 162 while (node && node !== document.body) { 163 const text = normalizeText(node.innerText || node.textContent || ''); 164 if (text.length >= 40 && text.length <= 2000) { 165 return node; 166 } 167 node = node.parentElement; 168 } 169 return anchor; 170 }; 171 const collectCandidates = () => { 172 const anchors = Array.from(document.querySelectorAll('a')).filter((anchor) => isItemHref(anchor.href || '')); 173 const seen = new Set(); 174 const items = []; 175 for (const anchor of anchors) { 176 const href = anchor.href || ''; 177 if (!href || seen.has(href)) continue; 178 seen.add(href); 179 180 const container = pickContainer(anchor); 181 const tagItems = collectTexts(container, '.offer-tag-row .offer-desc-item'); 182 const hoverItems = collectTexts(container, '.offer-hover-wrapper .offer-desc-item'); 183 const sellerAnchor = Array.from(container.querySelectorAll('a')) 184 .find((link) => isSellerHref(link.href || '')); 185 const hoverPriceText = firstText(container, [ 186 '.offer-hover-wrapper .hover-price-item', 187 '.offer-hover-wrapper .price-item', 188 ]); 189 190 items.push({ 191 item_url: href, 192 title: firstText(container, ['.offer-title-row .title-text', '.offer-title-row']) 193 || normalizeText(anchor.innerText || anchor.textContent || ''), 194 container_text: normalizeText(container.innerText || container.textContent || ''), 195 desc_rows: collectTexts(container, '.offer-desc-row'), 196 price_text: firstText(container, ['.offer-price-row .price-item']), 197 sales_text: firstText(container, ['.offer-price-row .col-desc_after', '.offer-desc-row .col-desc_after']), 198 hover_price_text: hoverPriceText, 199 moq_text: findMoqText(hoverItems, hoverPriceText), 200 tag_items: tagItems, 201 hover_items: hoverItems, 202 seller_name: sellerAnchor ? normalizeText(sellerAnchor.innerText || sellerAnchor.textContent || '') : null, 203 seller_url: sellerAnchor ? sellerAnchor.href : null, 204 }); 205 } 206 return items; 207 }; 208 const findNextUrl = () => { 209 const selectors = [ 210 'a.fui-next:not(.disabled)', 211 'a.next-pagination-item:not(.disabled)', 212 'a[rel="next"]:not(.disabled)', 213 'a[data-role="next"]:not(.disabled)', 214 ]; 215 for (const selector of selectors) { 216 const node = document.querySelector(selector); 217 if (!node) continue; 218 const href = normalizeUrl(node.getAttribute('href') || node.href || ''); 219 if (href) return href; 220 } 221 const textBased = Array.from(document.querySelectorAll('a')) 222 .find((node) => /下一页|next/i.test(normalizeText(node.textContent || ''))); 223 if (!textBased) return ''; 224 return normalizeUrl(textBased.getAttribute('href') || textBased.href || ''); 225 }; 226 227 return { 228 href: window.location.href, 229 title: document.title || '', 230 bodyText: document.body ? document.body.innerText || '' : '', 231 next_url: findNextUrl(), 232 candidates: collectCandidates(), 233 }; 234 })() 235 `); 236 if (!payload || typeof payload !== 'object') { 237 throw new CommandExecutionError('1688 search page did not return a readable payload', 'Open the same query in Chrome and verify the page is fully loaded before retrying.'); 238 } 239 return payload; 240 } 241 async function collectSearchRows(page, query, limit) { 242 const rowsByKey = new Map(); 243 const seenPages = new Set(); 244 let nextUrl = buildSearchUrl(query); 245 let pageCount = 0; 246 while (nextUrl && rowsByKey.size < limit && pageCount < MAX_SEARCH_PAGES) { 247 if (seenPages.has(nextUrl)) 248 break; 249 seenPages.add(nextUrl); 250 pageCount += 1; 251 const payload = await readSearchPayload(page, nextUrl); 252 const sourceUrl = cleanText(payload.href) || nextUrl; 253 const candidates = Array.isArray(payload.candidates) ? payload.candidates : []; 254 for (const candidate of candidates) { 255 const row = normalizeSearchCandidate(candidate, sourceUrl); 256 const dedupeKey = buildDedupeKey(row); 257 if (!dedupeKey || rowsByKey.has(dedupeKey)) 258 continue; 259 rowsByKey.set(dedupeKey, row); 260 if (rowsByKey.size >= limit) 261 break; 262 } 263 const candidateNextUrl = cleanText(payload.next_url); 264 if (!candidateNextUrl || candidateNextUrl === sourceUrl) 265 break; 266 nextUrl = candidateNextUrl; 267 } 268 if (rowsByKey.size === 0) { 269 throw new EmptyResultError('1688 search', 'No visible results were extracted. Retry with a different query or open the same search page in Chrome first.'); 270 } 271 return [...rowsByKey.values()] 272 .slice(0, limit) 273 .map((row, index) => ({ ...row, rank: index + 1 })); 274 } 275 cli({ 276 site: '1688', 277 name: 'search', 278 description: '1688 商品搜索(结果候选、卖家链接、价格/MOQ/销量文本)', 279 domain: 'www.1688.com', 280 strategy: Strategy.COOKIE, 281 navigateBefore: false, 282 args: [ 283 { 284 name: 'query', 285 required: true, 286 positional: true, 287 help: '搜索关键词,如 "置物架"', 288 }, 289 { 290 name: 'limit', 291 type: 'int', 292 default: SEARCH_LIMIT_DEFAULT, 293 help: `结果数量上限(默认 ${SEARCH_LIMIT_DEFAULT},最大 ${SEARCH_LIMIT_MAX})`, 294 }, 295 ], 296 columns: ['rank', 'title', 'price_text', 'moq_text', 'seller_name', 'location'], 297 func: async (page, kwargs) => { 298 const query = String(kwargs.query ?? ''); 299 const limit = parseSearchLimit(kwargs.limit); 300 return collectSearchRows(page, query, limit); 301 }, 302 }); 303 export const __test__ = { 304 normalizeSearchCandidate, 305 extractMoqText, 306 extractSalesText, 307 firstWord, 308 buildDedupeKey, 309 };