/ clis / 1688 / search.js
search.js
  1  import { CommandExecutionError, EmptyResultError } from '@jackwener/opencli/errors';
  2  import { cli, Strategy } from '@jackwener/opencli/registry';
  3  import { FACTORY_BADGE_PATTERNS, SERVICE_BADGE_PATTERNS, assertAuthenticatedState, buildProvenance, buildSearchUrl, canonicalizeItemUrl, canonicalizeSellerUrl, cleanText, extractBadges, extractLocation, extractMemberId, extractOfferId, extractShopId, gotoAndReadState, parseMoqText, parsePriceText, SEARCH_LIMIT_DEFAULT, SEARCH_LIMIT_MAX, parseSearchLimit, uniqueNonEmpty, } from './shared.js';
  4  const SEARCH_ITEM_URL_PATTERNS = [
  5      'detail.1688.com/offer/',
  6      'detail.m.1688.com/page/index.html?offerId=',
  7  ];
  8  const MAX_SEARCH_PAGES = 12;
  9  function normalizeSearchCandidate(candidate, sourceUrl) {
 10      const canonicalItemUrl = canonicalizeItemUrl(cleanText(candidate.item_url));
 11      const containerText = cleanText(candidate.container_text);
 12      const priceText = firstNonEmpty([
 13          normalizeInlineText(candidate.price_text),
 14          normalizeInlineText(extractPriceText(candidate.hover_price_text)),
 15      ]);
 16      const priceRange = parsePriceText(priceText || containerText);
 17      const moq = parseMoqText(firstNonEmpty([
 18          normalizeInlineText(candidate.moq_text),
 19          normalizeInlineText(extractMoqText(containerText)),
 20      ]));
 21      const canonicalSellerUrl = canonicalizeSellerUrl(cleanText(candidate.seller_url));
 22      const evidenceText = uniqueNonEmpty([
 23          containerText,
 24          ...(candidate.desc_rows ?? []),
 25          ...(candidate.tag_items ?? []),
 26          ...(candidate.hover_items ?? []),
 27      ]).join('\n');
 28      const badges = extractBadges(evidenceText, [...FACTORY_BADGE_PATTERNS, ...SERVICE_BADGE_PATTERNS]);
 29      const salesText = firstNonEmpty([
 30          extractSalesText(candidate.sales_text),
 31          extractSalesText(containerText),
 32      ]);
 33      const returnRateText = extractReturnRateText([...(candidate.tag_items ?? []), ...(candidate.hover_items ?? [])]);
 34      const provenance = buildProvenance(sourceUrl);
 35      return {
 36          rank: 0,
 37          offer_id: extractOfferId(canonicalItemUrl ?? '') ?? null,
 38          member_id: extractMemberId(canonicalSellerUrl ?? '') ?? null,
 39          shop_id: extractShopId(canonicalSellerUrl ?? '') ?? null,
 40          title: cleanText(candidate.title) || firstWord(containerText) || null,
 41          item_url: canonicalItemUrl,
 42          seller_name: cleanText(candidate.seller_name) || null,
 43          seller_url: canonicalSellerUrl,
 44          price_text: priceRange.price_text || null,
 45          price_min: priceRange.price_min,
 46          price_max: priceRange.price_max,
 47          currency: priceRange.currency,
 48          moq_text: moq.moq_text || null,
 49          moq_value: moq.moq_value,
 50          location: extractLocation(containerText),
 51          badges,
 52          sales_text: salesText || null,
 53          return_rate_text: returnRateText,
 54          source_url: provenance.source_url,
 55          fetched_at: provenance.fetched_at,
 56          strategy: provenance.strategy,
 57      };
 58  }
 59  function extractMoqText(text) {
 60      const normalized = normalizeInlineText(text);
 61      return normalized.match(/\d+(?:\.\d+)?\s*(件|个|套|箱|包|双|台|把|只)\s*起批/i)?.[0]
 62          ?? normalized.match(/≥\s*\d+(?:\.\d+)?\s*(件|个|套|箱|包|双|台|把|只)?/i)?.[0]
 63          ?? normalized.match(/\d+(?:\.\d+)?\s*(?:~|-|至|到)\s*\d+(?:\.\d+)?\s*(件|个|套|箱|包|双|台|把|只)/i)?.[0]
 64          ?? '';
 65  }
 66  function extractPriceText(text) {
 67      const normalized = normalizeInlineText(text);
 68      return normalized.match(/[¥$€]\s*\d+(?:\.\d+)?/)?.[0] ?? '';
 69  }
 70  function extractSalesText(text) {
 71      const normalized = normalizeInlineText(text);
 72      if (!normalized)
 73          return '';
 74      if (/^\d+(?:\.\d+)?\+?\s*(件|套|个|单)$/.test(normalized)) {
 75          return normalized;
 76      }
 77      const match = normalized.match(/(?:已售|销量|售)\s*\d+(?:\.\d+)?\+?\s*(件|套|个|单)?/);
 78      return match ? cleanText(match[0]) : '';
 79  }
 80  function firstWord(text) {
 81      return text.split(/\s+/).find(Boolean) ?? '';
 82  }
 83  function firstNonEmpty(values) {
 84      return values.map((value) => cleanText(value)).find(Boolean) ?? '';
 85  }
 86  function normalizeInlineText(text) {
 87      return cleanText(text)
 88          .replace(/([¥$€])\s+(?=\d)/g, '$1')
 89          .replace(/(\d)\s*\.\s*(\d)/g, '$1.$2')
 90          .replace(/\s*([~-])\s*/g, '$1')
 91          .trim();
 92  }
 93  function extractReturnRateText(values) {
 94      return uniqueNonEmpty(values.map((value) => normalizeInlineText(value)))
 95          .find((value) => /^回头率\s*\d+(?:\.\d+)?%$/.test(value))
 96          ?? null;
 97  }
 98  function buildDedupeKey(row) {
 99      if (row.offer_id)
100          return `offer:${row.offer_id}`;
101      if (row.item_url)
102          return `url:${row.item_url}`;
103      return null;
104  }
105  async function readSearchPayload(page, url) {
106      const state = await gotoAndReadState(page, url, 2500, 'search');
107      assertAuthenticatedState(state, 'search');
108      const payload = await page.evaluate(`
109      (() => {
110        const normalizeText = (value) => (value || '').replace(/\\s+/g, ' ').trim();
111        const normalizeUrl = (href) => {
112          if (!href) return '';
113          try {
114            return new URL(href, window.location.href).toString();
115          } catch {
116            return '';
117          }
118        };
119        const isItemHref = (href) => ${JSON.stringify(SEARCH_ITEM_URL_PATTERNS)}
120          .some((pattern) => (href || '').includes(pattern));
121        const uniqueTexts = (values) => [...new Set(values.map((value) => normalizeText(value)).filter(Boolean))];
122        const collectTexts = (root, selector) => uniqueTexts(
123          Array.from(root.querySelectorAll(selector)).map((node) => node.innerText || node.textContent || ''),
124        );
125        const firstText = (root, selectors) => {
126          for (const selector of selectors) {
127            const node = root.querySelector(selector);
128            const value = normalizeText(node ? node.innerText || node.textContent || '' : '');
129            if (value) return value;
130          }
131          return '';
132        };
133        const findMoqText = (values, priceText) => {
134          const moqPattern = /(≥\\s*\\d+(?:\\.\\d+)?\\s*(件|个|套|箱|包|双|台|把|只)?)|(\\d+(?:\\.\\d+)?\\s*(?:~|-|至|到)\\s*\\d+(?:\\.\\d+)?\\s*(件|个|套|箱|包|双|台|把|只))|(\\d+(?:\\.\\d+)?\\s*(件|个|套|箱|包|双|台|把|只)\\s*起批)/i;
135          return values.find((value) => moqPattern.test(value))
136            || normalizeText(priceText).match(moqPattern)?.[0]
137            || '';
138        };
139        const isSellerHref = (href) => {
140          if (!href) return false;
141          try {
142            const url = new URL(href, window.location.href);
143            const host = url.hostname || '';
144            if (!host.endsWith('.1688.com')) return false;
145            if (
146              host === 's.1688.com'
147              || host === 'r.1688.com'
148              || host === 'air.1688.com'
149              || host === 'detail.1688.com'
150              || host === 'detail.m.1688.com'
151              || host === 'dj.1688.com'
152            ) {
153              return false;
154            }
155            return true;
156          } catch {
157            return false;
158          }
159        };
160        const pickContainer = (anchor) => {
161          let node = anchor;
162          while (node && node !== document.body) {
163            const text = normalizeText(node.innerText || node.textContent || '');
164            if (text.length >= 40 && text.length <= 2000) {
165              return node;
166            }
167            node = node.parentElement;
168          }
169          return anchor;
170        };
171        const collectCandidates = () => {
172          const anchors = Array.from(document.querySelectorAll('a')).filter((anchor) => isItemHref(anchor.href || ''));
173          const seen = new Set();
174          const items = [];
175          for (const anchor of anchors) {
176            const href = anchor.href || '';
177            if (!href || seen.has(href)) continue;
178            seen.add(href);
179  
180            const container = pickContainer(anchor);
181            const tagItems = collectTexts(container, '.offer-tag-row .offer-desc-item');
182            const hoverItems = collectTexts(container, '.offer-hover-wrapper .offer-desc-item');
183            const sellerAnchor = Array.from(container.querySelectorAll('a'))
184              .find((link) => isSellerHref(link.href || ''));
185            const hoverPriceText = firstText(container, [
186              '.offer-hover-wrapper .hover-price-item',
187              '.offer-hover-wrapper .price-item',
188            ]);
189  
190            items.push({
191              item_url: href,
192              title: firstText(container, ['.offer-title-row .title-text', '.offer-title-row'])
193                || normalizeText(anchor.innerText || anchor.textContent || ''),
194              container_text: normalizeText(container.innerText || container.textContent || ''),
195              desc_rows: collectTexts(container, '.offer-desc-row'),
196              price_text: firstText(container, ['.offer-price-row .price-item']),
197              sales_text: firstText(container, ['.offer-price-row .col-desc_after', '.offer-desc-row .col-desc_after']),
198              hover_price_text: hoverPriceText,
199              moq_text: findMoqText(hoverItems, hoverPriceText),
200              tag_items: tagItems,
201              hover_items: hoverItems,
202              seller_name: sellerAnchor ? normalizeText(sellerAnchor.innerText || sellerAnchor.textContent || '') : null,
203              seller_url: sellerAnchor ? sellerAnchor.href : null,
204            });
205          }
206          return items;
207        };
208        const findNextUrl = () => {
209          const selectors = [
210            'a.fui-next:not(.disabled)',
211            'a.next-pagination-item:not(.disabled)',
212            'a[rel="next"]:not(.disabled)',
213            'a[data-role="next"]:not(.disabled)',
214          ];
215          for (const selector of selectors) {
216            const node = document.querySelector(selector);
217            if (!node) continue;
218            const href = normalizeUrl(node.getAttribute('href') || node.href || '');
219            if (href) return href;
220          }
221          const textBased = Array.from(document.querySelectorAll('a'))
222            .find((node) => /下一页|next/i.test(normalizeText(node.textContent || '')));
223          if (!textBased) return '';
224          return normalizeUrl(textBased.getAttribute('href') || textBased.href || '');
225        };
226  
227        return {
228          href: window.location.href,
229          title: document.title || '',
230          bodyText: document.body ? document.body.innerText || '' : '',
231          next_url: findNextUrl(),
232          candidates: collectCandidates(),
233        };
234      })()
235    `);
236      if (!payload || typeof payload !== 'object') {
237          throw new CommandExecutionError('1688 search page did not return a readable payload', 'Open the same query in Chrome and verify the page is fully loaded before retrying.');
238      }
239      return payload;
240  }
241  async function collectSearchRows(page, query, limit) {
242      const rowsByKey = new Map();
243      const seenPages = new Set();
244      let nextUrl = buildSearchUrl(query);
245      let pageCount = 0;
246      while (nextUrl && rowsByKey.size < limit && pageCount < MAX_SEARCH_PAGES) {
247          if (seenPages.has(nextUrl))
248              break;
249          seenPages.add(nextUrl);
250          pageCount += 1;
251          const payload = await readSearchPayload(page, nextUrl);
252          const sourceUrl = cleanText(payload.href) || nextUrl;
253          const candidates = Array.isArray(payload.candidates) ? payload.candidates : [];
254          for (const candidate of candidates) {
255              const row = normalizeSearchCandidate(candidate, sourceUrl);
256              const dedupeKey = buildDedupeKey(row);
257              if (!dedupeKey || rowsByKey.has(dedupeKey))
258                  continue;
259              rowsByKey.set(dedupeKey, row);
260              if (rowsByKey.size >= limit)
261                  break;
262          }
263          const candidateNextUrl = cleanText(payload.next_url);
264          if (!candidateNextUrl || candidateNextUrl === sourceUrl)
265              break;
266          nextUrl = candidateNextUrl;
267      }
268      if (rowsByKey.size === 0) {
269          throw new EmptyResultError('1688 search', 'No visible results were extracted. Retry with a different query or open the same search page in Chrome first.');
270      }
271      return [...rowsByKey.values()]
272          .slice(0, limit)
273          .map((row, index) => ({ ...row, rank: index + 1 }));
274  }
275  cli({
276      site: '1688',
277      name: 'search',
278      description: '1688 商品搜索(结果候选、卖家链接、价格/MOQ/销量文本)',
279      domain: 'www.1688.com',
280      strategy: Strategy.COOKIE,
281      navigateBefore: false,
282      args: [
283          {
284              name: 'query',
285              required: true,
286              positional: true,
287              help: '搜索关键词,如 "置物架"',
288          },
289          {
290              name: 'limit',
291              type: 'int',
292              default: SEARCH_LIMIT_DEFAULT,
293              help: `结果数量上限(默认 ${SEARCH_LIMIT_DEFAULT},最大 ${SEARCH_LIMIT_MAX})`,
294          },
295      ],
296      columns: ['rank', 'title', 'price_text', 'moq_text', 'seller_name', 'location'],
297      func: async (page, kwargs) => {
298          const query = String(kwargs.query ?? '');
299          const limit = parseSearchLimit(kwargs.limit);
300          return collectSearchRows(page, query, limit);
301      },
302  });
303  export const __test__ = {
304      normalizeSearchCandidate,
305      extractMoqText,
306      extractSalesText,
307      firstWord,
308      buildDedupeKey,
309  };