/ clis / powerchina / search.js
search.js
  1  /**
  2   * PowerChina search — browser DOM extraction with multi-entry URL probing.
  3   */
  4  import { cli, Strategy } from '@jackwener/opencli/registry';
  5  import { AuthRequiredError } from '@jackwener/opencli/errors';
  6  import {
  7    cleanText,
  8    normalizeDate,
  9    toProcurementSearchRecords,
 10  } from '../jianyu/shared/procurement-contract.js';
 11  import { searchRowsFromEntries } from '../jianyu/shared/china-bid-search.js';
 12  
 13  const SEARCH_ENTRIES = [
 14    'https://bid.powerchina.cn/search',
 15    'https://bid.powerchina.cn/',
 16  ];
 17  const API_LIST_ENDPOINT = 'https://bid.powerchina.cn/newcbs/recpro-newmember/BidAnnouncementSummary/list';
 18  const API_DETAIL_ENDPOINT = 'https://bid.powerchina.cn/newcbs/recpro-newmember/BidAnnouncementSummary/getInfo';
 19  const API_DEFAULT_ANNOUNCEMENT_TYPE = '招采公告';
 20  
 21  const PROCUREMENT_TITLE_HINT = /(公告|招标|采购|中标|成交|项目|notice|tender|bidding)/i;
 22  const NAVIGATION_TITLE_HINT = /^(english|中文|chinese|language|home|首页|搜索|search)$/i;
 23  const RETRYABLE_SEARCH_ERROR_HINT = /(detached while handling command|execution context was destroyed|target closed|cannot find context with specified id)/i;
 24  
 25  export function buildSearchCandidates(query) {
 26    const keyword = query.trim();
 27    if (!keyword) return [...SEARCH_ENTRIES];
 28    const encoded = encodeURIComponent(keyword);
 29    return [
 30      `https://bid.powerchina.cn/search?keyword=${encoded}`,
 31      `https://bid.powerchina.cn/search?keywords=${encoded}`,
 32      `https://bid.powerchina.cn/search?q=${encoded}`,
 33      ...SEARCH_ENTRIES,
 34    ];
 35  }
 36  
 37  function dedupeCandidates(items) {
 38    const deduped = [];
 39    const seen = new Set();
 40    for (const item of items) {
 41      const key = `${item.title}\t${item.url}`;
 42      if (seen.has(key)) continue;
 43      seen.add(key);
 44      deduped.push(item);
 45    }
 46    return deduped;
 47  }
 48  
 49  function isLikelyNavigationUrl(rawUrl) {
 50    const urlText = cleanText(rawUrl);
 51    if (!urlText) return true;
 52    try {
 53      const parsed = new URL(urlText);
 54      const pathname = parsed.pathname.toLowerCase().replace(/\/+$/, '') || '/';
 55      const hash = cleanText(parsed.hash).toLowerCase();
 56      if (pathname === '/' || pathname === '/index') return true;
 57      if (pathname === '/search') return true;
 58      if (pathname === '/old' || pathname.startsWith('/old/')) return true;
 59      if (pathname === '/en' || pathname.startsWith('/en/')) return true;
 60      if (pathname === '/zh' || pathname.startsWith('/zh/')) return true;
 61      if (hash === '#/' || hash === '#/index' || hash.startsWith('#/search')) return true;
 62      return false;
 63    } catch {
 64      return true;
 65    }
 66  }
 67  
 68  function isLikelyNavigationTitle(rawTitle) {
 69    const title = cleanText(rawTitle);
 70    if (!title) return true;
 71    const normalized = title.toLowerCase();
 72    if (NAVIGATION_TITLE_HINT.test(normalized)) return true;
 73    if (normalized.length <= 10 && (normalized === 'en' || normalized === 'zh' || normalized.includes('english'))) {
 74      return true;
 75    }
 76    return false;
 77  }
 78  
 79  function filterNavigationRows(items) {
 80    return items.filter((item) => {
 81      const title = cleanText(item.title);
 82      const url = cleanText(item.url);
 83      if (!url || !title) return false;
 84      if (isLikelyNavigationUrl(url)) return false;
 85      if (isLikelyNavigationTitle(title) && !PROCUREMENT_TITLE_HINT.test(title)) return false;
 86      return true;
 87    });
 88  }
 89  
 90  export function buildApiDetailUrl(id) {
 91    const normalizedId = cleanText(id);
 92    if (!normalizedId) return '';
 93    return `${API_DETAIL_ENDPOINT}/${encodeURIComponent(normalizedId)}`;
 94  }
 95  
 96  function toApiCandidate(row) {
 97    const id = cleanText(row.id);
 98    const title = cleanText(row.title);
 99    if (!id || !title) return null;
100  
101    const url = buildApiDetailUrl(id);
102    if (!url) return null;
103  
104    const contextText = cleanText([
105      row.announcementType,
106      row.titleTypeName,
107      row.source,
108      row.publishTime,
109      row.registrationDeadline,
110      row.submissionDeadline,
111      row.bidOpenTime,
112    ].filter(Boolean).join(' | '));
113  
114    const date = normalizeDate(cleanText(row.publishTime || row.bidOpenTime || row.submissionDeadline || ''));
115    return {
116      title,
117      url,
118      date,
119      contextText,
120    };
121  }
122  
123  async function searchRowsFromApi(query, limit) {
124    const keyword = cleanText(query);
125    const pageSize = Math.max(20, Math.min(100, Math.max(limit * 3, limit)));
126    const payload = {
127      pageNum: 1,
128      pageSize,
129      announcementType: API_DEFAULT_ANNOUNCEMENT_TYPE,
130      companyType: '3',
131      time: Date.now(),
132    };
133    if (keyword) payload.keyWords = keyword;
134  
135    const response = await fetch(API_LIST_ENDPOINT, {
136      method: 'POST',
137      headers: {
138        'Content-Type': 'application/json;charset=utf-8',
139      },
140      body: JSON.stringify(payload),
141    });
142  
143    if (!response.ok) {
144      throw new Error(`[taxonomy=relay_unavailable] site=powerchina command=search api HTTP ${response.status}`);
145    }
146  
147    const data = await response.json();
148    if ((data.code ?? 200) !== 200) {
149      throw new Error(`[taxonomy=relay_unavailable] site=powerchina command=search api code=${data.code ?? 'unknown'} msg=${cleanText(data.msg)}`);
150    }
151  
152    const rows = Array.isArray(data.rows) ? data.rows : [];
153    const mapped = rows
154      .map((row) => toApiCandidate(row))
155      .filter(Boolean);
156    return dedupeCandidates(mapped).slice(0, limit);
157  }
158  
159  cli({
160    site: 'powerchina',
161    name: 'search',
162    description: '搜索中国电建阳光采购公告',
163    domain: 'bid.powerchina.cn',
164    strategy: Strategy.COOKIE,
165    browser: true,
166    args: [
167      { name: 'query', required: true, positional: true, help: 'Search keyword, e.g. "procurement"' },
168      { name: 'limit', type: 'int', default: 20, help: 'Number of results (max 50)' },
169    ],
170    columns: ['rank', 'content_type', 'title', 'publish_time', 'project_code', 'budget_or_limit', 'url'],
171    func: async (page, kwargs) => {
172      const query = cleanText(kwargs.query);
173      const limit = Math.max(1, Math.min(Number(kwargs.limit) || 20, 50));
174      let extractedRows = [];
175      let apiFailure = null;
176      let apiSucceeded = false;
177  
178      try {
179        const apiRows = await searchRowsFromApi(query, limit);
180        extractedRows = apiRows;
181        apiSucceeded = true;
182      } catch (error) {
183        apiFailure = cleanText(error instanceof Error ? error.message : String(error || ''));
184      }
185  
186      if (apiSucceeded && extractedRows.length === 0) {
187        return [];
188      }
189  
190      if (!apiSucceeded) {
191        try {
192          extractedRows = await searchRowsFromEntries(page, {
193            query,
194            candidateUrls: buildSearchCandidates(query),
195            allowedHostFragments: ['bid.powerchina.cn', 'powerchina.cn'],
196            limit,
197          });
198        } catch (error) {
199          const message = cleanText(error instanceof Error ? error.message : String(error || ''));
200          if (RETRYABLE_SEARCH_ERROR_HINT.test(message)) {
201            throw new Error(`[taxonomy=relay_unavailable] site=powerchina command=search detached browser context: ${message}`);
202          }
203          throw error;
204        }
205      }
206  
207      const rows = filterNavigationRows(
208        dedupeCandidates(extractedRows).map((item) => ({
209          title: cleanText(item.title),
210          url: cleanText(item.url),
211          date: normalizeDate(cleanText(item.date)),
212          contextText: cleanText(item.contextText),
213        })),
214      );
215  
216      if (rows.length === 0 && extractedRows.length > 0) {
217        throw new Error('[taxonomy=empty_result] site=powerchina command=search extracted only navigation/portal rows');
218      }
219  
220      if (rows.length === 0) {
221        const pageText = cleanText(await page.evaluate('document.body ? document.body.innerText : ""'));
222        if (/(请先登录|未登录|登录后|验证码|人机验证)/.test(pageText)) {
223          throw new AuthRequiredError(
224            'bid.powerchina.cn',
225            '[taxonomy=selector_drift] site=powerchina command=search login required or human verification',
226          );
227        }
228        if (apiFailure) {
229          throw new Error(`[taxonomy=empty_result] site=powerchina command=search api/dom yielded no result: ${apiFailure}`);
230        }
231      }
232  
233      return toProcurementSearchRecords(rows, {
234        site: 'powerchina',
235        query,
236        limit,
237      });
238    },
239  });
240  
241  export const __test__ = {
242    buildSearchCandidates,
243    normalizeDate,
244    dedupeCandidates,
245    filterNavigationRows,
246    isLikelyNavigationUrl,
247    isLikelyNavigationTitle,
248    buildApiDetailUrl,
249    toApiCandidate,
250  };