/ clis / 36kr / search.js
search.js
 1  /**
 2   * 36kr article search — DOM scraping.
 3   *
 4   * Navigates to the 36kr search results page and scrapes rendered articles.
 5   */
 6  import { cli, Strategy } from '@jackwener/opencli/registry';
 7  import { CliError } from '@jackwener/opencli/errors';
 8  cli({
 9      site: '36kr',
10      name: 'search',
11      description: '搜索36氪文章',
12      domain: 'www.36kr.com',
13      strategy: Strategy.PUBLIC,
14      browser: true,
15      args: [
16          { name: 'query', positional: true, required: true, help: 'Search keyword (e.g. "AI", "OpenAI")' },
17          { name: 'limit', type: 'int', default: 20, help: 'Number of results (max 50)' },
18      ],
19      columns: ['rank', 'title', 'date', 'url'],
20      func: async (page, args) => {
21          const count = Math.min(Number(args.limit) || 20, 50);
22          const query = encodeURIComponent(String(args.query ?? ''));
23          await page.goto(`https://www.36kr.com/search/articles/${query}`);
24          // Poll DOM until article links appear (36kr renders client-side)
25          const deadline = Date.now() + 5000;
26          while (Date.now() < deadline) {
27              if (await page.evaluate('document.querySelectorAll("a[href*=\\"/p/\\"]").length'))
28                  break;
29              await new Promise(r => setTimeout(r, 300));
30          }
31          const domItems = await page.evaluate(`
32        (() => {
33          const seen = new Set();
34          const results = [];
35          // article-item-title contains the clickable title link
36          const titleEls = document.querySelectorAll('.article-item-title a[href*="/p/"], .article-item-title[href*="/p/"]');
37          for (const el of titleEls) {
38            const href = el.getAttribute('href') || '';
39            const title = el.textContent?.trim() || '';
40            if (!title || seen.has(href)) continue;
41            seen.add(href);
42            // Look for date near the article item
43            const item = el.closest('[class*="article-item"]') || el.parentElement;
44            const dateEl = item?.querySelector('[class*="time"], [class*="date"], time');
45            const date = dateEl?.textContent?.trim() || '';
46            results.push({
47              title,
48              url: href.startsWith('http') ? href : 'https://36kr.com' + href,
49              date,
50            });
51          }
52          // Fallback: generic /p/ links with meaningful text
53          if (results.length === 0) {
54            const links = document.querySelectorAll('a[href*="/p/"]');
55            for (const el of links) {
56              const href = el.getAttribute('href') || '';
57              const title = el.textContent?.trim() || '';
58              if (!title || title.length < 8 || seen.has(href) || seen.has(title)) continue;
59              seen.add(href);
60              seen.add(title);
61              results.push({ title, url: href.startsWith('http') ? href : 'https://36kr.com' + href, date: '' });
62            }
63          }
64          return results;
65        })()
66      `);
67          const items = Array.isArray(domItems) ? domItems : [];
68          if (items.length === 0) {
69              throw new CliError('NO_DATA', 'No results found', `Try a different query or check your keyword`);
70          }
71          return items.slice(0, count).map((item, i) => ({
72              rank: i + 1,
73              title: item.title,
74              date: item.date,
75              url: item.url,
76          }));
77      },
78  });