/ clis / google / search.js
search.js
  1  /**
  2   * Google Web Search via browser DOM extraction.
  3   * Uses browser mode to navigate google.com and extract results from the DOM.
  4   *
  5   * Extraction strategy (2026-03): Google no longer uses `.g` class containers.
  6   * Instead, we find all `a` tags containing `h3` within `#rso`, then walk up
  7   * to the result container (`div.tF2Cxc` or closest `div[data-hveid]`) to find
  8   * snippets. This approach is resilient to class name changes.
  9   */
 10  import { cli, Strategy } from '@jackwener/opencli/registry';
 11  import { CliError } from '@jackwener/opencli/errors';
 12  cli({
 13      site: 'google',
 14      name: 'search',
 15      description: 'Search Google',
 16      domain: 'google.com',
 17      strategy: Strategy.PUBLIC,
 18      browser: true,
 19      args: [
 20          { name: 'keyword', positional: true, required: true, help: 'Search query' },
 21          { name: 'limit', type: 'int', default: 10, help: 'Number of results (1-100)' },
 22          { name: 'lang', default: 'en', help: 'Language short code (e.g. en, zh)' },
 23      ],
 24      columns: ['type', 'title', 'url', 'snippet'],
 25      func: async (page, args) => {
 26          const limit = Math.max(1, Math.min(Number(args.limit), 100));
 27          const keyword = encodeURIComponent(args.keyword);
 28          const lang = encodeURIComponent(args.lang);
 29          const url = `https://www.google.com/search?q=${keyword}&hl=${lang}&num=${limit}`;
 30          await page.goto(url);
 31          await page.wait(2);
 32          const results = await page.evaluate(`
 33        (function() {
 34          var results = [];
 35          var seenUrls = {};
 36          var rso = document.querySelector('#rso');
 37          if (!rso) return results;
 38  
 39          // -- Featured snippet (scoped to #rso to avoid matching unrelated elements) --
 40          var featuredEl = rso.querySelector('.xpdopen .hgKElc')
 41                        || rso.querySelector('.IZ6rdc');
 42          if (featuredEl) {
 43            var parentBlock = featuredEl.closest('[data-hveid]') || featuredEl.parentElement;
 44            var fLink = parentBlock ? parentBlock.querySelector('a[href]') : null;
 45            var fUrl = fLink ? fLink.href : '';
 46            if (fUrl) seenUrls[fUrl] = true;
 47            results.push({
 48              type: 'snippet',
 49              title: featuredEl.textContent.trim().slice(0, 200),
 50              url: fUrl,
 51              snippet: '',
 52            });
 53          }
 54  
 55          // -- Standard search results --
 56          // Strategy: find all links containing h3 within #rso
 57          var allLinks = rso.querySelectorAll('a');
 58          for (var i = 0; i < allLinks.length; i++) {
 59            var link = allLinks[i];
 60            var h3 = link.querySelector('h3');
 61            if (!h3) continue;
 62  
 63            var href = link.href || '';
 64            // Skip non-http, Google internal links, and duplicates
 65            if (!href.match(/^https?:\\/\\//)) continue;
 66            if (href.indexOf('google.com/search') !== -1) continue;
 67            if (seenUrls[href]) continue;
 68            seenUrls[href] = true;
 69  
 70            // Walk up to find result container for snippet extraction
 71            var container = link;
 72            for (var j = 0; j < 6; j++) {
 73              if (container.parentElement && container.parentElement !== rso) {
 74                container = container.parentElement;
 75              }
 76              // Stop at a known result boundary
 77              if (container.getAttribute && container.getAttribute('data-hveid')) break;
 78            }
 79  
 80            // Find snippet: look for descriptive text, skip breadcrumbs and metadata
 81            var snippetText = '';
 82            var titleText = h3.textContent.trim();
 83            var candidates = container.querySelectorAll('span, div');
 84            for (var k = 0; k < candidates.length; k++) {
 85              var el = candidates[k];
 86              if (el.querySelector('h3') || el.querySelector('a[href]')) continue;
 87              var text = el.textContent.trim();
 88              if (text.length < 40 || text.length > 500) continue;
 89              if (text === titleText) continue;
 90              // Skip URL breadcrumbs (e.g. "https://example.com › path..." or "Site Namehttps://...")
 91              if (text.indexOf('\u203A') !== -1) continue;
 92              if (new RegExp('https?://').test(text.slice(0, 60))) continue;
 93              snippetText = text;
 94              break;
 95            }
 96  
 97            results.push({
 98              type: 'result',
 99              title: h3.textContent.trim(),
100              url: href,
101              snippet: snippetText.slice(0, 300),
102            });
103          }
104  
105          // -- People Also Ask --
106          var paaContainers = document.querySelectorAll('[data-sgrd="true"]');
107          for (var i = 0; i < paaContainers.length; i++) {
108            var questionEl = paaContainers[i].querySelector('span.CSkcDe');
109            if (questionEl) {
110              results.push({
111                type: 'paa',
112                title: questionEl.textContent.trim(),
113                url: '',
114                snippet: '',
115              });
116            }
117          }
118  
119          return results;
120        })()
121      `);
122          if (!Array.isArray(results) || results.length === 0) {
123              throw new CliError('NOT_FOUND', 'No search results found', 'Try a different keyword or check for CAPTCHA');
124          }
125          return results;
126      },
127  });