/ clis / amazon / rankings.js
rankings.js
  1  import { CommandExecutionError } from '@jackwener/opencli/errors';
  2  import { Strategy } from '@jackwener/opencli/registry';
  3  import { assertUsableState, buildProvenance, cleanText, extractAsin, extractCategoryNodeId, extractReviewCountFromCardText, firstMeaningfulLine, gotoAndReadState, isRankingPaginationUrl, normalizeProductUrl, parsePriceText, parseRatingValue, parseReviewCount, resolveRankingUrl, toAbsoluteAmazonUrl, uniqueNonEmpty, } from './shared.js';
  4  function parseRank(rawRank, fallback) {
  5      const normalized = cleanText(rawRank);
  6      const match = normalized.match(/(\d{1,4})/);
  7      if (!match)
  8          return fallback;
  9      const parsed = Number.parseInt(match[1], 10);
 10      return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback;
 11  }
 12  function normalizeVisibleCategoryLinks(links) {
 13      const normalized = (links ?? [])
 14          .map((entry) => ({
 15          title: cleanText(entry?.title),
 16          url: toAbsoluteAmazonUrl(entry?.url) ?? '',
 17          node_id: cleanText(entry?.node_id) || extractCategoryNodeId(entry?.url) || null,
 18      }))
 19          .filter((entry) => Boolean(entry.title) && Boolean(entry.url));
 20      const seen = new Set();
 21      const deduped = [];
 22      for (const entry of normalized) {
 23          if (seen.has(entry.url))
 24              continue;
 25          seen.add(entry.url);
 26          deduped.push(entry);
 27      }
 28      return deduped;
 29  }
 30  export function normalizeRankingCandidate(candidate, context) {
 31      const productUrl = normalizeProductUrl(candidate.href);
 32      const asin = extractAsin(candidate.asin ?? '') ?? extractAsin(productUrl ?? '') ?? null;
 33      const title = cleanText(candidate.title) || firstMeaningfulLine(candidate.card_text);
 34      const price = parsePriceText(cleanText(candidate.price_text) || candidate.card_text);
 35      const ratingText = cleanText(candidate.rating_text) || null;
 36      const reviewCountText = cleanText(candidate.review_count_text)
 37          || extractReviewCountFromCardText(candidate.card_text)
 38          || null;
 39      const provenance = buildProvenance(context.sourceUrl);
 40      const categoryUrl = context.categoryUrl || context.sourceUrl;
 41      return {
 42          list_type: context.listType,
 43          rank: parseRank(candidate.rank_text, context.rankFallback),
 44          asin,
 45          title: title || null,
 46          product_url: productUrl,
 47          price_text: price.price_text,
 48          price_value: price.price_value,
 49          currency: price.currency,
 50          rating_text: ratingText,
 51          rating_value: parseRatingValue(ratingText),
 52          review_count_text: reviewCountText,
 53          review_count: parseReviewCount(reviewCountText),
 54          list_title: context.listTitle,
 55          category_title: context.categoryTitle,
 56          category_url: categoryUrl,
 57          category_node_id: extractCategoryNodeId(categoryUrl),
 58          category_path: context.categoryPath,
 59          visible_category_links: context.visibleCategoryLinks,
 60          ...provenance,
 61      };
 62  }
 63  async function readRankingPage(page, listType, url) {
 64      const state = await gotoAndReadState(page, url, 2500, listType);
 65      assertUsableState(state, listType);
 66      return await page.evaluate(`
 67      (() => ({
 68        href: window.location.href,
 69        title: document.title || '',
 70        list_title:
 71          document.querySelector('#zg_banner_text')?.textContent
 72          || document.querySelector('h1')?.textContent
 73          || '',
 74        category_title:
 75          document.querySelector('#zg_browseRoot .zg_selected')?.textContent
 76          || document.querySelector('#wayfinding-breadcrumbs_feature_div ul li:last-child')?.textContent
 77          || document.querySelector('#wayfinding-breadcrumbs_container ul li:last-child')?.textContent
 78          || '',
 79        category_path: Array.from(document.querySelectorAll(
 80          '#zg_browseRoot ul li a, #zg_browseRoot ul li span, ' +
 81          '#wayfinding-breadcrumbs_feature_div ul li a, #wayfinding-breadcrumbs_feature_div ul li span.a-list-item, ' +
 82          '#wayfinding-breadcrumbs_container ul li a, #wayfinding-breadcrumbs_container ul li span.a-list-item'
 83        ))
 84          .map((entry) => (entry.textContent || '').trim())
 85          .filter(Boolean),
 86        cards: Array.from(document.querySelectorAll(
 87          '.p13n-sc-uncoverable-faceout, .zg-grid-general-faceout, [data-asin][class*="p13n"]'
 88        )).map((card) => ({
 89          rank_text:
 90            card.querySelector('.zg-bdg-text')?.textContent
 91            || card.querySelector('[class*="rank"]')?.textContent
 92            || '',
 93          asin:
 94            card.getAttribute('data-asin')
 95            || card.getAttribute('id')
 96            || '',
 97          title:
 98            card.querySelector('[class*="line-clamp"]')?.textContent
 99            || card.querySelector('img')?.getAttribute('alt')
100            || card.querySelector('a[href*="/dp/"]')?.textContent
101            || '',
102          href:
103            card.querySelector('a[href*="/dp/"], a[href*="/gp/product/"]')?.href
104            || '',
105          price_text:
106            card.querySelector('.a-price .a-offscreen')?.textContent
107            || card.querySelector('.a-color-price')?.textContent
108            || '',
109          rating_text:
110            card.querySelector('[aria-label*="out of 5 stars"]')?.getAttribute('aria-label')
111            || '',
112          review_count_text:
113            card.querySelector('a[href*="#customerReviews"]')?.textContent
114            || card.querySelector('.a-size-small')?.textContent
115            || '',
116          card_text: card.innerText || '',
117        })),
118        page_links: Array.from(document.querySelectorAll('.a-pagination a[href], li.a-normal a[href], li.a-selected a[href]'))
119          .map((anchor) => anchor.href || '')
120          .filter(Boolean),
121        visible_category_links: Array.from(document.querySelectorAll(
122          '#zg_browseRoot a[href], #zg-left-col a[href], [class*="zg-browse"] a[href]'
123        )).map((anchor) => ({
124          title: (anchor.textContent || '').trim(),
125          url: anchor.href || '',
126          node_id:
127            anchor.getAttribute('data-node-id')
128            || anchor.dataset?.nodeid
129            || '',
130        }))
131          .filter((entry) => entry.title && entry.url),
132      }))()
133    `);
134  }
135  function createEmptyResultHint(commandName) {
136      return [
137          `Open the same Amazon ${commandName} page in shared Chrome and verify ranked items are visible.`,
138          'If the page shows a robot check, clear it manually and retry.',
139      ].join(' ');
140  }
141  export function createRankingCliOptions(definition) {
142      return {
143          site: 'amazon',
144          name: definition.commandName,
145          description: definition.description,
146          domain: 'amazon.com',
147          strategy: Strategy.COOKIE,
148          navigateBefore: false,
149          args: [
150              {
151                  name: 'input',
152                  positional: true,
153                  help: 'Ranking URL or supported Amazon path. Omit to use the list root.',
154              },
155              {
156                  name: 'limit',
157                  type: 'int',
158                  default: 100,
159                  help: 'Maximum number of ranked items to return (default 100)',
160              },
161          ],
162          columns: ['list_type', 'rank', 'asin', 'title', 'price_text', 'rating_value', 'review_count'],
163          func: async (page, kwargs) => {
164              const limit = Math.max(1, Number(kwargs.limit) || 100);
165              const initialUrl = resolveRankingUrl(definition.listType, typeof kwargs.input === 'string' ? kwargs.input : undefined);
166              const queue = [initialUrl];
167              const visited = new Set();
168              const seenEntityKeys = new Set();
169              const results = [];
170              let listTitle = null;
171              while (queue.length > 0 && results.length < limit) {
172                  const nextUrl = queue.shift();
173                  if (visited.has(nextUrl))
174                      continue;
175                  visited.add(nextUrl);
176                  const payload = await readRankingPage(page, definition.listType, nextUrl);
177                  const sourceUrl = cleanText(payload.href) || nextUrl;
178                  listTitle = cleanText(payload.list_title) || cleanText(payload.title) || listTitle;
179                  const categoryPath = uniqueNonEmpty(payload.category_path ?? []);
180                  const categoryTitle = cleanText(payload.category_title)
181                      || (categoryPath.length > 0 ? categoryPath[categoryPath.length - 1] : '');
182                  const visibleCategoryLinks = normalizeVisibleCategoryLinks(payload.visible_category_links);
183                  const cards = payload.cards ?? [];
184                  for (const card of cards) {
185                      const normalized = normalizeRankingCandidate(card, {
186                          listType: definition.listType,
187                          rankFallback: results.length + 1,
188                          listTitle,
189                          sourceUrl,
190                          categoryTitle: categoryTitle || null,
191                          categoryUrl: sourceUrl,
192                          categoryPath,
193                          visibleCategoryLinks,
194                      });
195                      const dedupeKey = cleanText(String(normalized.asin ?? ''))
196                          || cleanText(String(normalized.product_url ?? ''));
197                      if (dedupeKey && seenEntityKeys.has(dedupeKey))
198                          continue;
199                      if (dedupeKey)
200                          seenEntityKeys.add(dedupeKey);
201                      results.push(normalized);
202                      if (results.length >= limit)
203                          break;
204                  }
205                  const pageLinks = uniqueNonEmpty(payload.page_links ?? []);
206                  for (const href of pageLinks) {
207                      const absolute = toAbsoluteAmazonUrl(href);
208                      if (!absolute || !isRankingPaginationUrl(definition.listType, absolute))
209                          continue;
210                      if (!visited.has(absolute) && !queue.includes(absolute)) {
211                          queue.push(absolute);
212                      }
213                  }
214              }
215              if (results.length === 0) {
216                  throw new CommandExecutionError(`amazon ${definition.commandName} did not expose any ranked items`, createEmptyResultHint(definition.commandName));
217              }
218              return results.slice(0, limit);
219          },
220      };
221  }
222  export const __test__ = {
223      parseRank,
224      normalizeVisibleCategoryLinks,
225      normalizeRankingCandidate,
226  };