rankings.js
1 import { CommandExecutionError } from '@jackwener/opencli/errors'; 2 import { Strategy } from '@jackwener/opencli/registry'; 3 import { assertUsableState, buildProvenance, cleanText, extractAsin, extractCategoryNodeId, extractReviewCountFromCardText, firstMeaningfulLine, gotoAndReadState, isRankingPaginationUrl, normalizeProductUrl, parsePriceText, parseRatingValue, parseReviewCount, resolveRankingUrl, toAbsoluteAmazonUrl, uniqueNonEmpty, } from './shared.js'; 4 function parseRank(rawRank, fallback) { 5 const normalized = cleanText(rawRank); 6 const match = normalized.match(/(\d{1,4})/); 7 if (!match) 8 return fallback; 9 const parsed = Number.parseInt(match[1], 10); 10 return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback; 11 } 12 function normalizeVisibleCategoryLinks(links) { 13 const normalized = (links ?? []) 14 .map((entry) => ({ 15 title: cleanText(entry?.title), 16 url: toAbsoluteAmazonUrl(entry?.url) ?? '', 17 node_id: cleanText(entry?.node_id) || extractCategoryNodeId(entry?.url) || null, 18 })) 19 .filter((entry) => Boolean(entry.title) && Boolean(entry.url)); 20 const seen = new Set(); 21 const deduped = []; 22 for (const entry of normalized) { 23 if (seen.has(entry.url)) 24 continue; 25 seen.add(entry.url); 26 deduped.push(entry); 27 } 28 return deduped; 29 } 30 export function normalizeRankingCandidate(candidate, context) { 31 const productUrl = normalizeProductUrl(candidate.href); 32 const asin = extractAsin(candidate.asin ?? '') ?? extractAsin(productUrl ?? '') ?? null; 33 const title = cleanText(candidate.title) || firstMeaningfulLine(candidate.card_text); 34 const price = parsePriceText(cleanText(candidate.price_text) || candidate.card_text); 35 const ratingText = cleanText(candidate.rating_text) || null; 36 const reviewCountText = cleanText(candidate.review_count_text) 37 || extractReviewCountFromCardText(candidate.card_text) 38 || null; 39 const provenance = buildProvenance(context.sourceUrl); 40 const categoryUrl = context.categoryUrl || context.sourceUrl; 41 return { 42 list_type: context.listType, 43 rank: parseRank(candidate.rank_text, context.rankFallback), 44 asin, 45 title: title || null, 46 product_url: productUrl, 47 price_text: price.price_text, 48 price_value: price.price_value, 49 currency: price.currency, 50 rating_text: ratingText, 51 rating_value: parseRatingValue(ratingText), 52 review_count_text: reviewCountText, 53 review_count: parseReviewCount(reviewCountText), 54 list_title: context.listTitle, 55 category_title: context.categoryTitle, 56 category_url: categoryUrl, 57 category_node_id: extractCategoryNodeId(categoryUrl), 58 category_path: context.categoryPath, 59 visible_category_links: context.visibleCategoryLinks, 60 ...provenance, 61 }; 62 } 63 async function readRankingPage(page, listType, url) { 64 const state = await gotoAndReadState(page, url, 2500, listType); 65 assertUsableState(state, listType); 66 return await page.evaluate(` 67 (() => ({ 68 href: window.location.href, 69 title: document.title || '', 70 list_title: 71 document.querySelector('#zg_banner_text')?.textContent 72 || document.querySelector('h1')?.textContent 73 || '', 74 category_title: 75 document.querySelector('#zg_browseRoot .zg_selected')?.textContent 76 || document.querySelector('#wayfinding-breadcrumbs_feature_div ul li:last-child')?.textContent 77 || document.querySelector('#wayfinding-breadcrumbs_container ul li:last-child')?.textContent 78 || '', 79 category_path: Array.from(document.querySelectorAll( 80 '#zg_browseRoot ul li a, #zg_browseRoot ul li span, ' + 81 '#wayfinding-breadcrumbs_feature_div ul li a, #wayfinding-breadcrumbs_feature_div ul li span.a-list-item, ' + 82 '#wayfinding-breadcrumbs_container ul li a, #wayfinding-breadcrumbs_container ul li span.a-list-item' 83 )) 84 .map((entry) => (entry.textContent || '').trim()) 85 .filter(Boolean), 86 cards: Array.from(document.querySelectorAll( 87 '.p13n-sc-uncoverable-faceout, .zg-grid-general-faceout, [data-asin][class*="p13n"]' 88 )).map((card) => ({ 89 rank_text: 90 card.querySelector('.zg-bdg-text')?.textContent 91 || card.querySelector('[class*="rank"]')?.textContent 92 || '', 93 asin: 94 card.getAttribute('data-asin') 95 || card.getAttribute('id') 96 || '', 97 title: 98 card.querySelector('[class*="line-clamp"]')?.textContent 99 || card.querySelector('img')?.getAttribute('alt') 100 || card.querySelector('a[href*="/dp/"]')?.textContent 101 || '', 102 href: 103 card.querySelector('a[href*="/dp/"], a[href*="/gp/product/"]')?.href 104 || '', 105 price_text: 106 card.querySelector('.a-price .a-offscreen')?.textContent 107 || card.querySelector('.a-color-price')?.textContent 108 || '', 109 rating_text: 110 card.querySelector('[aria-label*="out of 5 stars"]')?.getAttribute('aria-label') 111 || '', 112 review_count_text: 113 card.querySelector('a[href*="#customerReviews"]')?.textContent 114 || card.querySelector('.a-size-small')?.textContent 115 || '', 116 card_text: card.innerText || '', 117 })), 118 page_links: Array.from(document.querySelectorAll('.a-pagination a[href], li.a-normal a[href], li.a-selected a[href]')) 119 .map((anchor) => anchor.href || '') 120 .filter(Boolean), 121 visible_category_links: Array.from(document.querySelectorAll( 122 '#zg_browseRoot a[href], #zg-left-col a[href], [class*="zg-browse"] a[href]' 123 )).map((anchor) => ({ 124 title: (anchor.textContent || '').trim(), 125 url: anchor.href || '', 126 node_id: 127 anchor.getAttribute('data-node-id') 128 || anchor.dataset?.nodeid 129 || '', 130 })) 131 .filter((entry) => entry.title && entry.url), 132 }))() 133 `); 134 } 135 function createEmptyResultHint(commandName) { 136 return [ 137 `Open the same Amazon ${commandName} page in shared Chrome and verify ranked items are visible.`, 138 'If the page shows a robot check, clear it manually and retry.', 139 ].join(' '); 140 } 141 export function createRankingCliOptions(definition) { 142 return { 143 site: 'amazon', 144 name: definition.commandName, 145 description: definition.description, 146 domain: 'amazon.com', 147 strategy: Strategy.COOKIE, 148 navigateBefore: false, 149 args: [ 150 { 151 name: 'input', 152 positional: true, 153 help: 'Ranking URL or supported Amazon path. Omit to use the list root.', 154 }, 155 { 156 name: 'limit', 157 type: 'int', 158 default: 100, 159 help: 'Maximum number of ranked items to return (default 100)', 160 }, 161 ], 162 columns: ['list_type', 'rank', 'asin', 'title', 'price_text', 'rating_value', 'review_count'], 163 func: async (page, kwargs) => { 164 const limit = Math.max(1, Number(kwargs.limit) || 100); 165 const initialUrl = resolveRankingUrl(definition.listType, typeof kwargs.input === 'string' ? kwargs.input : undefined); 166 const queue = [initialUrl]; 167 const visited = new Set(); 168 const seenEntityKeys = new Set(); 169 const results = []; 170 let listTitle = null; 171 while (queue.length > 0 && results.length < limit) { 172 const nextUrl = queue.shift(); 173 if (visited.has(nextUrl)) 174 continue; 175 visited.add(nextUrl); 176 const payload = await readRankingPage(page, definition.listType, nextUrl); 177 const sourceUrl = cleanText(payload.href) || nextUrl; 178 listTitle = cleanText(payload.list_title) || cleanText(payload.title) || listTitle; 179 const categoryPath = uniqueNonEmpty(payload.category_path ?? []); 180 const categoryTitle = cleanText(payload.category_title) 181 || (categoryPath.length > 0 ? categoryPath[categoryPath.length - 1] : ''); 182 const visibleCategoryLinks = normalizeVisibleCategoryLinks(payload.visible_category_links); 183 const cards = payload.cards ?? []; 184 for (const card of cards) { 185 const normalized = normalizeRankingCandidate(card, { 186 listType: definition.listType, 187 rankFallback: results.length + 1, 188 listTitle, 189 sourceUrl, 190 categoryTitle: categoryTitle || null, 191 categoryUrl: sourceUrl, 192 categoryPath, 193 visibleCategoryLinks, 194 }); 195 const dedupeKey = cleanText(String(normalized.asin ?? '')) 196 || cleanText(String(normalized.product_url ?? '')); 197 if (dedupeKey && seenEntityKeys.has(dedupeKey)) 198 continue; 199 if (dedupeKey) 200 seenEntityKeys.add(dedupeKey); 201 results.push(normalized); 202 if (results.length >= limit) 203 break; 204 } 205 const pageLinks = uniqueNonEmpty(payload.page_links ?? []); 206 for (const href of pageLinks) { 207 const absolute = toAbsoluteAmazonUrl(href); 208 if (!absolute || !isRankingPaginationUrl(definition.listType, absolute)) 209 continue; 210 if (!visited.has(absolute) && !queue.includes(absolute)) { 211 queue.push(absolute); 212 } 213 } 214 } 215 if (results.length === 0) { 216 throw new CommandExecutionError(`amazon ${definition.commandName} did not expose any ranked items`, createEmptyResultHint(definition.commandName)); 217 } 218 return results.slice(0, limit); 219 }, 220 }; 221 } 222 export const __test__ = { 223 parseRank, 224 normalizeVisibleCategoryLinks, 225 normalizeRankingCandidate, 226 };