utils.js
1 import { ArgumentError } from '@jackwener/opencli/errors'; 2 /** 3 * Normalize an IMDb title or person input to a bare ID. 4 * Accepts bare IDs, desktop URLs, mobile URLs, and URLs with language prefixes or query params. 5 */ 6 export function normalizeImdbId(input, prefix) { 7 const trimmed = input.trim(); 8 const barePattern = new RegExp(`^${prefix}\\d{7,8}$`); 9 if (barePattern.test(trimmed)) { 10 return trimmed; 11 } 12 const pathPattern = new RegExp(`/(?:[a-z]{2}/)?(?:title|name)/(${prefix}\\d{7,8})(?:[/?#]|$)`, 'i'); 13 const pathMatch = trimmed.match(pathPattern); 14 if (pathMatch) { 15 return pathMatch[1]; 16 } 17 throw new ArgumentError(`Invalid IMDb ID: "${input}"`, `Expected ${prefix === 'tt' ? 'title' : 'name'} ID like ${prefix === 'tt' ? 'tt1375666' : 'nm0634240'} or an IMDb URL`); 18 } 19 /** 20 * Convert an ISO 8601 duration string to a short human-readable format for table display. 21 * Example: PT2H28M -> 2h 28m. 22 */ 23 export function formatDuration(iso) { 24 if (!iso) { 25 return ''; 26 } 27 const match = iso.match(/^PT(?:(\d+)H)?(?:(\d+)M)?$/); 28 if (!match) { 29 return ''; 30 } 31 const parts = []; 32 if (match[1]) { 33 parts.push(`${match[1]}h`); 34 } 35 if (match[2]) { 36 parts.push(`${match[2]}m`); 37 } 38 return parts.join(' '); 39 } 40 /** 41 * Force an IMDb page URL to use the English language parameter, 42 * reducing structural differences across localized pages. 43 */ 44 export function forceEnglishUrl(url) { 45 const parsed = new URL(url); 46 parsed.searchParams.set('language', 'en-US'); 47 return parsed.toString(); 48 } 49 /** 50 * Normalize IMDb title-type payloads that may be represented as an object, 51 * a raw string, or an empty text field with only an internal id. 52 */ 53 export function normalizeImdbTitleType(input) { 54 const raw = (() => { 55 if (typeof input === 'string') 56 return input; 57 if (!input || typeof input !== 'object') 58 return ''; 59 const value = input; 60 return typeof value.text === 'string' && value.text.trim() 61 ? value.text 62 : typeof value.id === 'string' 63 ? value.id 64 : ''; 65 })().trim(); 66 if (!raw) 67 return ''; 68 const known = { 69 movie: 'Movie', 70 short: 'Short', 71 video: 'Video', 72 tvEpisode: 'TV Episode', 73 tvMiniSeries: 'TV Mini Series', 74 tvMovie: 'TV Movie', 75 tvSeries: 'TV Series', 76 tvShort: 'TV Short', 77 tvSpecial: 'TV Special', 78 videoGame: 'Video Game', 79 }; 80 return known[raw] ?? raw; 81 } 82 /** 83 * Extract structured JSON-LD data from the page. 84 * Accepts a single type string or an array of types to match against @type. 85 */ 86 export async function extractJsonLd(page, type) { 87 const filterTypes = type ? (Array.isArray(type) ? type : [type]) : []; 88 return page.evaluate(` 89 (function() { 90 var scripts = document.querySelectorAll('script[type="application/ld+json"]'); 91 var wantedTypes = ${JSON.stringify(filterTypes)}; 92 93 function matchesType(data) { 94 if (wantedTypes.length === 0) { 95 return true; 96 } 97 if (!data || typeof data !== 'object') { 98 return false; 99 } 100 if (wantedTypes.indexOf(data['@type']) !== -1) { 101 return true; 102 } 103 if (Array.isArray(data['@type'])) { 104 for (var t = 0; t < data['@type'].length; t++) { 105 if (wantedTypes.indexOf(data['@type'][t]) !== -1) return true; 106 } 107 } 108 return false; 109 } 110 111 function findMatch(data) { 112 if (Array.isArray(data)) { 113 for (var i = 0; i < data.length; i++) { 114 var itemMatch = findMatch(data[i]); 115 if (itemMatch) { 116 return itemMatch; 117 } 118 } 119 return null; 120 } 121 122 if (!data || typeof data !== 'object') { 123 return null; 124 } 125 126 if (matchesType(data)) { 127 return data; 128 } 129 130 if (Array.isArray(data['@graph'])) { 131 return findMatch(data['@graph']); 132 } 133 134 return null; 135 } 136 137 for (var i = 0; i < scripts.length; i++) { 138 try { 139 var parsed = JSON.parse(scripts[i].textContent || 'null'); 140 var match = findMatch(parsed); 141 if (match) { 142 return match; 143 } 144 } catch (error) { 145 void error; 146 } 147 } 148 149 return null; 150 })() 151 `); 152 } 153 /** 154 * Poll until the current IMDb page path matches the expected entity/search path. 155 */ 156 export async function waitForImdbPath(page, pathPattern, timeoutMs = 15000) { 157 const result = await page.evaluate(` 158 (async function() { 159 var deadline = Date.now() + ${timeoutMs}; 160 var pattern = new RegExp(${JSON.stringify(pathPattern)}, 'i'); 161 while (Date.now() < deadline) { 162 if (pattern.test(window.location.pathname)) { 163 return true; 164 } 165 await new Promise(function(resolve) { setTimeout(resolve, 250); }); 166 } 167 return pattern.test(window.location.pathname); 168 })() 169 `); 170 return Boolean(result); 171 } 172 /** 173 * Wait until IMDb search results (or the search UI state) has rendered. 174 */ 175 export async function waitForImdbSearchReady(page, timeoutMs = 15000) { 176 const result = await page.evaluate(` 177 (async function() { 178 var deadline = Date.now() + ${timeoutMs}; 179 180 function hasSearchResults() { 181 var nextDataEl = document.getElementById('__NEXT_DATA__'); 182 if (nextDataEl) { 183 try { 184 var nextData = JSON.parse(nextDataEl.textContent || 'null'); 185 var pageProps = nextData && nextData.props && nextData.props.pageProps; 186 var titleResults = (pageProps && pageProps.titleResults && pageProps.titleResults.results) || []; 187 var nameResults = (pageProps && pageProps.nameResults && pageProps.nameResults.results) || []; 188 if (titleResults.length > 0 || nameResults.length > 0) { 189 return true; 190 } 191 } catch (error) { 192 void error; 193 } 194 } 195 196 if (document.querySelector('a[href*="/title/"], a[href*="/name/"]')) { 197 return true; 198 } 199 200 var body = document.body ? (document.body.textContent || '') : ''; 201 return body.includes('No results found for') || body.includes('No exact matches'); 202 } 203 204 while (Date.now() < deadline) { 205 if (hasSearchResults()) { 206 return true; 207 } 208 await new Promise(function(resolve) { setTimeout(resolve, 250); }); 209 } 210 211 return hasSearchResults(); 212 })() 213 `); 214 return Boolean(result); 215 } 216 /** 217 * Wait until IMDb review cards (or the page review summary) has rendered. 218 */ 219 export async function waitForImdbReviewsReady(page, timeoutMs = 15000) { 220 const result = await page.evaluate(` 221 (async function() { 222 var deadline = Date.now() + ${timeoutMs}; 223 224 function hasReviewContent() { 225 if (document.querySelector('article.user-review-item, [data-testid="review-card-parent"], [data-testid="tturv-total-reviews"]')) { 226 return true; 227 } 228 var body = document.body ? (document.body.textContent || '') : ''; 229 return body.includes('No user reviews') || body.includes('Review this title'); 230 } 231 232 while (Date.now() < deadline) { 233 if (hasReviewContent()) { 234 return true; 235 } 236 await new Promise(function(resolve) { setTimeout(resolve, 250); }); 237 } 238 239 return hasReviewContent(); 240 })() 241 `); 242 return Boolean(result); 243 } 244 /** 245 * Read the current IMDb entity id from the page URL/canonical metadata. 246 */ 247 export async function getCurrentImdbId(page, prefix) { 248 const result = await page.evaluate(` 249 (function() { 250 var pattern = new RegExp('(${prefix}\\\\d{7,8})', 'i'); 251 var candidates = [ 252 window.location.pathname || '', 253 document.querySelector('link[rel="canonical"]')?.getAttribute('href') || '', 254 document.querySelector('meta[property="og:url"]')?.getAttribute('content') || '' 255 ]; 256 257 for (var i = 0; i < candidates.length; i++) { 258 var match = candidates[i].match(pattern); 259 if (match) { 260 return match[1]; 261 } 262 } 263 return ''; 264 })() 265 `); 266 return typeof result === 'string' ? result : ''; 267 } 268 /** 269 * Detect whether the current page is an IMDb bot-challenge or verification page. 270 */ 271 export async function isChallengePage(page) { 272 const result = await page.evaluate(` 273 (function() { 274 var title = document.title || ''; 275 var body = document.body ? (document.body.textContent || '') : ''; 276 return title.includes('Robot Check') || 277 title.includes('Are you a robot') || 278 title.includes('JavaScript is disabled') || 279 body.includes('captcha') || 280 body.includes('verify that you are human') || 281 body.includes('not a robot'); 282 })() 283 `); 284 return Boolean(result); 285 }