/ clis / imdb / utils.js
utils.js
  1  import { ArgumentError } from '@jackwener/opencli/errors';
  2  /**
  3   * Normalize an IMDb title or person input to a bare ID.
  4   * Accepts bare IDs, desktop URLs, mobile URLs, and URLs with language prefixes or query params.
  5   */
  6  export function normalizeImdbId(input, prefix) {
  7      const trimmed = input.trim();
  8      const barePattern = new RegExp(`^${prefix}\\d{7,8}$`);
  9      if (barePattern.test(trimmed)) {
 10          return trimmed;
 11      }
 12      const pathPattern = new RegExp(`/(?:[a-z]{2}/)?(?:title|name)/(${prefix}\\d{7,8})(?:[/?#]|$)`, 'i');
 13      const pathMatch = trimmed.match(pathPattern);
 14      if (pathMatch) {
 15          return pathMatch[1];
 16      }
 17      throw new ArgumentError(`Invalid IMDb ID: "${input}"`, `Expected ${prefix === 'tt' ? 'title' : 'name'} ID like ${prefix === 'tt' ? 'tt1375666' : 'nm0634240'} or an IMDb URL`);
 18  }
 19  /**
 20   * Convert an ISO 8601 duration string to a short human-readable format for table display.
 21   * Example: PT2H28M -> 2h 28m.
 22   */
 23  export function formatDuration(iso) {
 24      if (!iso) {
 25          return '';
 26      }
 27      const match = iso.match(/^PT(?:(\d+)H)?(?:(\d+)M)?$/);
 28      if (!match) {
 29          return '';
 30      }
 31      const parts = [];
 32      if (match[1]) {
 33          parts.push(`${match[1]}h`);
 34      }
 35      if (match[2]) {
 36          parts.push(`${match[2]}m`);
 37      }
 38      return parts.join(' ');
 39  }
 40  /**
 41   * Force an IMDb page URL to use the English language parameter,
 42   * reducing structural differences across localized pages.
 43   */
 44  export function forceEnglishUrl(url) {
 45      const parsed = new URL(url);
 46      parsed.searchParams.set('language', 'en-US');
 47      return parsed.toString();
 48  }
 49  /**
 50   * Normalize IMDb title-type payloads that may be represented as an object,
 51   * a raw string, or an empty text field with only an internal id.
 52   */
 53  export function normalizeImdbTitleType(input) {
 54      const raw = (() => {
 55          if (typeof input === 'string')
 56              return input;
 57          if (!input || typeof input !== 'object')
 58              return '';
 59          const value = input;
 60          return typeof value.text === 'string' && value.text.trim()
 61              ? value.text
 62              : typeof value.id === 'string'
 63                  ? value.id
 64                  : '';
 65      })().trim();
 66      if (!raw)
 67          return '';
 68      const known = {
 69          movie: 'Movie',
 70          short: 'Short',
 71          video: 'Video',
 72          tvEpisode: 'TV Episode',
 73          tvMiniSeries: 'TV Mini Series',
 74          tvMovie: 'TV Movie',
 75          tvSeries: 'TV Series',
 76          tvShort: 'TV Short',
 77          tvSpecial: 'TV Special',
 78          videoGame: 'Video Game',
 79      };
 80      return known[raw] ?? raw;
 81  }
 82  /**
 83   * Extract structured JSON-LD data from the page.
 84   * Accepts a single type string or an array of types to match against @type.
 85   */
 86  export async function extractJsonLd(page, type) {
 87      const filterTypes = type ? (Array.isArray(type) ? type : [type]) : [];
 88      return page.evaluate(`
 89      (function() {
 90        var scripts = document.querySelectorAll('script[type="application/ld+json"]');
 91        var wantedTypes = ${JSON.stringify(filterTypes)};
 92  
 93        function matchesType(data) {
 94          if (wantedTypes.length === 0) {
 95            return true;
 96          }
 97          if (!data || typeof data !== 'object') {
 98            return false;
 99          }
100          if (wantedTypes.indexOf(data['@type']) !== -1) {
101            return true;
102          }
103          if (Array.isArray(data['@type'])) {
104            for (var t = 0; t < data['@type'].length; t++) {
105              if (wantedTypes.indexOf(data['@type'][t]) !== -1) return true;
106            }
107          }
108          return false;
109        }
110  
111        function findMatch(data) {
112          if (Array.isArray(data)) {
113            for (var i = 0; i < data.length; i++) {
114              var itemMatch = findMatch(data[i]);
115              if (itemMatch) {
116                return itemMatch;
117              }
118            }
119            return null;
120          }
121  
122          if (!data || typeof data !== 'object') {
123            return null;
124          }
125  
126          if (matchesType(data)) {
127            return data;
128          }
129  
130          if (Array.isArray(data['@graph'])) {
131            return findMatch(data['@graph']);
132          }
133  
134          return null;
135        }
136  
137        for (var i = 0; i < scripts.length; i++) {
138          try {
139            var parsed = JSON.parse(scripts[i].textContent || 'null');
140            var match = findMatch(parsed);
141            if (match) {
142              return match;
143            }
144          } catch (error) {
145            void error;
146          }
147        }
148  
149        return null;
150      })()
151    `);
152  }
153  /**
154   * Poll until the current IMDb page path matches the expected entity/search path.
155   */
156  export async function waitForImdbPath(page, pathPattern, timeoutMs = 15000) {
157      const result = await page.evaluate(`
158      (async function() {
159        var deadline = Date.now() + ${timeoutMs};
160        var pattern = new RegExp(${JSON.stringify(pathPattern)}, 'i');
161        while (Date.now() < deadline) {
162          if (pattern.test(window.location.pathname)) {
163            return true;
164          }
165          await new Promise(function(resolve) { setTimeout(resolve, 250); });
166        }
167        return pattern.test(window.location.pathname);
168      })()
169    `);
170      return Boolean(result);
171  }
172  /**
173   * Wait until IMDb search results (or the search UI state) has rendered.
174   */
175  export async function waitForImdbSearchReady(page, timeoutMs = 15000) {
176      const result = await page.evaluate(`
177      (async function() {
178        var deadline = Date.now() + ${timeoutMs};
179  
180        function hasSearchResults() {
181          var nextDataEl = document.getElementById('__NEXT_DATA__');
182          if (nextDataEl) {
183            try {
184              var nextData = JSON.parse(nextDataEl.textContent || 'null');
185              var pageProps = nextData && nextData.props && nextData.props.pageProps;
186              var titleResults = (pageProps && pageProps.titleResults && pageProps.titleResults.results) || [];
187              var nameResults = (pageProps && pageProps.nameResults && pageProps.nameResults.results) || [];
188              if (titleResults.length > 0 || nameResults.length > 0) {
189                return true;
190              }
191            } catch (error) {
192              void error;
193            }
194          }
195  
196          if (document.querySelector('a[href*="/title/"], a[href*="/name/"]')) {
197            return true;
198          }
199  
200          var body = document.body ? (document.body.textContent || '') : '';
201          return body.includes('No results found for') || body.includes('No exact matches');
202        }
203  
204        while (Date.now() < deadline) {
205          if (hasSearchResults()) {
206            return true;
207          }
208          await new Promise(function(resolve) { setTimeout(resolve, 250); });
209        }
210  
211        return hasSearchResults();
212      })()
213    `);
214      return Boolean(result);
215  }
216  /**
217   * Wait until IMDb review cards (or the page review summary) has rendered.
218   */
219  export async function waitForImdbReviewsReady(page, timeoutMs = 15000) {
220      const result = await page.evaluate(`
221      (async function() {
222        var deadline = Date.now() + ${timeoutMs};
223  
224        function hasReviewContent() {
225          if (document.querySelector('article.user-review-item, [data-testid="review-card-parent"], [data-testid="tturv-total-reviews"]')) {
226            return true;
227          }
228          var body = document.body ? (document.body.textContent || '') : '';
229          return body.includes('No user reviews') || body.includes('Review this title');
230        }
231  
232        while (Date.now() < deadline) {
233          if (hasReviewContent()) {
234            return true;
235          }
236          await new Promise(function(resolve) { setTimeout(resolve, 250); });
237        }
238  
239        return hasReviewContent();
240      })()
241    `);
242      return Boolean(result);
243  }
244  /**
245   * Read the current IMDb entity id from the page URL/canonical metadata.
246   */
247  export async function getCurrentImdbId(page, prefix) {
248      const result = await page.evaluate(`
249      (function() {
250        var pattern = new RegExp('(${prefix}\\\\d{7,8})', 'i');
251        var candidates = [
252          window.location.pathname || '',
253          document.querySelector('link[rel="canonical"]')?.getAttribute('href') || '',
254          document.querySelector('meta[property="og:url"]')?.getAttribute('content') || ''
255        ];
256  
257        for (var i = 0; i < candidates.length; i++) {
258          var match = candidates[i].match(pattern);
259          if (match) {
260            return match[1];
261          }
262        }
263        return '';
264      })()
265    `);
266      return typeof result === 'string' ? result : '';
267  }
268  /**
269   * Detect whether the current page is an IMDb bot-challenge or verification page.
270   */
271  export async function isChallengePage(page) {
272      const result = await page.evaluate(`
273      (function() {
274        var title = document.title || '';
275        var body = document.body ? (document.body.textContent || '') : '';
276        return title.includes('Robot Check') ||
277          title.includes('Are you a robot') ||
278          title.includes('JavaScript is disabled') ||
279          body.includes('captcha') ||
280          body.includes('verify that you are human') ||
281          body.includes('not a robot');
282      })()
283    `);
284      return Boolean(result);
285  }