Cradicle Explorer

/ src / stages / rescoring.js
rescoring.js
  1  /**
  2   * Rescoring Stage
  3   * Rescore sites that received B- or below with below-fold screenshots
  4   */
  5  
  6  import { run, getOne, getAll, withTransaction } from '../utils/db.js';
  7  import { readFileSync } from 'fs';
  8  import { normaliseCountryCode } from '../config/countries.js';
  9  import { scoreWebsite } from '../score.js';
 10  import { callLLM } from '../utils/llm-provider.js';
 11  import Logger from '../utils/logger.js';
 12  import { generateStageCompletion, displayProgress } from '../utils/summary-generator.js';
 13  import { processBatch } from '../utils/error-handler.js';
 14  import { loadScreenshot } from '../utils/screenshot-storage.js';
 15  import { checkBlocklist } from '../utils/site-filters.js';
 16  import { incrementRescored } from '../utils/keyword-counters.js';
 17  import { recordFailure, resetRetries } from '../utils/retry-handler.js';
 18  import { parseCountryFromGoogleDomain } from '../utils/tld-detector.js';
 19  import { cleanInvalidSocialLinks } from '../contacts/prioritize.js';
 20  import { normalizePhoneNumber } from '../utils/phone-normalizer.js';
 21  import { readHtmlDom, hasHtmlDom } from '../utils/html-storage.js';
 22  import { setScoreJson } from '../utils/score-storage.js';
 23  
 24  const logger = new Logger('Rescoring');
 25  const success = (...args) => logger.success(...args);
 26  const info = (...args) => logger.info(...args);
 27  const error = (...args) => logger.error(...args);
 28  
 29  const ENGLISH_ONLY_MARKETS = process.env.ENGLISH_ONLY_MARKETS
 30    ? process.env.ENGLISH_ONLY_MARKETS.split(',').map(c => c.trim().toUpperCase())
 31    : null;
 32  
 33  // Load vision prompt
 34  const VISION_PROMPT = readFileSync(new URL('../../prompts/VISION.md', import.meta.url), 'utf-8');
 35  
 36  // Model configuration (from env or default)
 37  const VISION_MODEL = process.env.VISION_MODEL || 'openai/gpt-4o-mini';
 38  
 39  // Check ENABLE_VISION flag (consolidates old flags)
 40  const ENABLE_VISION = process.env.ENABLE_VISION !== 'false';
 41  
 42  /**
 43   * Clean phone numbers from LLM output
 44   * - Supports both object format {number, label} and legacy string format
 45   * - Normalizes to E.164 format
 46   * - Logs transformations for troubleshooting
 47   * @param {Array<Object|string>} phones - Phone numbers from LLM
 48   * @param {string} context - URL or identifier for logging
 49   * @returns {Array<Object>} Cleaned phone numbers in object format
 50   */
 51  function cleanPhoneNumbers(phones, context) {
 52    if (!Array.isArray(phones)) return [];
 53  
 54    return phones
 55      .map((phone, idx) => {
 56        // Handle legacy string format
 57        if (typeof phone === 'string') {
 58          const cleaned = normalizePhoneNumber(phone);
 59          if (cleaned !== phone) {
 60            info(`  Phone normalization [${context}]: '${phone}' → '${cleaned}'`);
 61          }
 62          return { number: cleaned, label: '' };
 63        }
 64  
 65        // Handle object format {number, label}
 66        if (phone.number) {
 67          const original = phone.number;
 68          const cleaned = normalizePhoneNumber(original);
 69          if (cleaned !== original) {
 70            info(`  Phone normalization [${context}]: '${original}' → '${cleaned}'`);
 71          }
 72          return {
 73            number: cleaned,
 74            label: phone.label || '',
 75          };
 76        }
 77  
 78        // Invalid format
 79        error(`  Invalid phone format at index ${idx} for ${context}: ${JSON.stringify(phone)}`);
 80        return null;
 81      })
 82      .filter(Boolean); // Remove null entries
 83  }
 84  
 85  /**
 86   * Validate base64 image data
 87   * @param {string} base64Data - Base64-encoded image
 88   * @returns {Object} Validation result
 89   */
 90  function validateBase64Image(base64Data) {
 91    // Check if base64 data is present
 92    if (!base64Data || typeof base64Data !== 'string') {
 93      return { valid: false, reason: 'Missing or invalid base64 data' };
 94    }
 95  
 96    // Check minimum length (empty images are typically < 100 bytes)
 97    if (base64Data.length < 100) {
 98      return { valid: false, reason: 'Base64 data too short (likely empty image)' };
 99    }
100  
101    // Check for valid base64 characters (A-Z, a-z, 0-9, +, /, =)
102    const base64Regex = /^[A-Za-z0-9+/=]+$/;
103    if (!base64Regex.test(base64Data)) {
104      return { valid: false, reason: 'Invalid base64 characters' };
105    }
106  
107    // Estimate decoded size (base64 is ~1.33x larger than binary)
108    const estimatedBytes = (base64Data.length * 3) / 4;
109    const estimatedMB = estimatedBytes / (1024 * 1024);
110  
111    // OpenRouter has 20MB limit per request, but images should be much smaller
112    if (estimatedMB > 10) {
113      return { valid: false, reason: `Image too large (~${estimatedMB.toFixed(1)}MB)` };
114    }
115  
116    return { valid: true, sizeBytes: estimatedBytes, sizeMB: estimatedMB };
117  }
118  
119  /**
120   * Extract text from below-fold screenshot using vision LLM
121   * @param {string} screenshotBase64 - Base64-encoded screenshot
122   * @param {number} siteId - Site ID for usage tracking
123   * @returns {Promise<string>} Extracted text from image
124   */
125  async function extractTextFromImage(screenshotBase64, siteId = null) {
126    try {
127      // Validate base64 image before sending to API
128      const validation = validateBase64Image(screenshotBase64);
129      if (!validation.valid) {
130        error(`  Invalid image data: ${validation.reason}`);
131        return '';
132      }
133  
134      // Log image size for debugging
135      if (validation.sizeMB > 5) {
136        info(`  Large image detected: ${validation.sizeMB.toFixed(1)}MB`);
137      }
138  
139      const messages = [
140        {
141          role: 'system',
142          content: VISION_PROMPT,
143        },
144        {
145          role: 'user',
146          content: [
147            {
148              type: 'text',
149              text: 'Extract all visible text from this screenshot:',
150            },
151            {
152              type: 'image_url',
153              image_url: {
154                url: `data:image/jpeg;base64,${screenshotBase64}`,
155              },
156            },
157          ],
158        },
159      ];
160  
161      const response = await callLLM({
162        model: VISION_MODEL,
163        messages,
164        temperature: 0.1,
165        max_tokens: 2000,
166        stage: 'rescoring',
167        siteId,
168      });
169  
170      return response.content || '';
171    } catch (err) {
172      // Check if this is a 400 error (bad request) - don't retry these
173      if (err.message && (err.message.includes('400') || err.message.includes('Bad Request'))) {
174        error(`  OpenRouter rejected image (400 Bad Request) - invalid format or size`);
175        return '';
176      }
177  
178      error(`  Error extracting text from image: ${err.message}`);
179      return '';
180    }
181  }
182  
183  /**
184   * Run the rescoring stage
185   * @param {Object} options - Stage options
186   * @param {number} options.limit - Limit number of sites to rescore
187   * @param {number} options.concurrency - Number of concurrent scoring operations (default: 10)
188   * @param {number} options.cutoff - Score cutoff for rescoring (default: 82)
189   * @returns {Promise<Object>} Stage results
190   */
191  export async function runRescoringStage(options = {}) {
192    const startTime = Date.now();
193    const concurrency = options.concurrency || 10;
194  
195    // Get cutoff from environment variable or use default
196    const cutoff = options.cutoff || parseInt(process.env.LOW_SCORE_CUTOFF || '82', 10);
197  
198    info('Starting Rescoring Stage...');
199  
200    // Show deprecation warning if old flags are used
201    const legacyFlags = [
202      process.env.USE_COMPUTER_VISION_SCORING,
203      process.env.USE_COMPUTER_VISION_RESCORING,
204      process.env.USE_COMPUTER_VISION_ENRICHMENT,
205    ];
206    if (legacyFlags.some(flag => flag !== undefined)) {
207      logger.warn(
208        '[rescoring] WARN: Vision flags (USE_COMPUTER_VISION_*) are deprecated. Use ENABLE_VISION instead.'
209      );
210    }
211  
212    try {
213      // Skip rescoring entirely if vision is disabled (rescoring relies on below-fold screenshots)
214      if (!ENABLE_VISION) {
215        info('Rescoring stage skipped: ENABLE_VISION=false (no screenshots to analyze)');
216        info('Sites are promoted directly to semantic_scored status by the scoring stage');
217  
218        // Promote any remaining prog_scored sites (edge case: sites scored before this fix)
219        const { changes: promotedCount } = await run(
220          `UPDATE sites
221           SET status = 'semantic_scored',
222               rescored_at = NOW()
223           WHERE status = 'prog_scored'
224             AND html_dom IS NOT NULL`
225        );
226  
227        // Sites at 'prog_scored' with empty html_dom cannot be enriched — reset to 'found' for re-capture
228        // Note: score_json/contacts_json are stored on filesystem, not in DB columns.
229        const { changes: resetCount } = await run(
230          `UPDATE sites
231           SET status = 'found',
232               error_message = 'html_dom missing at prog_scored stage — reset for re-capture',
233               score = NULL,
234               updated_at = NOW()
235           WHERE status = 'prog_scored'
236             AND (html_dom IS NULL OR html_dom = '')`
237        );
238  
239        if (promotedCount > 0) {
240          info(
241            `Auto-promoted ${promotedCount} prog_scored → semantic_scored (scoring stage now handles this directly)`
242          );
243        }
244        if (resetCount > 0) {
245          info(
246            `Reset ${resetCount} prog_scored sites with empty html_dom back to 'found' for re-capture`
247          );
248        }
249  
250        return {
251          processed: promotedCount,
252          succeeded: promotedCount,
253          failed: 0,
254          skipped: 0,
255          improved: 0,
256          duration: Date.now() - startTime,
257        };
258      }
259  
260      info(`Cutoff: ${cutoff} (B- or below)`);
261  
262      // Get sites that need rescoring (including retry of errors)
263      const englishFilter = ENGLISH_ONLY_MARKETS
264        ? `AND country_code = ANY($2::text[])`
265        : '';
266      const sitesQuery = `
267        SELECT id, domain, landing_page_url as url, score, grade, country_code
268        FROM sites
269        WHERE status = 'prog_scored'
270          AND score <= $1
271          AND screenshot_path IS NOT NULL
272          AND (rescored_at IS NULL OR error_message IS NOT NULL)
273          ${englishFilter}
274        ORDER BY CASE WHEN country_code IN ('AU','CA','GB','IE','IN','NZ','US','ZA') THEN 0 ELSE 1 END ASC
275        ${options.limit ? `LIMIT ${options.limit}` : ''}
276      `;
277  
278      const sitesParams = ENGLISH_ONLY_MARKETS ? [cutoff, ENGLISH_ONLY_MARKETS] : [cutoff];
279      const sites = await getAll(sitesQuery, sitesParams);
280  
281      if (sites.length === 0) {
282        info('No sites need rescoring');
283        return {
284          processed: 0,
285          succeeded: 0,
286          failed: 0,
287          skipped: 0,
288          duration: Date.now() - startTime,
289        };
290      }
291  
292      // Filter out blocklisted sites (directories/social media/franchises)
293      let ignoredCount = 0;
294      for (const site of sites) {
295        const blocked = checkBlocklist(site.domain, site.country_code);
296        if (blocked) {
297          await run(
298            `UPDATE sites SET status = 'ignored', error_message = $1 WHERE id = $2`,
299            [blocked.reason, site.id]
300          );
301          ignoredCount++;
302        }
303      }
304  
305      if (ignoredCount > 0) {
306        info(`Marked ${ignoredCount} sites as ignored (directories/social media)`);
307      }
308  
309      info(`Rescoring ${sites.length} sites (score <= ${cutoff})`);
310  
311      const stats = {
312        processed: 0,
313        succeeded: 0,
314        failed: 0,
315        skipped: 0,
316        improved: 0,
317        gradeDistribution: {},
318      };
319  
320      // Process sites in batches
321      const { results, errors } = await processBatch(
322        sites,
323        (site, index) => {
324          displayProgress(index + 1, sites.length, `Rescoring ${site.url}`);
325          return rescoreSite(site.id, site.url, site.score);
326        },
327        { concurrency }
328      );
329  
330      // Count successes, failures, and improvements
331      stats.processed = sites.length;
332      stats.succeeded = results.filter(r => r !== null).length;
333      stats.failed = errors.length;
334      stats.improved = results.filter(r => r && r.improved).length;
335  
336      // Calculate grade distribution (after rescoring)
337      const grades = await getAll(
338        `SELECT grade, COUNT(*) as count
339         FROM sites
340         WHERE grade IS NOT NULL
341         GROUP BY grade`
342      );
343  
344      for (const g of grades) {
345        stats.gradeDistribution[g.grade] = g.count;
346      }
347  
348      // Log errors
349      for (const err of errors) {
350        const url = err.item?.url || 'unknown site';
351        const message = err.error?.message || err.toString();
352        error(`  Failed to rescore ${url}: ${message}`);
353      }
354  
355      stats.duration = Date.now() - startTime;
356      generateStageCompletion('Rescoring', stats);
357      info(`Scores improved: ${stats.improved}/${stats.succeeded}`);
358  
359      return stats;
360    } catch (err) {
361      error(`Rescoring stage failed: ${err.message}`);
362      throw err;
363    }
364  }
365  
366  /**
367   * Rescore a single site with below-fold screenshots
368   * @param {number} siteId - Site ID
369   * @param {string} url - Site URL
370   * @param {number} oldScore - Previous score
371   * @returns {Promise<Object>} Rescoring result
372   */
373  async function rescoreSite(siteId, url, oldScore) {
374    try {
375      // Get site data
376      const site = await getOne(
377        `SELECT
378          id, landing_page_url as url, screenshot_path, ssl_status, http_headers, locale_data
379         FROM sites
380         WHERE id = $1`,
381        [siteId]
382      );
383  
384      // Load HTML from filesystem
385      const htmlDom = readHtmlDom(siteId);
386  
387      if (!site.screenshot_path) {
388        throw new Error('Missing screenshots');
389      }
390  
391      // Load screenshots from file system (including below-fold)
392      const desktopAbove = await loadScreenshot(site.screenshot_path, 'desktop_above');
393      const desktopBelow = await loadScreenshot(site.screenshot_path, 'desktop_below');
394      const mobileAbove = await loadScreenshot(site.screenshot_path, 'mobile_above');
395  
396      // Extract text from below-fold screenshot
397      let visionText = '';
398      if (desktopBelow) {
399        const desktopBelowBase64 = desktopBelow.toString('base64');
400        visionText = await extractTextFromImage(desktopBelowBase64, site.id);
401        if (visionText) {
402          info(`  Extracted ${visionText.length} chars of text from below-fold screenshot`);
403        }
404      }
405  
406      // Prepare site data for rescoring (match scoreWebsite expectations)
407      const siteData = {
408        url: site.url,
409        domain: new URL(site.url).hostname,
410        screenshots: {
411          desktop_above: desktopAbove,
412          desktop_below: desktopBelow,
413          mobile_above: mobileAbove,
414        },
415        html: htmlDom || '',
416        httpHeaders: site.http_headers,
417        localeData: site.locale_data, // Locale indicators for country detection
418        visionText, // Include vision text for contact extraction
419      };
420  
421      // Rescore with below-fold screenshots and vision text
422      const result = await scoreWebsite(siteData, siteId);
423  
424      // Extract grade and score from nested structure
425      const grade = result?.overall_calculation?.letter_grade || null;
426      const score = result?.overall_calculation?.conversion_score || null;
427  
428      // Check if LLM detected a business directory
429      // Skip in test mode (E2E test page is not a real local business)
430      const isDirectory = result?.overall_calculation?.is_business_directory;
431  
432      if (isDirectory && process.env.NODE_ENV !== 'test') {
433        // Mark as ignored if LLM detected directory
434        setScoreJson(siteId, result);
435        await run(
436          `UPDATE sites SET status = 'ignored', error_message = $1 WHERE id = $2`,
437          ['Ignored: Business directory (LLM detected)', siteId]
438        );
439  
440        info(`  ${url} detected as directory by LLM - marked as ignored`);
441  
442        return result;
443      }
444  
445      // Check if LLM detected an error page
446      const isErrorPage = result?.overall_calculation?.is_error_page;
447      const errorType = result?.overall_calculation?.error_type;
448      const errorDescription = result?.overall_calculation?.error_description;
449  
450      if (isErrorPage && errorType) {
451        // Permanent errors (404, 403, 410) - mark as ignored
452        const permanentErrors = ['404', '403', '410'];
453  
454        if (permanentErrors.includes(errorType)) {
455          const errorMsg = errorDescription || `Permanent error: ${errorType}`;
456          setScoreJson(siteId, result);
457          await run(
458            `UPDATE sites SET status = 'ignored', error_message = $1 WHERE id = $2`,
459            [errorMsg, siteId]
460          );
461  
462          info(`  ${url} detected as ${errorType} error - marked as ignored`);
463  
464          return result;
465        }
466  
467        // Temporary errors (5xx, maintenance, redirect) - keep status unchanged for retry
468        const temporaryErrors = ['5xx', 'maintenance', 'redirect'];
469  
470        if (temporaryErrors.includes(errorType)) {
471          const errorMsg = errorDescription || `Temporary error: ${errorType}`;
472          setScoreJson(siteId, result);
473          await run(
474            `UPDATE sites SET status = 'prog_scored', error_message = $1 WHERE id = $2`,
475            [errorMsg, siteId]
476          );
477  
478          info(`  ${url} detected as ${errorType} error - will retry later`);
479  
480          return result;
481        }
482      }
483  
484      // Check if LLM detected a broken site (visual rendering failure)
485      const isBrokenSite = result?.overall_calculation?.is_broken_site || false;
486      const brokenSiteDetails = result?.overall_calculation?.broken_site_details || [];
487  
488      if (isBrokenSite) {
489        // If broken on rescoring (both above + below-fold), mark as ignore (not worth retrying)
490        const errorMsg = `Broken site on rescore (above + below-fold) - Issues: ${brokenSiteDetails.join(', ')}`;
491        setScoreJson(siteId, result);
492        await run(
493          `UPDATE sites SET status = 'ignored', error_message = $1 WHERE id = $2`,
494          [errorMsg, siteId]
495        );
496  
497        error(`  ${url} broken site detected on rescore - marked as ignored`);
498        info(`      Issues: ${brokenSiteDetails.join(', ')}`);
499  
500        return result;
501      }
502  
503      // Clean up invalid Twitter/X.com links from contact_details
504      if (result?.contact_details) {
505        result.contact_details = cleanInvalidSocialLinks(result.contact_details);
506  
507        // Clean phone numbers to E.164 format (add country code if needed)
508        if (result.contact_details.phone_numbers) {
509          result.contact_details.phone_numbers = cleanPhoneNumbers(
510            result.contact_details.phone_numbers,
511            url
512          );
513        }
514      }
515  
516      // Extract location information from contact_details
517      const city = result?.contact_details?.city || null;
518      const countryCode = normaliseCountryCode(result?.contact_details?.country_code) || null;
519      const state = result?.contact_details?.state || null;
520  
521      // Check for country mismatch (site from wrong country for the search)
522      // Skip in test mode (E2E test page has mixed country signals by design)
523      if (countryCode && process.env.NODE_ENV !== 'test') {
524        const siteRow = await getOne('SELECT google_domain FROM sites WHERE id = $1', [siteId]);
525        const googleDomain = siteRow?.google_domain;
526  
527        if (googleDomain) {
528          const expectedCountry = parseCountryFromGoogleDomain(googleDomain);
529  
530          // Check if detected country matches expected country
531          if (countryCode !== expectedCountry) {
532            const errorMsg = `Country mismatch: Site is in ${countryCode}, but search was for ${expectedCountry} (${googleDomain})`;
533            setScoreJson(siteId, result);
534            await run(
535              `UPDATE sites SET
536                status = 'ignored',
537                error_message = $1,
538                city = $2,
539                country_code = $3,
540                state = $4
541               WHERE id = $5`,
542              [errorMsg, city, countryCode, state, siteId]
543            );
544  
545            info(
546              `  ${url} country mismatch: Site in ${countryCode}, search for ${expectedCountry} - marked as ignored`
547            );
548  
549            return result;
550          }
551        }
552      }
553  
554      // High-score check: if vision rescoring pushed score above threshold, skip enrichment
555      const scoreThreshold = parseInt(process.env.LOW_SCORE_CUTOFF || '82', 10);
556      const newStatus = score !== null && score > scoreThreshold ? 'high_score' : 'vision_scored';
557  
558      // Write score to filesystem; DB columns no longer store score_json blobs
559      setScoreJson(siteId, result);
560      await run(
561        `UPDATE sites SET
562          grade = $1,
563          score = $2,
564          city = $3,
565          country_code = $4,
566          state = $5,
567          status = $6,
568          rescored_at = NOW(),
569          error_message = NULL
570         WHERE id = $7`,
571        [grade, score, city, countryCode, state, newStatus, siteId]
572      );
573  
574      // Reset retry count on successful rescoring
575      await resetRetries(siteId);
576  
577      // Increment keyword counter for rescored sites
578      const siteInfo = await getOne(
579        'SELECT keyword, country_code FROM sites WHERE id = $1',
580        [siteId]
581      );
582      if (siteInfo?.keyword && siteInfo?.country_code) {
583        await incrementRescored(siteInfo.keyword, siteInfo.country_code);
584      }
585  
586      const improved = score > oldScore;
587      const change = improved ? `↑ +${(score - oldScore).toFixed(1)}` : `→ ${score}`;
588  
589      success(`  Rescored ${url}: ${grade} (${oldScore} ${change})`);
590  
591      return { ...result, improved };
592    } catch (err) {
593      // Record failure and increment retry count (marks as 'failing' if limit exceeded)
594      await recordFailure(siteId, 'rescoring', err, 'prog_scored');
595      throw err;
596    }
597  }
598  
599  // Exported for unit testing
600  export { cleanPhoneNumbers, validateBase64Image };
601  
602  /**
603   * Get rescoring statistics
604   * @returns {Promise<Object>} Rescoring statistics
605   */
606  export async function getRescoringStats() {
607    return await getOne(
608      `SELECT
609        COUNT(id) as total_rescored,
610        AVG(score) as avg_score_after,
611        MIN(score) as min_score,
612        MAX(score) as max_score
613       FROM sites
614       WHERE rescored_at IS NOT NULL`
615    );
616  }