rescoring.js
1 /** 2 * Rescoring Stage 3 * Rescore sites that received B- or below with below-fold screenshots 4 */ 5 6 import { run, getOne, getAll, withTransaction } from '../utils/db.js'; 7 import { readFileSync } from 'fs'; 8 import { normaliseCountryCode } from '../config/countries.js'; 9 import { scoreWebsite } from '../score.js'; 10 import { callLLM } from '../utils/llm-provider.js'; 11 import Logger from '../utils/logger.js'; 12 import { generateStageCompletion, displayProgress } from '../utils/summary-generator.js'; 13 import { processBatch } from '../utils/error-handler.js'; 14 import { loadScreenshot } from '../utils/screenshot-storage.js'; 15 import { checkBlocklist } from '../utils/site-filters.js'; 16 import { incrementRescored } from '../utils/keyword-counters.js'; 17 import { recordFailure, resetRetries } from '../utils/retry-handler.js'; 18 import { parseCountryFromGoogleDomain } from '../utils/tld-detector.js'; 19 import { cleanInvalidSocialLinks } from '../contacts/prioritize.js'; 20 import { normalizePhoneNumber } from '../utils/phone-normalizer.js'; 21 import { readHtmlDom, hasHtmlDom } from '../utils/html-storage.js'; 22 import { setScoreJson } from '../utils/score-storage.js'; 23 24 const logger = new Logger('Rescoring'); 25 const success = (...args) => logger.success(...args); 26 const info = (...args) => logger.info(...args); 27 const error = (...args) => logger.error(...args); 28 29 const ENGLISH_ONLY_MARKETS = process.env.ENGLISH_ONLY_MARKETS 30 ? process.env.ENGLISH_ONLY_MARKETS.split(',').map(c => c.trim().toUpperCase()) 31 : null; 32 33 // Load vision prompt 34 const VISION_PROMPT = readFileSync(new URL('../../prompts/VISION.md', import.meta.url), 'utf-8'); 35 36 // Model configuration (from env or default) 37 const VISION_MODEL = process.env.VISION_MODEL || 'openai/gpt-4o-mini'; 38 39 // Check ENABLE_VISION flag (consolidates old flags) 40 const ENABLE_VISION = process.env.ENABLE_VISION !== 'false'; 41 42 /** 43 * Clean phone numbers from LLM output 44 * - Supports both object format {number, label} and legacy string format 45 * - Normalizes to E.164 format 46 * - Logs transformations for troubleshooting 47 * @param {Array<Object|string>} phones - Phone numbers from LLM 48 * @param {string} context - URL or identifier for logging 49 * @returns {Array<Object>} Cleaned phone numbers in object format 50 */ 51 function cleanPhoneNumbers(phones, context) { 52 if (!Array.isArray(phones)) return []; 53 54 return phones 55 .map((phone, idx) => { 56 // Handle legacy string format 57 if (typeof phone === 'string') { 58 const cleaned = normalizePhoneNumber(phone); 59 if (cleaned !== phone) { 60 info(` Phone normalization [${context}]: '${phone}' → '${cleaned}'`); 61 } 62 return { number: cleaned, label: '' }; 63 } 64 65 // Handle object format {number, label} 66 if (phone.number) { 67 const original = phone.number; 68 const cleaned = normalizePhoneNumber(original); 69 if (cleaned !== original) { 70 info(` Phone normalization [${context}]: '${original}' → '${cleaned}'`); 71 } 72 return { 73 number: cleaned, 74 label: phone.label || '', 75 }; 76 } 77 78 // Invalid format 79 error(` Invalid phone format at index ${idx} for ${context}: ${JSON.stringify(phone)}`); 80 return null; 81 }) 82 .filter(Boolean); // Remove null entries 83 } 84 85 /** 86 * Validate base64 image data 87 * @param {string} base64Data - Base64-encoded image 88 * @returns {Object} Validation result 89 */ 90 function validateBase64Image(base64Data) { 91 // Check if base64 data is present 92 if (!base64Data || typeof base64Data !== 'string') { 93 return { valid: false, reason: 'Missing or invalid base64 data' }; 94 } 95 96 // Check minimum length (empty images are typically < 100 bytes) 97 if (base64Data.length < 100) { 98 return { valid: false, reason: 'Base64 data too short (likely empty image)' }; 99 } 100 101 // Check for valid base64 characters (A-Z, a-z, 0-9, +, /, =) 102 const base64Regex = /^[A-Za-z0-9+/=]+$/; 103 if (!base64Regex.test(base64Data)) { 104 return { valid: false, reason: 'Invalid base64 characters' }; 105 } 106 107 // Estimate decoded size (base64 is ~1.33x larger than binary) 108 const estimatedBytes = (base64Data.length * 3) / 4; 109 const estimatedMB = estimatedBytes / (1024 * 1024); 110 111 // OpenRouter has 20MB limit per request, but images should be much smaller 112 if (estimatedMB > 10) { 113 return { valid: false, reason: `Image too large (~${estimatedMB.toFixed(1)}MB)` }; 114 } 115 116 return { valid: true, sizeBytes: estimatedBytes, sizeMB: estimatedMB }; 117 } 118 119 /** 120 * Extract text from below-fold screenshot using vision LLM 121 * @param {string} screenshotBase64 - Base64-encoded screenshot 122 * @param {number} siteId - Site ID for usage tracking 123 * @returns {Promise<string>} Extracted text from image 124 */ 125 async function extractTextFromImage(screenshotBase64, siteId = null) { 126 try { 127 // Validate base64 image before sending to API 128 const validation = validateBase64Image(screenshotBase64); 129 if (!validation.valid) { 130 error(` Invalid image data: ${validation.reason}`); 131 return ''; 132 } 133 134 // Log image size for debugging 135 if (validation.sizeMB > 5) { 136 info(` Large image detected: ${validation.sizeMB.toFixed(1)}MB`); 137 } 138 139 const messages = [ 140 { 141 role: 'system', 142 content: VISION_PROMPT, 143 }, 144 { 145 role: 'user', 146 content: [ 147 { 148 type: 'text', 149 text: 'Extract all visible text from this screenshot:', 150 }, 151 { 152 type: 'image_url', 153 image_url: { 154 url: `data:image/jpeg;base64,${screenshotBase64}`, 155 }, 156 }, 157 ], 158 }, 159 ]; 160 161 const response = await callLLM({ 162 model: VISION_MODEL, 163 messages, 164 temperature: 0.1, 165 max_tokens: 2000, 166 stage: 'rescoring', 167 siteId, 168 }); 169 170 return response.content || ''; 171 } catch (err) { 172 // Check if this is a 400 error (bad request) - don't retry these 173 if (err.message && (err.message.includes('400') || err.message.includes('Bad Request'))) { 174 error(` OpenRouter rejected image (400 Bad Request) - invalid format or size`); 175 return ''; 176 } 177 178 error(` Error extracting text from image: ${err.message}`); 179 return ''; 180 } 181 } 182 183 /** 184 * Run the rescoring stage 185 * @param {Object} options - Stage options 186 * @param {number} options.limit - Limit number of sites to rescore 187 * @param {number} options.concurrency - Number of concurrent scoring operations (default: 10) 188 * @param {number} options.cutoff - Score cutoff for rescoring (default: 82) 189 * @returns {Promise<Object>} Stage results 190 */ 191 export async function runRescoringStage(options = {}) { 192 const startTime = Date.now(); 193 const concurrency = options.concurrency || 10; 194 195 // Get cutoff from environment variable or use default 196 const cutoff = options.cutoff || parseInt(process.env.LOW_SCORE_CUTOFF || '82', 10); 197 198 info('Starting Rescoring Stage...'); 199 200 // Show deprecation warning if old flags are used 201 const legacyFlags = [ 202 process.env.USE_COMPUTER_VISION_SCORING, 203 process.env.USE_COMPUTER_VISION_RESCORING, 204 process.env.USE_COMPUTER_VISION_ENRICHMENT, 205 ]; 206 if (legacyFlags.some(flag => flag !== undefined)) { 207 logger.warn( 208 '[rescoring] WARN: Vision flags (USE_COMPUTER_VISION_*) are deprecated. Use ENABLE_VISION instead.' 209 ); 210 } 211 212 try { 213 // Skip rescoring entirely if vision is disabled (rescoring relies on below-fold screenshots) 214 if (!ENABLE_VISION) { 215 info('Rescoring stage skipped: ENABLE_VISION=false (no screenshots to analyze)'); 216 info('Sites are promoted directly to semantic_scored status by the scoring stage'); 217 218 // Promote any remaining prog_scored sites (edge case: sites scored before this fix) 219 const { changes: promotedCount } = await run( 220 `UPDATE sites 221 SET status = 'semantic_scored', 222 rescored_at = NOW() 223 WHERE status = 'prog_scored' 224 AND html_dom IS NOT NULL` 225 ); 226 227 // Sites at 'prog_scored' with empty html_dom cannot be enriched — reset to 'found' for re-capture 228 // Note: score_json/contacts_json are stored on filesystem, not in DB columns. 229 const { changes: resetCount } = await run( 230 `UPDATE sites 231 SET status = 'found', 232 error_message = 'html_dom missing at prog_scored stage — reset for re-capture', 233 score = NULL, 234 updated_at = NOW() 235 WHERE status = 'prog_scored' 236 AND (html_dom IS NULL OR html_dom = '')` 237 ); 238 239 if (promotedCount > 0) { 240 info( 241 `Auto-promoted ${promotedCount} prog_scored → semantic_scored (scoring stage now handles this directly)` 242 ); 243 } 244 if (resetCount > 0) { 245 info( 246 `Reset ${resetCount} prog_scored sites with empty html_dom back to 'found' for re-capture` 247 ); 248 } 249 250 return { 251 processed: promotedCount, 252 succeeded: promotedCount, 253 failed: 0, 254 skipped: 0, 255 improved: 0, 256 duration: Date.now() - startTime, 257 }; 258 } 259 260 info(`Cutoff: ${cutoff} (B- or below)`); 261 262 // Get sites that need rescoring (including retry of errors) 263 const englishFilter = ENGLISH_ONLY_MARKETS 264 ? `AND country_code = ANY($2::text[])` 265 : ''; 266 const sitesQuery = ` 267 SELECT id, domain, landing_page_url as url, score, grade, country_code 268 FROM sites 269 WHERE status = 'prog_scored' 270 AND score <= $1 271 AND screenshot_path IS NOT NULL 272 AND (rescored_at IS NULL OR error_message IS NOT NULL) 273 ${englishFilter} 274 ORDER BY CASE WHEN country_code IN ('AU','CA','GB','IE','IN','NZ','US','ZA') THEN 0 ELSE 1 END ASC 275 ${options.limit ? `LIMIT ${options.limit}` : ''} 276 `; 277 278 const sitesParams = ENGLISH_ONLY_MARKETS ? [cutoff, ENGLISH_ONLY_MARKETS] : [cutoff]; 279 const sites = await getAll(sitesQuery, sitesParams); 280 281 if (sites.length === 0) { 282 info('No sites need rescoring'); 283 return { 284 processed: 0, 285 succeeded: 0, 286 failed: 0, 287 skipped: 0, 288 duration: Date.now() - startTime, 289 }; 290 } 291 292 // Filter out blocklisted sites (directories/social media/franchises) 293 let ignoredCount = 0; 294 for (const site of sites) { 295 const blocked = checkBlocklist(site.domain, site.country_code); 296 if (blocked) { 297 await run( 298 `UPDATE sites SET status = 'ignored', error_message = $1 WHERE id = $2`, 299 [blocked.reason, site.id] 300 ); 301 ignoredCount++; 302 } 303 } 304 305 if (ignoredCount > 0) { 306 info(`Marked ${ignoredCount} sites as ignored (directories/social media)`); 307 } 308 309 info(`Rescoring ${sites.length} sites (score <= ${cutoff})`); 310 311 const stats = { 312 processed: 0, 313 succeeded: 0, 314 failed: 0, 315 skipped: 0, 316 improved: 0, 317 gradeDistribution: {}, 318 }; 319 320 // Process sites in batches 321 const { results, errors } = await processBatch( 322 sites, 323 (site, index) => { 324 displayProgress(index + 1, sites.length, `Rescoring ${site.url}`); 325 return rescoreSite(site.id, site.url, site.score); 326 }, 327 { concurrency } 328 ); 329 330 // Count successes, failures, and improvements 331 stats.processed = sites.length; 332 stats.succeeded = results.filter(r => r !== null).length; 333 stats.failed = errors.length; 334 stats.improved = results.filter(r => r && r.improved).length; 335 336 // Calculate grade distribution (after rescoring) 337 const grades = await getAll( 338 `SELECT grade, COUNT(*) as count 339 FROM sites 340 WHERE grade IS NOT NULL 341 GROUP BY grade` 342 ); 343 344 for (const g of grades) { 345 stats.gradeDistribution[g.grade] = g.count; 346 } 347 348 // Log errors 349 for (const err of errors) { 350 const url = err.item?.url || 'unknown site'; 351 const message = err.error?.message || err.toString(); 352 error(` Failed to rescore ${url}: ${message}`); 353 } 354 355 stats.duration = Date.now() - startTime; 356 generateStageCompletion('Rescoring', stats); 357 info(`Scores improved: ${stats.improved}/${stats.succeeded}`); 358 359 return stats; 360 } catch (err) { 361 error(`Rescoring stage failed: ${err.message}`); 362 throw err; 363 } 364 } 365 366 /** 367 * Rescore a single site with below-fold screenshots 368 * @param {number} siteId - Site ID 369 * @param {string} url - Site URL 370 * @param {number} oldScore - Previous score 371 * @returns {Promise<Object>} Rescoring result 372 */ 373 async function rescoreSite(siteId, url, oldScore) { 374 try { 375 // Get site data 376 const site = await getOne( 377 `SELECT 378 id, landing_page_url as url, screenshot_path, ssl_status, http_headers, locale_data 379 FROM sites 380 WHERE id = $1`, 381 [siteId] 382 ); 383 384 // Load HTML from filesystem 385 const htmlDom = readHtmlDom(siteId); 386 387 if (!site.screenshot_path) { 388 throw new Error('Missing screenshots'); 389 } 390 391 // Load screenshots from file system (including below-fold) 392 const desktopAbove = await loadScreenshot(site.screenshot_path, 'desktop_above'); 393 const desktopBelow = await loadScreenshot(site.screenshot_path, 'desktop_below'); 394 const mobileAbove = await loadScreenshot(site.screenshot_path, 'mobile_above'); 395 396 // Extract text from below-fold screenshot 397 let visionText = ''; 398 if (desktopBelow) { 399 const desktopBelowBase64 = desktopBelow.toString('base64'); 400 visionText = await extractTextFromImage(desktopBelowBase64, site.id); 401 if (visionText) { 402 info(` Extracted ${visionText.length} chars of text from below-fold screenshot`); 403 } 404 } 405 406 // Prepare site data for rescoring (match scoreWebsite expectations) 407 const siteData = { 408 url: site.url, 409 domain: new URL(site.url).hostname, 410 screenshots: { 411 desktop_above: desktopAbove, 412 desktop_below: desktopBelow, 413 mobile_above: mobileAbove, 414 }, 415 html: htmlDom || '', 416 httpHeaders: site.http_headers, 417 localeData: site.locale_data, // Locale indicators for country detection 418 visionText, // Include vision text for contact extraction 419 }; 420 421 // Rescore with below-fold screenshots and vision text 422 const result = await scoreWebsite(siteData, siteId); 423 424 // Extract grade and score from nested structure 425 const grade = result?.overall_calculation?.letter_grade || null; 426 const score = result?.overall_calculation?.conversion_score || null; 427 428 // Check if LLM detected a business directory 429 // Skip in test mode (E2E test page is not a real local business) 430 const isDirectory = result?.overall_calculation?.is_business_directory; 431 432 if (isDirectory && process.env.NODE_ENV !== 'test') { 433 // Mark as ignored if LLM detected directory 434 setScoreJson(siteId, result); 435 await run( 436 `UPDATE sites SET status = 'ignored', error_message = $1 WHERE id = $2`, 437 ['Ignored: Business directory (LLM detected)', siteId] 438 ); 439 440 info(` ${url} detected as directory by LLM - marked as ignored`); 441 442 return result; 443 } 444 445 // Check if LLM detected an error page 446 const isErrorPage = result?.overall_calculation?.is_error_page; 447 const errorType = result?.overall_calculation?.error_type; 448 const errorDescription = result?.overall_calculation?.error_description; 449 450 if (isErrorPage && errorType) { 451 // Permanent errors (404, 403, 410) - mark as ignored 452 const permanentErrors = ['404', '403', '410']; 453 454 if (permanentErrors.includes(errorType)) { 455 const errorMsg = errorDescription || `Permanent error: ${errorType}`; 456 setScoreJson(siteId, result); 457 await run( 458 `UPDATE sites SET status = 'ignored', error_message = $1 WHERE id = $2`, 459 [errorMsg, siteId] 460 ); 461 462 info(` ${url} detected as ${errorType} error - marked as ignored`); 463 464 return result; 465 } 466 467 // Temporary errors (5xx, maintenance, redirect) - keep status unchanged for retry 468 const temporaryErrors = ['5xx', 'maintenance', 'redirect']; 469 470 if (temporaryErrors.includes(errorType)) { 471 const errorMsg = errorDescription || `Temporary error: ${errorType}`; 472 setScoreJson(siteId, result); 473 await run( 474 `UPDATE sites SET status = 'prog_scored', error_message = $1 WHERE id = $2`, 475 [errorMsg, siteId] 476 ); 477 478 info(` ${url} detected as ${errorType} error - will retry later`); 479 480 return result; 481 } 482 } 483 484 // Check if LLM detected a broken site (visual rendering failure) 485 const isBrokenSite = result?.overall_calculation?.is_broken_site || false; 486 const brokenSiteDetails = result?.overall_calculation?.broken_site_details || []; 487 488 if (isBrokenSite) { 489 // If broken on rescoring (both above + below-fold), mark as ignore (not worth retrying) 490 const errorMsg = `Broken site on rescore (above + below-fold) - Issues: ${brokenSiteDetails.join(', ')}`; 491 setScoreJson(siteId, result); 492 await run( 493 `UPDATE sites SET status = 'ignored', error_message = $1 WHERE id = $2`, 494 [errorMsg, siteId] 495 ); 496 497 error(` ${url} broken site detected on rescore - marked as ignored`); 498 info(` Issues: ${brokenSiteDetails.join(', ')}`); 499 500 return result; 501 } 502 503 // Clean up invalid Twitter/X.com links from contact_details 504 if (result?.contact_details) { 505 result.contact_details = cleanInvalidSocialLinks(result.contact_details); 506 507 // Clean phone numbers to E.164 format (add country code if needed) 508 if (result.contact_details.phone_numbers) { 509 result.contact_details.phone_numbers = cleanPhoneNumbers( 510 result.contact_details.phone_numbers, 511 url 512 ); 513 } 514 } 515 516 // Extract location information from contact_details 517 const city = result?.contact_details?.city || null; 518 const countryCode = normaliseCountryCode(result?.contact_details?.country_code) || null; 519 const state = result?.contact_details?.state || null; 520 521 // Check for country mismatch (site from wrong country for the search) 522 // Skip in test mode (E2E test page has mixed country signals by design) 523 if (countryCode && process.env.NODE_ENV !== 'test') { 524 const siteRow = await getOne('SELECT google_domain FROM sites WHERE id = $1', [siteId]); 525 const googleDomain = siteRow?.google_domain; 526 527 if (googleDomain) { 528 const expectedCountry = parseCountryFromGoogleDomain(googleDomain); 529 530 // Check if detected country matches expected country 531 if (countryCode !== expectedCountry) { 532 const errorMsg = `Country mismatch: Site is in ${countryCode}, but search was for ${expectedCountry} (${googleDomain})`; 533 setScoreJson(siteId, result); 534 await run( 535 `UPDATE sites SET 536 status = 'ignored', 537 error_message = $1, 538 city = $2, 539 country_code = $3, 540 state = $4 541 WHERE id = $5`, 542 [errorMsg, city, countryCode, state, siteId] 543 ); 544 545 info( 546 ` ${url} country mismatch: Site in ${countryCode}, search for ${expectedCountry} - marked as ignored` 547 ); 548 549 return result; 550 } 551 } 552 } 553 554 // High-score check: if vision rescoring pushed score above threshold, skip enrichment 555 const scoreThreshold = parseInt(process.env.LOW_SCORE_CUTOFF || '82', 10); 556 const newStatus = score !== null && score > scoreThreshold ? 'high_score' : 'vision_scored'; 557 558 // Write score to filesystem; DB columns no longer store score_json blobs 559 setScoreJson(siteId, result); 560 await run( 561 `UPDATE sites SET 562 grade = $1, 563 score = $2, 564 city = $3, 565 country_code = $4, 566 state = $5, 567 status = $6, 568 rescored_at = NOW(), 569 error_message = NULL 570 WHERE id = $7`, 571 [grade, score, city, countryCode, state, newStatus, siteId] 572 ); 573 574 // Reset retry count on successful rescoring 575 await resetRetries(siteId); 576 577 // Increment keyword counter for rescored sites 578 const siteInfo = await getOne( 579 'SELECT keyword, country_code FROM sites WHERE id = $1', 580 [siteId] 581 ); 582 if (siteInfo?.keyword && siteInfo?.country_code) { 583 await incrementRescored(siteInfo.keyword, siteInfo.country_code); 584 } 585 586 const improved = score > oldScore; 587 const change = improved ? `↑ +${(score - oldScore).toFixed(1)}` : `→ ${score}`; 588 589 success(` Rescored ${url}: ${grade} (${oldScore} ${change})`); 590 591 return { ...result, improved }; 592 } catch (err) { 593 // Record failure and increment retry count (marks as 'failing' if limit exceeded) 594 await recordFailure(siteId, 'rescoring', err, 'prog_scored'); 595 throw err; 596 } 597 } 598 599 // Exported for unit testing 600 export { cleanPhoneNumbers, validateBase64Image }; 601 602 /** 603 * Get rescoring statistics 604 * @returns {Promise<Object>} Rescoring statistics 605 */ 606 export async function getRescoringStats() { 607 return await getOne( 608 `SELECT 609 COUNT(id) as total_rescored, 610 AVG(score) as avg_score_after, 611 MIN(score) as min_score, 612 MAX(score) as max_score 613 FROM sites 614 WHERE rescored_at IS NOT NULL` 615 ); 616 }