scoring.test.js
1 /** 2 * Unit Tests for Scoring Stage 3 * 4 * Tests the runScoringStage() function and its internal scoreSite() logic. 5 * 6 * Key behaviors tested: 7 * - Happy path: site scored, grade/score saved to DB, status updated 8 * - High score (>82): status set to 'high_score' 9 * - Low score (<=82): status set to 'scored' 10 * - Business directory detected (is_business_directory=true): status='ignore' 11 * - Non-local business detected (is_local_business=false): status='ignore' 12 * - Error page permanent (isErrorPage + 404/403/410): status='ignore' 13 * - Error page temporary (isErrorPage + 5xx): status='assets_captured' 14 * - Broken site: recapture scheduled 15 * - Broken site max retries exceeded: status='ignore' 16 * - LOW_SCORE_CUTOFF env var respected 17 * - ENABLE_VISION=false: contacts saved, status='rescored' 18 * - No sites: returns early with 0 counts 19 * - Blocklist filtering marks sites as 'ignore' 20 * 21 * SQL patterns used in scoring.js: 22 * 23 * Blocklist UPDATE: 24 * UPDATE sites SET status = 'ignore', error_message = ? WHERE id = ? 25 * args = [reason, id] 26 * 27 * Directory/Non-local/ErrorPage(perm)/BrokenMax UPDATE (ignore): 28 * UPDATE sites SET\n status = 'ignore',\n error_message = ?,\n score_json = ?\n WHERE id = ? 29 * args = [errorMsg, scoreJson, id] 30 * 31 * Temporary error / broken site recapture UPDATE: 32 * UPDATE sites SET\n status = 'assets_captured',... 33 * args = [errorMsg, scoreJson, id] (temporary) 34 * args = [errorMsg, recaptureCount, scoreJson, id] (broken) 35 * 36 * Success UPDATE: 37 * UPDATE sites SET grade=?, score=?, score_json=?, city=?, country_code=?, state=?, 38 * contacts_json=?, status=?, scored_at=..., WHERE id=? 39 * args = [grade, score, scoreJson, city, countryCode, state, contactsJson, status, id] 40 * indices: 0 1 2 3 4 5 6 7 8 41 * 42 * Run with: 43 * NODE_ENV=test LOGS_DIR=/tmp/test-logs DATABASE_PATH=/tmp/test-sites.db \ 44 * node --experimental-test-module-mocks --test tests/stages/scoring.test.js 45 */ 46 47 import { test, describe, mock, beforeEach } from 'node:test'; 48 import assert from 'node:assert'; 49 import { createPgMock } from '../helpers/pg-mock.js'; // eslint-disable-line no-unused-vars 50 51 // ============================================================================ 52 // MOCK DEFINITIONS — ALL mock.module() calls MUST come before any dynamic imports 53 // ============================================================================ 54 55 // Shared state holders for mocks (mutated per-test via beforeEach) 56 let mockSites = []; 57 let mockSiteDetails = {}; 58 let mockGradeRows = []; 59 let mockRecaptureData = { recapture_count: 0 }; 60 let mockKeywordData = null; 61 62 // Track DB calls for assertions 63 const dbCalls = { 64 updates: [], // { sql: string, args: any[] } 65 }; 66 67 /** 68 * MockDatabase: simulates better-sqlite3 API. 69 * 70 * The constructor receives the dbPath string. prepare().all()/.get()/.run() 71 * are intercepted to return controlled test data. 72 * 73 * SQL DISPATCH STRATEGY: 74 * - SELECT queries dispatch by unique SQL fragments 75 * - UPDATE queries captured with full SQL + args for assertion 76 */ 77 class MockDatabase { 78 constructor(_path) { 79 this._closed = false; 80 } 81 82 prepare(sql) { 83 const trimmed = sql.trim(); 84 85 return { 86 // ---- SELECT (all) ------------------------------------------------ 87 all: () => { 88 // Main query: SELECT sites needing scoring 89 if (trimmed.includes("status = 'assets_captured'") && trimmed.includes('score IS NULL')) { 90 return mockSites; 91 } 92 // Grade distribution query 93 if (trimmed.includes('grade IS NOT NULL') && trimmed.includes('GROUP BY grade')) { 94 return mockGradeRows; 95 } 96 return []; 97 }, 98 99 // ---- SELECT (get, single row) ------------------------------------ 100 get: _id => { 101 // Site detail query inside scoreSite (has ssl_status and landing_page_url) 102 if (trimmed.includes('screenshot_path') && trimmed.includes('ssl_status')) { 103 return mockSiteDetails[_id] || null; 104 } 105 // Broken site: SELECT recapture_count (exact match, no retry_count) 106 if (trimmed.includes('recapture_count') && !trimmed.includes('retry_count')) { 107 return mockRecaptureData; 108 } 109 // recordFailure: SELECT retry_count 110 if (trimmed.includes('retry_count') && !trimmed.includes('recapture_count')) { 111 return { retry_count: 0 }; 112 } 113 // programmatic scorer: SELECT keyword FROM sites WHERE id = ? 114 if (trimmed.includes('keyword') && !trimmed.includes('country_code')) { 115 return { keyword: 'plumber' }; 116 } 117 // incrementLowScoring: SELECT keyword, country_code 118 if (trimmed.includes('keyword') && trimmed.includes('country_code')) { 119 return mockKeywordData; 120 } 121 // getScoringStats: aggregate stats query 122 if (trimmed.includes('total_sites') && trimmed.includes('avg_score')) { 123 return { 124 total_sites: 10, 125 scored_sites: 8, 126 low_score_sites: 5, 127 avg_score: 70.5, 128 min_score: 40, 129 max_score: 95, 130 }; 131 } 132 return null; 133 }, 134 135 // ---- INSERT / UPDATE ------------------------------------------- 136 run: (...args) => { 137 // Auto-promote UPDATE (has rescored + assets_captured in same SQL): count changes, don't track 138 const isAutoPromote = 139 trimmed.includes("status = 'semantic_scored'") && 140 trimmed.includes("status = 'assets_captured'"); 141 if (isAutoPromote) { 142 return { changes: mockSites.length, lastInsertRowid: 0 }; 143 } 144 if (trimmed.startsWith('UPDATE sites SET') || trimmed.startsWith('UPDATE sites\n')) { 145 dbCalls.updates.push({ sql: trimmed, args }); 146 } 147 return { changes: 1, lastInsertRowid: 0 }; 148 }, 149 }; 150 } 151 152 pragma() { 153 return undefined; 154 } 155 156 exec() { 157 return undefined; 158 } 159 160 transaction(fn) { 161 return (...args) => fn(...args); 162 } 163 164 close() { 165 this._closed = true; 166 } 167 } 168 169 // Mock better-sqlite3 (kept for any direct usage, though scoring.js now uses db.js) 170 mock.module('better-sqlite3', { 171 defaultExport: MockDatabase, 172 }); 173 174 // ---- Mock db.js (PostgreSQL layer) — mirrors MockDatabase logic as async functions ---- 175 // scoring.js imports { run, getOne, getAll, query, withTransaction } from '../utils/db.js' 176 mock.module('../../src/utils/db.js', { 177 namedExports: { 178 getPool: () => ({}), 179 closePool: async () => {}, 180 createDatabaseConnection: () => ({}), 181 closeDatabaseConnection: async () => {}, 182 183 getAll: async (sql) => { 184 const trimmed = sql.trim(); 185 // Main query: SELECT sites needing scoring 186 if (trimmed.includes("status = 'assets_captured'") && trimmed.includes('score IS NULL')) { 187 return mockSites; 188 } 189 // Grade distribution query 190 if (trimmed.includes('grade IS NOT NULL') && trimmed.includes('GROUP BY grade')) { 191 return mockGradeRows; 192 } 193 return []; 194 }, 195 196 getOne: async (sql, params) => { 197 const trimmed = sql.trim(); 198 // Site detail query inside scoreSite (has ssl_status and landing_page_url) 199 if (trimmed.includes('screenshot_path') && trimmed.includes('ssl_status')) { 200 const siteId = params && params[0]; 201 return mockSiteDetails[siteId] || null; 202 } 203 // Broken site: SELECT recapture_count 204 if (trimmed.includes('recapture_count') && !trimmed.includes('retry_count')) { 205 return mockRecaptureData; 206 } 207 // recordFailure: SELECT retry_count 208 if (trimmed.includes('retry_count') && !trimmed.includes('recapture_count')) { 209 return { retry_count: 0 }; 210 } 211 // programmatic scorer: SELECT keyword FROM sites WHERE id = ? 212 if (trimmed.includes('keyword') && !trimmed.includes('country_code')) { 213 return { keyword: 'plumber' }; 214 } 215 // incrementLowScoring: SELECT keyword, country_code 216 if (trimmed.includes('keyword') && trimmed.includes('country_code')) { 217 return mockKeywordData; 218 } 219 // getScoringStats: aggregate stats query 220 if (trimmed.includes('total_sites') && trimmed.includes('avg_score')) { 221 return { 222 total_sites: 10, 223 scored_sites: 8, 224 low_score_sites: 5, 225 avg_score: 70.5, 226 min_score: 40, 227 max_score: 95, 228 }; 229 } 230 return null; 231 }, 232 233 run: async (sql, args) => { 234 // Normalize PG params to SQLite-style for assertion matching 235 const trimmed = sql.trim().replace(/\$\d+/g, '?'); 236 // Auto-promote UPDATE 237 const isAutoPromote = 238 trimmed.includes("status = 'semantic_scored'") && 239 trimmed.includes("status = 'assets_captured'"); 240 if (isAutoPromote) { 241 return { changes: mockSites.length, lastInsertRowid: 0 }; 242 } 243 if (trimmed.startsWith('UPDATE sites SET') || trimmed.startsWith('UPDATE sites\n')) { 244 dbCalls.updates.push({ sql: trimmed, args: args || [] }); 245 } 246 return { changes: 1, lastInsertRowid: 0 }; 247 }, 248 249 query: async (sql, args) => { 250 const trimmed = sql.trim().replace(/\$\d+/g, '?'); 251 if (trimmed.startsWith('UPDATE sites SET') || trimmed.startsWith('UPDATE sites\n')) { 252 dbCalls.updates.push({ sql: trimmed, args: args || [] }); 253 return { rows: [], rowCount: 1 }; 254 } 255 return { rows: [], rowCount: 0 }; 256 }, 257 258 withTransaction: async (fn) => { 259 const fakeClient = { 260 query: async (sql, args) => { 261 const trimmed = sql.trim().replace(/\$\d+/g, '?'); 262 if (trimmed.startsWith('UPDATE sites SET') || trimmed.startsWith('UPDATE sites\n')) { 263 dbCalls.updates.push({ sql: trimmed, args: args || [] }); 264 return { rows: [], rowCount: 1 }; 265 } 266 // Grade distribution 267 if (trimmed.includes('grade IS NOT NULL') && trimmed.includes('GROUP BY grade')) { 268 return { rows: mockGradeRows, rowCount: mockGradeRows.length }; 269 } 270 // Keyword lookup inside transaction — return mockKeywordData only if non-null 271 if (trimmed.includes('keyword') && trimmed.includes('country_code') && trimmed.includes('SELECT')) { 272 if (mockKeywordData) { 273 return { rows: [mockKeywordData], rowCount: 1 }; 274 } 275 return { rows: [], rowCount: 0 }; 276 } 277 return { rows: [], rowCount: 0 }; 278 }, 279 }; 280 return fn(fakeClient); 281 }, 282 }, 283 }); 284 285 // Shared scoreWebsite mock — default returns a low-score result 286 const mockScoreWebsite = mock.fn(async () => ({ 287 overall_calculation: { 288 letter_grade: 'D', 289 conversion_score: 65, 290 is_business_directory: false, 291 is_local_business: true, 292 is_error_page: false, 293 is_broken_site: false, 294 city: 'Sydney', 295 country_code: 'AU', 296 state: 'NSW', 297 }, 298 })); 299 300 mock.module('../../src/score.js', { 301 namedExports: { 302 scoreWebsite: mockScoreWebsite, 303 computeGrade: score => { 304 if (score >= 97) return 'A+'; 305 if (score >= 90) return 'A'; 306 if (score >= 87) return 'B+'; 307 if (score >= 83) return 'B'; 308 if (score >= 80) return 'B-'; 309 if (score >= 60) return 'C'; 310 return 'F'; 311 }, 312 computeScoreFromFactors: mock.fn(() => 75), 313 FACTOR_WEIGHTS: { 314 headline: 0.15, 315 value_proposition: 0.15, 316 social_proof: 0.12, 317 cta: 0.12, 318 mobile: 0.1, 319 page_speed: 0.1, 320 trust: 0.1, 321 contact: 0.08, 322 above_fold: 0.08, 323 }, 324 }, 325 }); 326 327 // Mock programmatic-scorer — returns a low-score result by default 328 const mockScoreWebsiteProgrammatically = mock.fn(() => ({ 329 conversion_score: 60, 330 letter_grade: 'D', 331 is_error_page: false, 332 is_broken_site: false, 333 is_business_directory: false, 334 is_local_business: true, 335 is_law_firm: false, 336 industry_classification: '', 337 country_code: 'AU', 338 city: null, 339 state: null, 340 factor_scores: {}, 341 contacts: null, 342 })); 343 mock.module('../../src/utils/programmatic-scorer.js', { 344 namedExports: { 345 scoreWebsiteProgrammatically: mockScoreWebsiteProgrammatically, 346 }, 347 }); 348 349 // Mock loadScreenshot 350 const mockLoadScreenshot = mock.fn(async () => null); 351 mock.module('../../src/utils/screenshot-storage.js', { 352 namedExports: { 353 loadScreenshot: mockLoadScreenshot, 354 }, 355 }); 356 357 // Mock site-filters — default: not blocked 358 const mockCheckBlocklist = mock.fn(() => null); 359 const mockClassifyIndustry = mock.fn(() => null); 360 mock.module('../../src/utils/site-filters.js', { 361 namedExports: { 362 checkBlocklist: mockCheckBlocklist, 363 classifyIndustry: mockClassifyIndustry, 364 }, 365 }); 366 367 // Mock keyword-counters 368 const mockIncrementLowScoring = mock.fn(); 369 mock.module('../../src/utils/keyword-counters.js', { 370 namedExports: { 371 incrementLowScoring: mockIncrementLowScoring, 372 incrementAssetsScraped: mock.fn(), 373 }, 374 }); 375 376 // Mock retry-handler 377 const mockRecordFailure = mock.fn(); 378 const mockResetRetries = mock.fn(); 379 mock.module('../../src/utils/retry-handler.js', { 380 namedExports: { 381 recordFailure: mockRecordFailure, 382 resetRetries: mockResetRetries, 383 }, 384 }); 385 386 // Mock summary-generator 387 mock.module('../../src/utils/summary-generator.js', { 388 namedExports: { 389 generateStageCompletion: mock.fn(), 390 displayProgress: mock.fn(), 391 }, 392 }); 393 394 // Mock error-handler — provide a real processBatch implementation so the stage works end-to-end 395 const mockProcessBatch = mock.fn(async (items, processor, _opts) => { 396 const results = []; 397 const errors = []; 398 for (let i = 0; i < items.length; i++) { 399 try { 400 const result = await processor(items[i], i); 401 results.push(result); 402 } catch (err) { 403 errors.push(err); 404 } 405 } 406 return { results, errors }; 407 }); 408 mock.module('../../src/utils/error-handler.js', { 409 namedExports: { 410 processBatch: mockProcessBatch, 411 retryWithBackoff: mock.fn(async fn => fn()), 412 isRetryableError: mock.fn(() => false), 413 sleep: mock.fn(async () => {}), 414 safeJsonParse: mock.fn(str => JSON.parse(str)), 415 extractDomain: mock.fn(url => new URL(url).hostname), 416 }, 417 }); 418 419 // Mock logger — silent 420 class MockLogger { 421 info() {} 422 success() {} 423 error() {} 424 warn() {} 425 debug() {} 426 } 427 mock.module('../../src/utils/logger.js', { 428 defaultExport: MockLogger, 429 }); 430 431 // Mock dotenv 432 mock.module('dotenv', { 433 namedExports: { config: mock.fn() }, 434 defaultExport: { config: mock.fn() }, 435 }); 436 437 // html-storage: readHtmlDom reads from filesystem — mock to return controlled HTML 438 let mockHtmlDom = '<html></html>'; 439 mock.module('../../src/utils/html-storage.js', { 440 namedExports: { 441 readHtmlDom: mock.fn(() => mockHtmlDom), 442 writeHtmlDom: mock.fn(), 443 hasHtmlDom: mock.fn(() => true), 444 deleteHtmlDom: mock.fn(), 445 writeKeyPagesHtml: mock.fn(), 446 readKeyPagesHtml: mock.fn(() => null), 447 deleteKeyPagesHtml: mock.fn(), 448 deleteAllHtml: mock.fn(), 449 DATA_DIR: '/tmp/test-html-data', 450 }, 451 }); 452 453 // score-storage and contacts-storage: write to filesystem — mock to no-op 454 mock.module('../../src/utils/score-storage.js', { 455 namedExports: { 456 setScoreJson: mock.fn(), 457 getScoreJson: mock.fn(() => null), 458 }, 459 }); 460 461 mock.module('../../src/utils/contacts-storage.js', { 462 namedExports: { 463 setContactsJson: mock.fn(), 464 getContactsJson: mock.fn(() => null), 465 }, 466 }); 467 468 // ============================================================================ 469 // IMPORTS — must come AFTER all mock.module() calls 470 // ============================================================================ 471 472 const { runScoringStage, getScoringStats } = await import('../../src/stages/scoring.js'); 473 const { setContactsJson: mockSetContactsJson } = await import('../../src/utils/contacts-storage.js'); 474 475 // ============================================================================ 476 // HELPERS 477 // ============================================================================ 478 479 /** Build a minimal site row returned by the initial DB query. */ 480 function makeSiteRow(overrides = {}) { 481 return { 482 id: 1, 483 domain: 'example.com', 484 url: 'https://example.com', 485 country_code: 'AU', 486 ...overrides, 487 }; 488 } 489 490 /** Build the detailed site record returned by scoreSite's inner SELECT. */ 491 function makeSiteDetail(overrides = {}) { 492 return { 493 id: 1, 494 url: 'https://example.com', 495 screenshot_path: null, 496 html_dom: '<html><body>Example</body></html>', 497 ssl_status: null, 498 http_headers: null, 499 locale_data: null, 500 ...overrides, 501 }; 502 } 503 504 /** 505 * Find a DB update whose SQL contains a specific status literal 506 * (e.g. "status = 'ignored'"). 507 * These statuses are embedded in the SQL string, NOT passed as parameters. 508 */ 509 function findUpdateWithStatus(status) { 510 return dbCalls.updates.find(u => u.sql.includes(`status = '${status}'`)); 511 } 512 513 /** 514 * Find the main "success" update for a site (the one that writes grade/score/status/city). 515 * The success UPDATE has: grade=?, score=?, city=?, country_code=?, state=?, status=?, WHERE id=? 516 * args indices: [0]=grade, [1]=score, [2]=city, [3]=country_code, [4]=state, [5]=status, [6]=id 517 * Note: score_json/contacts_json blobs written to filesystem (setScoreJson/setContactsJson). 518 */ 519 function findMainSuccessUpdate() { 520 return dbCalls.updates.find( 521 u => u.sql.includes('grade = ?') && u.sql.includes('score = ?') && u.sql.includes('scored_at') 522 ); 523 } 524 525 /** Reset all shared mutable state between tests. */ 526 function resetState() { 527 mockSites = []; 528 mockSiteDetails = {}; 529 mockGradeRows = []; 530 mockRecaptureData = { recapture_count: 0 }; 531 mockKeywordData = null; 532 dbCalls.updates = []; 533 534 mockScoreWebsite.mock.resetCalls(); 535 mockScoreWebsiteProgrammatically.mock.resetCalls(); 536 mockLoadScreenshot.mock.resetCalls(); 537 mockCheckBlocklist.mock.resetCalls(); 538 mockIncrementLowScoring.mock.resetCalls(); 539 mockRecordFailure.mock.resetCalls(); 540 mockResetRetries.mock.resetCalls(); 541 mockProcessBatch.mock.resetCalls(); 542 mockSetContactsJson.mock.resetCalls(); 543 } 544 545 /** Set env to a known baseline state before each test. */ 546 function resetEnv() { 547 process.env.NODE_ENV = 'production'; // allow directory/local-business checks 548 process.env.ENABLE_VISION = 'true'; 549 process.env.ENABLE_LLM_SCORING = 'true'; 550 delete process.env.LOW_SCORE_CUTOFF; 551 delete process.env.SCORING_CONCURRENCY; 552 // Remove split-DB paths so createDatabaseConnection doesn't validate non-existent files 553 // (the NODE_ENV != 'test' guard in db.js would otherwise throw) 554 delete process.env.OPS_DB_PATH; 555 delete process.env.TEL_DB_PATH; 556 } 557 558 // ============================================================================ 559 // TESTS 560 // ============================================================================ 561 562 // --------------------------------------------------------------------------- 563 describe('runScoringStage — no sites', () => { 564 beforeEach(() => { 565 resetState(); 566 resetEnv(); 567 }); 568 569 test('returns early with 0 counts when no sites need scoring', async () => { 570 mockSites = []; // DB returns empty list 571 572 const result = await runScoringStage({ limit: 10 }); 573 574 assert.strictEqual(result.processed, 0); 575 assert.strictEqual(result.succeeded, 0); 576 assert.strictEqual(result.failed, 0); 577 assert.strictEqual(result.skipped, 0); 578 assert.ok(typeof result.duration === 'number'); 579 580 // scoreWebsite should never be called 581 assert.strictEqual(mockScoreWebsite.mock.callCount(), 0); 582 }); 583 }); 584 585 // --------------------------------------------------------------------------- 586 describe('runScoringStage — blocklist filtering', () => { 587 beforeEach(() => { 588 resetState(); 589 resetEnv(); 590 }); 591 592 test('marks blocked sites as ignore via checkBlocklist', async () => { 593 const site = makeSiteRow({ id: 1, domain: 'yelp.com' }); 594 mockSites = [site]; 595 mockSiteDetails[1] = makeSiteDetail({ id: 1, url: 'https://yelp.com' }); 596 597 // checkBlocklist returns a block reason 598 mockCheckBlocklist.mock.mockImplementation(() => ({ 599 reason: 'Business directory: yelp.com', 600 })); 601 602 // scoreWebsite succeeds (the stage still processes the site through processBatch) 603 mockScoreWebsite.mock.mockImplementation(async () => ({ 604 overall_calculation: { 605 letter_grade: 'D-', 606 conversion_score: 60, 607 is_business_directory: false, 608 is_local_business: true, 609 is_error_page: false, 610 is_broken_site: false, 611 }, 612 })); 613 614 await runScoringStage({ limit: 1 }); 615 616 // checkBlocklist called once for the site 617 assert.strictEqual(mockCheckBlocklist.mock.callCount(), 1); 618 619 // The blocklist UPDATE embeds status='ignore' directly in SQL: 620 // "UPDATE sites SET status = 'ignore', error_message = ? WHERE id = ?" 621 const ignoreUpdate = dbCalls.updates.find( 622 u => 623 u.sql.includes("status = 'ignored'") && 624 u.args.some(a => typeof a === 'string' && a.includes('Business directory')) 625 ); 626 assert.ok(ignoreUpdate, 'Expected an UPDATE with status=ignore from blocklist'); 627 }); 628 629 test('does not mark non-blocked sites as ignore via checkBlocklist', async () => { 630 const site = makeSiteRow(); 631 mockSites = [site]; 632 mockSiteDetails[1] = makeSiteDetail(); 633 634 mockCheckBlocklist.mock.mockImplementation(() => null); // not blocked 635 636 mockScoreWebsite.mock.mockImplementation(async () => ({ 637 overall_calculation: { 638 letter_grade: 'D-', 639 conversion_score: 60, 640 is_business_directory: false, 641 is_local_business: true, 642 is_error_page: false, 643 is_broken_site: false, 644 }, 645 })); 646 647 await runScoringStage({ limit: 1 }); 648 649 // No blocklist-style ignore update should exist 650 const blocklistIgnore = dbCalls.updates.find( 651 u => 652 u.sql.includes("status = 'ignored'") && 653 u.args.some(a => typeof a === 'string' && a.includes('Business directory')) 654 ); 655 assert.ok(!blocklistIgnore, 'Should NOT have a blocklist ignore update for non-blocked site'); 656 }); 657 }); 658 659 // --------------------------------------------------------------------------- 660 describe('runScoringStage — happy path scoring', () => { 661 beforeEach(() => { 662 resetState(); 663 resetEnv(); 664 }); 665 666 test('scores site, saves grade/score to DB, sets status to scored (low score)', async () => { 667 const site = makeSiteRow(); 668 mockSites = [site]; 669 mockSiteDetails[1] = makeSiteDetail(); 670 mockCheckBlocklist.mock.mockImplementation(() => null); 671 672 mockScoreWebsite.mock.mockImplementation(async () => ({ 673 overall_calculation: { 674 letter_grade: 'D', 675 conversion_score: 65, 676 is_business_directory: false, 677 is_local_business: true, 678 is_error_page: false, 679 is_broken_site: false, 680 city: 'Melbourne', 681 country_code: 'AU', 682 state: 'VIC', 683 }, 684 })); 685 686 await runScoringStage({ limit: 1 }); 687 688 assert.strictEqual(mockScoreWebsite.mock.callCount(), 1); 689 690 // The main success UPDATE writes grade, score, city, country_code, state, status 691 const successUpdate = findMainSuccessUpdate(); 692 assert.ok(successUpdate, 'Expected a success UPDATE with grade/score fields'); 693 694 // args: [grade, score, city, countryCode, state, contactsJson, FS_SENTINEL, status, id] 695 // scoreJson removed — now written to filesystem via setScoreJson() 696 const { args } = successUpdate; 697 assert.strictEqual(args[0], 'D'); // grade 698 assert.strictEqual(args[1], 65); // score 699 assert.strictEqual(args[2], 'Melbourne'); // city 700 assert.strictEqual(args[3], 'AU'); // country_code 701 assert.strictEqual(args[4], 'VIC'); // state 702 assert.strictEqual(args[5], 'prog_scored'); // status 703 704 // resetRetries should be called on success 705 assert.strictEqual(mockResetRetries.mock.callCount(), 1); 706 }); 707 708 test('calls scoreWebsite with correct site data including html and url', async () => { 709 mockHtmlDom = '<html>test</html>'; 710 const site = makeSiteRow({ id: 1, url: 'https://example.com' }); 711 mockSites = [site]; 712 mockSiteDetails[1] = makeSiteDetail({ 713 screenshot_path: null, 714 }); 715 mockCheckBlocklist.mock.mockImplementation(() => null); 716 717 mockScoreWebsite.mock.mockImplementation(async () => ({ 718 overall_calculation: { 719 letter_grade: 'C+', 720 conversion_score: 78, 721 is_business_directory: false, 722 is_local_business: true, 723 is_error_page: false, 724 is_broken_site: false, 725 }, 726 })); 727 728 await runScoringStage({ limit: 1 }); 729 730 const callArgs = mockScoreWebsite.mock.calls[0].arguments; 731 const siteData = callArgs[0]; 732 733 assert.strictEqual(siteData.html, '<html>test</html>'); 734 assert.strictEqual(siteData.url, 'https://example.com'); 735 assert.ok('screenshots' in siteData, 'siteData should have screenshots property'); 736 }); 737 738 test('loads screenshots from disk when screenshot_path is set', async () => { 739 const site = makeSiteRow(); 740 mockSites = [site]; 741 mockSiteDetails[1] = makeSiteDetail({ screenshot_path: '/screenshots/1' }); 742 mockCheckBlocklist.mock.mockImplementation(() => null); 743 744 const fakeScreenshot = Buffer.from('fake-image'); 745 mockLoadScreenshot.mock.mockImplementation(async () => fakeScreenshot); 746 747 mockScoreWebsite.mock.mockImplementation(async () => ({ 748 overall_calculation: { 749 letter_grade: 'C-', 750 conversion_score: 70, 751 is_business_directory: false, 752 is_local_business: true, 753 is_error_page: false, 754 is_broken_site: false, 755 }, 756 })); 757 758 await runScoringStage({ limit: 1 }); 759 760 // loadScreenshot called twice: desktop_above + mobile_above 761 assert.strictEqual(mockLoadScreenshot.mock.callCount(), 2); 762 }); 763 764 test('does not load screenshots when screenshot_path is null', async () => { 765 const site = makeSiteRow(); 766 mockSites = [site]; 767 mockSiteDetails[1] = makeSiteDetail({ screenshot_path: null }); 768 mockCheckBlocklist.mock.mockImplementation(() => null); 769 770 mockScoreWebsite.mock.mockImplementation(async () => ({ 771 overall_calculation: { 772 letter_grade: 'D-', 773 conversion_score: 60, 774 is_business_directory: false, 775 is_local_business: true, 776 is_error_page: false, 777 is_broken_site: false, 778 }, 779 })); 780 781 await runScoringStage({ limit: 1 }); 782 783 assert.strictEqual(mockLoadScreenshot.mock.callCount(), 0); 784 }); 785 }); 786 787 // --------------------------------------------------------------------------- 788 describe('runScoringStage — score threshold / HIGH_SCORE', () => { 789 beforeEach(() => { 790 resetState(); 791 resetEnv(); 792 }); 793 794 test('sets status to high_score when score > 82 (default threshold)', async () => { 795 const site = makeSiteRow(); 796 mockSites = [site]; 797 mockSiteDetails[1] = makeSiteDetail(); 798 mockCheckBlocklist.mock.mockImplementation(() => null); 799 800 mockScoreWebsite.mock.mockImplementation(async () => ({ 801 overall_calculation: { 802 letter_grade: 'A-', 803 conversion_score: 90, 804 is_business_directory: false, 805 is_local_business: true, 806 is_error_page: false, 807 is_broken_site: false, 808 }, 809 })); 810 811 await runScoringStage({ limit: 1 }); 812 813 const successUpdate = findMainSuccessUpdate(); 814 assert.ok(successUpdate, 'Expected a success UPDATE'); 815 assert.strictEqual(successUpdate.args[5], 'high_score', 'status should be high_score'); 816 }); 817 818 test('sets status to scored when score is exactly 82 (at threshold, not above)', async () => { 819 const site = makeSiteRow(); 820 mockSites = [site]; 821 mockSiteDetails[1] = makeSiteDetail(); 822 mockCheckBlocklist.mock.mockImplementation(() => null); 823 824 mockScoreWebsite.mock.mockImplementation(async () => ({ 825 overall_calculation: { 826 letter_grade: 'B-', 827 conversion_score: 82, 828 is_business_directory: false, 829 is_local_business: true, 830 is_error_page: false, 831 is_broken_site: false, 832 }, 833 })); 834 835 await runScoringStage({ limit: 1 }); 836 837 // 82 is NOT > 82 so it should be 'scored' 838 const successUpdate = findMainSuccessUpdate(); 839 assert.ok(successUpdate, 'Expected a success UPDATE'); 840 assert.strictEqual( 841 successUpdate.args[5], 842 'prog_scored', 843 'status should be scored for score=82' 844 ); 845 }); 846 847 test('sets status to scored when score is 1 (well below threshold)', async () => { 848 const site = makeSiteRow(); 849 mockSites = [site]; 850 mockSiteDetails[1] = makeSiteDetail(); 851 mockCheckBlocklist.mock.mockImplementation(() => null); 852 853 mockScoreWebsite.mock.mockImplementation(async () => ({ 854 overall_calculation: { 855 letter_grade: 'F', 856 conversion_score: 1, 857 is_business_directory: false, 858 is_local_business: true, 859 is_error_page: false, 860 is_broken_site: false, 861 }, 862 })); 863 864 await runScoringStage({ limit: 1 }); 865 866 const successUpdate = findMainSuccessUpdate(); 867 assert.ok(successUpdate, 'Expected a success UPDATE'); 868 assert.strictEqual( 869 successUpdate.args[5], 870 'prog_scored', 871 'status should be prog_scored for very low score' 872 ); 873 }); 874 875 test('respects LOW_SCORE_CUTOFF=90: score 88 is still scored (88 is NOT > 90)', async () => { 876 process.env.LOW_SCORE_CUTOFF = '90'; 877 878 const site = makeSiteRow(); 879 mockSites = [site]; 880 mockSiteDetails[1] = makeSiteDetail(); 881 mockCheckBlocklist.mock.mockImplementation(() => null); 882 883 mockScoreWebsite.mock.mockImplementation(async () => ({ 884 overall_calculation: { 885 letter_grade: 'B+', 886 conversion_score: 88, 887 is_business_directory: false, 888 is_local_business: true, 889 is_error_page: false, 890 is_broken_site: false, 891 }, 892 })); 893 894 await runScoringStage({ limit: 1 }); 895 896 const successUpdate = findMainSuccessUpdate(); 897 assert.ok(successUpdate, 'Expected a success UPDATE'); 898 assert.strictEqual( 899 successUpdate.args[5], 900 'prog_scored', 901 'score 88 should be scored when cutoff=90' 902 ); 903 }); 904 905 test('respects LOW_SCORE_CUTOFF=70: score 75 becomes high_score (75 > 70)', async () => { 906 process.env.LOW_SCORE_CUTOFF = '70'; 907 908 const site = makeSiteRow(); 909 mockSites = [site]; 910 mockSiteDetails[1] = makeSiteDetail(); 911 mockCheckBlocklist.mock.mockImplementation(() => null); 912 913 mockScoreWebsite.mock.mockImplementation(async () => ({ 914 overall_calculation: { 915 letter_grade: 'C', 916 conversion_score: 75, 917 is_business_directory: false, 918 is_local_business: true, 919 is_error_page: false, 920 is_broken_site: false, 921 }, 922 })); 923 924 await runScoringStage({ limit: 1 }); 925 926 const successUpdate = findMainSuccessUpdate(); 927 assert.ok(successUpdate, 'Expected a success UPDATE'); 928 assert.strictEqual( 929 successUpdate.args[5], 930 'high_score', 931 'score 75 should be high_score when cutoff=70' 932 ); 933 }); 934 }); 935 936 // --------------------------------------------------------------------------- 937 describe('runScoringStage — ENABLE_VISION=false (HTML-only mode)', () => { 938 beforeEach(() => { 939 resetState(); 940 resetEnv(); 941 process.env.ENABLE_VISION = 'false'; 942 process.env.ENABLE_LLM_SCORING = 'false'; // prevent early orchestrator-mode return 943 }); 944 945 test('sets status to rescored instead of scored in HTML-only mode (low score)', async () => { 946 const site = makeSiteRow(); 947 mockSites = [site]; 948 mockSiteDetails[1] = makeSiteDetail(); 949 mockCheckBlocklist.mock.mockImplementation(() => null); 950 951 mockScoreWebsite.mock.mockImplementation(async () => ({ 952 overall_calculation: { 953 letter_grade: 'D-', 954 conversion_score: 60, 955 is_business_directory: false, 956 is_local_business: true, 957 is_error_page: false, 958 is_broken_site: false, 959 }, 960 contact_details: [{ email: 'owner@example.com' }], 961 })); 962 963 await runScoringStage({ limit: 1 }); 964 965 // Status must be 'rescored' in HTML-only mode (args[5]) 966 const successUpdate = findMainSuccessUpdate(); 967 assert.ok(successUpdate, 'Expected a success UPDATE'); 968 assert.strictEqual( 969 successUpdate.args[5], 970 'semantic_scored', 971 'status should be semantic_scored in HTML-only mode' 972 ); 973 }); 974 975 test('saves contacts_json (non-null) when ENABLE_VISION=false and contact_details present', async () => { 976 const site = makeSiteRow(); 977 mockSites = [site]; 978 mockSiteDetails[1] = makeSiteDetail(); 979 mockCheckBlocklist.mock.mockImplementation(() => null); 980 981 // In programmatic mode (ENABLE_LLM_SCORING=false), contacts come from the programmatic scorer 982 mockScoreWebsiteProgrammatically.mock.mockImplementation(() => ({ 983 conversion_score: 40, 984 letter_grade: 'F', 985 is_error_page: false, 986 is_broken_site: false, 987 is_business_directory: false, 988 is_local_business: true, 989 is_law_firm: false, 990 industry_classification: '', 991 country_code: 'AU', 992 city: null, 993 state: null, 994 factor_scores: {}, 995 contacts: { 996 email_addresses: [{ email: 'contact@example.com', source: 'footer' }], 997 phone_numbers: [], 998 social_profiles: [], 999 has_contact_form: false, 1000 key_pages: [], 1001 }, 1002 })); 1003 1004 await runScoringStage({ limit: 1 }); 1005 1006 // contacts written to filesystem via setContactsJson, not stored in DB 1007 assert.equal(mockSetContactsJson.mock.callCount(), 1, 'setContactsJson should be called once'); 1008 const [calledSiteId, calledContacts] = mockSetContactsJson.mock.calls[0].arguments; 1009 assert.equal(calledSiteId, 1, 'setContactsJson siteId should match'); 1010 assert.ok(calledContacts, 'contacts arg should be truthy'); 1011 const parsedContacts = typeof calledContacts === 'string' ? JSON.parse(calledContacts) : calledContacts; 1012 assert.ok(parsedContacts && typeof parsedContacts === 'object', 'contacts arg should be a JSON object'); 1013 }); 1014 1015 test('does not save contacts_json when ENABLE_VISION=true (contacts_json=null)', async () => { 1016 process.env.ENABLE_VISION = 'true'; // Override the beforeEach 1017 1018 const site = makeSiteRow(); 1019 mockSites = [site]; 1020 mockSiteDetails[1] = makeSiteDetail(); 1021 mockCheckBlocklist.mock.mockImplementation(() => null); 1022 1023 mockScoreWebsite.mock.mockImplementation(async () => ({ 1024 overall_calculation: { 1025 letter_grade: 'D', 1026 conversion_score: 65, 1027 is_business_directory: false, 1028 is_local_business: true, 1029 is_error_page: false, 1030 is_broken_site: false, 1031 }, 1032 contact_details: [{ email: 'owner@example.com' }], 1033 })); 1034 1035 await runScoringStage({ limit: 1 }); 1036 1037 // when vision is enabled, setContactsJson should NOT be called 1038 assert.equal( 1039 mockSetContactsJson.mock.callCount(), 1040 0, 1041 'setContactsJson should not be called when ENABLE_VISION=true' 1042 ); 1043 }); 1044 1045 test('still sets high_score status in HTML-only mode when score > threshold', async () => { 1046 const site = makeSiteRow(); 1047 mockSites = [site]; 1048 mockSiteDetails[1] = makeSiteDetail(); 1049 mockCheckBlocklist.mock.mockImplementation(() => null); 1050 1051 // In programmatic mode (ENABLE_LLM_SCORING=false from beforeEach), use programmatic mock 1052 mockScoreWebsiteProgrammatically.mock.mockImplementation(() => ({ 1053 conversion_score: 98, 1054 letter_grade: 'A+', 1055 is_error_page: false, 1056 is_broken_site: false, 1057 is_business_directory: false, 1058 is_local_business: true, 1059 is_law_firm: false, 1060 industry_classification: '', 1061 country_code: 'AU', 1062 city: null, 1063 state: null, 1064 factor_scores: {}, 1065 contacts: null, 1066 })); 1067 1068 await runScoringStage({ limit: 1 }); 1069 1070 // high_score takes priority over HTML-only 'rescored' path 1071 const successUpdate = findMainSuccessUpdate(); 1072 assert.ok(successUpdate, 'Expected a success UPDATE'); 1073 assert.strictEqual( 1074 successUpdate.args[5], 1075 'high_score', 1076 'high_score takes priority even in HTML-only mode' 1077 ); 1078 }); 1079 }); 1080 1081 // --------------------------------------------------------------------------- 1082 describe('runScoringStage — business directory detection', () => { 1083 beforeEach(() => { 1084 resetState(); 1085 resetEnv(); 1086 process.env.NODE_ENV = 'production'; // enable directory/local checks 1087 }); 1088 1089 test('marks site as ignore when LLM detects business directory (is_business_directory=true)', async () => { 1090 const site = makeSiteRow(); 1091 mockSites = [site]; 1092 mockSiteDetails[1] = makeSiteDetail(); 1093 mockCheckBlocklist.mock.mockImplementation(() => null); 1094 1095 mockScoreWebsite.mock.mockImplementation(async () => ({ 1096 overall_calculation: { 1097 letter_grade: 'F', 1098 conversion_score: 30, 1099 is_business_directory: true, 1100 is_local_business: true, 1101 is_error_page: false, 1102 is_broken_site: false, 1103 }, 1104 })); 1105 1106 await runScoringStage({ limit: 1 }); 1107 1108 // The directory UPDATE: UPDATE sites SET status='ignore', error_message=?, score_json=? WHERE id=? 1109 // args = ['Ignored: Business directory (LLM detected)', scoreJson, id] 1110 const ignoreUpdate = findUpdateWithStatus('ignored'); 1111 assert.ok(ignoreUpdate, 'Expected an UPDATE setting status=ignore'); 1112 assert.ok( 1113 ignoreUpdate.args[0].includes('Business directory'), 1114 'Error message should mention business directory' 1115 ); 1116 }); 1117 1118 test('marks site as ignore when LLM detects non-local business (is_local_business=false)', async () => { 1119 const site = makeSiteRow(); 1120 mockSites = [site]; 1121 mockSiteDetails[1] = makeSiteDetail(); 1122 mockCheckBlocklist.mock.mockImplementation(() => null); 1123 1124 mockScoreWebsite.mock.mockImplementation(async () => ({ 1125 overall_calculation: { 1126 letter_grade: 'B-', 1127 conversion_score: 80, 1128 is_business_directory: false, 1129 is_local_business: false, 1130 is_error_page: false, 1131 is_broken_site: false, 1132 }, 1133 })); 1134 1135 await runScoringStage({ limit: 1 }); 1136 1137 const ignoreUpdate = findUpdateWithStatus('ignored'); 1138 assert.ok(ignoreUpdate, 'Expected an UPDATE setting status=ignore for non-local business'); 1139 assert.ok( 1140 ignoreUpdate.args[0].toLowerCase().includes('local'), 1141 'Error message should mention local business' 1142 ); 1143 }); 1144 1145 test('does NOT mark site as ignore in test mode (NODE_ENV=test) for directory detection', async () => { 1146 process.env.NODE_ENV = 'test'; 1147 1148 const site = makeSiteRow(); 1149 mockSites = [site]; 1150 mockSiteDetails[1] = makeSiteDetail(); 1151 mockCheckBlocklist.mock.mockImplementation(() => null); 1152 1153 mockScoreWebsite.mock.mockImplementation(async () => ({ 1154 overall_calculation: { 1155 letter_grade: 'F', 1156 conversion_score: 30, 1157 is_business_directory: true, 1158 is_local_business: false, 1159 is_error_page: false, 1160 is_broken_site: false, 1161 }, 1162 })); 1163 1164 await runScoringStage({ limit: 1 }); 1165 1166 // In test mode the directory/non-local checks are skipped — should score normally 1167 const successUpdate = findMainSuccessUpdate(); 1168 assert.ok(successUpdate, 'Expected a normal success UPDATE in test mode'); 1169 1170 // No ignore update from directory/local detection 1171 const ignoreUpdate = dbCalls.updates.find( 1172 u => 1173 u.sql.includes("status = 'ignored'") && 1174 u.args.some(a => typeof a === 'string' && (a.includes('directory') || a.includes('local'))) 1175 ); 1176 assert.ok(!ignoreUpdate, 'Should NOT have a directory/local ignore UPDATE in test mode'); 1177 }); 1178 }); 1179 1180 // --------------------------------------------------------------------------- 1181 describe('runScoringStage — error page detection', () => { 1182 beforeEach(() => { 1183 resetState(); 1184 resetEnv(); 1185 }); 1186 1187 test('marks site as ignore when is_error_page=true and errorType=404 (permanent)', async () => { 1188 const site = makeSiteRow(); 1189 mockSites = [site]; 1190 mockSiteDetails[1] = makeSiteDetail(); 1191 mockCheckBlocklist.mock.mockImplementation(() => null); 1192 1193 mockScoreWebsite.mock.mockImplementation(async () => ({ 1194 overall_calculation: { 1195 letter_grade: null, 1196 conversion_score: null, 1197 is_business_directory: false, 1198 is_local_business: true, 1199 is_error_page: true, 1200 error_type: '404', 1201 error_description: 'Page not found', 1202 is_broken_site: false, 1203 }, 1204 })); 1205 1206 await runScoringStage({ limit: 1 }); 1207 1208 const ignoreUpdate = findUpdateWithStatus('ignored'); 1209 assert.ok(ignoreUpdate, 'Expected UPDATE setting status=ignore for 404 error'); 1210 assert.ok( 1211 ignoreUpdate.args[0].includes('Page not found'), 1212 'Error message should come from error_description' 1213 ); 1214 }); 1215 1216 test('marks site as ignore when errorType=403 (permanent)', async () => { 1217 const site = makeSiteRow(); 1218 mockSites = [site]; 1219 mockSiteDetails[1] = makeSiteDetail(); 1220 mockCheckBlocklist.mock.mockImplementation(() => null); 1221 1222 mockScoreWebsite.mock.mockImplementation(async () => ({ 1223 overall_calculation: { 1224 letter_grade: null, 1225 conversion_score: null, 1226 is_business_directory: false, 1227 is_local_business: true, 1228 is_error_page: true, 1229 error_type: '403', 1230 error_description: 'Access forbidden', 1231 is_broken_site: false, 1232 }, 1233 })); 1234 1235 await runScoringStage({ limit: 1 }); 1236 1237 const ignoreUpdate = findUpdateWithStatus('ignored'); 1238 assert.ok(ignoreUpdate, 'Expected UPDATE setting status=ignore for 403 error'); 1239 assert.ok(ignoreUpdate.args[0].includes('Access forbidden')); 1240 }); 1241 1242 test('marks site as ignore when errorType=410 (permanent), uses fallback message', async () => { 1243 const site = makeSiteRow(); 1244 mockSites = [site]; 1245 mockSiteDetails[1] = makeSiteDetail(); 1246 mockCheckBlocklist.mock.mockImplementation(() => null); 1247 1248 mockScoreWebsite.mock.mockImplementation(async () => ({ 1249 overall_calculation: { 1250 letter_grade: null, 1251 conversion_score: null, 1252 is_business_directory: false, 1253 is_local_business: true, 1254 is_error_page: true, 1255 error_type: '410', 1256 error_description: null, // null → falls back to "Permanent error: 410" 1257 is_broken_site: false, 1258 }, 1259 })); 1260 1261 await runScoringStage({ limit: 1 }); 1262 1263 const ignoreUpdate = findUpdateWithStatus('ignored'); 1264 assert.ok(ignoreUpdate, 'Expected UPDATE setting status=ignore for 410 error'); 1265 // Fallback message: "Permanent error: 410" 1266 assert.ok( 1267 ignoreUpdate.args[0].includes('410') || ignoreUpdate.args[0].includes('Permanent'), 1268 'Error message should reference 410 or Permanent error' 1269 ); 1270 }); 1271 1272 test('keeps status=assets_captured for temporary 5xx error page', async () => { 1273 const site = makeSiteRow(); 1274 mockSites = [site]; 1275 mockSiteDetails[1] = makeSiteDetail(); 1276 mockCheckBlocklist.mock.mockImplementation(() => null); 1277 1278 mockScoreWebsite.mock.mockImplementation(async () => ({ 1279 overall_calculation: { 1280 letter_grade: null, 1281 conversion_score: null, 1282 is_business_directory: false, 1283 is_local_business: true, 1284 is_error_page: true, 1285 error_type: '5xx', 1286 error_description: 'Server error', 1287 is_broken_site: false, 1288 }, 1289 })); 1290 1291 await runScoringStage({ limit: 1 }); 1292 1293 // Temporary error: UPDATE sets status = 'assets_captured' in SQL 1294 const temporaryUpdate = findUpdateWithStatus('assets_captured'); 1295 assert.ok(temporaryUpdate, 'Expected UPDATE setting status=assets_captured for 5xx'); 1296 assert.ok( 1297 temporaryUpdate.args[0].includes('Server error'), 1298 'Error description should be saved' 1299 ); 1300 1301 // Must NOT have an ignore UPDATE 1302 const ignoreUpdate = findUpdateWithStatus('ignored'); 1303 assert.ok(!ignoreUpdate, 'Should NOT set status=ignore for temporary 5xx error'); 1304 }); 1305 1306 test('keeps status=assets_captured for maintenance error page', async () => { 1307 const site = makeSiteRow(); 1308 mockSites = [site]; 1309 mockSiteDetails[1] = makeSiteDetail(); 1310 mockCheckBlocklist.mock.mockImplementation(() => null); 1311 1312 mockScoreWebsite.mock.mockImplementation(async () => ({ 1313 overall_calculation: { 1314 letter_grade: null, 1315 conversion_score: null, 1316 is_business_directory: false, 1317 is_local_business: true, 1318 is_error_page: true, 1319 error_type: 'maintenance', 1320 error_description: 'Under maintenance', 1321 is_broken_site: false, 1322 }, 1323 })); 1324 1325 await runScoringStage({ limit: 1 }); 1326 1327 const temporaryUpdate = findUpdateWithStatus('assets_captured'); 1328 assert.ok( 1329 temporaryUpdate, 1330 'Expected UPDATE setting status=assets_captured for maintenance page' 1331 ); 1332 }); 1333 }); 1334 1335 // --------------------------------------------------------------------------- 1336 describe('runScoringStage — broken site detection', () => { 1337 beforeEach(() => { 1338 resetState(); 1339 resetEnv(); 1340 }); 1341 1342 test('schedules recapture for broken site (first attempt)', async () => { 1343 const site = makeSiteRow(); 1344 mockSites = [site]; 1345 mockSiteDetails[1] = makeSiteDetail(); 1346 mockCheckBlocklist.mock.mockImplementation(() => null); 1347 mockRecaptureData = { recapture_count: 0 }; // First attempt 1348 1349 mockScoreWebsite.mock.mockImplementation(async () => ({ 1350 overall_calculation: { 1351 letter_grade: null, 1352 conversion_score: null, 1353 is_business_directory: false, 1354 is_local_business: true, 1355 is_error_page: false, 1356 is_broken_site: true, 1357 broken_site_details: ['JavaScript errors', 'Blank page'], 1358 }, 1359 })); 1360 1361 await runScoringStage({ limit: 1 }); 1362 1363 // The broken site UPDATE sets: 1364 // status = 'assets_captured', error_message=?, recapture_count=?, recapture_at=..., score_json=? 1365 // so the SQL contains both 'assets_captured' and 'recapture_count' 1366 const brokenUpdate = dbCalls.updates.find( 1367 u => u.sql.includes("status = 'assets_captured'") && u.sql.includes('recapture_count') 1368 ); 1369 assert.ok(brokenUpdate, 'Expected a broken-site UPDATE with assets_captured + recapture_count'); 1370 1371 // args: [errorMsg, recaptureCount, scoreJson, id] 1372 assert.ok( 1373 brokenUpdate.args[0].includes('Broken site'), 1374 'Error message should mention broken site' 1375 ); 1376 assert.strictEqual(brokenUpdate.args[1], 1, 'recapture_count should be 1 (0+1)'); 1377 1378 // Should NOT be marked ignore 1379 const ignoreUpdate = findUpdateWithStatus('ignored'); 1380 assert.ok(!ignoreUpdate, 'Should NOT mark as ignore on first broken-site attempt'); 1381 }); 1382 1383 test('increments recapture_count on second broken site attempt (count 1→2)', async () => { 1384 const site = makeSiteRow(); 1385 mockSites = [site]; 1386 mockSiteDetails[1] = makeSiteDetail(); 1387 mockCheckBlocklist.mock.mockImplementation(() => null); 1388 mockRecaptureData = { recapture_count: 1 }; // Previous attempt count = 1 1389 1390 mockScoreWebsite.mock.mockImplementation(async () => ({ 1391 overall_calculation: { 1392 letter_grade: null, 1393 conversion_score: null, 1394 is_business_directory: false, 1395 is_local_business: true, 1396 is_error_page: false, 1397 is_broken_site: true, 1398 broken_site_details: ['Layout broken'], 1399 }, 1400 })); 1401 1402 await runScoringStage({ limit: 1 }); 1403 1404 const brokenUpdate = dbCalls.updates.find( 1405 u => u.sql.includes("status = 'assets_captured'") && u.sql.includes('recapture_count') 1406 ); 1407 assert.ok(brokenUpdate, 'Expected a broken-site UPDATE'); 1408 // args[1] = recaptureCount = 1+1 = 2 1409 assert.strictEqual(brokenUpdate.args[1], 2, 'recapture_count should be 2 on second attempt'); 1410 }); 1411 1412 test('marks site as ignore when broken site exceeds max retries (recapture_count >= 3 → count=4)', async () => { 1413 const site = makeSiteRow(); 1414 mockSites = [site]; 1415 mockSiteDetails[1] = makeSiteDetail(); 1416 mockCheckBlocklist.mock.mockImplementation(() => null); 1417 // 3 previous attempts; 3+1=4 > 3 → max exceeded 1418 mockRecaptureData = { recapture_count: 3 }; 1419 1420 mockScoreWebsite.mock.mockImplementation(async () => ({ 1421 overall_calculation: { 1422 letter_grade: null, 1423 conversion_score: null, 1424 is_business_directory: false, 1425 is_local_business: true, 1426 is_error_page: false, 1427 is_broken_site: true, 1428 broken_site_details: ['Persistent crash'], 1429 }, 1430 })); 1431 1432 await runScoringStage({ limit: 1 }); 1433 1434 // When max exceeded: UPDATE sets status='ignore' with "Max recapture attempts reached" 1435 const ignoreUpdate = findUpdateWithStatus('ignored'); 1436 assert.ok(ignoreUpdate, 'Expected status=ignore when max recapture retries exceeded'); 1437 assert.ok( 1438 ignoreUpdate.args[0].includes('Max recapture'), 1439 'Error message should mention Max recapture' 1440 ); 1441 1442 // Should NOT have a recapture UPDATE (assets_captured + recapture_count) 1443 const recaptureUpdate = dbCalls.updates.find( 1444 u => u.sql.includes("status = 'assets_captured'") && u.sql.includes('recapture_count') 1445 ); 1446 assert.ok(!recaptureUpdate, 'Should NOT schedule another recapture when max exceeded'); 1447 }); 1448 }); 1449 1450 // --------------------------------------------------------------------------- 1451 describe('runScoringStage — keyword counter', () => { 1452 beforeEach(() => { 1453 resetState(); 1454 resetEnv(); 1455 }); 1456 1457 test('increments low_scoring counter when site scores low and has keyword data', async () => { 1458 const site = makeSiteRow(); 1459 mockSites = [site]; 1460 mockSiteDetails[1] = makeSiteDetail(); 1461 mockCheckBlocklist.mock.mockImplementation(() => null); 1462 mockKeywordData = { keyword: 'plumber near me', country_code: 'AU' }; 1463 1464 mockScoreWebsite.mock.mockImplementation(async () => ({ 1465 overall_calculation: { 1466 letter_grade: 'F', 1467 conversion_score: 45, 1468 is_business_directory: false, 1469 is_local_business: true, 1470 is_error_page: false, 1471 is_broken_site: false, 1472 }, 1473 })); 1474 1475 await runScoringStage({ limit: 1 }); 1476 1477 assert.strictEqual(mockIncrementLowScoring.mock.callCount(), 1); 1478 }); 1479 1480 test('does not increment low_scoring counter for high score site (score > threshold)', async () => { 1481 const site = makeSiteRow(); 1482 mockSites = [site]; 1483 mockSiteDetails[1] = makeSiteDetail(); 1484 mockCheckBlocklist.mock.mockImplementation(() => null); 1485 mockKeywordData = { keyword: 'plumber near me', country_code: 'AU' }; 1486 1487 mockScoreWebsite.mock.mockImplementation(async () => ({ 1488 overall_calculation: { 1489 letter_grade: 'A', 1490 conversion_score: 95, 1491 is_business_directory: false, 1492 is_local_business: true, 1493 is_error_page: false, 1494 is_broken_site: false, 1495 }, 1496 })); 1497 1498 await runScoringStage({ limit: 1 }); 1499 1500 assert.strictEqual(mockIncrementLowScoring.mock.callCount(), 0); 1501 }); 1502 1503 test('does not increment low_scoring counter when keyword data is null', async () => { 1504 const site = makeSiteRow(); 1505 mockSites = [site]; 1506 mockSiteDetails[1] = makeSiteDetail(); 1507 mockCheckBlocklist.mock.mockImplementation(() => null); 1508 mockKeywordData = null; // no keyword on site 1509 1510 mockScoreWebsite.mock.mockImplementation(async () => ({ 1511 overall_calculation: { 1512 letter_grade: 'F', 1513 conversion_score: 55, 1514 is_business_directory: false, 1515 is_local_business: true, 1516 is_error_page: false, 1517 is_broken_site: false, 1518 }, 1519 })); 1520 1521 await runScoringStage({ limit: 1 }); 1522 1523 assert.strictEqual(mockIncrementLowScoring.mock.callCount(), 0); 1524 }); 1525 }); 1526 1527 // --------------------------------------------------------------------------- 1528 describe('runScoringStage — error handling', () => { 1529 beforeEach(() => { 1530 resetState(); 1531 resetEnv(); 1532 }); 1533 1534 test('records failure via recordFailure when scoreWebsite throws', async () => { 1535 const site = makeSiteRow(); 1536 mockSites = [site]; 1537 mockSiteDetails[1] = makeSiteDetail(); 1538 mockCheckBlocklist.mock.mockImplementation(() => null); 1539 1540 mockScoreWebsite.mock.mockImplementation(async () => { 1541 throw new Error('OpenRouter API timeout'); 1542 }); 1543 1544 const result = await runScoringStage({ limit: 1 }); 1545 1546 // processBatch catches per-site errors, so stage completes with failure counts 1547 assert.strictEqual(result.processed, 1); 1548 assert.strictEqual(result.failed, 1); 1549 assert.strictEqual(result.succeeded, 0); 1550 1551 // recordFailure called with correct site ID and stage name 1552 // API: recordFailure(siteId, stageName, error, currentStatus) 1553 assert.strictEqual(mockRecordFailure.mock.callCount(), 1); 1554 const rfArgs = mockRecordFailure.mock.calls[0].arguments; 1555 assert.strictEqual(rfArgs[0], 1, 'siteId should be 1'); 1556 assert.strictEqual(rfArgs[1], 'scoring', 'stageName should be scoring'); 1557 }); 1558 1559 test('returns correct counts for mixed success/failure across multiple sites', async () => { 1560 const site1 = makeSiteRow({ id: 1, url: 'https://site1.com' }); 1561 const site2 = makeSiteRow({ id: 2, url: 'https://site2.com' }); 1562 mockSites = [site1, site2]; 1563 mockSiteDetails[1] = makeSiteDetail({ id: 1, url: 'https://site1.com' }); 1564 mockSiteDetails[2] = makeSiteDetail({ id: 2, url: 'https://site2.com' }); 1565 mockCheckBlocklist.mock.mockImplementation(() => null); 1566 1567 mockScoreWebsite.mock.mockImplementation(async siteData => { 1568 if (siteData.url === 'https://site2.com') { 1569 throw new Error('API error for site2'); 1570 } 1571 return { 1572 overall_calculation: { 1573 letter_grade: 'D-', 1574 conversion_score: 60, 1575 is_business_directory: false, 1576 is_local_business: true, 1577 is_error_page: false, 1578 is_broken_site: false, 1579 }, 1580 }; 1581 }); 1582 1583 const result = await runScoringStage({ limit: 2 }); 1584 1585 assert.strictEqual(result.processed, 2); 1586 assert.strictEqual(result.succeeded, 1); 1587 assert.strictEqual(result.failed, 1); 1588 }); 1589 }); 1590 1591 // --------------------------------------------------------------------------- 1592 describe('runScoringStage — return value structure', () => { 1593 beforeEach(() => { 1594 resetState(); 1595 resetEnv(); 1596 }); 1597 1598 test('returns all required fields in the stats object', async () => { 1599 const site = makeSiteRow(); 1600 mockSites = [site]; 1601 mockSiteDetails[1] = makeSiteDetail(); 1602 mockCheckBlocklist.mock.mockImplementation(() => null); 1603 mockGradeRows = [{ grade: 'D-', count: 1 }]; 1604 1605 mockScoreWebsite.mock.mockImplementation(async () => ({ 1606 overall_calculation: { 1607 letter_grade: 'D-', 1608 conversion_score: 60, 1609 is_business_directory: false, 1610 is_local_business: true, 1611 is_error_page: false, 1612 is_broken_site: false, 1613 }, 1614 })); 1615 1616 const result = await runScoringStage({ limit: 1 }); 1617 1618 assert.ok('processed' in result, 'Should have processed field'); 1619 assert.ok('succeeded' in result, 'Should have succeeded field'); 1620 assert.ok('failed' in result, 'Should have failed field'); 1621 assert.ok('skipped' in result, 'Should have skipped field'); 1622 assert.ok('duration' in result, 'Should have duration field'); 1623 assert.ok('gradeDistribution' in result, 'Should have gradeDistribution field'); 1624 assert.ok(typeof result.duration === 'number', 'Duration should be a number'); 1625 assert.strictEqual(result.gradeDistribution['D-'], 1); 1626 }); 1627 1628 test('returns correct counts for a fully successful run of 2 sites', async () => { 1629 const site1 = makeSiteRow({ id: 1, url: 'https://site1.com' }); 1630 const site2 = makeSiteRow({ id: 2, url: 'https://site2.com' }); 1631 mockSites = [site1, site2]; 1632 mockSiteDetails[1] = makeSiteDetail({ id: 1, url: 'https://site1.com' }); 1633 mockSiteDetails[2] = makeSiteDetail({ id: 2, url: 'https://site2.com' }); 1634 mockCheckBlocklist.mock.mockImplementation(() => null); 1635 1636 mockScoreWebsite.mock.mockImplementation(async () => ({ 1637 overall_calculation: { 1638 letter_grade: 'C', 1639 conversion_score: 75, 1640 is_business_directory: false, 1641 is_local_business: true, 1642 is_error_page: false, 1643 is_broken_site: false, 1644 }, 1645 })); 1646 1647 const result = await runScoringStage({ limit: 2 }); 1648 1649 assert.strictEqual(result.processed, 2); 1650 assert.strictEqual(result.succeeded, 2); 1651 assert.strictEqual(result.failed, 0); 1652 }); 1653 1654 test('duration is positive number in milliseconds', async () => { 1655 mockSites = []; 1656 1657 const result = await runScoringStage(); 1658 1659 assert.ok(typeof result.duration === 'number', 'duration should be number'); 1660 assert.ok(result.duration >= 0, 'duration should be non-negative'); 1661 }); 1662 }); 1663 1664 // ─── classifyIndustry truthy branch ───────────────────────────────────────── 1665 1666 describe('runScoringStage — classifyIndustry ignore', () => { 1667 beforeEach(() => { 1668 dbCalls.updates = []; 1669 mockClassifyIndustry.mock.resetCalls(); 1670 }); 1671 1672 test('marks site as ignore when classifyIndustry returns legal industry', async () => { 1673 mockSites = [makeSiteRow({ id: 1, domain: 'lawfirm.com' })]; 1674 mockSiteDetails[1] = makeSiteDetail(); 1675 1676 // classifyIndustry returns legal industry 1677 mockClassifyIndustry.mock.mockImplementationOnce(() => ({ 1678 type: 'legal', 1679 reason: 'domain contains "law"', 1680 })); 1681 1682 const result = await runScoringStage({ limit: 1 }); 1683 1684 // Site should be ignored — industry classification fires before LLM scoring 1685 // so processed may be 0 (skipped before batch processing) 1686 const ignoreUpdate = dbCalls.updates.find( 1687 u => u.sql.includes("status = 'ignored'") && u.args.some(a => String(a).includes('Ignored')) 1688 ); 1689 assert.ok(ignoreUpdate, 'Should have marked site as ignore for legal industry'); 1690 }); 1691 1692 test('marks site as ignore when classifyIndustry returns regulated industry (non-legal)', async () => { 1693 mockSites = [makeSiteRow({ id: 2, domain: 'mypharmacy.com' })]; 1694 mockSiteDetails[2] = makeSiteDetail({ id: 2 }); 1695 1696 mockClassifyIndustry.mock.mockImplementationOnce(() => ({ 1697 type: 'pharmaceutical', 1698 reason: 'domain contains "pharma"', 1699 })); 1700 1701 const result = await runScoringStage({ limit: 1 }); 1702 1703 const ignoreUpdate = dbCalls.updates.find( 1704 u => u.sql.includes("status = 'ignored'") && u.args.some(a => String(a).includes('Regulated')) 1705 ); 1706 assert.ok(ignoreUpdate, 'Should have marked site as ignore for regulated industry'); 1707 }); 1708 }); 1709 1710 // ─── getScoringStats ───────────────────────────────────────────────────────── 1711 1712 describe('getScoringStats', () => { 1713 test('returns stats object with expected fields', async () => { 1714 mockGradeRows = [ 1715 { grade: 'A', count: 2 }, 1716 { grade: 'B', count: 3 }, 1717 { grade: 'F', count: 5 }, 1718 ]; 1719 1720 const stats = await getScoringStats(); 1721 1722 assert.ok(typeof stats.total_sites === 'number', 'total_sites should be a number'); 1723 assert.ok(typeof stats.avg_score === 'number', 'avg_score should be a number'); 1724 assert.ok(typeof stats.gradeDistribution === 'object', 'gradeDistribution should be an object'); 1725 assert.strictEqual(stats.gradeDistribution['A'], 2); 1726 assert.strictEqual(stats.gradeDistribution['B'], 3); 1727 assert.strictEqual(stats.gradeDistribution['F'], 5); 1728 }); 1729 1730 test('returns gradeDistribution as empty object when no grades exist', async () => { 1731 mockGradeRows = []; 1732 1733 const stats = await getScoringStats(); 1734 1735 assert.deepStrictEqual(stats.gradeDistribution, {}); 1736 }); 1737 });