assets.test.js
1 /** 2 * Unit Tests for Assets Stage 3 * 4 * Tests the runAssetsStage(), captureSiteScreenshots(), getAssetsStats(), 5 * and backfillScreenshots() flows. 6 * 7 * Key behaviors tested: 8 * - HTML DOM must be non-null/non-empty before marking assets_captured 9 * - Null html from capture throws and triggers retry 10 * - Empty html from capture throws and triggers retry 11 * - Successful capture with valid HTML marks assets_captured 12 * - Vision-enabled path captures screenshots and validates them 13 * - Blocklist filtering marks sites as 'ignore' 14 * - Error page detection schedules retry in 7 days 15 * - getAssetsStats() returns correct shape 16 * - backfillScreenshots() delegates to runAssetsStage 17 * 18 * Run with: 19 * NODE_ENV=test LOGS_DIR=/tmp/test-logs DATABASE_PATH=/tmp/test-assets.db \ 20 * node --experimental-test-module-mocks --test tests/stages/assets.test.js 21 */ 22 23 import { test, describe, mock, beforeEach } from 'node:test'; 24 import assert from 'node:assert'; 25 import { createPgMock } from '../helpers/pg-mock.js'; // eslint-disable-line no-unused-vars 26 27 // ============================================================================ 28 // MOCK DEFINITIONS — ALL mock.module() calls MUST come before any dynamic imports 29 // ============================================================================ 30 31 // Track DB calls for assertions 32 const dbCalls = { 33 updates: [], 34 queries: [], 35 }; 36 37 // Shared state holders for mocks 38 let mockSitesFound = []; 39 let mockKeywordData = null; 40 let mockPageHtml = '<html><body>Test page content</body></html>'; 41 let mockStatsRow = { 42 total_sites: 10, 43 sites_with_screenshots: 5, 44 captured_sites: 8, 45 pending_capture: 2, 46 failed_capture: 1, 47 }; 48 let mockDedupeStats = { sitesIgnored: 0, duplicateDomains: 0, crossBorder: 0 }; 49 50 // For croppedScreenshotsExist mock control 51 let mockScreenshotsExist = { exists: true, missing: [] }; 52 // For captureWebsite return value control 53 let mockCaptureResult = null; 54 // For checkBlocklist mock control 55 let mockBlocklistResult = null; 56 // For detectErrorPage mock control 57 let mockErrorPageResult = { isErrorPage: false }; 58 59 class MockDatabase { 60 constructor(_path) { 61 this._closed = false; 62 } 63 64 prepare(sql) { 65 const trimmed = sql.trim(); 66 67 return { 68 all: () => { 69 dbCalls.queries.push(trimmed); 70 // Main query: SELECT sites needing HTML capture (ENABLE_VISION=false path) 71 if (trimmed.includes("status = 'found'") && trimmed.includes('html_dom IS NULL')) { 72 return mockSitesFound; 73 } 74 // Vision-enabled main query 75 if ( 76 trimmed.includes("status = 'found' OR status = 'assets_captured'") || 77 (trimmed.includes("status = 'found'") && trimmed.includes('screenshot_path')) 78 ) { 79 return mockSitesFound; 80 } 81 // backfillScreenshots query 82 if (trimmed.includes('screenshot_path IS NULL') && trimmed.includes('LIMIT ?')) { 83 return mockSitesFound; 84 } 85 return []; 86 }, 87 88 get: _id => { 89 dbCalls.queries.push(trimmed); 90 // Stats query 91 if (trimmed.includes('total_sites')) { 92 return mockStatsRow; 93 } 94 // retry_count query from recordFailure 95 if (trimmed.includes('retry_count')) { 96 return { retry_count: 0 }; 97 } 98 // keyword query from incrementAssetsScraped 99 if (trimmed.includes('keyword') && trimmed.includes('country_code')) { 100 return mockKeywordData; 101 } 102 return null; 103 }, 104 105 run: (...args) => { 106 if (trimmed.startsWith('UPDATE sites')) { 107 dbCalls.updates.push({ sql: trimmed, args }); 108 } 109 return { changes: 1, lastInsertRowid: 0 }; 110 }, 111 }; 112 } 113 114 pragma() { 115 return undefined; 116 } 117 118 exec() { 119 return undefined; 120 } 121 122 transaction(fn) { 123 return (...args) => fn(...args); 124 } 125 126 close() { 127 this._closed = true; 128 } 129 } 130 131 // Mock better-sqlite3 (legacy — assets.js now uses db.js, but kept for any transitive deps) 132 mock.module('better-sqlite3', { 133 defaultExport: MockDatabase, 134 }); 135 136 // Mock db.js — assets.js uses db.js (PostgreSQL), not better-sqlite3 directly 137 mock.module('../../src/utils/db.js', { 138 namedExports: { 139 getPool: () => ({}), 140 getAll: async (sql) => { 141 const trimmed = sql.trim(); 142 dbCalls.queries.push(trimmed); 143 // HTML-only path: sites WHERE status = 'found' AND html_dom IS NULL 144 if (trimmed.includes("status = 'found'") && trimmed.includes('html_dom IS NULL')) { 145 return mockSitesFound; 146 } 147 // Vision path: status = 'found' OR status = 'assets_captured' 148 if (trimmed.includes("status = 'found' OR status = 'assets_captured'")) { 149 return mockSitesFound; 150 } 151 // backfillScreenshots query 152 if (trimmed.includes('screenshot_path IS NULL')) { 153 return mockSitesFound; 154 } 155 return []; 156 }, 157 getOne: async (sql) => { 158 const trimmed = sql.trim(); 159 dbCalls.queries.push(trimmed); 160 if (trimmed.includes('total_sites')) { 161 return mockStatsRow; 162 } 163 if (trimmed.includes('retry_count')) { 164 return { retry_count: 0 }; 165 } 166 if (trimmed.includes('keyword') && trimmed.includes('country_code')) { 167 return mockKeywordData; 168 } 169 return null; 170 }, 171 run: async (sql, ...args) => { 172 const trimmed = sql.trim(); 173 if (trimmed.startsWith('UPDATE sites')) { 174 dbCalls.updates.push({ sql: trimmed, args }); 175 } 176 return { changes: 1, lastInsertRowid: 0 }; 177 }, 178 query: async (sql, params) => { 179 const trimmed = sql.trim(); 180 dbCalls.queries.push(trimmed); 181 if (trimmed.startsWith('SELECT') || trimmed.startsWith('WITH')) { 182 if (trimmed.includes('keyword') && trimmed.includes('country_code')) { 183 return { rows: mockKeywordData ? [mockKeywordData] : [], rowCount: 0 }; 184 } 185 return { rows: [], rowCount: 0 }; 186 } 187 if (trimmed.startsWith('UPDATE sites')) { 188 dbCalls.updates.push({ sql: trimmed, args: [params] }); 189 } 190 return { rows: [], rowCount: 1 }; 191 }, 192 withTransaction: async (fn) => { 193 const fakeClient = { 194 query: async (sql, params) => { 195 const trimmed = sql.trim(); 196 dbCalls.queries.push(trimmed); 197 if (trimmed.startsWith('UPDATE sites')) { 198 dbCalls.updates.push({ sql: trimmed, args: [params] }); 199 } 200 if (trimmed.includes('keyword') && trimmed.includes('country_code')) { 201 return { rows: mockKeywordData ? [mockKeywordData] : [], rowCount: 0 }; 202 } 203 return { rows: [], rowCount: 1 }; 204 }, 205 }; 206 return await fn(fakeClient); 207 }, 208 closePool: async () => {}, 209 createDatabaseConnection: () => ({}), 210 closeDatabaseConnection: async () => {}, 211 }, 212 }); 213 214 // Mutable capture result for vision tests 215 const mockCaptureWebsite = mock.fn(async () => { 216 if (mockCaptureResult !== null) return mockCaptureResult; 217 return { 218 html: '<html><body>Test page content</body></html>', 219 screenshots: { 220 desktop_above: Buffer.from('img1'), 221 desktop_below: Buffer.from('img2'), 222 mobile_above: Buffer.from('img3'), 223 }, 224 screenshotsUncropped: { 225 desktop_above: Buffer.from('unc1'), 226 desktop_below: Buffer.from('unc2'), 227 mobile_above: Buffer.from('unc3'), 228 }, 229 httpStatusCode: 200, 230 sslStatus: 'https', 231 httpHeaders: '{}', 232 localeData: '{"htmlLang":"en","hreflangs":[]}', 233 }; 234 }); 235 236 // Mock response object for page.goto() 237 const mockResponse = () => ({ 238 status: () => 200, 239 headers: () => ({}), 240 }); 241 242 mock.module('../../src/capture.js', { 243 namedExports: { 244 captureWebsite: mockCaptureWebsite, 245 launchBrowser: mock.fn(async () => ({ close: async () => {} })), 246 createStealthContext: mock.fn(async () => ({ 247 newPage: async () => ({ 248 goto: async () => mockResponse(), 249 content: async () => mockPageHtml, 250 close: async () => {}, 251 evaluate: async () => ({ htmlLang: null, hreflangs: [] }), 252 waitForTimeout: async () => {}, 253 on: () => {}, 254 }), 255 close: async () => {}, 256 })), 257 }, 258 }); 259 260 // Mock screenshot-storage 261 const mockSaveScreenshots = mock.fn(async () => '/tmp/screenshots/1'); 262 const mockCroppedScreenshotsExist = mock.fn(async () => mockScreenshotsExist); 263 mock.module('../../src/utils/screenshot-storage.js', { 264 namedExports: { 265 saveScreenshots: mockSaveScreenshots, 266 croppedScreenshotsExist: mockCroppedScreenshotsExist, 267 }, 268 }); 269 270 // Mock site-filters 271 const mockCheckBlocklist = mock.fn(() => mockBlocklistResult); 272 mock.module('../../src/utils/site-filters.js', { 273 namedExports: { 274 checkBlocklist: mockCheckBlocklist, 275 }, 276 }); 277 278 // Mock keyword-counters 279 const mockIncrementAssetsScraped = mock.fn(); 280 mock.module('../../src/utils/keyword-counters.js', { 281 namedExports: { 282 incrementAssetsScraped: mockIncrementAssetsScraped, 283 }, 284 }); 285 286 // Mock dedupe-locale-aware 287 const mockDeduplicateSites = mock.fn(() => mockDedupeStats); 288 mock.module('../../src/utils/dedupe-locale-aware.js', { 289 namedExports: { 290 deduplicateSites: mockDeduplicateSites, 291 }, 292 }); 293 294 // Mock error-page-detector 295 const mockDetectErrorPage = mock.fn(() => mockErrorPageResult); 296 mock.module('../../src/utils/error-page-detector.js', { 297 namedExports: { 298 detectErrorPage: mockDetectErrorPage, 299 }, 300 }); 301 302 // Mock retry-handler 303 const mockRecordFailure = mock.fn(); 304 const mockResetRetries = mock.fn(); 305 mock.module('../../src/utils/retry-handler.js', { 306 namedExports: { 307 recordFailure: mockRecordFailure, 308 resetRetries: mockResetRetries, 309 }, 310 }); 311 312 // Mock adaptive-concurrency 313 mock.module('../../src/utils/adaptive-concurrency.js', { 314 namedExports: { 315 getAdaptiveConcurrencyFast: mock.fn(() => 1), 316 }, 317 }); 318 319 // Mock countries 320 mock.module('../../src/config/countries.js', { 321 namedExports: { 322 getCountryByCode: mock.fn(() => ({ code: 'AU', name: 'Australia' })), 323 }, 324 }); 325 326 // Mock detect-language 327 mock.module('../../src/utils/detect-language.js', { 328 namedExports: { 329 deriveLanguageCode: mock.fn(() => 'en'), 330 }, 331 }); 332 333 // Mock summary-generator 334 mock.module('../../src/utils/summary-generator.js', { 335 namedExports: { 336 generateStageCompletion: mock.fn(), 337 displayProgress: mock.fn(), 338 }, 339 }); 340 341 // Mock error-handler processBatch - returns { results, errors } matching real API 342 mock.module('../../src/utils/error-handler.js', { 343 namedExports: { 344 processBatch: mock.fn(async (items, processor) => { 345 const results = []; 346 const errors = []; 347 for (let i = 0; i < items.length; i++) { 348 try { 349 const result = await processor(items[i], i); 350 results.push(result); 351 } catch (err) { 352 errors.push({ item: items[i], error: err }); 353 } 354 } 355 return { results, errors }; 356 }), 357 withTimeout: mock.fn(async promise => promise), 358 }, 359 }); 360 361 // Mock html-storage — assets.js writes HTML to filesystem, stores 'fs' sentinel in DB 362 const mockWriteHtmlDom = mock.fn(); 363 const mockDeleteHtmlDom = mock.fn(); 364 mock.module('../../src/utils/html-storage.js', { 365 namedExports: { 366 writeHtmlDom: mockWriteHtmlDom, 367 hasHtmlDom: mock.fn(() => false), 368 deleteHtmlDom: mockDeleteHtmlDom, 369 readHtmlDom: mock.fn(() => null), 370 writeKeyPagesHtml: mock.fn(), 371 readKeyPagesHtml: mock.fn(() => null), 372 deleteKeyPagesHtml: mock.fn(), 373 deleteAllHtml: mock.fn(), 374 DATA_DIR: '/tmp/test-html-data', 375 }, 376 }); 377 378 // ============================================================================ 379 // DYNAMIC IMPORT — after all mocks are in place 380 // ============================================================================ 381 382 const { runAssetsStage, getAssetsStats, backfillScreenshots } = 383 await import('../../src/stages/assets.js'); 384 385 // ============================================================================ 386 // HELPERS 387 // ============================================================================ 388 389 function resetMocks() { 390 mockSitesFound = []; 391 mockKeywordData = null; 392 mockPageHtml = '<html><body>Test page content</body></html>'; 393 mockCaptureResult = null; 394 mockBlocklistResult = null; 395 mockErrorPageResult = { isErrorPage: false }; 396 mockScreenshotsExist = { exists: true, missing: [] }; 397 dbCalls.updates = []; 398 dbCalls.queries = []; 399 mockRecordFailure.mock.resetCalls(); 400 mockResetRetries.mock.resetCalls(); 401 mockCaptureWebsite.mock.resetCalls(); 402 mockSaveScreenshots.mock.resetCalls(); 403 mockCheckBlocklist.mock.resetCalls(); 404 mockDetectErrorPage.mock.resetCalls(); 405 mockIncrementAssetsScraped.mock.resetCalls(); 406 mockDeduplicateSites.mock.resetCalls(); 407 mockWriteHtmlDom.mock.resetCalls(); 408 mockDeleteHtmlDom.mock.resetCalls(); 409 } 410 411 // ============================================================================ 412 // TESTS — HTML-only path (ENABLE_VISION=false) 413 // ============================================================================ 414 415 describe('Assets Stage - html_dom validation (ENABLE_VISION=false)', () => { 416 beforeEach(() => { 417 resetMocks(); 418 process.env.ENABLE_VISION = 'false'; 419 }); 420 421 test('rejects site when html is null (triggers retry via recordFailure)', async () => { 422 mockSitesFound = [ 423 { id: 1, url: 'https://example.com', domain: 'example.com', country_code: 'AU' }, 424 ]; 425 mockPageHtml = null; 426 427 await runAssetsStage({ limit: 1 }); 428 429 assert.strictEqual( 430 mockRecordFailure.mock.callCount(), 431 1, 432 'recordFailure should be called once' 433 ); 434 const failCall = mockRecordFailure.mock.calls[0].arguments; 435 assert.strictEqual(failCall[0], 1, 'siteId should be 1'); 436 assert.strictEqual(failCall[1], 'assets', 'stage should be assets'); 437 assert.match( 438 failCall[2].message, 439 /HTML DOM capture failed/i, 440 'error should mention HTML DOM capture' 441 ); 442 443 const capturedUpdates = dbCalls.updates.filter(u => 444 JSON.stringify(u.args).includes('assets_captured') 445 ); 446 assert.strictEqual(capturedUpdates.length, 0, 'should not mark as assets_captured'); 447 }); 448 449 test('rejects site when html is empty/whitespace (triggers retry via recordFailure)', async () => { 450 mockSitesFound = [ 451 { id: 2, url: 'https://example2.com', domain: 'example2.com', country_code: 'AU' }, 452 ]; 453 mockPageHtml = ' '; 454 455 await runAssetsStage({ limit: 1 }); 456 457 assert.strictEqual( 458 mockRecordFailure.mock.callCount(), 459 1, 460 'recordFailure should be called once' 461 ); 462 const failCall = mockRecordFailure.mock.calls[0].arguments; 463 assert.match( 464 failCall[2].message, 465 /HTML DOM capture failed/i, 466 'error should mention HTML DOM capture' 467 ); 468 }); 469 470 test('accepts site when html has valid content', async () => { 471 mockSitesFound = [ 472 { id: 3, url: 'https://example3.com', domain: 'example3.com', country_code: 'AU' }, 473 ]; 474 mockKeywordData = { keyword: 'plumber', country_code: 'AU' }; 475 mockPageHtml = '<html><body><h1>Real business page</h1></body></html>'; 476 477 await runAssetsStage({ limit: 1 }); 478 479 assert.strictEqual(mockRecordFailure.mock.callCount(), 0, 'recordFailure should not be called'); 480 assert.strictEqual(mockResetRetries.mock.callCount(), 1, 'resetRetries should be called'); 481 482 const capturedUpdates = dbCalls.updates.filter(u => 483 JSON.stringify(u.args).includes('assets_captured') 484 ); 485 assert.ok(capturedUpdates.length > 0, 'should mark as assets_captured'); 486 487 // HTML is written to filesystem via writeHtmlDom, not stored in DB 488 assert.equal(mockWriteHtmlDom.mock.callCount(), 1, 'writeHtmlDom should be called once'); 489 const [wSiteId, wHtml] = mockWriteHtmlDom.mock.calls[0].arguments; 490 assert.equal(wSiteId, 3, 'writeHtmlDom siteId should match'); 491 assert.ok( 492 wHtml.includes('Real business page'), 493 'html_dom should contain the captured HTML' 494 ); 495 }); 496 497 test('returns early with zero counts when no sites need capture', async () => { 498 mockSitesFound = []; 499 500 const result = await runAssetsStage({ limit: 10 }); 501 502 assert.strictEqual(result.processed, 0); 503 assert.strictEqual(result.succeeded, 0); 504 assert.strictEqual(result.failed, 0); 505 assert.ok(typeof result.duration === 'number', 'duration should be a number'); 506 }); 507 508 test('increments keyword counter on successful capture', async () => { 509 mockSitesFound = [ 510 { id: 5, url: 'https://example5.com', domain: 'example5.com', country_code: 'AU' }, 511 ]; 512 mockKeywordData = { keyword: 'electrician', country_code: 'AU' }; 513 mockPageHtml = '<html><body>Valid content</body></html>'; 514 515 await runAssetsStage(); 516 517 assert.strictEqual( 518 mockIncrementAssetsScraped.mock.callCount(), 519 1, 520 'incrementAssetsScraped should be called' 521 ); 522 }); 523 524 test('skips keyword increment when keyword data is missing', async () => { 525 mockSitesFound = [ 526 { id: 6, url: 'https://example6.com', domain: 'example6.com', country_code: 'AU' }, 527 ]; 528 mockKeywordData = null; 529 mockPageHtml = '<html><body>Valid content</body></html>'; 530 531 await runAssetsStage(); 532 533 assert.strictEqual( 534 mockIncrementAssetsScraped.mock.callCount(), 535 0, 536 'incrementAssetsScraped should not be called without keyword data' 537 ); 538 }); 539 540 test('marks blocklisted sites as ignore', async () => { 541 mockSitesFound = [{ id: 7, url: 'https://yelp.com', domain: 'yelp.com', country_code: 'US' }]; 542 mockBlocklistResult = { reason: 'Directory site: yelp.com' }; 543 544 await runAssetsStage(); 545 546 // Should have updated the site to 'ignore' 547 const ignoreUpdates = dbCalls.updates.filter( 548 u => u.sql.includes("status = 'ignored'") || u.args.includes('Directory site: yelp.com') 549 ); 550 assert.ok(ignoreUpdates.length > 0, 'should mark blocklisted site as ignore'); 551 // Should not have tried to capture HTML (no recordFailure, no resetRetries) 552 assert.strictEqual( 553 mockRecordFailure.mock.callCount(), 554 0, 555 'recordFailure should not be called for blocklisted sites' 556 ); 557 }); 558 559 test('deduplicates sites before processing', { skip: 'deduplicateSites removed (DR-106): UNIQUE constraint prevents duplicates at insert time' }, async () => { 560 mockSitesFound = []; 561 562 await runAssetsStage(); 563 564 assert.ok(mockDeduplicateSites.mock.callCount() > 0, 'deduplicateSites should be called'); 565 }); 566 567 test('returns stats with processed/succeeded/failed counts', async () => { 568 mockSitesFound = [ 569 { id: 8, url: 'https://good.com', domain: 'good.com', country_code: 'AU' }, 570 { id: 9, url: 'https://bad.com', domain: 'bad.com', country_code: 'AU' }, 571 ]; 572 mockKeywordData = { keyword: 'plumber', country_code: 'AU' }; 573 // First site gets valid html, second gets null (fails) 574 const callCount = 0; 575 const originalMockResponse = mockResponse; 576 mockPageHtml = '<html>valid</html>'; 577 578 // Override page.content to return alternately valid/null 579 // We simulate this by making the second site fail via processBatch catching the error 580 // Since mockPageHtml is shared, we'll just test the overall shape 581 const result = await runAssetsStage(); 582 583 assert.ok('processed' in result, 'result should have processed'); 584 assert.ok('succeeded' in result, 'result should have succeeded'); 585 assert.ok('failed' in result, 'result should have failed'); 586 assert.ok('duration' in result, 'result should have duration'); 587 588 void callCount; 589 void originalMockResponse; 590 }); 591 }); 592 593 // ============================================================================ 594 // TESTS — Vision-enabled path (ENABLE_VISION=true) 595 // ============================================================================ 596 597 describe('Assets Stage - vision-enabled path (ENABLE_VISION=true)', () => { 598 beforeEach(() => { 599 resetMocks(); 600 process.env.ENABLE_VISION = 'true'; 601 }); 602 603 test('returns early when no candidate sites exist', async () => { 604 mockSitesFound = []; 605 606 const result = await runAssetsStage({ limit: 5 }); 607 608 assert.strictEqual(result.processed, 0); 609 assert.strictEqual(result.succeeded, 0); 610 assert.strictEqual(result.failed, 0); 611 assert.ok(typeof result.duration === 'number'); 612 }); 613 614 test('captures screenshots and saves to disk on success', async () => { 615 mockSitesFound = [ 616 { 617 id: 10, 618 url: 'https://vision.com', 619 domain: 'vision.com', 620 country_code: 'AU', 621 screenshot_path: null, 622 html_dom: null, 623 error_message: null, 624 }, 625 ]; 626 mockKeywordData = { keyword: 'dentist', country_code: 'AU' }; 627 mockCaptureResult = { 628 html: '<html><body>Vision page</body></html>', 629 screenshots: { 630 desktop_above: Buffer.from('a'), 631 desktop_below: Buffer.from('b'), 632 mobile_above: Buffer.from('c'), 633 }, 634 screenshotsUncropped: { 635 desktop_above: Buffer.from('ua'), 636 desktop_below: Buffer.from('ub'), 637 mobile_above: Buffer.from('uc'), 638 }, 639 httpStatusCode: 200, 640 sslStatus: 'https', 641 httpHeaders: '{}', 642 localeData: '{}', 643 }; 644 mockScreenshotsExist = { exists: true, missing: [] }; 645 646 await runAssetsStage({ limit: 1 }); 647 648 assert.strictEqual(mockCaptureWebsite.mock.callCount(), 1, 'captureWebsite should be called'); 649 assert.strictEqual(mockSaveScreenshots.mock.callCount(), 1, 'saveScreenshots should be called'); 650 assert.strictEqual(mockResetRetries.mock.callCount(), 1, 'resetRetries should be called'); 651 assert.strictEqual(mockRecordFailure.mock.callCount(), 0, 'no failures expected'); 652 653 const capturedUpdates = dbCalls.updates.filter(u => 654 JSON.stringify(u.args).includes('assets_captured') 655 ); 656 assert.ok(capturedUpdates.length > 0, 'should mark site as assets_captured'); 657 }); 658 659 test('throws when screenshot validation fails after save', async () => { 660 mockSitesFound = [ 661 { 662 id: 11, 663 url: 'https://badevision.com', 664 domain: 'badevision.com', 665 country_code: 'AU', 666 screenshot_path: null, 667 html_dom: null, 668 error_message: null, 669 }, 670 ]; 671 mockCaptureResult = { 672 html: '<html><body>Content</body></html>', 673 screenshots: { 674 desktop_above: Buffer.from('a'), 675 desktop_below: Buffer.from('b'), 676 mobile_above: Buffer.from('c'), 677 }, 678 screenshotsUncropped: { 679 desktop_above: Buffer.from('ua'), 680 desktop_below: Buffer.from('ub'), 681 mobile_above: Buffer.from('uc'), 682 }, 683 httpStatusCode: 200, 684 sslStatus: 'https', 685 httpHeaders: '{}', 686 localeData: '{}', 687 }; 688 // Simulate screenshot files missing after save 689 mockScreenshotsExist = { exists: false, missing: ['desktop_above', 'mobile_above'] }; 690 691 await runAssetsStage({ limit: 1 }); 692 693 // recordFailure should be called because validation failed 694 assert.strictEqual( 695 mockRecordFailure.mock.callCount(), 696 1, 697 'recordFailure should be called when screenshots missing' 698 ); 699 const failCall = mockRecordFailure.mock.calls[0].arguments; 700 assert.match( 701 failCall[2].message, 702 /Screenshot validation failed/i, 703 'error should mention screenshot validation' 704 ); 705 }); 706 707 test('schedules retry when error page detected', async () => { 708 mockSitesFound = [ 709 { 710 id: 12, 711 url: 'https://errorpage.com', 712 domain: 'errorpage.com', 713 country_code: 'AU', 714 screenshot_path: null, 715 html_dom: null, 716 error_message: null, 717 }, 718 ]; 719 mockCaptureResult = { 720 html: '<html><body>404 Not Found</body></html>', 721 screenshots: null, 722 screenshotsUncropped: null, 723 httpStatusCode: 200, // False positive - 200 but it's really an error page 724 sslStatus: 'https', 725 httpHeaders: '{}', 726 localeData: '{}', 727 }; 728 mockErrorPageResult = { 729 isErrorPage: true, 730 indicator: '404 text in body', 731 wordCount: 5, 732 }; 733 734 await runAssetsStage({ limit: 1 }); 735 736 // Should have updated with recapture_at (7-day retry) 737 // The SQL uses NOW() + INTERVAL '7 days' (PG) or datetime('+7 days') (SQLite translated) 738 const retryUpdates = dbCalls.updates.filter( 739 u => u.sql.includes('recapture_at') && (u.sql.includes('7 days') || u.sql.includes('+7')) 740 ); 741 assert.ok(retryUpdates.length > 0, 'should schedule retry in 7 days'); 742 // Should have called recordFailure (error page throws) 743 assert.strictEqual( 744 mockRecordFailure.mock.callCount(), 745 1, 746 'recordFailure should be called for error pages' 747 ); 748 }); 749 750 test('records failure for HTTP error status codes', async () => { 751 mockSitesFound = [ 752 { 753 id: 13, 754 url: 'https://http-error.com', 755 domain: 'http-error.com', 756 country_code: 'AU', 757 screenshot_path: null, 758 html_dom: null, 759 error_message: null, 760 }, 761 ]; 762 mockCaptureResult = { 763 html: '<html><body>Not Found</body></html>', 764 screenshots: null, 765 screenshotsUncropped: null, 766 httpStatusCode: 404, 767 sslStatus: 'https', 768 httpHeaders: '{}', 769 localeData: '{}', 770 }; 771 772 await runAssetsStage({ limit: 1 }); 773 774 assert.strictEqual( 775 mockRecordFailure.mock.callCount(), 776 1, 777 'recordFailure should be called for HTTP 404' 778 ); 779 const failCall = mockRecordFailure.mock.calls[0].arguments; 780 assert.match(failCall[2].message, /HTTP 404/i, 'error message should mention HTTP 404'); 781 }); 782 783 test('marks blocklisted sites as ignore in vision mode', async () => { 784 mockSitesFound = [ 785 { 786 id: 14, 787 url: 'https://facebook.com', 788 domain: 'facebook.com', 789 country_code: 'US', 790 screenshot_path: null, 791 html_dom: null, 792 error_message: null, 793 }, 794 ]; 795 mockBlocklistResult = { reason: 'Social media: facebook.com' }; 796 797 await runAssetsStage({ limit: 1 }); 798 799 const ignoreUpdates = dbCalls.updates.filter(u => 800 JSON.stringify(u.args).includes('Social media') 801 ); 802 assert.ok(ignoreUpdates.length > 0, 'should mark social media as ignore'); 803 // captureWebsite should NOT be called for blocked sites 804 assert.strictEqual(mockCaptureWebsite.mock.callCount(), 0, 'should not capture blocked sites'); 805 }); 806 807 test('deduplicates sites before processing in vision mode', { skip: 'deduplicateSites removed (DR-106): UNIQUE constraint prevents duplicates at insert time' }, async () => { 808 mockSitesFound = []; 809 810 await runAssetsStage(); 811 812 assert.ok(mockDeduplicateSites.mock.callCount() > 0, 'deduplicateSites should be called'); 813 }); 814 815 test('logs deduplication stats when sites are ignored', { skip: 'deduplicateSites removed (DR-106): UNIQUE constraint prevents duplicates at insert time' }, async () => { 816 mockDedupeStats = { sitesIgnored: 3, duplicateDomains: 2, crossBorder: 1 }; 817 mockSitesFound = []; 818 819 const result = await runAssetsStage(); 820 821 assert.strictEqual(result.processed, 0); 822 assert.ok(mockDeduplicateSites.mock.callCount() > 0); 823 }); 824 825 test('increments keyword counter on successful vision capture', async () => { 826 mockSitesFound = [ 827 { 828 id: 15, 829 url: 'https://dentist.com', 830 domain: 'dentist.com', 831 country_code: 'AU', 832 screenshot_path: null, 833 html_dom: null, 834 error_message: null, 835 }, 836 ]; 837 mockKeywordData = { keyword: 'dentist', country_code: 'AU' }; 838 mockCaptureResult = { 839 html: '<html><body>Dentist page</body></html>', 840 screenshots: { 841 desktop_above: Buffer.from('a'), 842 desktop_below: Buffer.from('b'), 843 mobile_above: Buffer.from('c'), 844 }, 845 screenshotsUncropped: { 846 desktop_above: Buffer.from('ua'), 847 desktop_below: Buffer.from('ub'), 848 mobile_above: Buffer.from('uc'), 849 }, 850 httpStatusCode: 200, 851 sslStatus: 'https', 852 httpHeaders: '{}', 853 localeData: '{}', 854 }; 855 856 await runAssetsStage({ limit: 1 }); 857 858 assert.strictEqual( 859 mockIncrementAssetsScraped.mock.callCount(), 860 1, 861 'incrementAssetsScraped should be called' 862 ); 863 }); 864 }); 865 866 // ============================================================================ 867 // TESTS — getAssetsStats() 868 // ============================================================================ 869 870 describe('getAssetsStats()', () => { 871 beforeEach(() => { 872 resetMocks(); 873 }); 874 875 test('returns stats object with expected fields', async () => { 876 mockStatsRow = { 877 total_sites: 100, 878 sites_with_screenshots: 42, 879 captured_sites: 80, 880 pending_capture: 15, 881 failed_capture: 5, 882 }; 883 884 const stats = await getAssetsStats(); 885 886 assert.ok(stats !== null && typeof stats === 'object', 'should return an object'); 887 assert.strictEqual(stats.total_sites, 100); 888 assert.strictEqual(stats.sites_with_screenshots, 42); 889 assert.strictEqual(stats.captured_sites, 80); 890 assert.strictEqual(stats.pending_capture, 15); 891 assert.strictEqual(stats.failed_capture, 5); 892 }); 893 894 test('returns zeros when no sites exist', async () => { 895 mockStatsRow = { 896 total_sites: 0, 897 sites_with_screenshots: 0, 898 captured_sites: 0, 899 pending_capture: 0, 900 failed_capture: 0, 901 }; 902 903 const stats = await getAssetsStats(); 904 905 assert.strictEqual(stats.total_sites, 0); 906 assert.strictEqual(stats.sites_with_screenshots, 0); 907 assert.strictEqual(stats.captured_sites, 0); 908 assert.strictEqual(stats.pending_capture, 0); 909 assert.strictEqual(stats.failed_capture, 0); 910 }); 911 912 test('all numeric fields are numbers', async () => { 913 const stats = await getAssetsStats(); 914 const numericFields = [ 915 'total_sites', 916 'sites_with_screenshots', 917 'captured_sites', 918 'pending_capture', 919 'failed_capture', 920 ]; 921 for (const field of numericFields) { 922 assert.ok( 923 typeof stats[field] === 'number', 924 `${field} should be a number, got ${typeof stats[field]}` 925 ); 926 } 927 }); 928 }); 929 930 // ============================================================================ 931 // TESTS — backfillScreenshots() 932 // ============================================================================ 933 934 describe('backfillScreenshots()', () => { 935 beforeEach(() => { 936 resetMocks(); 937 process.env.ENABLE_VISION = 'false'; 938 }); 939 940 test('returns zero counts when no sites need backfill', async () => { 941 mockSitesFound = []; 942 943 const result = await backfillScreenshots(10); 944 945 assert.strictEqual(result.processed, 0); 946 assert.strictEqual(result.succeeded, 0); 947 assert.strictEqual(result.failed, 0); 948 }); 949 950 test('delegates to runAssetsStage when sites need backfill', async () => { 951 mockSitesFound = [ 952 { id: 20, url: 'https://backfill.com', domain: 'backfill.com', country_code: 'AU' }, 953 ]; 954 mockKeywordData = { keyword: 'plumber', country_code: 'AU' }; 955 mockPageHtml = '<html><body>Backfill page</body></html>'; 956 957 const result = await backfillScreenshots(5); 958 959 // Should have processed the site via runAssetsStage 960 assert.ok('processed' in result, 'result should have processed field'); 961 assert.ok('succeeded' in result, 'result should have succeeded field'); 962 assert.ok('failed' in result, 'result should have failed field'); 963 }); 964 965 test('uses default limit of 10', async () => { 966 mockSitesFound = []; 967 968 const result = await backfillScreenshots(); 969 970 assert.strictEqual(result.processed, 0); 971 }); 972 }); 973 974 // ============================================================================ 975 // TESTS — Legacy flag deprecation warning 976 // ============================================================================ 977 978 describe('Assets Stage - legacy flag handling', () => { 979 beforeEach(() => { 980 resetMocks(); 981 process.env.ENABLE_VISION = 'false'; 982 }); 983 984 test('does not throw when legacy flags are set', async () => { 985 process.env.ENABLE_SCREENSHOT_CAPTURE = 'true'; 986 mockSitesFound = []; 987 988 const result = await runAssetsStage(); 989 990 assert.strictEqual(result.processed, 0, 'should still return valid result with legacy flags'); 991 992 delete process.env.ENABLE_SCREENSHOT_CAPTURE; 993 }); 994 });