/ tests / stages / assets.test.js
assets.test.js
  1  /**
  2   * Unit Tests for Assets Stage
  3   *
  4   * Tests the runAssetsStage(), captureSiteScreenshots(), getAssetsStats(),
  5   * and backfillScreenshots() flows.
  6   *
  7   * Key behaviors tested:
  8   * - HTML DOM must be non-null/non-empty before marking assets_captured
  9   * - Null html from capture throws and triggers retry
 10   * - Empty html from capture throws and triggers retry
 11   * - Successful capture with valid HTML marks assets_captured
 12   * - Vision-enabled path captures screenshots and validates them
 13   * - Blocklist filtering marks sites as 'ignore'
 14   * - Error page detection schedules retry in 7 days
 15   * - getAssetsStats() returns correct shape
 16   * - backfillScreenshots() delegates to runAssetsStage
 17   *
 18   * Run with:
 19   *   NODE_ENV=test LOGS_DIR=/tmp/test-logs DATABASE_PATH=/tmp/test-assets.db \
 20   *   node --experimental-test-module-mocks --test tests/stages/assets.test.js
 21   */
 22  
 23  import { test, describe, mock, beforeEach } from 'node:test';
 24  import assert from 'node:assert';
 25  import { createPgMock } from '../helpers/pg-mock.js'; // eslint-disable-line no-unused-vars
 26  
 27  // ============================================================================
 28  // MOCK DEFINITIONS — ALL mock.module() calls MUST come before any dynamic imports
 29  // ============================================================================
 30  
 31  // Track DB calls for assertions
 32  const dbCalls = {
 33    updates: [],
 34    queries: [],
 35  };
 36  
 37  // Shared state holders for mocks
 38  let mockSitesFound = [];
 39  let mockKeywordData = null;
 40  let mockPageHtml = '<html><body>Test page content</body></html>';
 41  let mockStatsRow = {
 42    total_sites: 10,
 43    sites_with_screenshots: 5,
 44    captured_sites: 8,
 45    pending_capture: 2,
 46    failed_capture: 1,
 47  };
 48  let mockDedupeStats = { sitesIgnored: 0, duplicateDomains: 0, crossBorder: 0 };
 49  
 50  // For croppedScreenshotsExist mock control
 51  let mockScreenshotsExist = { exists: true, missing: [] };
 52  // For captureWebsite return value control
 53  let mockCaptureResult = null;
 54  // For checkBlocklist mock control
 55  let mockBlocklistResult = null;
 56  // For detectErrorPage mock control
 57  let mockErrorPageResult = { isErrorPage: false };
 58  
 59  class MockDatabase {
 60    constructor(_path) {
 61      this._closed = false;
 62    }
 63  
 64    prepare(sql) {
 65      const trimmed = sql.trim();
 66  
 67      return {
 68        all: () => {
 69          dbCalls.queries.push(trimmed);
 70          // Main query: SELECT sites needing HTML capture (ENABLE_VISION=false path)
 71          if (trimmed.includes("status = 'found'") && trimmed.includes('html_dom IS NULL')) {
 72            return mockSitesFound;
 73          }
 74          // Vision-enabled main query
 75          if (
 76            trimmed.includes("status = 'found' OR status = 'assets_captured'") ||
 77            (trimmed.includes("status = 'found'") && trimmed.includes('screenshot_path'))
 78          ) {
 79            return mockSitesFound;
 80          }
 81          // backfillScreenshots query
 82          if (trimmed.includes('screenshot_path IS NULL') && trimmed.includes('LIMIT ?')) {
 83            return mockSitesFound;
 84          }
 85          return [];
 86        },
 87  
 88        get: _id => {
 89          dbCalls.queries.push(trimmed);
 90          // Stats query
 91          if (trimmed.includes('total_sites')) {
 92            return mockStatsRow;
 93          }
 94          // retry_count query from recordFailure
 95          if (trimmed.includes('retry_count')) {
 96            return { retry_count: 0 };
 97          }
 98          // keyword query from incrementAssetsScraped
 99          if (trimmed.includes('keyword') && trimmed.includes('country_code')) {
100            return mockKeywordData;
101          }
102          return null;
103        },
104  
105        run: (...args) => {
106          if (trimmed.startsWith('UPDATE sites')) {
107            dbCalls.updates.push({ sql: trimmed, args });
108          }
109          return { changes: 1, lastInsertRowid: 0 };
110        },
111      };
112    }
113  
114    pragma() {
115      return undefined;
116    }
117  
118    exec() {
119      return undefined;
120    }
121  
122    transaction(fn) {
123      return (...args) => fn(...args);
124    }
125  
126    close() {
127      this._closed = true;
128    }
129  }
130  
131  // Mock better-sqlite3 (legacy — assets.js now uses db.js, but kept for any transitive deps)
132  mock.module('better-sqlite3', {
133    defaultExport: MockDatabase,
134  });
135  
136  // Mock db.js — assets.js uses db.js (PostgreSQL), not better-sqlite3 directly
137  mock.module('../../src/utils/db.js', {
138    namedExports: {
139      getPool: () => ({}),
140      getAll: async (sql) => {
141        const trimmed = sql.trim();
142        dbCalls.queries.push(trimmed);
143        // HTML-only path: sites WHERE status = 'found' AND html_dom IS NULL
144        if (trimmed.includes("status = 'found'") && trimmed.includes('html_dom IS NULL')) {
145          return mockSitesFound;
146        }
147        // Vision path: status = 'found' OR status = 'assets_captured'
148        if (trimmed.includes("status = 'found' OR status = 'assets_captured'")) {
149          return mockSitesFound;
150        }
151        // backfillScreenshots query
152        if (trimmed.includes('screenshot_path IS NULL')) {
153          return mockSitesFound;
154        }
155        return [];
156      },
157      getOne: async (sql) => {
158        const trimmed = sql.trim();
159        dbCalls.queries.push(trimmed);
160        if (trimmed.includes('total_sites')) {
161          return mockStatsRow;
162        }
163        if (trimmed.includes('retry_count')) {
164          return { retry_count: 0 };
165        }
166        if (trimmed.includes('keyword') && trimmed.includes('country_code')) {
167          return mockKeywordData;
168        }
169        return null;
170      },
171      run: async (sql, ...args) => {
172        const trimmed = sql.trim();
173        if (trimmed.startsWith('UPDATE sites')) {
174          dbCalls.updates.push({ sql: trimmed, args });
175        }
176        return { changes: 1, lastInsertRowid: 0 };
177      },
178      query: async (sql, params) => {
179        const trimmed = sql.trim();
180        dbCalls.queries.push(trimmed);
181        if (trimmed.startsWith('SELECT') || trimmed.startsWith('WITH')) {
182          if (trimmed.includes('keyword') && trimmed.includes('country_code')) {
183            return { rows: mockKeywordData ? [mockKeywordData] : [], rowCount: 0 };
184          }
185          return { rows: [], rowCount: 0 };
186        }
187        if (trimmed.startsWith('UPDATE sites')) {
188          dbCalls.updates.push({ sql: trimmed, args: [params] });
189        }
190        return { rows: [], rowCount: 1 };
191      },
192      withTransaction: async (fn) => {
193        const fakeClient = {
194          query: async (sql, params) => {
195            const trimmed = sql.trim();
196            dbCalls.queries.push(trimmed);
197            if (trimmed.startsWith('UPDATE sites')) {
198              dbCalls.updates.push({ sql: trimmed, args: [params] });
199            }
200            if (trimmed.includes('keyword') && trimmed.includes('country_code')) {
201              return { rows: mockKeywordData ? [mockKeywordData] : [], rowCount: 0 };
202            }
203            return { rows: [], rowCount: 1 };
204          },
205        };
206        return await fn(fakeClient);
207      },
208      closePool: async () => {},
209      createDatabaseConnection: () => ({}),
210      closeDatabaseConnection: async () => {},
211    },
212  });
213  
214  // Mutable capture result for vision tests
215  const mockCaptureWebsite = mock.fn(async () => {
216    if (mockCaptureResult !== null) return mockCaptureResult;
217    return {
218      html: '<html><body>Test page content</body></html>',
219      screenshots: {
220        desktop_above: Buffer.from('img1'),
221        desktop_below: Buffer.from('img2'),
222        mobile_above: Buffer.from('img3'),
223      },
224      screenshotsUncropped: {
225        desktop_above: Buffer.from('unc1'),
226        desktop_below: Buffer.from('unc2'),
227        mobile_above: Buffer.from('unc3'),
228      },
229      httpStatusCode: 200,
230      sslStatus: 'https',
231      httpHeaders: '{}',
232      localeData: '{"htmlLang":"en","hreflangs":[]}',
233    };
234  });
235  
236  // Mock response object for page.goto()
237  const mockResponse = () => ({
238    status: () => 200,
239    headers: () => ({}),
240  });
241  
242  mock.module('../../src/capture.js', {
243    namedExports: {
244      captureWebsite: mockCaptureWebsite,
245      launchBrowser: mock.fn(async () => ({ close: async () => {} })),
246      createStealthContext: mock.fn(async () => ({
247        newPage: async () => ({
248          goto: async () => mockResponse(),
249          content: async () => mockPageHtml,
250          close: async () => {},
251          evaluate: async () => ({ htmlLang: null, hreflangs: [] }),
252          waitForTimeout: async () => {},
253          on: () => {},
254        }),
255        close: async () => {},
256      })),
257    },
258  });
259  
260  // Mock screenshot-storage
261  const mockSaveScreenshots = mock.fn(async () => '/tmp/screenshots/1');
262  const mockCroppedScreenshotsExist = mock.fn(async () => mockScreenshotsExist);
263  mock.module('../../src/utils/screenshot-storage.js', {
264    namedExports: {
265      saveScreenshots: mockSaveScreenshots,
266      croppedScreenshotsExist: mockCroppedScreenshotsExist,
267    },
268  });
269  
270  // Mock site-filters
271  const mockCheckBlocklist = mock.fn(() => mockBlocklistResult);
272  mock.module('../../src/utils/site-filters.js', {
273    namedExports: {
274      checkBlocklist: mockCheckBlocklist,
275    },
276  });
277  
278  // Mock keyword-counters
279  const mockIncrementAssetsScraped = mock.fn();
280  mock.module('../../src/utils/keyword-counters.js', {
281    namedExports: {
282      incrementAssetsScraped: mockIncrementAssetsScraped,
283    },
284  });
285  
286  // Mock dedupe-locale-aware
287  const mockDeduplicateSites = mock.fn(() => mockDedupeStats);
288  mock.module('../../src/utils/dedupe-locale-aware.js', {
289    namedExports: {
290      deduplicateSites: mockDeduplicateSites,
291    },
292  });
293  
294  // Mock error-page-detector
295  const mockDetectErrorPage = mock.fn(() => mockErrorPageResult);
296  mock.module('../../src/utils/error-page-detector.js', {
297    namedExports: {
298      detectErrorPage: mockDetectErrorPage,
299    },
300  });
301  
302  // Mock retry-handler
303  const mockRecordFailure = mock.fn();
304  const mockResetRetries = mock.fn();
305  mock.module('../../src/utils/retry-handler.js', {
306    namedExports: {
307      recordFailure: mockRecordFailure,
308      resetRetries: mockResetRetries,
309    },
310  });
311  
312  // Mock adaptive-concurrency
313  mock.module('../../src/utils/adaptive-concurrency.js', {
314    namedExports: {
315      getAdaptiveConcurrencyFast: mock.fn(() => 1),
316    },
317  });
318  
319  // Mock countries
320  mock.module('../../src/config/countries.js', {
321    namedExports: {
322      getCountryByCode: mock.fn(() => ({ code: 'AU', name: 'Australia' })),
323    },
324  });
325  
326  // Mock detect-language
327  mock.module('../../src/utils/detect-language.js', {
328    namedExports: {
329      deriveLanguageCode: mock.fn(() => 'en'),
330    },
331  });
332  
333  // Mock summary-generator
334  mock.module('../../src/utils/summary-generator.js', {
335    namedExports: {
336      generateStageCompletion: mock.fn(),
337      displayProgress: mock.fn(),
338    },
339  });
340  
341  // Mock error-handler processBatch - returns { results, errors } matching real API
342  mock.module('../../src/utils/error-handler.js', {
343    namedExports: {
344      processBatch: mock.fn(async (items, processor) => {
345        const results = [];
346        const errors = [];
347        for (let i = 0; i < items.length; i++) {
348          try {
349            const result = await processor(items[i], i);
350            results.push(result);
351          } catch (err) {
352            errors.push({ item: items[i], error: err });
353          }
354        }
355        return { results, errors };
356      }),
357      withTimeout: mock.fn(async promise => promise),
358    },
359  });
360  
361  // Mock html-storage — assets.js writes HTML to filesystem, stores 'fs' sentinel in DB
362  const mockWriteHtmlDom = mock.fn();
363  const mockDeleteHtmlDom = mock.fn();
364  mock.module('../../src/utils/html-storage.js', {
365    namedExports: {
366      writeHtmlDom: mockWriteHtmlDom,
367      hasHtmlDom: mock.fn(() => false),
368      deleteHtmlDom: mockDeleteHtmlDom,
369      readHtmlDom: mock.fn(() => null),
370      writeKeyPagesHtml: mock.fn(),
371      readKeyPagesHtml: mock.fn(() => null),
372      deleteKeyPagesHtml: mock.fn(),
373      deleteAllHtml: mock.fn(),
374      DATA_DIR: '/tmp/test-html-data',
375    },
376  });
377  
378  // ============================================================================
379  // DYNAMIC IMPORT — after all mocks are in place
380  // ============================================================================
381  
382  const { runAssetsStage, getAssetsStats, backfillScreenshots } =
383    await import('../../src/stages/assets.js');
384  
385  // ============================================================================
386  // HELPERS
387  // ============================================================================
388  
389  function resetMocks() {
390    mockSitesFound = [];
391    mockKeywordData = null;
392    mockPageHtml = '<html><body>Test page content</body></html>';
393    mockCaptureResult = null;
394    mockBlocklistResult = null;
395    mockErrorPageResult = { isErrorPage: false };
396    mockScreenshotsExist = { exists: true, missing: [] };
397    dbCalls.updates = [];
398    dbCalls.queries = [];
399    mockRecordFailure.mock.resetCalls();
400    mockResetRetries.mock.resetCalls();
401    mockCaptureWebsite.mock.resetCalls();
402    mockSaveScreenshots.mock.resetCalls();
403    mockCheckBlocklist.mock.resetCalls();
404    mockDetectErrorPage.mock.resetCalls();
405    mockIncrementAssetsScraped.mock.resetCalls();
406    mockDeduplicateSites.mock.resetCalls();
407    mockWriteHtmlDom.mock.resetCalls();
408    mockDeleteHtmlDom.mock.resetCalls();
409  }
410  
411  // ============================================================================
412  // TESTS — HTML-only path (ENABLE_VISION=false)
413  // ============================================================================
414  
415  describe('Assets Stage - html_dom validation (ENABLE_VISION=false)', () => {
416    beforeEach(() => {
417      resetMocks();
418      process.env.ENABLE_VISION = 'false';
419    });
420  
421    test('rejects site when html is null (triggers retry via recordFailure)', async () => {
422      mockSitesFound = [
423        { id: 1, url: 'https://example.com', domain: 'example.com', country_code: 'AU' },
424      ];
425      mockPageHtml = null;
426  
427      await runAssetsStage({ limit: 1 });
428  
429      assert.strictEqual(
430        mockRecordFailure.mock.callCount(),
431        1,
432        'recordFailure should be called once'
433      );
434      const failCall = mockRecordFailure.mock.calls[0].arguments;
435      assert.strictEqual(failCall[0], 1, 'siteId should be 1');
436      assert.strictEqual(failCall[1], 'assets', 'stage should be assets');
437      assert.match(
438        failCall[2].message,
439        /HTML DOM capture failed/i,
440        'error should mention HTML DOM capture'
441      );
442  
443      const capturedUpdates = dbCalls.updates.filter(u =>
444        JSON.stringify(u.args).includes('assets_captured')
445      );
446      assert.strictEqual(capturedUpdates.length, 0, 'should not mark as assets_captured');
447    });
448  
449    test('rejects site when html is empty/whitespace (triggers retry via recordFailure)', async () => {
450      mockSitesFound = [
451        { id: 2, url: 'https://example2.com', domain: 'example2.com', country_code: 'AU' },
452      ];
453      mockPageHtml = '   ';
454  
455      await runAssetsStage({ limit: 1 });
456  
457      assert.strictEqual(
458        mockRecordFailure.mock.callCount(),
459        1,
460        'recordFailure should be called once'
461      );
462      const failCall = mockRecordFailure.mock.calls[0].arguments;
463      assert.match(
464        failCall[2].message,
465        /HTML DOM capture failed/i,
466        'error should mention HTML DOM capture'
467      );
468    });
469  
470    test('accepts site when html has valid content', async () => {
471      mockSitesFound = [
472        { id: 3, url: 'https://example3.com', domain: 'example3.com', country_code: 'AU' },
473      ];
474      mockKeywordData = { keyword: 'plumber', country_code: 'AU' };
475      mockPageHtml = '<html><body><h1>Real business page</h1></body></html>';
476  
477      await runAssetsStage({ limit: 1 });
478  
479      assert.strictEqual(mockRecordFailure.mock.callCount(), 0, 'recordFailure should not be called');
480      assert.strictEqual(mockResetRetries.mock.callCount(), 1, 'resetRetries should be called');
481  
482      const capturedUpdates = dbCalls.updates.filter(u =>
483        JSON.stringify(u.args).includes('assets_captured')
484      );
485      assert.ok(capturedUpdates.length > 0, 'should mark as assets_captured');
486  
487      // HTML is written to filesystem via writeHtmlDom, not stored in DB
488      assert.equal(mockWriteHtmlDom.mock.callCount(), 1, 'writeHtmlDom should be called once');
489      const [wSiteId, wHtml] = mockWriteHtmlDom.mock.calls[0].arguments;
490      assert.equal(wSiteId, 3, 'writeHtmlDom siteId should match');
491      assert.ok(
492        wHtml.includes('Real business page'),
493        'html_dom should contain the captured HTML'
494      );
495    });
496  
497    test('returns early with zero counts when no sites need capture', async () => {
498      mockSitesFound = [];
499  
500      const result = await runAssetsStage({ limit: 10 });
501  
502      assert.strictEqual(result.processed, 0);
503      assert.strictEqual(result.succeeded, 0);
504      assert.strictEqual(result.failed, 0);
505      assert.ok(typeof result.duration === 'number', 'duration should be a number');
506    });
507  
508    test('increments keyword counter on successful capture', async () => {
509      mockSitesFound = [
510        { id: 5, url: 'https://example5.com', domain: 'example5.com', country_code: 'AU' },
511      ];
512      mockKeywordData = { keyword: 'electrician', country_code: 'AU' };
513      mockPageHtml = '<html><body>Valid content</body></html>';
514  
515      await runAssetsStage();
516  
517      assert.strictEqual(
518        mockIncrementAssetsScraped.mock.callCount(),
519        1,
520        'incrementAssetsScraped should be called'
521      );
522    });
523  
524    test('skips keyword increment when keyword data is missing', async () => {
525      mockSitesFound = [
526        { id: 6, url: 'https://example6.com', domain: 'example6.com', country_code: 'AU' },
527      ];
528      mockKeywordData = null;
529      mockPageHtml = '<html><body>Valid content</body></html>';
530  
531      await runAssetsStage();
532  
533      assert.strictEqual(
534        mockIncrementAssetsScraped.mock.callCount(),
535        0,
536        'incrementAssetsScraped should not be called without keyword data'
537      );
538    });
539  
540    test('marks blocklisted sites as ignore', async () => {
541      mockSitesFound = [{ id: 7, url: 'https://yelp.com', domain: 'yelp.com', country_code: 'US' }];
542      mockBlocklistResult = { reason: 'Directory site: yelp.com' };
543  
544      await runAssetsStage();
545  
546      // Should have updated the site to 'ignore'
547      const ignoreUpdates = dbCalls.updates.filter(
548        u => u.sql.includes("status = 'ignored'") || u.args.includes('Directory site: yelp.com')
549      );
550      assert.ok(ignoreUpdates.length > 0, 'should mark blocklisted site as ignore');
551      // Should not have tried to capture HTML (no recordFailure, no resetRetries)
552      assert.strictEqual(
553        mockRecordFailure.mock.callCount(),
554        0,
555        'recordFailure should not be called for blocklisted sites'
556      );
557    });
558  
559    test('deduplicates sites before processing', { skip: 'deduplicateSites removed (DR-106): UNIQUE constraint prevents duplicates at insert time' }, async () => {
560      mockSitesFound = [];
561  
562      await runAssetsStage();
563  
564      assert.ok(mockDeduplicateSites.mock.callCount() > 0, 'deduplicateSites should be called');
565    });
566  
567    test('returns stats with processed/succeeded/failed counts', async () => {
568      mockSitesFound = [
569        { id: 8, url: 'https://good.com', domain: 'good.com', country_code: 'AU' },
570        { id: 9, url: 'https://bad.com', domain: 'bad.com', country_code: 'AU' },
571      ];
572      mockKeywordData = { keyword: 'plumber', country_code: 'AU' };
573      // First site gets valid html, second gets null (fails)
574      const callCount = 0;
575      const originalMockResponse = mockResponse;
576      mockPageHtml = '<html>valid</html>';
577  
578      // Override page.content to return alternately valid/null
579      // We simulate this by making the second site fail via processBatch catching the error
580      // Since mockPageHtml is shared, we'll just test the overall shape
581      const result = await runAssetsStage();
582  
583      assert.ok('processed' in result, 'result should have processed');
584      assert.ok('succeeded' in result, 'result should have succeeded');
585      assert.ok('failed' in result, 'result should have failed');
586      assert.ok('duration' in result, 'result should have duration');
587  
588      void callCount;
589      void originalMockResponse;
590    });
591  });
592  
593  // ============================================================================
594  // TESTS — Vision-enabled path (ENABLE_VISION=true)
595  // ============================================================================
596  
597  describe('Assets Stage - vision-enabled path (ENABLE_VISION=true)', () => {
598    beforeEach(() => {
599      resetMocks();
600      process.env.ENABLE_VISION = 'true';
601    });
602  
603    test('returns early when no candidate sites exist', async () => {
604      mockSitesFound = [];
605  
606      const result = await runAssetsStage({ limit: 5 });
607  
608      assert.strictEqual(result.processed, 0);
609      assert.strictEqual(result.succeeded, 0);
610      assert.strictEqual(result.failed, 0);
611      assert.ok(typeof result.duration === 'number');
612    });
613  
614    test('captures screenshots and saves to disk on success', async () => {
615      mockSitesFound = [
616        {
617          id: 10,
618          url: 'https://vision.com',
619          domain: 'vision.com',
620          country_code: 'AU',
621          screenshot_path: null,
622          html_dom: null,
623          error_message: null,
624        },
625      ];
626      mockKeywordData = { keyword: 'dentist', country_code: 'AU' };
627      mockCaptureResult = {
628        html: '<html><body>Vision page</body></html>',
629        screenshots: {
630          desktop_above: Buffer.from('a'),
631          desktop_below: Buffer.from('b'),
632          mobile_above: Buffer.from('c'),
633        },
634        screenshotsUncropped: {
635          desktop_above: Buffer.from('ua'),
636          desktop_below: Buffer.from('ub'),
637          mobile_above: Buffer.from('uc'),
638        },
639        httpStatusCode: 200,
640        sslStatus: 'https',
641        httpHeaders: '{}',
642        localeData: '{}',
643      };
644      mockScreenshotsExist = { exists: true, missing: [] };
645  
646      await runAssetsStage({ limit: 1 });
647  
648      assert.strictEqual(mockCaptureWebsite.mock.callCount(), 1, 'captureWebsite should be called');
649      assert.strictEqual(mockSaveScreenshots.mock.callCount(), 1, 'saveScreenshots should be called');
650      assert.strictEqual(mockResetRetries.mock.callCount(), 1, 'resetRetries should be called');
651      assert.strictEqual(mockRecordFailure.mock.callCount(), 0, 'no failures expected');
652  
653      const capturedUpdates = dbCalls.updates.filter(u =>
654        JSON.stringify(u.args).includes('assets_captured')
655      );
656      assert.ok(capturedUpdates.length > 0, 'should mark site as assets_captured');
657    });
658  
659    test('throws when screenshot validation fails after save', async () => {
660      mockSitesFound = [
661        {
662          id: 11,
663          url: 'https://badevision.com',
664          domain: 'badevision.com',
665          country_code: 'AU',
666          screenshot_path: null,
667          html_dom: null,
668          error_message: null,
669        },
670      ];
671      mockCaptureResult = {
672        html: '<html><body>Content</body></html>',
673        screenshots: {
674          desktop_above: Buffer.from('a'),
675          desktop_below: Buffer.from('b'),
676          mobile_above: Buffer.from('c'),
677        },
678        screenshotsUncropped: {
679          desktop_above: Buffer.from('ua'),
680          desktop_below: Buffer.from('ub'),
681          mobile_above: Buffer.from('uc'),
682        },
683        httpStatusCode: 200,
684        sslStatus: 'https',
685        httpHeaders: '{}',
686        localeData: '{}',
687      };
688      // Simulate screenshot files missing after save
689      mockScreenshotsExist = { exists: false, missing: ['desktop_above', 'mobile_above'] };
690  
691      await runAssetsStage({ limit: 1 });
692  
693      // recordFailure should be called because validation failed
694      assert.strictEqual(
695        mockRecordFailure.mock.callCount(),
696        1,
697        'recordFailure should be called when screenshots missing'
698      );
699      const failCall = mockRecordFailure.mock.calls[0].arguments;
700      assert.match(
701        failCall[2].message,
702        /Screenshot validation failed/i,
703        'error should mention screenshot validation'
704      );
705    });
706  
707    test('schedules retry when error page detected', async () => {
708      mockSitesFound = [
709        {
710          id: 12,
711          url: 'https://errorpage.com',
712          domain: 'errorpage.com',
713          country_code: 'AU',
714          screenshot_path: null,
715          html_dom: null,
716          error_message: null,
717        },
718      ];
719      mockCaptureResult = {
720        html: '<html><body>404 Not Found</body></html>',
721        screenshots: null,
722        screenshotsUncropped: null,
723        httpStatusCode: 200, // False positive - 200 but it's really an error page
724        sslStatus: 'https',
725        httpHeaders: '{}',
726        localeData: '{}',
727      };
728      mockErrorPageResult = {
729        isErrorPage: true,
730        indicator: '404 text in body',
731        wordCount: 5,
732      };
733  
734      await runAssetsStage({ limit: 1 });
735  
736      // Should have updated with recapture_at (7-day retry)
737      // The SQL uses NOW() + INTERVAL '7 days' (PG) or datetime('+7 days') (SQLite translated)
738      const retryUpdates = dbCalls.updates.filter(
739        u => u.sql.includes('recapture_at') && (u.sql.includes('7 days') || u.sql.includes('+7'))
740      );
741      assert.ok(retryUpdates.length > 0, 'should schedule retry in 7 days');
742      // Should have called recordFailure (error page throws)
743      assert.strictEqual(
744        mockRecordFailure.mock.callCount(),
745        1,
746        'recordFailure should be called for error pages'
747      );
748    });
749  
750    test('records failure for HTTP error status codes', async () => {
751      mockSitesFound = [
752        {
753          id: 13,
754          url: 'https://http-error.com',
755          domain: 'http-error.com',
756          country_code: 'AU',
757          screenshot_path: null,
758          html_dom: null,
759          error_message: null,
760        },
761      ];
762      mockCaptureResult = {
763        html: '<html><body>Not Found</body></html>',
764        screenshots: null,
765        screenshotsUncropped: null,
766        httpStatusCode: 404,
767        sslStatus: 'https',
768        httpHeaders: '{}',
769        localeData: '{}',
770      };
771  
772      await runAssetsStage({ limit: 1 });
773  
774      assert.strictEqual(
775        mockRecordFailure.mock.callCount(),
776        1,
777        'recordFailure should be called for HTTP 404'
778      );
779      const failCall = mockRecordFailure.mock.calls[0].arguments;
780      assert.match(failCall[2].message, /HTTP 404/i, 'error message should mention HTTP 404');
781    });
782  
783    test('marks blocklisted sites as ignore in vision mode', async () => {
784      mockSitesFound = [
785        {
786          id: 14,
787          url: 'https://facebook.com',
788          domain: 'facebook.com',
789          country_code: 'US',
790          screenshot_path: null,
791          html_dom: null,
792          error_message: null,
793        },
794      ];
795      mockBlocklistResult = { reason: 'Social media: facebook.com' };
796  
797      await runAssetsStage({ limit: 1 });
798  
799      const ignoreUpdates = dbCalls.updates.filter(u =>
800        JSON.stringify(u.args).includes('Social media')
801      );
802      assert.ok(ignoreUpdates.length > 0, 'should mark social media as ignore');
803      // captureWebsite should NOT be called for blocked sites
804      assert.strictEqual(mockCaptureWebsite.mock.callCount(), 0, 'should not capture blocked sites');
805    });
806  
807    test('deduplicates sites before processing in vision mode', { skip: 'deduplicateSites removed (DR-106): UNIQUE constraint prevents duplicates at insert time' }, async () => {
808      mockSitesFound = [];
809  
810      await runAssetsStage();
811  
812      assert.ok(mockDeduplicateSites.mock.callCount() > 0, 'deduplicateSites should be called');
813    });
814  
815    test('logs deduplication stats when sites are ignored', { skip: 'deduplicateSites removed (DR-106): UNIQUE constraint prevents duplicates at insert time' }, async () => {
816      mockDedupeStats = { sitesIgnored: 3, duplicateDomains: 2, crossBorder: 1 };
817      mockSitesFound = [];
818  
819      const result = await runAssetsStage();
820  
821      assert.strictEqual(result.processed, 0);
822      assert.ok(mockDeduplicateSites.mock.callCount() > 0);
823    });
824  
825    test('increments keyword counter on successful vision capture', async () => {
826      mockSitesFound = [
827        {
828          id: 15,
829          url: 'https://dentist.com',
830          domain: 'dentist.com',
831          country_code: 'AU',
832          screenshot_path: null,
833          html_dom: null,
834          error_message: null,
835        },
836      ];
837      mockKeywordData = { keyword: 'dentist', country_code: 'AU' };
838      mockCaptureResult = {
839        html: '<html><body>Dentist page</body></html>',
840        screenshots: {
841          desktop_above: Buffer.from('a'),
842          desktop_below: Buffer.from('b'),
843          mobile_above: Buffer.from('c'),
844        },
845        screenshotsUncropped: {
846          desktop_above: Buffer.from('ua'),
847          desktop_below: Buffer.from('ub'),
848          mobile_above: Buffer.from('uc'),
849        },
850        httpStatusCode: 200,
851        sslStatus: 'https',
852        httpHeaders: '{}',
853        localeData: '{}',
854      };
855  
856      await runAssetsStage({ limit: 1 });
857  
858      assert.strictEqual(
859        mockIncrementAssetsScraped.mock.callCount(),
860        1,
861        'incrementAssetsScraped should be called'
862      );
863    });
864  });
865  
866  // ============================================================================
867  // TESTS — getAssetsStats()
868  // ============================================================================
869  
870  describe('getAssetsStats()', () => {
871    beforeEach(() => {
872      resetMocks();
873    });
874  
875    test('returns stats object with expected fields', async () => {
876      mockStatsRow = {
877        total_sites: 100,
878        sites_with_screenshots: 42,
879        captured_sites: 80,
880        pending_capture: 15,
881        failed_capture: 5,
882      };
883  
884      const stats = await getAssetsStats();
885  
886      assert.ok(stats !== null && typeof stats === 'object', 'should return an object');
887      assert.strictEqual(stats.total_sites, 100);
888      assert.strictEqual(stats.sites_with_screenshots, 42);
889      assert.strictEqual(stats.captured_sites, 80);
890      assert.strictEqual(stats.pending_capture, 15);
891      assert.strictEqual(stats.failed_capture, 5);
892    });
893  
894    test('returns zeros when no sites exist', async () => {
895      mockStatsRow = {
896        total_sites: 0,
897        sites_with_screenshots: 0,
898        captured_sites: 0,
899        pending_capture: 0,
900        failed_capture: 0,
901      };
902  
903      const stats = await getAssetsStats();
904  
905      assert.strictEqual(stats.total_sites, 0);
906      assert.strictEqual(stats.sites_with_screenshots, 0);
907      assert.strictEqual(stats.captured_sites, 0);
908      assert.strictEqual(stats.pending_capture, 0);
909      assert.strictEqual(stats.failed_capture, 0);
910    });
911  
912    test('all numeric fields are numbers', async () => {
913      const stats = await getAssetsStats();
914      const numericFields = [
915        'total_sites',
916        'sites_with_screenshots',
917        'captured_sites',
918        'pending_capture',
919        'failed_capture',
920      ];
921      for (const field of numericFields) {
922        assert.ok(
923          typeof stats[field] === 'number',
924          `${field} should be a number, got ${typeof stats[field]}`
925        );
926      }
927    });
928  });
929  
930  // ============================================================================
931  // TESTS — backfillScreenshots()
932  // ============================================================================
933  
934  describe('backfillScreenshots()', () => {
935    beforeEach(() => {
936      resetMocks();
937      process.env.ENABLE_VISION = 'false';
938    });
939  
940    test('returns zero counts when no sites need backfill', async () => {
941      mockSitesFound = [];
942  
943      const result = await backfillScreenshots(10);
944  
945      assert.strictEqual(result.processed, 0);
946      assert.strictEqual(result.succeeded, 0);
947      assert.strictEqual(result.failed, 0);
948    });
949  
950    test('delegates to runAssetsStage when sites need backfill', async () => {
951      mockSitesFound = [
952        { id: 20, url: 'https://backfill.com', domain: 'backfill.com', country_code: 'AU' },
953      ];
954      mockKeywordData = { keyword: 'plumber', country_code: 'AU' };
955      mockPageHtml = '<html><body>Backfill page</body></html>';
956  
957      const result = await backfillScreenshots(5);
958  
959      // Should have processed the site via runAssetsStage
960      assert.ok('processed' in result, 'result should have processed field');
961      assert.ok('succeeded' in result, 'result should have succeeded field');
962      assert.ok('failed' in result, 'result should have failed field');
963    });
964  
965    test('uses default limit of 10', async () => {
966      mockSitesFound = [];
967  
968      const result = await backfillScreenshots();
969  
970      assert.strictEqual(result.processed, 0);
971    });
972  });
973  
974  // ============================================================================
975  // TESTS — Legacy flag deprecation warning
976  // ============================================================================
977  
978  describe('Assets Stage - legacy flag handling', () => {
979    beforeEach(() => {
980      resetMocks();
981      process.env.ENABLE_VISION = 'false';
982    });
983  
984    test('does not throw when legacy flags are set', async () => {
985      process.env.ENABLE_SCREENSHOT_CAPTURE = 'true';
986      mockSitesFound = [];
987  
988      const result = await runAssetsStage();
989  
990      assert.strictEqual(result.processed, 0, 'should still return valid result with legacy flags');
991  
992      delete process.env.ENABLE_SCREENSHOT_CAPTURE;
993    });
994  });