/ tests / pipeline / assets-stage.test.js
assets-stage.test.js
  1  /**
  2   * Assets Stage Tests
  3   * Tests for screenshot capture and HTML-only modes
  4   */
  5  
  6  import { describe, it, before, after, mock } from 'node:test';
  7  import assert from 'node:assert/strict';
  8  import Database from 'better-sqlite3';
  9  import { createPgMock } from '../helpers/pg-mock.js';
 10  import fs from 'fs/promises';
 11  import path from 'path';
 12  import { fileURLToPath } from 'url';
 13  
 14  const __dirname = path.dirname(fileURLToPath(import.meta.url));
 15  
 16  // ── In-memory DB + pg mock ──────────────────────────────────────────────────
 17  
 18  const db = new Database(':memory:');
 19  
 20  db.exec(`
 21    CREATE TABLE IF NOT EXISTS sites (
 22      id INTEGER PRIMARY KEY AUTOINCREMENT,
 23      domain TEXT NOT NULL,
 24      landing_page_url TEXT NOT NULL,
 25      status TEXT DEFAULT 'found',
 26      screenshot_path TEXT,
 27      html_dom TEXT,
 28      http_status_code INTEGER,
 29      ssl_status TEXT,
 30      http_headers TEXT,
 31      locale_data TEXT,
 32      error_message TEXT,
 33      recapture_at TEXT,
 34      country_code TEXT DEFAULT 'AU',
 35      keyword TEXT,
 36      score REAL,
 37      grade TEXT,
 38      rescored_at DATETIME
 39    )
 40  `);
 41  
 42  mock.module('../../src/utils/db.js', {
 43    namedExports: createPgMock(db),
 44  });
 45  
 46  /**
 47   * Insert test site
 48   */
 49  function insertTestSite(domain, status = 'found') {
 50    const stmt = db.prepare(`
 51      INSERT INTO sites (domain, landing_page_url, status, keyword, country_code)
 52      VALUES (?, ?, ?, 'test keyword', 'AU')
 53    `);
 54    const result = stmt.run(domain, `https://${domain}`, status);
 55    return result.lastInsertRowid;
 56  }
 57  
 58  describe('Assets Stage - Screenshot Capture Bug Fix', () => {
 59    before(() => {
 60      db.exec('DELETE FROM sites');
 61    });
 62  
 63    after(() => {
 64      // nothing to clean up — in-memory db
 65    });
 66  
 67    it('should NOT mark sites as assets_captured when screenshot_path is NULL', () => {
 68      // Simulate what happens when ENABLE_SCREENSHOT_CAPTURE=false
 69      const siteId = insertTestSite('example.com', 'found');
 70  
 71      // Simulate successful HTML capture but no screenshots
 72      const screenshotPath = null;
 73      const htmlDom = '<html><body>Test</body></html>';
 74      const httpStatusCode = 200;
 75      const sslStatus = 'https';
 76      const httpHeaders = '{}';
 77      const localeData = '{}';
 78  
 79      // This is the logic from assets.js (AFTER fix)
 80      const isSuccess = httpStatusCode >= 200 && httpStatusCode < 400;
 81      const hasScreenshots = screenshotPath !== null;
 82      const newStatus = isSuccess && hasScreenshots ? 'assets_captured' : 'found';
 83  
 84      // Update database
 85      db.prepare(
 86        `
 87        UPDATE sites SET
 88          screenshot_path = ?,
 89          html_dom = ?,
 90          http_status_code = ?,
 91          ssl_status = ?,
 92          http_headers = ?,
 93          locale_data = ?,
 94          status = ?,
 95          error_message = NULL,
 96          recapture_at = NULL
 97        WHERE id = ?
 98      `
 99      ).run(
100        screenshotPath,
101        htmlDom,
102        httpStatusCode,
103        sslStatus,
104        httpHeaders,
105        localeData,
106        newStatus,
107        siteId
108      );
109  
110      // Verify site is NOT marked as assets_captured
111      const site = db.prepare('SELECT * FROM sites WHERE id = ?').get(siteId);
112      assert.strictEqual(
113        site.status,
114        'found',
115        'Site should remain in "found" status when screenshot_path is NULL'
116      );
117      assert.strictEqual(site.screenshot_path, null, 'screenshot_path should be NULL');
118      assert.strictEqual(site.html_dom, htmlDom, 'html_dom should be saved');
119      assert.strictEqual(site.http_status_code, 200, 'http_status_code should be saved');
120    });
121  
122    it('should mark sites as assets_captured when screenshots are captured', () => {
123      const siteId = insertTestSite('example-with-screenshots.com', 'found');
124  
125      // Simulate successful screenshot + HTML capture
126      const screenshotPath = 'screenshots/123';
127      const htmlDom = '<html><body>Test</body></html>';
128      const httpStatusCode = 200;
129      const sslStatus = 'https';
130      const httpHeaders = '{}';
131      const localeData = '{}';
132  
133      // This is the logic from assets.js (AFTER fix)
134      const isSuccess = httpStatusCode >= 200 && httpStatusCode < 400;
135      const hasScreenshots = screenshotPath !== null;
136      const newStatus = isSuccess && hasScreenshots ? 'assets_captured' : 'found';
137  
138      // Update database
139      db.prepare(
140        `
141        UPDATE sites SET
142          screenshot_path = ?,
143          html_dom = ?,
144          http_status_code = ?,
145          ssl_status = ?,
146          http_headers = ?,
147          locale_data = ?,
148          status = ?,
149          error_message = NULL,
150          recapture_at = NULL
151        WHERE id = ?
152      `
153      ).run(
154        screenshotPath,
155        htmlDom,
156        httpStatusCode,
157        sslStatus,
158        httpHeaders,
159        localeData,
160        newStatus,
161        siteId
162      );
163  
164      // Verify site IS marked as assets_captured
165      const site = db.prepare('SELECT * FROM sites WHERE id = ?').get(siteId);
166      assert.strictEqual(
167        site.status,
168        'assets_captured',
169        'Site should be marked as "assets_captured" when screenshots exist'
170      );
171      assert.strictEqual(site.screenshot_path, screenshotPath, 'screenshot_path should be saved');
172      assert.strictEqual(site.html_dom, htmlDom, 'html_dom should be saved');
173      assert.strictEqual(site.http_status_code, 200, 'http_status_code should be saved');
174    });
175  
176    it('should keep sites in "found" status when HTTP status is an error', () => {
177      const siteId = insertTestSite('example-404.com', 'found');
178  
179      // Simulate 404 response
180      const screenshotPath = null;
181      const htmlDom = '<html><body>404 Not Found</body></html>';
182      const httpStatusCode = 404;
183      const sslStatus = 'https';
184      const httpHeaders = '{}';
185      const localeData = '{}';
186  
187      // This is the logic from assets.js (AFTER fix)
188      const isSuccess = httpStatusCode >= 200 && httpStatusCode < 400;
189      const hasScreenshots = screenshotPath !== null;
190      const newStatus = isSuccess && hasScreenshots ? 'assets_captured' : 'found';
191  
192      // Update database
193      db.prepare(
194        `
195        UPDATE sites SET
196          screenshot_path = ?,
197          html_dom = ?,
198          http_status_code = ?,
199          ssl_status = ?,
200          http_headers = ?,
201          locale_data = ?,
202          status = ?,
203          error_message = NULL,
204          recapture_at = NULL
205        WHERE id = ?
206      `
207      ).run(
208        screenshotPath,
209        htmlDom,
210        httpStatusCode,
211        sslStatus,
212        httpHeaders,
213        localeData,
214        newStatus,
215        siteId
216      );
217  
218      // Verify site is NOT marked as assets_captured
219      const site = db.prepare('SELECT * FROM sites WHERE id = ?').get(siteId);
220      assert.strictEqual(
221        site.status,
222        'found',
223        'Site should remain in "found" status for HTTP errors'
224      );
225      assert.strictEqual(site.http_status_code, 404, 'http_status_code should be saved');
226    });
227  
228    it('scoring stage should skip sites with NULL screenshot_path', () => {
229      // Insert a site with assets_captured but NULL screenshot_path (edge case)
230      const siteId = insertTestSite('edge-case.com', 'assets_captured');
231      db.prepare('UPDATE sites SET screenshot_path = NULL WHERE id = ?').run(siteId);
232  
233      // This is the query from scoring.js (AFTER fix)
234      const query = `
235        SELECT id, domain, landing_page_url as url, country_code
236        FROM sites
237        WHERE status = 'assets_captured'
238          AND screenshot_path IS NOT NULL
239          AND (score IS NULL OR error_message IS NOT NULL)
240      `;
241  
242      const sites = db.prepare(query).all();
243  
244      // Verify the edge case site is NOT included
245      const foundSite = sites.find(s => s.id === siteId);
246      assert.strictEqual(
247        foundSite,
248        undefined,
249        'Scoring stage should skip sites with NULL screenshot_path'
250      );
251    });
252  
253    it('scoring stage should include sites with valid screenshot_path', () => {
254      // Insert a site with valid screenshot_path
255      const siteId = insertTestSite('valid-site.com', 'assets_captured');
256      db.prepare('UPDATE sites SET screenshot_path = ? WHERE id = ?').run('screenshots/123', siteId);
257  
258      // This is the query from scoring.js (AFTER fix)
259      const query = `
260        SELECT id, domain, landing_page_url as url, country_code
261        FROM sites
262        WHERE status = 'assets_captured'
263          AND screenshot_path IS NOT NULL
264          AND (score IS NULL OR error_message IS NOT NULL)
265      `;
266  
267      const sites = db.prepare(query).all();
268  
269      // Verify the valid site IS included
270      const foundSite = sites.find(s => s.id === siteId);
271      assert.notStrictEqual(
272        foundSite,
273        undefined,
274        'Scoring stage should include sites with valid screenshot_path'
275      );
276      assert.strictEqual(foundSite.domain, 'valid-site.com');
277    });
278  
279    it('should validate screenshot files exist before marking as assets_captured', async () => {
280      // This test validates the bug fix:
281      // Previously, sites could get screenshot_path set but files not written to disk
282      // Causing them to be stuck in 'found' status and preventing reprocessing
283  
284      const siteId = insertTestSite('test-validation.com', 'found');
285  
286      // Import the screenshot validation function
287      const { croppedScreenshotsExist } = await import('../../src/utils/screenshot-storage.js');
288  
289      // Case 1: screenshot_path is NULL - no validation needed
290      const screenshotPathNull = null;
291      const hasScreenshotsNull = screenshotPathNull !== null;
292      assert.strictEqual(
293        hasScreenshotsNull,
294        false,
295        'hasScreenshots should be false when screenshot_path is NULL'
296      );
297  
298      // Case 2: screenshot_path is set but files don't exist (BUG SCENARIO)
299      const screenshotPathInvalid = 'screenshots/99999'; // Non-existent directory
300      const { exists: filesExist, missing } = await croppedScreenshotsExist(screenshotPathInvalid);
301      assert.strictEqual(filesExist, false, 'Validation should detect missing screenshot files');
302      assert.strictEqual(
303        missing.length,
304        3,
305        'All 3 cropped screenshots should be reported as missing'
306      );
307  
308      // The fix: In assets.js, after saveScreenshots(), we now validate with croppedScreenshotsExist()
309      // If validation fails, it throws an error preventing the site from being marked as assets_captured
310      // This prevents the bug where screenshot_path is set but files don't exist
311    });
312  
313    it('should cleanup legacy sites with invalid screenshot_path values', async () => {
314      // This test validates the cleanup logic for existing broken sites
315      const { croppedScreenshotsExist } = await import('../../src/utils/screenshot-storage.js');
316  
317      // Create a site with invalid screenshot_path (legacy bug scenario)
318      const siteId = insertTestSite('legacy-broken.com', 'found');
319      db.prepare('UPDATE sites SET screenshot_path = ? WHERE id = ?').run(
320        'screenshots/99999',
321        siteId
322      );
323  
324      // Verify the screenshot_path is set but files don't exist
325      const siteBefore = db.prepare('SELECT * FROM sites WHERE id = ?').get(siteId);
326      assert.strictEqual(siteBefore.screenshot_path, 'screenshots/99999');
327  
328      const { exists } = await croppedScreenshotsExist(siteBefore.screenshot_path);
329      assert.strictEqual(exists, false, 'Screenshot files should not exist');
330  
331      // Simulate the cleanup logic from assets.js
332      if (!exists) {
333        db.prepare('UPDATE sites SET screenshot_path = NULL WHERE id = ?').run(siteId);
334      }
335  
336      // Verify the site was cleaned up
337      const siteAfter = db.prepare('SELECT * FROM sites WHERE id = ?').get(siteId);
338      assert.strictEqual(
339        siteAfter.screenshot_path,
340        null,
341        'Cleanup should reset screenshot_path to NULL'
342      );
343      assert.strictEqual(
344        siteAfter.status,
345        'found',
346        'Site should remain in found status for reprocessing'
347      );
348    });
349  });