assets-stage.test.js
1 /** 2 * Assets Stage Tests 3 * Tests for screenshot capture and HTML-only modes 4 */ 5 6 import { describe, it, before, after, mock } from 'node:test'; 7 import assert from 'node:assert/strict'; 8 import Database from 'better-sqlite3'; 9 import { createPgMock } from '../helpers/pg-mock.js'; 10 import fs from 'fs/promises'; 11 import path from 'path'; 12 import { fileURLToPath } from 'url'; 13 14 const __dirname = path.dirname(fileURLToPath(import.meta.url)); 15 16 // ── In-memory DB + pg mock ────────────────────────────────────────────────── 17 18 const db = new Database(':memory:'); 19 20 db.exec(` 21 CREATE TABLE IF NOT EXISTS sites ( 22 id INTEGER PRIMARY KEY AUTOINCREMENT, 23 domain TEXT NOT NULL, 24 landing_page_url TEXT NOT NULL, 25 status TEXT DEFAULT 'found', 26 screenshot_path TEXT, 27 html_dom TEXT, 28 http_status_code INTEGER, 29 ssl_status TEXT, 30 http_headers TEXT, 31 locale_data TEXT, 32 error_message TEXT, 33 recapture_at TEXT, 34 country_code TEXT DEFAULT 'AU', 35 keyword TEXT, 36 score REAL, 37 grade TEXT, 38 rescored_at DATETIME 39 ) 40 `); 41 42 mock.module('../../src/utils/db.js', { 43 namedExports: createPgMock(db), 44 }); 45 46 /** 47 * Insert test site 48 */ 49 function insertTestSite(domain, status = 'found') { 50 const stmt = db.prepare(` 51 INSERT INTO sites (domain, landing_page_url, status, keyword, country_code) 52 VALUES (?, ?, ?, 'test keyword', 'AU') 53 `); 54 const result = stmt.run(domain, `https://${domain}`, status); 55 return result.lastInsertRowid; 56 } 57 58 describe('Assets Stage - Screenshot Capture Bug Fix', () => { 59 before(() => { 60 db.exec('DELETE FROM sites'); 61 }); 62 63 after(() => { 64 // nothing to clean up — in-memory db 65 }); 66 67 it('should NOT mark sites as assets_captured when screenshot_path is NULL', () => { 68 // Simulate what happens when ENABLE_SCREENSHOT_CAPTURE=false 69 const siteId = insertTestSite('example.com', 'found'); 70 71 // Simulate successful HTML capture but no screenshots 72 const screenshotPath = null; 73 const htmlDom = '<html><body>Test</body></html>'; 74 const httpStatusCode = 200; 75 const sslStatus = 'https'; 76 const httpHeaders = '{}'; 77 const localeData = '{}'; 78 79 // This is the logic from assets.js (AFTER fix) 80 const isSuccess = httpStatusCode >= 200 && httpStatusCode < 400; 81 const hasScreenshots = screenshotPath !== null; 82 const newStatus = isSuccess && hasScreenshots ? 'assets_captured' : 'found'; 83 84 // Update database 85 db.prepare( 86 ` 87 UPDATE sites SET 88 screenshot_path = ?, 89 html_dom = ?, 90 http_status_code = ?, 91 ssl_status = ?, 92 http_headers = ?, 93 locale_data = ?, 94 status = ?, 95 error_message = NULL, 96 recapture_at = NULL 97 WHERE id = ? 98 ` 99 ).run( 100 screenshotPath, 101 htmlDom, 102 httpStatusCode, 103 sslStatus, 104 httpHeaders, 105 localeData, 106 newStatus, 107 siteId 108 ); 109 110 // Verify site is NOT marked as assets_captured 111 const site = db.prepare('SELECT * FROM sites WHERE id = ?').get(siteId); 112 assert.strictEqual( 113 site.status, 114 'found', 115 'Site should remain in "found" status when screenshot_path is NULL' 116 ); 117 assert.strictEqual(site.screenshot_path, null, 'screenshot_path should be NULL'); 118 assert.strictEqual(site.html_dom, htmlDom, 'html_dom should be saved'); 119 assert.strictEqual(site.http_status_code, 200, 'http_status_code should be saved'); 120 }); 121 122 it('should mark sites as assets_captured when screenshots are captured', () => { 123 const siteId = insertTestSite('example-with-screenshots.com', 'found'); 124 125 // Simulate successful screenshot + HTML capture 126 const screenshotPath = 'screenshots/123'; 127 const htmlDom = '<html><body>Test</body></html>'; 128 const httpStatusCode = 200; 129 const sslStatus = 'https'; 130 const httpHeaders = '{}'; 131 const localeData = '{}'; 132 133 // This is the logic from assets.js (AFTER fix) 134 const isSuccess = httpStatusCode >= 200 && httpStatusCode < 400; 135 const hasScreenshots = screenshotPath !== null; 136 const newStatus = isSuccess && hasScreenshots ? 'assets_captured' : 'found'; 137 138 // Update database 139 db.prepare( 140 ` 141 UPDATE sites SET 142 screenshot_path = ?, 143 html_dom = ?, 144 http_status_code = ?, 145 ssl_status = ?, 146 http_headers = ?, 147 locale_data = ?, 148 status = ?, 149 error_message = NULL, 150 recapture_at = NULL 151 WHERE id = ? 152 ` 153 ).run( 154 screenshotPath, 155 htmlDom, 156 httpStatusCode, 157 sslStatus, 158 httpHeaders, 159 localeData, 160 newStatus, 161 siteId 162 ); 163 164 // Verify site IS marked as assets_captured 165 const site = db.prepare('SELECT * FROM sites WHERE id = ?').get(siteId); 166 assert.strictEqual( 167 site.status, 168 'assets_captured', 169 'Site should be marked as "assets_captured" when screenshots exist' 170 ); 171 assert.strictEqual(site.screenshot_path, screenshotPath, 'screenshot_path should be saved'); 172 assert.strictEqual(site.html_dom, htmlDom, 'html_dom should be saved'); 173 assert.strictEqual(site.http_status_code, 200, 'http_status_code should be saved'); 174 }); 175 176 it('should keep sites in "found" status when HTTP status is an error', () => { 177 const siteId = insertTestSite('example-404.com', 'found'); 178 179 // Simulate 404 response 180 const screenshotPath = null; 181 const htmlDom = '<html><body>404 Not Found</body></html>'; 182 const httpStatusCode = 404; 183 const sslStatus = 'https'; 184 const httpHeaders = '{}'; 185 const localeData = '{}'; 186 187 // This is the logic from assets.js (AFTER fix) 188 const isSuccess = httpStatusCode >= 200 && httpStatusCode < 400; 189 const hasScreenshots = screenshotPath !== null; 190 const newStatus = isSuccess && hasScreenshots ? 'assets_captured' : 'found'; 191 192 // Update database 193 db.prepare( 194 ` 195 UPDATE sites SET 196 screenshot_path = ?, 197 html_dom = ?, 198 http_status_code = ?, 199 ssl_status = ?, 200 http_headers = ?, 201 locale_data = ?, 202 status = ?, 203 error_message = NULL, 204 recapture_at = NULL 205 WHERE id = ? 206 ` 207 ).run( 208 screenshotPath, 209 htmlDom, 210 httpStatusCode, 211 sslStatus, 212 httpHeaders, 213 localeData, 214 newStatus, 215 siteId 216 ); 217 218 // Verify site is NOT marked as assets_captured 219 const site = db.prepare('SELECT * FROM sites WHERE id = ?').get(siteId); 220 assert.strictEqual( 221 site.status, 222 'found', 223 'Site should remain in "found" status for HTTP errors' 224 ); 225 assert.strictEqual(site.http_status_code, 404, 'http_status_code should be saved'); 226 }); 227 228 it('scoring stage should skip sites with NULL screenshot_path', () => { 229 // Insert a site with assets_captured but NULL screenshot_path (edge case) 230 const siteId = insertTestSite('edge-case.com', 'assets_captured'); 231 db.prepare('UPDATE sites SET screenshot_path = NULL WHERE id = ?').run(siteId); 232 233 // This is the query from scoring.js (AFTER fix) 234 const query = ` 235 SELECT id, domain, landing_page_url as url, country_code 236 FROM sites 237 WHERE status = 'assets_captured' 238 AND screenshot_path IS NOT NULL 239 AND (score IS NULL OR error_message IS NOT NULL) 240 `; 241 242 const sites = db.prepare(query).all(); 243 244 // Verify the edge case site is NOT included 245 const foundSite = sites.find(s => s.id === siteId); 246 assert.strictEqual( 247 foundSite, 248 undefined, 249 'Scoring stage should skip sites with NULL screenshot_path' 250 ); 251 }); 252 253 it('scoring stage should include sites with valid screenshot_path', () => { 254 // Insert a site with valid screenshot_path 255 const siteId = insertTestSite('valid-site.com', 'assets_captured'); 256 db.prepare('UPDATE sites SET screenshot_path = ? WHERE id = ?').run('screenshots/123', siteId); 257 258 // This is the query from scoring.js (AFTER fix) 259 const query = ` 260 SELECT id, domain, landing_page_url as url, country_code 261 FROM sites 262 WHERE status = 'assets_captured' 263 AND screenshot_path IS NOT NULL 264 AND (score IS NULL OR error_message IS NOT NULL) 265 `; 266 267 const sites = db.prepare(query).all(); 268 269 // Verify the valid site IS included 270 const foundSite = sites.find(s => s.id === siteId); 271 assert.notStrictEqual( 272 foundSite, 273 undefined, 274 'Scoring stage should include sites with valid screenshot_path' 275 ); 276 assert.strictEqual(foundSite.domain, 'valid-site.com'); 277 }); 278 279 it('should validate screenshot files exist before marking as assets_captured', async () => { 280 // This test validates the bug fix: 281 // Previously, sites could get screenshot_path set but files not written to disk 282 // Causing them to be stuck in 'found' status and preventing reprocessing 283 284 const siteId = insertTestSite('test-validation.com', 'found'); 285 286 // Import the screenshot validation function 287 const { croppedScreenshotsExist } = await import('../../src/utils/screenshot-storage.js'); 288 289 // Case 1: screenshot_path is NULL - no validation needed 290 const screenshotPathNull = null; 291 const hasScreenshotsNull = screenshotPathNull !== null; 292 assert.strictEqual( 293 hasScreenshotsNull, 294 false, 295 'hasScreenshots should be false when screenshot_path is NULL' 296 ); 297 298 // Case 2: screenshot_path is set but files don't exist (BUG SCENARIO) 299 const screenshotPathInvalid = 'screenshots/99999'; // Non-existent directory 300 const { exists: filesExist, missing } = await croppedScreenshotsExist(screenshotPathInvalid); 301 assert.strictEqual(filesExist, false, 'Validation should detect missing screenshot files'); 302 assert.strictEqual( 303 missing.length, 304 3, 305 'All 3 cropped screenshots should be reported as missing' 306 ); 307 308 // The fix: In assets.js, after saveScreenshots(), we now validate with croppedScreenshotsExist() 309 // If validation fails, it throws an error preventing the site from being marked as assets_captured 310 // This prevents the bug where screenshot_path is set but files don't exist 311 }); 312 313 it('should cleanup legacy sites with invalid screenshot_path values', async () => { 314 // This test validates the cleanup logic for existing broken sites 315 const { croppedScreenshotsExist } = await import('../../src/utils/screenshot-storage.js'); 316 317 // Create a site with invalid screenshot_path (legacy bug scenario) 318 const siteId = insertTestSite('legacy-broken.com', 'found'); 319 db.prepare('UPDATE sites SET screenshot_path = ? WHERE id = ?').run( 320 'screenshots/99999', 321 siteId 322 ); 323 324 // Verify the screenshot_path is set but files don't exist 325 const siteBefore = db.prepare('SELECT * FROM sites WHERE id = ?').get(siteId); 326 assert.strictEqual(siteBefore.screenshot_path, 'screenshots/99999'); 327 328 const { exists } = await croppedScreenshotsExist(siteBefore.screenshot_path); 329 assert.strictEqual(exists, false, 'Screenshot files should not exist'); 330 331 // Simulate the cleanup logic from assets.js 332 if (!exists) { 333 db.prepare('UPDATE sites SET screenshot_path = NULL WHERE id = ?').run(siteId); 334 } 335 336 // Verify the site was cleaned up 337 const siteAfter = db.prepare('SELECT * FROM sites WHERE id = ?').get(siteId); 338 assert.strictEqual( 339 siteAfter.screenshot_path, 340 null, 341 'Cleanup should reset screenshot_path to NULL' 342 ); 343 assert.strictEqual( 344 siteAfter.status, 345 'found', 346 'Site should remain in found status for reprocessing' 347 ); 348 }); 349 });