/ src / utils / backfill-screenshots.js
backfill-screenshots.js
  1  /**
  2   * Backfill Screenshots Utility
  3   * Finds sites with missing screenshots and recaptures them
  4   *
  5   * DEPRECATED: The active pipeline uses backfillScreenshots() defined in
  6   * src/stages/assets.js, which queries PostgreSQL directly. This file is only
  7   * referenced by scripts/backfill-screenshots.js (a standalone CLI script) and
  8   * its test files. It still uses the SQLite .prepare() API and is not called
  9   * by the pipeline or cron. Do not use for new code.
 10   */
 11  
 12  import { launchBrowser, createStealthContext, captureScreenshots } from '../capture.js';
 13  import Logger from './logger.js';
 14  
 15  const logger = new Logger('BackfillScreenshots');
 16  
 17  /**
 18   * Find sites with missing screenshots
 19   * @param {Database} db - SQLite database instance
 20   * @param {number} limit - Maximum number of sites to return
 21   * @returns {Array} Sites with missing screenshots
 22   */
 23  export function findSitesWithMissingScreenshots(db, limit = 100) {
 24    const query = `
 25      SELECT 
 26        id,
 27        domain,
 28        landing_page_url,
 29        keyword,
 30        screenshot_above_desktop IS NULL as missing_desktop_above,
 31        screenshot_below_desktop IS NULL as missing_desktop_below,
 32        screenshot_above_mobile IS NULL as missing_mobile_above,
 33        screenshot_above_desktop_uncropped IS NULL as missing_desktop_above_uncropped,
 34        screenshot_below_desktop_uncropped IS NULL as missing_desktop_below_uncropped,
 35        screenshot_above_mobile_uncropped IS NULL as missing_mobile_above_uncropped
 36      FROM sites
 37      WHERE 
 38        screenshot_above_desktop IS NULL 
 39        OR screenshot_below_desktop IS NULL 
 40        OR screenshot_above_mobile IS NULL
 41        OR screenshot_above_desktop_uncropped IS NULL
 42        OR screenshot_below_desktop_uncropped IS NULL
 43        OR screenshot_above_mobile_uncropped IS NULL
 44      ORDER BY created_at DESC
 45      LIMIT ?
 46    `;
 47  
 48    return db.prepare(query).all(limit);
 49  }
 50  
 51  /**
 52   * Update site with new screenshots
 53   * @param {Database} db - SQLite database instance
 54   * @param {number} siteId - Site ID
 55   * @param {Object} screenshots - Screenshot buffers
 56   * @param {Object} screenshotsUncropped - Uncropped screenshot buffers
 57   */
 58  export function updateSiteScreenshots(db, siteId, screenshots, screenshotsUncropped) {
 59    const stmt = db.prepare(`
 60      UPDATE sites
 61      SET 
 62        screenshot_above_desktop = ?,
 63        screenshot_below_desktop = ?,
 64        screenshot_above_mobile = ?,
 65        screenshot_above_desktop_uncropped = ?,
 66        screenshot_below_desktop_uncropped = ?,
 67        screenshot_above_mobile_uncropped = ?,
 68        updated_at = CURRENT_TIMESTAMP
 69      WHERE id = ?
 70    `);
 71  
 72    stmt.run(
 73      screenshots.desktop_above || null,
 74      screenshots.desktop_below || null,
 75      screenshots.mobile_above || null,
 76      screenshotsUncropped.desktop_above || null,
 77      screenshotsUncropped.desktop_below || null,
 78      screenshotsUncropped.mobile_above || null,
 79      siteId
 80    );
 81  
 82    logger.success(`Updated screenshots for site ID ${siteId}`);
 83  }
 84  
 85  /**
 86   * Backfill screenshots for sites with missing data
 87   * @param {Database} db - SQLite database instance
 88   * @param {number} limit - Maximum number of sites to process
 89   * @returns {Promise<Object>} Results summary
 90   */
 91  export async function backfillScreenshots(db, limit = 100) {
 92    logger.info('Starting screenshot backfill...');
 93  
 94    const sites = findSitesWithMissingScreenshots(db, limit);
 95  
 96    if (sites.length === 0) {
 97      logger.success('No sites with missing screenshots found');
 98      return {
 99        total: 0,
100        success: 0,
101        failed: 0,
102        sites: [],
103      };
104    }
105  
106    logger.info(`Found ${sites.length} sites with missing screenshots`);
107  
108    const browser = await launchBrowser({ headless: true });
109    const context = await createStealthContext(browser);
110  
111    const results = {
112      total: sites.length,
113      success: 0,
114      failed: 0,
115      sites: [],
116    };
117  
118    try {
119      for (const site of sites) {
120        logger.info(`Processing ${site.domain} (ID: ${site.id})...`);
121  
122        try {
123          const captureResult = await captureScreenshots(context, site.landing_page_url, site.domain);
124  
125          if (captureResult.error) {
126            throw new Error(captureResult.error);
127          }
128  
129          // Update database with new screenshots
130          updateSiteScreenshots(
131            db,
132            site.id,
133            captureResult.screenshots,
134            captureResult.screenshotsUncropped
135          );
136  
137          results.success++;
138          results.sites.push({
139            id: site.id,
140            domain: site.domain,
141            status: 'success',
142          });
143  
144          logger.success(`Successfully backfilled screenshots for ${site.domain}`);
145        } catch (error) {
146          logger.error(`Failed to backfill screenshots for ${site.domain}`, error);
147  
148          results.failed++;
149          results.sites.push({
150            id: site.id,
151            domain: site.domain,
152            status: 'failed',
153            error: error.message,
154          });
155        }
156      }
157    } finally {
158      await context.close();
159      await browser.close();
160    }
161  
162    logger.success(
163      `Screenshot backfill complete: ${results.success} success, ${results.failed} failed`
164    );
165  
166    return results;
167  }
168  
169  export default {
170    findSitesWithMissingScreenshots,
171    updateSiteScreenshots,
172    backfillScreenshots,
173  };