/ scripts / analyze-screenshot-similarity.js
analyze-screenshot-similarity.js
  1  #!/usr/bin/env node
  2  
  3  /**
  4   * Analyze Screenshot Similarity
  5   *
  6   * Analyzes the size differences between cropped and uncropped screenshots
  7   * to help determine if the deduplication thresholds should be adjusted.
  8   */
  9  
 10  import { readdir, stat } from 'fs/promises';
 11  import { join, dirname } from 'path';
 12  import { fileURLToPath } from 'url';
 13  import sharp from 'sharp';
 14  
 15  const __filename = fileURLToPath(import.meta.url);
 16  const __dirname = dirname(__filename);
 17  
 18  const SCREENSHOTS_DIR = process.env.SCREENSHOT_BASE_PATH || join(__dirname, '..', 'screenshots');
 19  
 20  // Current thresholds
 21  const SIZE_DIFF_THRESHOLD_BYTES = 5120; // 5KB
 22  const SIZE_DIFF_THRESHOLD_PERCENT = 1; // 1%
 23  
 24  // Screenshot pairs to check
 25  const SCREENSHOT_PAIRS = [
 26    { cropped: 'desktop_above.jpg', uncropped: 'desktop_above_uncropped.jpg', type: 'desktop_above' },
 27    { cropped: 'desktop_below.jpg', uncropped: 'desktop_below_uncropped.jpg', type: 'desktop_below' },
 28    { cropped: 'mobile_above.jpg', uncropped: 'mobile_above_uncropped.jpg', type: 'mobile_above' },
 29  ];
 30  
 31  /**
 32   * Format bytes to human readable format
 33   */
 34  function formatBytes(bytes) {
 35    if (bytes === 0) return '0 Bytes';
 36    const k = 1024;
 37    const sizes = ['Bytes', 'KB', 'MB', 'GB'];
 38    const i = Math.floor(Math.log(bytes) / Math.log(k));
 39    return `${Math.round((bytes / Math.pow(k, i)) * 100) / 100} ${sizes[i]}`;
 40  }
 41  
 42  /**
 43   * Compare cropped and uncropped screenshots
 44   */
 45  async function compareScreenshots(croppedPath, uncroppedPath) {
 46    try {
 47      // Load cropped image and get dimensions
 48      const croppedBuffer = await sharp(croppedPath).toBuffer();
 49      const croppedMetadata = await sharp(croppedPath).metadata();
 50  
 51      // Resize uncropped to match cropped dimensions
 52      const resizedUncroppedBuffer = await sharp(uncroppedPath)
 53        .resize(croppedMetadata.width, croppedMetadata.height, {
 54          fit: 'cover',
 55          position: 'entropy',
 56        })
 57        .jpeg({
 58          quality: 85,
 59          mozjpeg: true,
 60        })
 61        .toBuffer();
 62  
 63      // Compare file sizes
 64      const croppedSize = croppedBuffer.length;
 65      const uncroppedSize = resizedUncroppedBuffer.length;
 66      const sizeDiff = Math.abs(croppedSize - uncroppedSize);
 67      const sizeDiffPercent = (sizeDiff / uncroppedSize) * 100;
 68  
 69      const isIdentical =
 70        sizeDiff < SIZE_DIFF_THRESHOLD_BYTES && sizeDiffPercent < SIZE_DIFF_THRESHOLD_PERCENT;
 71  
 72      return {
 73        croppedSize,
 74        uncroppedSize,
 75        sizeDiff,
 76        sizeDiffPercent,
 77        isIdentical,
 78      };
 79    } catch (error) {
 80      return { error: error.message };
 81    }
 82  }
 83  
 84  async function main() {
 85    console.log('šŸ“Š Analyzing screenshot similarity statistics...\n');
 86  
 87    const limit = parseInt(process.argv[2]) || 100; // Default to analyzing 100 folders
 88  
 89    // Get all screenshot folders
 90    let screenshotFolders;
 91    try {
 92      screenshotFolders = await readdir(SCREENSHOTS_DIR);
 93      screenshotFolders = screenshotFolders.slice(0, limit);
 94    } catch (error) {
 95      console.error('āŒ Error reading screenshots directory:', error.message);
 96      process.exit(1);
 97    }
 98  
 99    console.log(`Analyzing ${screenshotFolders.length} screenshot folders\n`);
100  
101    const stats = {
102      byType: {},
103    };
104  
105    // Initialize stats for each type
106    SCREENSHOT_PAIRS.forEach(pair => {
107      stats.byType[pair.type] = {
108        checked: 0,
109        identical: 0,
110        different: 0,
111        errors: 0,
112        differences: [], // Store all differences for distribution analysis
113      };
114    });
115  
116    // Check each folder
117    for (const folder of screenshotFolders) {
118      const folderPath = join(SCREENSHOTS_DIR, folder);
119  
120      // Skip if not a directory
121      try {
122        const folderStats = await stat(folderPath);
123        if (!folderStats.isDirectory()) {
124          continue;
125        }
126      } catch {
127        continue;
128      }
129  
130      // Check each screenshot pair
131      for (const pair of SCREENSHOT_PAIRS) {
132        const croppedPath = join(folderPath, pair.cropped);
133        const uncroppedPath = join(folderPath, pair.uncropped);
134  
135        // Skip if either file doesn't exist
136        try {
137          await stat(croppedPath);
138          await stat(uncroppedPath);
139        } catch {
140          continue;
141        }
142  
143        stats.byType[pair.type].checked++;
144  
145        // Compare screenshots
146        const comparison = await compareScreenshots(croppedPath, uncroppedPath);
147  
148        if (comparison.error) {
149          stats.byType[pair.type].errors++;
150          continue;
151        }
152  
153        if (comparison.isIdentical) {
154          stats.byType[pair.type].identical++;
155        } else {
156          stats.byType[pair.type].different++;
157          stats.byType[pair.type].differences.push({
158            folder,
159            sizeDiff: comparison.sizeDiff,
160            sizeDiffPercent: comparison.sizeDiffPercent,
161            croppedSize: comparison.croppedSize,
162            uncroppedSize: comparison.uncroppedSize,
163          });
164        }
165      }
166    }
167  
168    // Print results
169    console.log('šŸ“Š Results by Screenshot Type:\n');
170  
171    for (const [type, typeStats] of Object.entries(stats.byType)) {
172      if (typeStats.checked === 0) continue;
173  
174      console.log(`\n=== ${type.toUpperCase()} ===`);
175      console.log(`Checked: ${typeStats.checked}`);
176      console.log(
177        `Identical (would delete): ${typeStats.identical} (${Math.round((typeStats.identical / typeStats.checked) * 100)}%)`
178      );
179      console.log(
180        `Different (would keep): ${typeStats.different} (${Math.round((typeStats.different / typeStats.checked) * 100)}%)`
181      );
182  
183      if (typeStats.differences.length > 0) {
184        // Sort by size difference
185        typeStats.differences.sort((a, b) => a.sizeDiff - b.sizeDiff);
186  
187        console.log('\nSize Difference Distribution:');
188  
189        // Show percentiles
190        const p10 = typeStats.differences[Math.floor(typeStats.differences.length * 0.1)];
191        const p25 = typeStats.differences[Math.floor(typeStats.differences.length * 0.25)];
192        const p50 = typeStats.differences[Math.floor(typeStats.differences.length * 0.5)];
193        const p75 = typeStats.differences[Math.floor(typeStats.differences.length * 0.75)];
194        const p90 = typeStats.differences[Math.floor(typeStats.differences.length * 0.9)];
195  
196        console.log(
197          `  10th percentile: ${formatBytes(p10.sizeDiff)} (${p10.sizeDiffPercent.toFixed(2)}%)`
198        );
199        console.log(
200          `  25th percentile: ${formatBytes(p25.sizeDiff)} (${p25.sizeDiffPercent.toFixed(2)}%)`
201        );
202        console.log(
203          `  50th percentile (median): ${formatBytes(p50.sizeDiff)} (${p50.sizeDiffPercent.toFixed(2)}%)`
204        );
205        console.log(
206          `  75th percentile: ${formatBytes(p75.sizeDiff)} (${p75.sizeDiffPercent.toFixed(2)}%)`
207        );
208        console.log(
209          `  90th percentile: ${formatBytes(p90.sizeDiff)} (${p90.sizeDiffPercent.toFixed(2)}%)`
210        );
211  
212        // Show smallest differences that are still being kept
213        console.log('\nSmallest 5 differences being KEPT:');
214        typeStats.differences.slice(0, 5).forEach((diff, i) => {
215          console.log(
216            `  ${i + 1}. Folder ${diff.folder}: ${formatBytes(diff.sizeDiff)} (${diff.sizeDiffPercent.toFixed(2)}%) - cropped: ${formatBytes(diff.croppedSize)}, uncropped: ${formatBytes(diff.uncroppedSize)}`
217          );
218        });
219  
220        // Count how many would be caught with different thresholds
221        const wouldCatchAt10KB = typeStats.differences.filter(
222          d => d.sizeDiff < 10240 && d.sizeDiffPercent < 2
223        ).length;
224        const wouldCatchAt15KB = typeStats.differences.filter(
225          d => d.sizeDiff < 15360 && d.sizeDiffPercent < 3
226        ).length;
227        const wouldCatchAt20KB = typeStats.differences.filter(
228          d => d.sizeDiff < 20480 && d.sizeDiffPercent < 5
229        ).length;
230  
231        console.log('\nPotential Additional Deletions with Higher Thresholds:');
232        console.log(`  At 10KB & 2%: ${wouldCatchAt10KB} additional files`);
233        console.log(`  At 15KB & 3%: ${wouldCatchAt15KB} additional files`);
234        console.log(`  At 20KB & 5%: ${wouldCatchAt20KB} additional files`);
235      }
236    }
237  
238    console.log('\n\nšŸ’” Current Thresholds:');
239    console.log(`   Size difference: < ${formatBytes(SIZE_DIFF_THRESHOLD_BYTES)}`);
240    console.log(`   Percentage difference: < ${SIZE_DIFF_THRESHOLD_PERCENT}%`);
241    console.log(
242      '\nNote: Both conditions must be met (AND logic) for files to be considered identical.'
243    );
244  }
245  
246  main().catch(error => {
247    console.error('āŒ Fatal error:', error);
248    process.exit(1);
249  });