analyze-screenshot-similarity.js
1 #!/usr/bin/env node 2 3 /** 4 * Analyze Screenshot Similarity 5 * 6 * Analyzes the size differences between cropped and uncropped screenshots 7 * to help determine if the deduplication thresholds should be adjusted. 8 */ 9 10 import { readdir, stat } from 'fs/promises'; 11 import { join, dirname } from 'path'; 12 import { fileURLToPath } from 'url'; 13 import sharp from 'sharp'; 14 15 const __filename = fileURLToPath(import.meta.url); 16 const __dirname = dirname(__filename); 17 18 const SCREENSHOTS_DIR = process.env.SCREENSHOT_BASE_PATH || join(__dirname, '..', 'screenshots'); 19 20 // Current thresholds 21 const SIZE_DIFF_THRESHOLD_BYTES = 5120; // 5KB 22 const SIZE_DIFF_THRESHOLD_PERCENT = 1; // 1% 23 24 // Screenshot pairs to check 25 const SCREENSHOT_PAIRS = [ 26 { cropped: 'desktop_above.jpg', uncropped: 'desktop_above_uncropped.jpg', type: 'desktop_above' }, 27 { cropped: 'desktop_below.jpg', uncropped: 'desktop_below_uncropped.jpg', type: 'desktop_below' }, 28 { cropped: 'mobile_above.jpg', uncropped: 'mobile_above_uncropped.jpg', type: 'mobile_above' }, 29 ]; 30 31 /** 32 * Format bytes to human readable format 33 */ 34 function formatBytes(bytes) { 35 if (bytes === 0) return '0 Bytes'; 36 const k = 1024; 37 const sizes = ['Bytes', 'KB', 'MB', 'GB']; 38 const i = Math.floor(Math.log(bytes) / Math.log(k)); 39 return `${Math.round((bytes / Math.pow(k, i)) * 100) / 100} ${sizes[i]}`; 40 } 41 42 /** 43 * Compare cropped and uncropped screenshots 44 */ 45 async function compareScreenshots(croppedPath, uncroppedPath) { 46 try { 47 // Load cropped image and get dimensions 48 const croppedBuffer = await sharp(croppedPath).toBuffer(); 49 const croppedMetadata = await sharp(croppedPath).metadata(); 50 51 // Resize uncropped to match cropped dimensions 52 const resizedUncroppedBuffer = await sharp(uncroppedPath) 53 .resize(croppedMetadata.width, croppedMetadata.height, { 54 fit: 'cover', 55 position: 'entropy', 56 }) 57 .jpeg({ 58 quality: 85, 59 mozjpeg: true, 60 }) 61 .toBuffer(); 62 63 // Compare file sizes 64 const croppedSize = croppedBuffer.length; 65 const uncroppedSize = resizedUncroppedBuffer.length; 66 const sizeDiff = Math.abs(croppedSize - uncroppedSize); 67 const sizeDiffPercent = (sizeDiff / uncroppedSize) * 100; 68 69 const isIdentical = 70 sizeDiff < SIZE_DIFF_THRESHOLD_BYTES && sizeDiffPercent < SIZE_DIFF_THRESHOLD_PERCENT; 71 72 return { 73 croppedSize, 74 uncroppedSize, 75 sizeDiff, 76 sizeDiffPercent, 77 isIdentical, 78 }; 79 } catch (error) { 80 return { error: error.message }; 81 } 82 } 83 84 async function main() { 85 console.log('š Analyzing screenshot similarity statistics...\n'); 86 87 const limit = parseInt(process.argv[2]) || 100; // Default to analyzing 100 folders 88 89 // Get all screenshot folders 90 let screenshotFolders; 91 try { 92 screenshotFolders = await readdir(SCREENSHOTS_DIR); 93 screenshotFolders = screenshotFolders.slice(0, limit); 94 } catch (error) { 95 console.error('ā Error reading screenshots directory:', error.message); 96 process.exit(1); 97 } 98 99 console.log(`Analyzing ${screenshotFolders.length} screenshot folders\n`); 100 101 const stats = { 102 byType: {}, 103 }; 104 105 // Initialize stats for each type 106 SCREENSHOT_PAIRS.forEach(pair => { 107 stats.byType[pair.type] = { 108 checked: 0, 109 identical: 0, 110 different: 0, 111 errors: 0, 112 differences: [], // Store all differences for distribution analysis 113 }; 114 }); 115 116 // Check each folder 117 for (const folder of screenshotFolders) { 118 const folderPath = join(SCREENSHOTS_DIR, folder); 119 120 // Skip if not a directory 121 try { 122 const folderStats = await stat(folderPath); 123 if (!folderStats.isDirectory()) { 124 continue; 125 } 126 } catch { 127 continue; 128 } 129 130 // Check each screenshot pair 131 for (const pair of SCREENSHOT_PAIRS) { 132 const croppedPath = join(folderPath, pair.cropped); 133 const uncroppedPath = join(folderPath, pair.uncropped); 134 135 // Skip if either file doesn't exist 136 try { 137 await stat(croppedPath); 138 await stat(uncroppedPath); 139 } catch { 140 continue; 141 } 142 143 stats.byType[pair.type].checked++; 144 145 // Compare screenshots 146 const comparison = await compareScreenshots(croppedPath, uncroppedPath); 147 148 if (comparison.error) { 149 stats.byType[pair.type].errors++; 150 continue; 151 } 152 153 if (comparison.isIdentical) { 154 stats.byType[pair.type].identical++; 155 } else { 156 stats.byType[pair.type].different++; 157 stats.byType[pair.type].differences.push({ 158 folder, 159 sizeDiff: comparison.sizeDiff, 160 sizeDiffPercent: comparison.sizeDiffPercent, 161 croppedSize: comparison.croppedSize, 162 uncroppedSize: comparison.uncroppedSize, 163 }); 164 } 165 } 166 } 167 168 // Print results 169 console.log('š Results by Screenshot Type:\n'); 170 171 for (const [type, typeStats] of Object.entries(stats.byType)) { 172 if (typeStats.checked === 0) continue; 173 174 console.log(`\n=== ${type.toUpperCase()} ===`); 175 console.log(`Checked: ${typeStats.checked}`); 176 console.log( 177 `Identical (would delete): ${typeStats.identical} (${Math.round((typeStats.identical / typeStats.checked) * 100)}%)` 178 ); 179 console.log( 180 `Different (would keep): ${typeStats.different} (${Math.round((typeStats.different / typeStats.checked) * 100)}%)` 181 ); 182 183 if (typeStats.differences.length > 0) { 184 // Sort by size difference 185 typeStats.differences.sort((a, b) => a.sizeDiff - b.sizeDiff); 186 187 console.log('\nSize Difference Distribution:'); 188 189 // Show percentiles 190 const p10 = typeStats.differences[Math.floor(typeStats.differences.length * 0.1)]; 191 const p25 = typeStats.differences[Math.floor(typeStats.differences.length * 0.25)]; 192 const p50 = typeStats.differences[Math.floor(typeStats.differences.length * 0.5)]; 193 const p75 = typeStats.differences[Math.floor(typeStats.differences.length * 0.75)]; 194 const p90 = typeStats.differences[Math.floor(typeStats.differences.length * 0.9)]; 195 196 console.log( 197 ` 10th percentile: ${formatBytes(p10.sizeDiff)} (${p10.sizeDiffPercent.toFixed(2)}%)` 198 ); 199 console.log( 200 ` 25th percentile: ${formatBytes(p25.sizeDiff)} (${p25.sizeDiffPercent.toFixed(2)}%)` 201 ); 202 console.log( 203 ` 50th percentile (median): ${formatBytes(p50.sizeDiff)} (${p50.sizeDiffPercent.toFixed(2)}%)` 204 ); 205 console.log( 206 ` 75th percentile: ${formatBytes(p75.sizeDiff)} (${p75.sizeDiffPercent.toFixed(2)}%)` 207 ); 208 console.log( 209 ` 90th percentile: ${formatBytes(p90.sizeDiff)} (${p90.sizeDiffPercent.toFixed(2)}%)` 210 ); 211 212 // Show smallest differences that are still being kept 213 console.log('\nSmallest 5 differences being KEPT:'); 214 typeStats.differences.slice(0, 5).forEach((diff, i) => { 215 console.log( 216 ` ${i + 1}. Folder ${diff.folder}: ${formatBytes(diff.sizeDiff)} (${diff.sizeDiffPercent.toFixed(2)}%) - cropped: ${formatBytes(diff.croppedSize)}, uncropped: ${formatBytes(diff.uncroppedSize)}` 217 ); 218 }); 219 220 // Count how many would be caught with different thresholds 221 const wouldCatchAt10KB = typeStats.differences.filter( 222 d => d.sizeDiff < 10240 && d.sizeDiffPercent < 2 223 ).length; 224 const wouldCatchAt15KB = typeStats.differences.filter( 225 d => d.sizeDiff < 15360 && d.sizeDiffPercent < 3 226 ).length; 227 const wouldCatchAt20KB = typeStats.differences.filter( 228 d => d.sizeDiff < 20480 && d.sizeDiffPercent < 5 229 ).length; 230 231 console.log('\nPotential Additional Deletions with Higher Thresholds:'); 232 console.log(` At 10KB & 2%: ${wouldCatchAt10KB} additional files`); 233 console.log(` At 15KB & 3%: ${wouldCatchAt15KB} additional files`); 234 console.log(` At 20KB & 5%: ${wouldCatchAt20KB} additional files`); 235 } 236 } 237 238 console.log('\n\nš” Current Thresholds:'); 239 console.log(` Size difference: < ${formatBytes(SIZE_DIFF_THRESHOLD_BYTES)}`); 240 console.log(` Percentage difference: < ${SIZE_DIFF_THRESHOLD_PERCENT}%`); 241 console.log( 242 '\nNote: Both conditions must be met (AND logic) for files to be considered identical.' 243 ); 244 } 245 246 main().catch(error => { 247 console.error('ā Fatal error:', error); 248 process.exit(1); 249 });