/ constants / files.ts
files.ts
  1  /**
  2   * Binary file extensions to skip for text-based operations.
  3   * These files can't be meaningfully compared as text and are often large.
  4   */
  5  export const BINARY_EXTENSIONS = new Set([
  6    // Images
  7    '.png',
  8    '.jpg',
  9    '.jpeg',
 10    '.gif',
 11    '.bmp',
 12    '.ico',
 13    '.webp',
 14    '.tiff',
 15    '.tif',
 16    // Videos
 17    '.mp4',
 18    '.mov',
 19    '.avi',
 20    '.mkv',
 21    '.webm',
 22    '.wmv',
 23    '.flv',
 24    '.m4v',
 25    '.mpeg',
 26    '.mpg',
 27    // Audio
 28    '.mp3',
 29    '.wav',
 30    '.ogg',
 31    '.flac',
 32    '.aac',
 33    '.m4a',
 34    '.wma',
 35    '.aiff',
 36    '.opus',
 37    // Archives
 38    '.zip',
 39    '.tar',
 40    '.gz',
 41    '.bz2',
 42    '.7z',
 43    '.rar',
 44    '.xz',
 45    '.z',
 46    '.tgz',
 47    '.iso',
 48    // Executables/binaries
 49    '.exe',
 50    '.dll',
 51    '.so',
 52    '.dylib',
 53    '.bin',
 54    '.o',
 55    '.a',
 56    '.obj',
 57    '.lib',
 58    '.app',
 59    '.msi',
 60    '.deb',
 61    '.rpm',
 62    // Documents (PDF is here; FileReadTool excludes it at the call site)
 63    '.pdf',
 64    '.doc',
 65    '.docx',
 66    '.xls',
 67    '.xlsx',
 68    '.ppt',
 69    '.pptx',
 70    '.odt',
 71    '.ods',
 72    '.odp',
 73    // Fonts
 74    '.ttf',
 75    '.otf',
 76    '.woff',
 77    '.woff2',
 78    '.eot',
 79    // Bytecode / VM artifacts
 80    '.pyc',
 81    '.pyo',
 82    '.class',
 83    '.jar',
 84    '.war',
 85    '.ear',
 86    '.node',
 87    '.wasm',
 88    '.rlib',
 89    // Database files
 90    '.sqlite',
 91    '.sqlite3',
 92    '.db',
 93    '.mdb',
 94    '.idx',
 95    // Design / 3D
 96    '.psd',
 97    '.ai',
 98    '.eps',
 99    '.sketch',
100    '.fig',
101    '.xd',
102    '.blend',
103    '.3ds',
104    '.max',
105    // Flash
106    '.swf',
107    '.fla',
108    // Lock/profiling data
109    '.lockb',
110    '.dat',
111    '.data',
112  ])
113  
114  /**
115   * Check if a file path has a binary extension.
116   */
117  export function hasBinaryExtension(filePath: string): boolean {
118    const ext = filePath.slice(filePath.lastIndexOf('.')).toLowerCase()
119    return BINARY_EXTENSIONS.has(ext)
120  }
121  
122  /**
123   * Number of bytes to read for binary content detection.
124   */
125  const BINARY_CHECK_SIZE = 8192
126  
127  /**
128   * Check if a buffer contains binary content by looking for null bytes
129   * or a high proportion of non-printable characters.
130   */
131  export function isBinaryContent(buffer: Buffer): boolean {
132    // Check first BINARY_CHECK_SIZE bytes (or full buffer if smaller)
133    const checkSize = Math.min(buffer.length, BINARY_CHECK_SIZE)
134  
135    let nonPrintable = 0
136    for (let i = 0; i < checkSize; i++) {
137      const byte = buffer[i]!
138      // Null byte is a strong indicator of binary
139      if (byte === 0) {
140        return true
141      }
142      // Count non-printable, non-whitespace bytes
143      // Printable ASCII is 32-126, plus common whitespace (9, 10, 13)
144      if (
145        byte < 32 &&
146        byte !== 9 && // tab
147        byte !== 10 && // newline
148        byte !== 13 // carriage return
149      ) {
150        nonPrintable++
151      }
152    }
153  
154    // If more than 10% non-printable, likely binary
155    return nonPrintable / checkSize > 0.1
156  }