/ utils / generatedFiles.ts
generatedFiles.ts
  1  import { basename, extname, posix, sep } from 'path'
  2  
  3  /**
  4   * File patterns that should be excluded from attribution.
  5   * Based on GitHub Linguist vendored patterns and common generated file patterns.
  6   */
  7  
  8  // Exact file name matches (case-insensitive)
  9  const EXCLUDED_FILENAMES = new Set([
 10    'package-lock.json',
 11    'yarn.lock',
 12    'pnpm-lock.yaml',
 13    'bun.lockb',
 14    'bun.lock',
 15    'composer.lock',
 16    'gemfile.lock',
 17    'cargo.lock',
 18    'poetry.lock',
 19    'pipfile.lock',
 20    'shrinkwrap.json',
 21    'npm-shrinkwrap.json',
 22  ])
 23  
 24  // File extension patterns (case-insensitive)
 25  const EXCLUDED_EXTENSIONS = new Set([
 26    '.lock',
 27    '.min.js',
 28    '.min.css',
 29    '.min.html',
 30    '.bundle.js',
 31    '.bundle.css',
 32    '.generated.ts',
 33    '.generated.js',
 34    '.d.ts', // TypeScript declaration files
 35  ])
 36  
 37  // Directory patterns that indicate generated/vendored content
 38  const EXCLUDED_DIRECTORIES = [
 39    '/dist/',
 40    '/build/',
 41    '/out/',
 42    '/output/',
 43    '/node_modules/',
 44    '/vendor/',
 45    '/vendored/',
 46    '/third_party/',
 47    '/third-party/',
 48    '/external/',
 49    '/.next/',
 50    '/.nuxt/',
 51    '/.svelte-kit/',
 52    '/coverage/',
 53    '/__pycache__/',
 54    '/.tox/',
 55    '/venv/',
 56    '/.venv/',
 57    '/target/release/',
 58    '/target/debug/',
 59  ]
 60  
 61  // Filename patterns using regex for more complex matching
 62  const EXCLUDED_FILENAME_PATTERNS = [
 63    /^.*\.min\.[a-z]+$/i, // *.min.*
 64    /^.*-min\.[a-z]+$/i, // *-min.*
 65    /^.*\.bundle\.[a-z]+$/i, // *.bundle.*
 66    /^.*\.generated\.[a-z]+$/i, // *.generated.*
 67    /^.*\.gen\.[a-z]+$/i, // *.gen.*
 68    /^.*\.auto\.[a-z]+$/i, // *.auto.*
 69    /^.*_generated\.[a-z]+$/i, // *_generated.*
 70    /^.*_gen\.[a-z]+$/i, // *_gen.*
 71    /^.*\.pb\.(go|js|ts|py|rb)$/i, // Protocol buffer generated files
 72    /^.*_pb2?\.py$/i, // Python protobuf files
 73    /^.*\.pb\.h$/i, // C++ protobuf headers
 74    /^.*\.grpc\.[a-z]+$/i, // gRPC generated files
 75    /^.*\.swagger\.[a-z]+$/i, // Swagger generated files
 76    /^.*\.openapi\.[a-z]+$/i, // OpenAPI generated files
 77  ]
 78  
 79  /**
 80   * Check if a file should be excluded from attribution based on Linguist-style rules.
 81   *
 82   * @param filePath - Relative file path from repository root
 83   * @returns true if the file should be excluded from attribution
 84   */
 85  export function isGeneratedFile(filePath: string): boolean {
 86    // Normalize path separators for consistent pattern matching (patterns use posix-style /)
 87    const normalizedPath =
 88      posix.sep + filePath.split(sep).join(posix.sep).replace(/^\/+/, '')
 89    const fileName = basename(filePath).toLowerCase()
 90    const ext = extname(filePath).toLowerCase()
 91  
 92    // Check exact filename matches
 93    if (EXCLUDED_FILENAMES.has(fileName)) {
 94      return true
 95    }
 96  
 97    // Check extension matches
 98    if (EXCLUDED_EXTENSIONS.has(ext)) {
 99      return true
100    }
101  
102    // Check for compound extensions like .min.js
103    const parts = fileName.split('.')
104    if (parts.length > 2) {
105      const compoundExt = '.' + parts.slice(-2).join('.')
106      if (EXCLUDED_EXTENSIONS.has(compoundExt)) {
107        return true
108      }
109    }
110  
111    // Check directory patterns
112    for (const dir of EXCLUDED_DIRECTORIES) {
113      if (normalizedPath.includes(dir)) {
114        return true
115      }
116    }
117  
118    // Check filename patterns
119    for (const pattern of EXCLUDED_FILENAME_PATTERNS) {
120      if (pattern.test(fileName)) {
121        return true
122      }
123    }
124  
125    return false
126  }
127  
128  /**
129   * Filter a list of files to exclude generated files.
130   *
131   * @param files - Array of file paths
132   * @returns Array of files that are not generated
133   */
134  export function filterGeneratedFiles(files: string[]): string[] {
135    return files.filter(file => !isGeneratedFile(file))
136  }