/ lib / file-filter.ts
file-filter.ts
  1  import type { ChangedFile } from './types';
  2  
  3  /**
  4   * Built-in patterns for generated / low-value files.
  5   * Supports: exact basename match, *.ext suffix, *.mid.* middle wildcard.
  6   */
  7  const DEFAULT_GENERATED_PATTERNS: string[] = [
  8    // Lock files
  9    'poetry.lock',
 10    'Pipfile.lock',
 11    'package-lock.json',
 12    'yarn.lock',
 13    'pnpm-lock.yaml',
 14    'Gemfile.lock',
 15    'Cargo.lock',
 16    'go.sum',
 17    'composer.lock',
 18    'flake.lock',
 19    'pdm.lock',
 20    'uv.lock',
 21    'bun.lockb',
 22    'packages.lock.json',
 23    'Podfile.lock',
 24    'pubspec.lock',
 25  
 26    // Minified assets
 27    '*.min.js',
 28    '*.min.css',
 29  
 30    // Generated code
 31    '*.generated.*',
 32    '*.g.dart',
 33    '*.freezed.dart',
 34    '*.pb.go',
 35    '*.pb.ts',
 36  
 37    // Snapshots
 38    '*.snap',
 39  ];
 40  
 41  /**
 42   * Files exceeding this many changed lines (additions + deletions) are
 43   * auto-classified as generated, unless they have a recognized source extension.
 44   */
 45  const LARGE_FILE_THRESHOLD = 2000;
 46  
 47  const SOURCE_EXTENSIONS = new Set([
 48    '.ts',
 49    '.tsx',
 50    '.js',
 51    '.jsx',
 52    '.mjs',
 53    '.cjs',
 54    '.py',
 55    '.rb',
 56    '.rs',
 57    '.go',
 58    '.java',
 59    '.kt',
 60    '.swift',
 61    '.c',
 62    '.cpp',
 63    '.h',
 64    '.hpp',
 65    '.cs',
 66    '.dart',
 67    '.vue',
 68    '.svelte',
 69    '.astro',
 70    '.php',
 71    '.scala',
 72    '.ex',
 73    '.exs',
 74    '.clj',
 75    '.hs',
 76    '.ml',
 77    '.fs',
 78    '.lua',
 79    '.zig',
 80    '.nim',
 81    '.md',
 82    '.mdx',
 83    '.txt',
 84    '.yml',
 85    '.yaml',
 86    '.toml',
 87    '.json',
 88    '.xml',
 89    '.html',
 90    '.css',
 91    '.scss',
 92    '.less',
 93    '.sql',
 94  ]);
 95  
 96  export interface FileClassification {
 97    filename: string;
 98    classification: 'generated' | 'normal';
 99    reason?: string;
100  }
101  
102  export interface FilterResult {
103    normalFiles: ChangedFile[];
104    generatedFiles: FileClassification[];
105  }
106  
107  function matchesPattern(filename: string, pattern: string): boolean {
108    const basename = filename.split('/').pop() ?? filename;
109  
110    // *.mid.* (middle wildcard like *.generated.*)
111    if (pattern.startsWith('*.') && pattern.lastIndexOf('*') > 0) {
112      const inner = pattern.slice(1, pattern.lastIndexOf('*'));
113      return basename.includes(inner);
114    }
115  
116    // *.ext (suffix match)
117    if (pattern.startsWith('*')) {
118      return basename.endsWith(pattern.slice(1));
119    }
120  
121    // Exact basename match
122    return basename === pattern;
123  }
124  
125  export function classifyFiles(changedFiles: ChangedFile[], extraPatterns: string[] = []): FilterResult {
126    const patterns = [...DEFAULT_GENERATED_PATTERNS, ...extraPatterns];
127    const normalFiles: ChangedFile[] = [];
128    const generatedFiles: FileClassification[] = [];
129  
130    for (const file of changedFiles) {
131      const matchedPattern = patterns.find((p) => matchesPattern(file.filename, p));
132      if (matchedPattern) {
133        generatedFiles.push({
134          filename: file.filename,
135          classification: 'generated',
136          reason: `matches pattern: ${matchedPattern}`,
137        });
138        continue;
139      }
140  
141      // Size threshold for non-source files
142      const ext = '.' + (file.filename.split('.').pop() ?? '');
143      const totalChanged = file.additions + file.deletions;
144      if (totalChanged > LARGE_FILE_THRESHOLD && !SOURCE_EXTENSIONS.has(ext)) {
145        generatedFiles.push({
146          filename: file.filename,
147          classification: 'generated',
148          reason: `exceeds ${LARGE_FILE_THRESHOLD} changed lines (${totalChanged})`,
149        });
150        continue;
151      }
152  
153      normalFiles.push(file);
154    }
155  
156    return { normalFiles, generatedFiles };
157  }
158  
159  export function filterDiff(rawDiff: string, generatedFilenames: Set<string>): string {
160    if (generatedFilenames.size === 0) return rawDiff;
161  
162    const lines = rawDiff.split('\n');
163    const result: string[] = [];
164    let skipping = false;
165  
166    for (const line of lines) {
167      if (line.startsWith('diff --git')) {
168        const match = line.match(/diff --git a\/(.*) b\/(.*)/);
169        const filePath = match?.[2] ?? match?.[1];
170        skipping = filePath ? generatedFilenames.has(filePath) : false;
171      }
172  
173      if (!skipping) {
174        result.push(line);
175      }
176    }
177  
178    return result.join('\n');
179  }
180  
181  export function buildExcludedFilesSummary(generatedFiles: FileClassification[]): string {
182    if (generatedFiles.length === 0) return '';
183  
184    const lines = generatedFiles.map((f) => `  - ${f.filename} (${f.reason})`);
185  
186    return `<excluded_files count="${generatedFiles.length}">
187  The following files were changed in this PR but excluded from the diff and file contents because they are generated or low-value for review purposes:
188  ${lines.join('\n')}
189  </excluded_files>`;
190  }