file-filter.ts
1 import type { ChangedFile } from './types'; 2 3 /** 4 * Built-in patterns for generated / low-value files. 5 * Supports: exact basename match, *.ext suffix, *.mid.* middle wildcard. 6 */ 7 const DEFAULT_GENERATED_PATTERNS: string[] = [ 8 // Lock files 9 'poetry.lock', 10 'Pipfile.lock', 11 'package-lock.json', 12 'yarn.lock', 13 'pnpm-lock.yaml', 14 'Gemfile.lock', 15 'Cargo.lock', 16 'go.sum', 17 'composer.lock', 18 'flake.lock', 19 'pdm.lock', 20 'uv.lock', 21 'bun.lockb', 22 'packages.lock.json', 23 'Podfile.lock', 24 'pubspec.lock', 25 26 // Minified assets 27 '*.min.js', 28 '*.min.css', 29 30 // Generated code 31 '*.generated.*', 32 '*.g.dart', 33 '*.freezed.dart', 34 '*.pb.go', 35 '*.pb.ts', 36 37 // Snapshots 38 '*.snap', 39 ]; 40 41 /** 42 * Files exceeding this many changed lines (additions + deletions) are 43 * auto-classified as generated, unless they have a recognized source extension. 44 */ 45 const LARGE_FILE_THRESHOLD = 2000; 46 47 const SOURCE_EXTENSIONS = new Set([ 48 '.ts', 49 '.tsx', 50 '.js', 51 '.jsx', 52 '.mjs', 53 '.cjs', 54 '.py', 55 '.rb', 56 '.rs', 57 '.go', 58 '.java', 59 '.kt', 60 '.swift', 61 '.c', 62 '.cpp', 63 '.h', 64 '.hpp', 65 '.cs', 66 '.dart', 67 '.vue', 68 '.svelte', 69 '.astro', 70 '.php', 71 '.scala', 72 '.ex', 73 '.exs', 74 '.clj', 75 '.hs', 76 '.ml', 77 '.fs', 78 '.lua', 79 '.zig', 80 '.nim', 81 '.md', 82 '.mdx', 83 '.txt', 84 '.yml', 85 '.yaml', 86 '.toml', 87 '.json', 88 '.xml', 89 '.html', 90 '.css', 91 '.scss', 92 '.less', 93 '.sql', 94 ]); 95 96 export interface FileClassification { 97 filename: string; 98 classification: 'generated' | 'normal'; 99 reason?: string; 100 } 101 102 export interface FilterResult { 103 normalFiles: ChangedFile[]; 104 generatedFiles: FileClassification[]; 105 } 106 107 function matchesPattern(filename: string, pattern: string): boolean { 108 const basename = filename.split('/').pop() ?? filename; 109 110 // *.mid.* (middle wildcard like *.generated.*) 111 if (pattern.startsWith('*.') && pattern.lastIndexOf('*') > 0) { 112 const inner = pattern.slice(1, pattern.lastIndexOf('*')); 113 return basename.includes(inner); 114 } 115 116 // *.ext (suffix match) 117 if (pattern.startsWith('*')) { 118 return basename.endsWith(pattern.slice(1)); 119 } 120 121 // Exact basename match 122 return basename === pattern; 123 } 124 125 export function classifyFiles(changedFiles: ChangedFile[], extraPatterns: string[] = []): FilterResult { 126 const patterns = [...DEFAULT_GENERATED_PATTERNS, ...extraPatterns]; 127 const normalFiles: ChangedFile[] = []; 128 const generatedFiles: FileClassification[] = []; 129 130 for (const file of changedFiles) { 131 const matchedPattern = patterns.find((p) => matchesPattern(file.filename, p)); 132 if (matchedPattern) { 133 generatedFiles.push({ 134 filename: file.filename, 135 classification: 'generated', 136 reason: `matches pattern: ${matchedPattern}`, 137 }); 138 continue; 139 } 140 141 // Size threshold for non-source files 142 const ext = '.' + (file.filename.split('.').pop() ?? ''); 143 const totalChanged = file.additions + file.deletions; 144 if (totalChanged > LARGE_FILE_THRESHOLD && !SOURCE_EXTENSIONS.has(ext)) { 145 generatedFiles.push({ 146 filename: file.filename, 147 classification: 'generated', 148 reason: `exceeds ${LARGE_FILE_THRESHOLD} changed lines (${totalChanged})`, 149 }); 150 continue; 151 } 152 153 normalFiles.push(file); 154 } 155 156 return { normalFiles, generatedFiles }; 157 } 158 159 export function filterDiff(rawDiff: string, generatedFilenames: Set<string>): string { 160 if (generatedFilenames.size === 0) return rawDiff; 161 162 const lines = rawDiff.split('\n'); 163 const result: string[] = []; 164 let skipping = false; 165 166 for (const line of lines) { 167 if (line.startsWith('diff --git')) { 168 const match = line.match(/diff --git a\/(.*) b\/(.*)/); 169 const filePath = match?.[2] ?? match?.[1]; 170 skipping = filePath ? generatedFilenames.has(filePath) : false; 171 } 172 173 if (!skipping) { 174 result.push(line); 175 } 176 } 177 178 return result.join('\n'); 179 } 180 181 export function buildExcludedFilesSummary(generatedFiles: FileClassification[]): string { 182 if (generatedFiles.length === 0) return ''; 183 184 const lines = generatedFiles.map((f) => ` - ${f.filename} (${f.reason})`); 185 186 return `<excluded_files count="${generatedFiles.length}"> 187 The following files were changed in this PR but excluded from the diff and file contents because they are generated or low-value for review purposes: 188 ${lines.join('\n')} 189 </excluded_files>`; 190 }