/ utils / gitDiff.ts
gitDiff.ts
  1  import type { StructuredPatchHunk } from 'diff'
  2  import { access, readFile } from 'fs/promises'
  3  import { dirname, join, relative, sep } from 'path'
  4  import { getCwd } from './cwd.js'
  5  import { getCachedRepository } from './detectRepository.js'
  6  import { execFileNoThrow, execFileNoThrowWithCwd } from './execFileNoThrow.js'
  7  import { isFileWithinReadSizeLimit } from './file.js'
  8  import {
  9    findGitRoot,
 10    getDefaultBranch,
 11    getGitDir,
 12    getIsGit,
 13    gitExe,
 14  } from './git.js'
 15  
 16  export type GitDiffStats = {
 17    filesCount: number
 18    linesAdded: number
 19    linesRemoved: number
 20  }
 21  
 22  export type PerFileStats = {
 23    added: number
 24    removed: number
 25    isBinary: boolean
 26    isUntracked?: boolean
 27  }
 28  
 29  export type GitDiffResult = {
 30    stats: GitDiffStats
 31    perFileStats: Map<string, PerFileStats>
 32    hunks: Map<string, StructuredPatchHunk[]>
 33  }
 34  
 35  const GIT_TIMEOUT_MS = 5000
 36  const MAX_FILES = 50
 37  const MAX_DIFF_SIZE_BYTES = 1_000_000 // 1 MB - skip files larger than this
 38  const MAX_LINES_PER_FILE = 400 // GitHub's auto-load limit
 39  const MAX_FILES_FOR_DETAILS = 500 // Skip per-file details if more files than this
 40  
 41  /**
 42   * Fetch git diff stats and hunks comparing working tree to HEAD.
 43   * Returns null if not in a git repo or if git commands fail.
 44   *
 45   * Returns null during merge/rebase/cherry-pick/revert operations since the
 46   * working tree contains incoming changes that weren't intentionally
 47   * made by the user.
 48   */
 49  export async function fetchGitDiff(): Promise<GitDiffResult | null> {
 50    const isGit = await getIsGit()
 51    if (!isGit) return null
 52  
 53    // Skip diff calculation during transient git states since the
 54    // working tree contains incoming changes, not user-intentional edits
 55    if (await isInTransientGitState()) {
 56      return null
 57    }
 58  
 59    // Quick probe: use --shortstat to get totals without loading all content.
 60    // This is O(1) memory and lets us detect massive diffs (e.g., jj workspaces)
 61    // before committing to expensive operations.
 62    const { stdout: shortstatOut, code: shortstatCode } = await execFileNoThrow(
 63      gitExe(),
 64      ['--no-optional-locks', 'diff', 'HEAD', '--shortstat'],
 65      { timeout: GIT_TIMEOUT_MS, preserveOutputOnError: false },
 66    )
 67  
 68    if (shortstatCode === 0) {
 69      const quickStats = parseShortstat(shortstatOut)
 70      if (quickStats && quickStats.filesCount > MAX_FILES_FOR_DETAILS) {
 71        // Too many files - return accurate totals but skip per-file details
 72        // to avoid loading hundreds of MB into memory
 73        return {
 74          stats: quickStats,
 75          perFileStats: new Map(),
 76          hunks: new Map(),
 77        }
 78      }
 79    }
 80  
 81    // Get stats via --numstat (all uncommitted changes vs HEAD)
 82    const { stdout: numstatOut, code: numstatCode } = await execFileNoThrow(
 83      gitExe(),
 84      ['--no-optional-locks', 'diff', 'HEAD', '--numstat'],
 85      { timeout: GIT_TIMEOUT_MS, preserveOutputOnError: false },
 86    )
 87  
 88    if (numstatCode !== 0) return null
 89  
 90    const { stats, perFileStats } = parseGitNumstat(numstatOut)
 91  
 92    // Include untracked files (new files not yet staged)
 93    // Just filenames - no content reading for performance
 94    const remainingSlots = MAX_FILES - perFileStats.size
 95    if (remainingSlots > 0) {
 96      const untrackedStats = await fetchUntrackedFiles(remainingSlots)
 97      if (untrackedStats) {
 98        stats.filesCount += untrackedStats.size
 99        for (const [path, fileStats] of untrackedStats) {
100          perFileStats.set(path, fileStats)
101        }
102      }
103    }
104  
105    // Return stats only - hunks are fetched on-demand via fetchGitDiffHunks()
106    // to avoid expensive git diff HEAD call on every poll
107    return { stats, perFileStats, hunks: new Map() }
108  }
109  
110  /**
111   * Fetch git diff hunks on-demand (for DiffDialog).
112   * Separated from fetchGitDiff() to avoid expensive calls during polling.
113   */
114  export async function fetchGitDiffHunks(): Promise<
115    Map<string, StructuredPatchHunk[]>
116  > {
117    const isGit = await getIsGit()
118    if (!isGit) return new Map()
119  
120    if (await isInTransientGitState()) {
121      return new Map()
122    }
123  
124    const { stdout: diffOut, code: diffCode } = await execFileNoThrow(
125      gitExe(),
126      ['--no-optional-locks', 'diff', 'HEAD'],
127      { timeout: GIT_TIMEOUT_MS, preserveOutputOnError: false },
128    )
129  
130    if (diffCode !== 0) {
131      return new Map()
132    }
133  
134    return parseGitDiff(diffOut)
135  }
136  
137  export type NumstatResult = {
138    stats: GitDiffStats
139    perFileStats: Map<string, PerFileStats>
140  }
141  
142  /**
143   * Parse git diff --numstat output into stats.
144   * Format: <added>\t<removed>\t<filename>
145   * Binary files show '-' for counts.
146   * Only stores first MAX_FILES entries in perFileStats.
147   */
148  export function parseGitNumstat(stdout: string): NumstatResult {
149    const lines = stdout.trim().split('\n').filter(Boolean)
150    let added = 0
151    let removed = 0
152    let validFileCount = 0
153    const perFileStats = new Map<string, PerFileStats>()
154  
155    for (const line of lines) {
156      const parts = line.split('\t')
157      // Valid numstat lines have exactly 3 tab-separated parts: added, removed, filename
158      if (parts.length < 3) continue
159  
160      validFileCount++
161      const addStr = parts[0]
162      const remStr = parts[1]
163      const filePath = parts.slice(2).join('\t') // filename may contain tabs
164      const isBinary = addStr === '-' || remStr === '-'
165      const fileAdded = isBinary ? 0 : parseInt(addStr ?? '0', 10) || 0
166      const fileRemoved = isBinary ? 0 : parseInt(remStr ?? '0', 10) || 0
167  
168      added += fileAdded
169      removed += fileRemoved
170  
171      // Only store first MAX_FILES entries
172      if (perFileStats.size < MAX_FILES) {
173        perFileStats.set(filePath, {
174          added: fileAdded,
175          removed: fileRemoved,
176          isBinary,
177        })
178      }
179    }
180  
181    return {
182      stats: {
183        filesCount: validFileCount,
184        linesAdded: added,
185        linesRemoved: removed,
186      },
187      perFileStats,
188    }
189  }
190  
191  /**
192   * Parse unified diff output into per-file hunks.
193   * Splits by "diff --git" and parses each file's hunks.
194   *
195   * Applies limits:
196   * - MAX_FILES: stop after this many files
197   * - Files >1MB: skipped entirely (not in result map)
198   * - Files ≤1MB: parsed but limited to MAX_LINES_PER_FILE lines
199   */
200  export function parseGitDiff(
201    stdout: string,
202  ): Map<string, StructuredPatchHunk[]> {
203    const result = new Map<string, StructuredPatchHunk[]>()
204    if (!stdout.trim()) return result
205  
206    // Split by file diffs
207    const fileDiffs = stdout.split(/^diff --git /m).filter(Boolean)
208  
209    for (const fileDiff of fileDiffs) {
210      // Stop after MAX_FILES
211      if (result.size >= MAX_FILES) break
212  
213      // Skip files larger than 1MB
214      if (fileDiff.length > MAX_DIFF_SIZE_BYTES) {
215        continue
216      }
217  
218      const lines = fileDiff.split('\n')
219  
220      // Extract filename from first line: "a/path/to/file b/path/to/file"
221      const headerMatch = lines[0]?.match(/^a\/(.+?) b\/(.+)$/)
222      if (!headerMatch) continue
223      const filePath = headerMatch[2] ?? headerMatch[1] ?? ''
224  
225      // Find and parse hunks
226      const fileHunks: StructuredPatchHunk[] = []
227      let currentHunk: StructuredPatchHunk | null = null
228      let lineCount = 0
229  
230      for (let i = 1; i < lines.length; i++) {
231        const line = lines[i] ?? ''
232  
233        // StructuredPatchHunk header: @@ -oldStart,oldLines +newStart,newLines @@
234        const hunkMatch = line.match(
235          /^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@/,
236        )
237        if (hunkMatch) {
238          if (currentHunk) {
239            fileHunks.push(currentHunk)
240          }
241          currentHunk = {
242            oldStart: parseInt(hunkMatch[1] ?? '0', 10),
243            oldLines: parseInt(hunkMatch[2] ?? '1', 10),
244            newStart: parseInt(hunkMatch[3] ?? '0', 10),
245            newLines: parseInt(hunkMatch[4] ?? '1', 10),
246            lines: [],
247          }
248          continue
249        }
250  
251        // Skip binary file markers and other metadata
252        if (
253          line.startsWith('index ') ||
254          line.startsWith('---') ||
255          line.startsWith('+++') ||
256          line.startsWith('new file') ||
257          line.startsWith('deleted file') ||
258          line.startsWith('old mode') ||
259          line.startsWith('new mode') ||
260          line.startsWith('Binary files')
261        ) {
262          continue
263        }
264  
265        // Add diff lines to current hunk (with line limit)
266        if (
267          currentHunk &&
268          (line.startsWith('+') ||
269            line.startsWith('-') ||
270            line.startsWith(' ') ||
271            line === '')
272        ) {
273          // Stop adding lines once we hit the limit
274          if (lineCount >= MAX_LINES_PER_FILE) {
275            continue
276          }
277          // Force a flat string copy to break V8 sliced string references.
278          // When split() creates lines, V8 creates "sliced strings" that reference
279          // the parent. This keeps the entire parent string (~MBs) alive as long as
280          // any line is retained. Using '' + line forces a new flat string allocation,
281          // unlike slice(0) which V8 may optimize to return the same reference.
282          currentHunk.lines.push('' + line)
283          lineCount++
284        }
285      }
286  
287      // Don't forget the last hunk
288      if (currentHunk) {
289        fileHunks.push(currentHunk)
290      }
291  
292      if (fileHunks.length > 0) {
293        result.set(filePath, fileHunks)
294      }
295    }
296  
297    return result
298  }
299  
300  /**
301   * Check if we're in a transient git state (merge, rebase, cherry-pick, or revert).
302   * During these operations, we skip diff calculation since the working
303   * tree contains incoming changes that weren't intentionally made.
304   *
305   * Uses fs.access to check for transient ref files, avoiding process spawns.
306   */
307  async function isInTransientGitState(): Promise<boolean> {
308    const gitDir = await getGitDir(getCwd())
309    if (!gitDir) return false
310  
311    const transientFiles = [
312      'MERGE_HEAD',
313      'REBASE_HEAD',
314      'CHERRY_PICK_HEAD',
315      'REVERT_HEAD',
316    ]
317  
318    const results = await Promise.all(
319      transientFiles.map(file =>
320        access(join(gitDir, file))
321          .then(() => true)
322          .catch(() => false),
323      ),
324    )
325    return results.some(Boolean)
326  }
327  
328  /**
329   * Fetch untracked file names (no content reading).
330   * Returns file paths only - they'll be displayed with a note to stage them.
331   *
332   * @param maxFiles Maximum number of untracked files to include
333   */
334  async function fetchUntrackedFiles(
335    maxFiles: number,
336  ): Promise<Map<string, PerFileStats> | null> {
337    // Get list of untracked files (excludes gitignored)
338    const { stdout, code } = await execFileNoThrow(
339      gitExe(),
340      ['--no-optional-locks', 'ls-files', '--others', '--exclude-standard'],
341      { timeout: GIT_TIMEOUT_MS, preserveOutputOnError: false },
342    )
343  
344    if (code !== 0 || !stdout.trim()) return null
345  
346    const untrackedPaths = stdout.trim().split('\n').filter(Boolean)
347    if (untrackedPaths.length === 0) return null
348  
349    const perFileStats = new Map<string, PerFileStats>()
350  
351    // Just record filenames, no content reading
352    for (const filePath of untrackedPaths.slice(0, maxFiles)) {
353      perFileStats.set(filePath, {
354        added: 0,
355        removed: 0,
356        isBinary: false,
357        isUntracked: true,
358      })
359    }
360  
361    return perFileStats
362  }
363  
364  /**
365   * Parse git diff --shortstat output into stats.
366   * Format: " 1648 files changed, 52341 insertions(+), 8123 deletions(-)"
367   *
368   * This is O(1) memory regardless of diff size - git computes totals without
369   * loading all content. Used as a quick probe before expensive operations.
370   */
371  export function parseShortstat(stdout: string): GitDiffStats | null {
372    // Match: "N files changed" with optional ", N insertions(+)" and ", N deletions(-)"
373    const match = stdout.match(
374      /(\d+)\s+files?\s+changed(?:,\s+(\d+)\s+insertions?\(\+\))?(?:,\s+(\d+)\s+deletions?\(-\))?/,
375    )
376    if (!match) return null
377    return {
378      filesCount: parseInt(match[1] ?? '0', 10),
379      linesAdded: parseInt(match[2] ?? '0', 10),
380      linesRemoved: parseInt(match[3] ?? '0', 10),
381    }
382  }
383  
384  const SINGLE_FILE_DIFF_TIMEOUT_MS = 3000
385  
386  export type ToolUseDiff = {
387    filename: string
388    status: 'modified' | 'added'
389    additions: number
390    deletions: number
391    changes: number
392    patch: string
393    /** GitHub "owner/repo" when available (null for non-github.com or unknown repos) */
394    repository: string | null
395  }
396  
397  /**
398   * Fetch a structured diff for a single file against the merge base with the
399   * default branch. This produces a PR-like diff showing all changes since
400   * the branch diverged. Falls back to diffing against HEAD if the merge base
401   * cannot be determined (e.g., on the default branch itself).
402   * For untracked files, generates a synthetic diff showing all additions.
403   * Returns null if not in a git repo or if git commands fail.
404   */
405  export async function fetchSingleFileGitDiff(
406    absoluteFilePath: string,
407  ): Promise<ToolUseDiff | null> {
408    const gitRoot = findGitRoot(dirname(absoluteFilePath))
409    if (!gitRoot) return null
410  
411    const gitPath = relative(gitRoot, absoluteFilePath).split(sep).join('/')
412    const repository = getCachedRepository()
413  
414    // Check if the file is tracked by git
415    const { code: lsFilesCode } = await execFileNoThrowWithCwd(
416      gitExe(),
417      ['--no-optional-locks', 'ls-files', '--error-unmatch', gitPath],
418      { cwd: gitRoot, timeout: SINGLE_FILE_DIFF_TIMEOUT_MS },
419    )
420  
421    if (lsFilesCode === 0) {
422      // File is tracked - diff against merge base for PR-like view
423      const diffRef = await getDiffRef(gitRoot)
424      const { stdout, code } = await execFileNoThrowWithCwd(
425        gitExe(),
426        ['--no-optional-locks', 'diff', diffRef, '--', gitPath],
427        { cwd: gitRoot, timeout: SINGLE_FILE_DIFF_TIMEOUT_MS },
428      )
429      if (code !== 0) return null
430      if (!stdout) return null
431      return {
432        ...parseRawDiffToToolUseDiff(gitPath, stdout, 'modified'),
433        repository,
434      }
435    }
436  
437    // File is untracked - generate synthetic diff
438    const syntheticDiff = await generateSyntheticDiff(gitPath, absoluteFilePath)
439    if (!syntheticDiff) return null
440    return { ...syntheticDiff, repository }
441  }
442  
443  /**
444   * Parse raw unified diff output into the structured ToolUseDiff format.
445   * Extracts only the hunk content (starting from @@) as the patch,
446   * and counts additions/deletions.
447   */
448  function parseRawDiffToToolUseDiff(
449    filename: string,
450    rawDiff: string,
451    status: 'modified' | 'added',
452  ): Omit<ToolUseDiff, 'repository'> {
453    const lines = rawDiff.split('\n')
454    const patchLines: string[] = []
455    let inHunks = false
456    let additions = 0
457    let deletions = 0
458  
459    for (const line of lines) {
460      if (line.startsWith('@@')) {
461        inHunks = true
462      }
463      if (inHunks) {
464        patchLines.push(line)
465        if (line.startsWith('+') && !line.startsWith('+++')) {
466          additions++
467        } else if (line.startsWith('-') && !line.startsWith('---')) {
468          deletions++
469        }
470      }
471    }
472  
473    return {
474      filename,
475      status,
476      additions,
477      deletions,
478      changes: additions + deletions,
479      patch: patchLines.join('\n'),
480    }
481  }
482  
483  /**
484   * Determine the best ref to diff against for a PR-like diff.
485   * Priority:
486   * 1. CLAUDE_CODE_BASE_REF env var (set externally, e.g. by CCR managed containers)
487   * 2. Merge base with the default branch (best guess)
488   * 3. HEAD (fallback if merge-base fails)
489   */
490  async function getDiffRef(gitRoot: string): Promise<string> {
491    const baseBranch =
492      process.env.CLAUDE_CODE_BASE_REF || (await getDefaultBranch())
493    const { stdout, code } = await execFileNoThrowWithCwd(
494      gitExe(),
495      ['--no-optional-locks', 'merge-base', 'HEAD', baseBranch],
496      { cwd: gitRoot, timeout: SINGLE_FILE_DIFF_TIMEOUT_MS },
497    )
498    if (code === 0 && stdout.trim()) {
499      return stdout.trim()
500    }
501    return 'HEAD'
502  }
503  
504  async function generateSyntheticDiff(
505    gitPath: string,
506    absoluteFilePath: string,
507  ): Promise<Omit<ToolUseDiff, 'repository'> | null> {
508    try {
509      if (!isFileWithinReadSizeLimit(absoluteFilePath, MAX_DIFF_SIZE_BYTES)) {
510        return null
511      }
512      const content = await readFile(absoluteFilePath, 'utf-8')
513      const lines = content.split('\n')
514      // Remove trailing empty line from split if file ends with newline
515      if (lines.length > 0 && lines.at(-1) === '') {
516        lines.pop()
517      }
518      const lineCount = lines.length
519      const addedLines = lines.map(line => `+${line}`).join('\n')
520      const patch = `@@ -0,0 +1,${lineCount} @@\n${addedLines}`
521      return {
522        filename: gitPath,
523        status: 'added',
524        additions: lineCount,
525        deletions: 0,
526        changes: lineCount,
527        patch,
528      }
529    } catch {
530      return null
531    }
532  }