/ utils / heapDumpService.ts
heapDumpService.ts
  1  /**
  2   * Service for heap dump capture.
  3   * Used by the /heapdump command.
  4   */
  5  
  6  import { createWriteStream, writeFileSync } from 'fs'
  7  import { readdir, readFile, writeFile } from 'fs/promises'
  8  import { join } from 'path'
  9  import { pipeline } from 'stream/promises'
 10  import {
 11    getHeapSnapshot,
 12    getHeapSpaceStatistics,
 13    getHeapStatistics,
 14    type HeapSpaceInfo,
 15  } from 'v8'
 16  import { getSessionId } from '../bootstrap/state.js'
 17  import { logEvent } from '../services/analytics/index.js'
 18  import { logForDebugging } from './debug.js'
 19  import { toError } from './errors.js'
 20  import { getDesktopPath } from './file.js'
 21  import { getFsImplementation } from './fsOperations.js'
 22  import { logError } from './log.js'
 23  import { jsonStringify } from './slowOperations.js'
 24  
 25  export type HeapDumpResult = {
 26    success: boolean
 27    heapPath?: string
 28    diagPath?: string
 29    error?: string
 30  }
 31  
 32  /**
 33   * Memory diagnostics captured alongside heap dump.
 34   * Helps identify if leak is in V8 heap (captured in snapshot) or native memory (not captured).
 35   */
 36  export type MemoryDiagnostics = {
 37    timestamp: string
 38    sessionId: string
 39    trigger: 'manual' | 'auto-1.5GB'
 40    dumpNumber: number // 1st, 2nd, etc. auto dump in this session (0 for manual)
 41    uptimeSeconds: number
 42    memoryUsage: {
 43      heapUsed: number
 44      heapTotal: number
 45      external: number
 46      arrayBuffers: number
 47      rss: number
 48    }
 49    memoryGrowthRate: {
 50      bytesPerSecond: number
 51      mbPerHour: number
 52    }
 53    v8HeapStats: {
 54      heapSizeLimit: number // Max heap size allowed
 55      mallocedMemory: number // Memory allocated outside V8 heap
 56      peakMallocedMemory: number // Peak native memory
 57      detachedContexts: number // Leaked contexts - key leak indicator!
 58      nativeContexts: number // Active contexts
 59    }
 60    v8HeapSpaces?: Array<{
 61      name: string
 62      size: number
 63      used: number
 64      available: number
 65    }>
 66    resourceUsage: {
 67      maxRSS: number // Peak RSS in bytes
 68      userCPUTime: number
 69      systemCPUTime: number
 70    }
 71    activeHandles: number // Leaked timers, sockets, file handles
 72    activeRequests: number // Pending async operations
 73    openFileDescriptors?: number // Linux/macOS - indicates resource leaks
 74    analysis: {
 75      potentialLeaks: string[]
 76      recommendation: string
 77    }
 78    smapsRollup?: string // Linux only - detailed memory breakdown
 79    platform: string
 80    nodeVersion: string
 81    ccVersion: string
 82  }
 83  
 84  /**
 85   * Capture memory diagnostics.
 86   * This helps identify if the leak is in V8 heap (captured) or native memory (not captured).
 87   */
 88  export async function captureMemoryDiagnostics(
 89    trigger: 'manual' | 'auto-1.5GB',
 90    dumpNumber = 0,
 91  ): Promise<MemoryDiagnostics> {
 92    const usage = process.memoryUsage()
 93    const heapStats = getHeapStatistics()
 94    const resourceUsage = process.resourceUsage()
 95    const uptimeSeconds = process.uptime()
 96  
 97    // getHeapSpaceStatistics() is not available in Bun
 98    let heapSpaceStats: HeapSpaceInfo[] | undefined
 99    try {
100      heapSpaceStats = getHeapSpaceStatistics()
101    } catch {
102      // Not available in Bun runtime
103    }
104  
105    // Get active handles/requests count (these are internal APIs but stable)
106    const activeHandles = (
107      process as unknown as { _getActiveHandles: () => unknown[] }
108    )._getActiveHandles().length
109    const activeRequests = (
110      process as unknown as { _getActiveRequests: () => unknown[] }
111    )._getActiveRequests().length
112  
113    // Try to count open file descriptors (Linux/macOS)
114    let openFileDescriptors: number | undefined
115    try {
116      openFileDescriptors = (await readdir('/proc/self/fd')).length
117    } catch {
118      // Not on Linux - try macOS approach would require lsof, skip for now
119    }
120  
121    // Try to read Linux smaps_rollup for detailed memory breakdown
122    let smapsRollup: string | undefined
123    try {
124      smapsRollup = await readFile('/proc/self/smaps_rollup', 'utf8')
125    } catch {
126      // Not on Linux or no access - this is fine
127    }
128  
129    // Calculate native memory (RSS - heap) and growth rate
130    const nativeMemory = usage.rss - usage.heapUsed
131    const bytesPerSecond = uptimeSeconds > 0 ? usage.rss / uptimeSeconds : 0
132    const mbPerHour = (bytesPerSecond * 3600) / (1024 * 1024)
133  
134    // Identify potential leaks
135    const potentialLeaks: string[] = []
136    if (heapStats.number_of_detached_contexts > 0) {
137      potentialLeaks.push(
138        `${heapStats.number_of_detached_contexts} detached context(s) - possible iframe/context leak`,
139      )
140    }
141    if (activeHandles > 100) {
142      potentialLeaks.push(
143        `${activeHandles} active handles - possible timer/socket leak`,
144      )
145    }
146    if (nativeMemory > usage.heapUsed) {
147      potentialLeaks.push(
148        'Native memory > heap - leak may be in native addons (node-pty, sharp, etc.)',
149      )
150    }
151    if (mbPerHour > 100) {
152      potentialLeaks.push(
153        `High memory growth rate: ${mbPerHour.toFixed(1)} MB/hour`,
154      )
155    }
156    if (openFileDescriptors && openFileDescriptors > 500) {
157      potentialLeaks.push(
158        `${openFileDescriptors} open file descriptors - possible file/socket leak`,
159      )
160    }
161  
162    return {
163      timestamp: new Date().toISOString(),
164      sessionId: getSessionId(),
165      trigger,
166      dumpNumber,
167      uptimeSeconds,
168      memoryUsage: {
169        heapUsed: usage.heapUsed,
170        heapTotal: usage.heapTotal,
171        external: usage.external,
172        arrayBuffers: usage.arrayBuffers,
173        rss: usage.rss,
174      },
175      memoryGrowthRate: {
176        bytesPerSecond,
177        mbPerHour,
178      },
179      v8HeapStats: {
180        heapSizeLimit: heapStats.heap_size_limit,
181        mallocedMemory: heapStats.malloced_memory,
182        peakMallocedMemory: heapStats.peak_malloced_memory,
183        detachedContexts: heapStats.number_of_detached_contexts,
184        nativeContexts: heapStats.number_of_native_contexts,
185      },
186      v8HeapSpaces: heapSpaceStats?.map(space => ({
187        name: space.space_name,
188        size: space.space_size,
189        used: space.space_used_size,
190        available: space.space_available_size,
191      })),
192      resourceUsage: {
193        maxRSS: resourceUsage.maxRSS * 1024, // Convert KB to bytes
194        userCPUTime: resourceUsage.userCPUTime,
195        systemCPUTime: resourceUsage.systemCPUTime,
196      },
197      activeHandles,
198      activeRequests,
199      openFileDescriptors,
200      analysis: {
201        potentialLeaks,
202        recommendation:
203          potentialLeaks.length > 0
204            ? `WARNING: ${potentialLeaks.length} potential leak indicator(s) found. See potentialLeaks array.`
205            : 'No obvious leak indicators. Check heap snapshot for retained objects.',
206      },
207      smapsRollup,
208      platform: process.platform,
209      nodeVersion: process.version,
210      ccVersion: MACRO.VERSION,
211    }
212  }
213  
214  /**
215   * Core heap dump function — captures heap snapshot + diagnostics to ~/Desktop.
216   *
217   * Diagnostics are written BEFORE the heap snapshot is captured, because the
218   * V8 heap snapshot serialization can crash for very large heaps. By writing
219   * diagnostics first, we still get useful memory info even if the snapshot fails.
220   */
221  export async function performHeapDump(
222    trigger: 'manual' | 'auto-1.5GB' = 'manual',
223    dumpNumber = 0,
224  ): Promise<HeapDumpResult> {
225    try {
226      const sessionId = getSessionId()
227  
228      // Capture diagnostics before any other async I/O —
229      // the heap dump itself allocates memory and would skew the numbers.
230      const diagnostics = await captureMemoryDiagnostics(trigger, dumpNumber)
231  
232      const toGB = (bytes: number): string =>
233        (bytes / 1024 / 1024 / 1024).toFixed(3)
234      logForDebugging(`[HeapDump] Memory state:
235    heapUsed: ${toGB(diagnostics.memoryUsage.heapUsed)} GB (in snapshot)
236    external: ${toGB(diagnostics.memoryUsage.external)} GB (NOT in snapshot)
237    rss: ${toGB(diagnostics.memoryUsage.rss)} GB (total process)
238    ${diagnostics.analysis.recommendation}`)
239  
240      const dumpDir = getDesktopPath()
241      await getFsImplementation().mkdir(dumpDir)
242  
243      const suffix = dumpNumber > 0 ? `-dump${dumpNumber}` : ''
244      const heapFilename = `${sessionId}${suffix}.heapsnapshot`
245      const diagFilename = `${sessionId}${suffix}-diagnostics.json`
246      const heapPath = join(dumpDir, heapFilename)
247      const diagPath = join(dumpDir, diagFilename)
248  
249      // Write diagnostics first (cheap, unlikely to fail)
250      await writeFile(diagPath, jsonStringify(diagnostics, null, 2), {
251        mode: 0o600,
252      })
253      logForDebugging(`[HeapDump] Diagnostics written to ${diagPath}`)
254  
255      // Write heap snapshot (this can crash for very large heaps)
256      await writeHeapSnapshot(heapPath)
257      logForDebugging(`[HeapDump] Heap dump written to ${heapPath}`)
258  
259      logEvent('tengu_heap_dump', {
260        triggerManual: trigger === 'manual',
261        triggerAuto15GB: trigger === 'auto-1.5GB',
262        dumpNumber,
263        success: true,
264      })
265  
266      return { success: true, heapPath, diagPath }
267    } catch (err) {
268      const error = toError(err)
269      logError(error)
270      logEvent('tengu_heap_dump', {
271        triggerManual: trigger === 'manual',
272        triggerAuto15GB: trigger === 'auto-1.5GB',
273        dumpNumber,
274        success: false,
275      })
276      return { success: false, error: error.message }
277    }
278  }
279  
280  /**
281   * Write heap snapshot to a file.
282   * Uses pipeline() which handles stream cleanup automatically on errors.
283   */
284  async function writeHeapSnapshot(filepath: string): Promise<void> {
285    if (typeof Bun !== 'undefined') {
286      // In Bun, heapsnapshots are currently not streaming.
287      // Use synchronous I/O despite potentially large filesize so that we avoid cloning the string for cross-thread usage.
288      //
289      /* eslint-disable custom-rules/no-sync-fs -- intentionally sync to avoid cloning large heap snapshot string for cross-thread usage */
290      // @ts-expect-error 2nd argument is in the next version of Bun
291      writeFileSync(filepath, Bun.generateHeapSnapshot('v8', 'arraybuffer'), {
292        mode: 0o600,
293      })
294      /* eslint-enable custom-rules/no-sync-fs */
295  
296      // Force GC to try to free that heap snapshot sooner.
297      Bun.gc(true)
298      return
299    }
300    const writeStream = createWriteStream(filepath, { mode: 0o600 })
301    const heapSnapshotStream = getHeapSnapshot()
302    await pipeline(heapSnapshotStream, writeStream)
303  }