Cradicle Explorer

pidLock.ts
  1  /**
  2   * PID-Based Version Locking
  3   *
  4   * This module provides PID-based locking for running Claude Code versions.
  5   * Unlike mtime-based locking (which can hold locks for 30 days after a crash),
  6   * PID-based locking can immediately detect when a process is no longer running.
  7   *
  8   * Lock files contain JSON with the PID and metadata, and staleness is determined
  9   * by checking if the process is still alive.
 10   */
 11  
 12  import { basename, join } from 'path'
 13  import { getFeatureValue_CACHED_MAY_BE_STALE } from '../../services/analytics/growthbook.js'
 14  import { logForDebugging } from '../debug.js'
 15  import { isEnvDefinedFalsy, isEnvTruthy } from '../envUtils.js'
 16  import { isENOENT, toError } from '../errors.js'
 17  import { getFsImplementation } from '../fsOperations.js'
 18  import { getProcessCommand } from '../genericProcessUtils.js'
 19  import { logError } from '../log.js'
 20  import {
 21    jsonParse,
 22    jsonStringify,
 23    writeFileSync_DEPRECATED,
 24  } from '../slowOperations.js'
 25  
 26  /**
 27   * Check if PID-based version locking is enabled.
 28   * When disabled, falls back to mtime-based locking (30-day timeout).
 29   *
 30   * Controlled by GrowthBook gate with local override:
 31   * - Set ENABLE_PID_BASED_VERSION_LOCKING=true to force-enable
 32   * - Set ENABLE_PID_BASED_VERSION_LOCKING=false to force-disable
 33   * - If unset, GrowthBook gate (tengu_pid_based_version_locking) controls rollout
 34   */
 35  export function isPidBasedLockingEnabled(): boolean {
 36    const envVar = process.env.ENABLE_PID_BASED_VERSION_LOCKING
 37    // If env var is explicitly set, respect it
 38    if (isEnvTruthy(envVar)) {
 39      return true
 40    }
 41    if (isEnvDefinedFalsy(envVar)) {
 42      return false
 43    }
 44    // GrowthBook controls gradual rollout (returns false for external users)
 45    return getFeatureValue_CACHED_MAY_BE_STALE(
 46      'tengu_pid_based_version_locking',
 47      false,
 48    )
 49  }
 50  
 51  /**
 52   * Content stored in a version lock file
 53   */
 54  export type VersionLockContent = {
 55    pid: number
 56    version: string
 57    execPath: string
 58    acquiredAt: number // timestamp when lock was acquired
 59  }
 60  
 61  /**
 62   * Information about a lock for diagnostic purposes
 63   */
 64  export type LockInfo = {
 65    version: string
 66    pid: number
 67    isProcessRunning: boolean
 68    execPath: string
 69    acquiredAt: Date
 70    lockFilePath: string
 71  }
 72  
 73  // Fallback stale timeout (2 hours) - used when PID check is inconclusive
 74  // This is much shorter than the previous 30-day timeout but still allows
 75  // for edge cases like network filesystems where PID check might fail
 76  const FALLBACK_STALE_MS = 2 * 60 * 60 * 1000
 77  
 78  /**
 79   * Check if a process with the given PID is currently running
 80   * Uses signal 0 which doesn't actually send a signal but checks if we can
 81   */
 82  export function isProcessRunning(pid: number): boolean {
 83    // PID 0 is special - it refers to the current process group, not a real process
 84    // PID 1 is init/systemd and is always running but shouldn't be considered for locks
 85    if (pid <= 1) {
 86      return false
 87    }
 88  
 89    try {
 90      process.kill(pid, 0)
 91      return true
 92    } catch {
 93      return false
 94    }
 95  }
 96  
 97  /**
 98   * Validate that a running process is actually a Claude process
 99   * This helps mitigate PID reuse issues
100   */
101  function isClaudeProcess(pid: number, expectedExecPath: string): boolean {
102    if (!isProcessRunning(pid)) {
103      return false
104    }
105  
106    // If the PID matches our current process, we know it's valid
107    // This handles test environments where the command might not contain 'claude'
108    if (pid === process.pid) {
109      return true
110    }
111  
112    try {
113      const command = getProcessCommand(pid)
114      if (!command) {
115        // If we can't get the command, trust the PID check
116        // This is conservative - we'd rather not delete a running version
117        return true
118      }
119  
120      // Check if the command contains 'claude' or the expected exec path
121      const normalizedCommand = command.toLowerCase()
122      const normalizedExecPath = expectedExecPath.toLowerCase()
123  
124      return (
125        normalizedCommand.includes('claude') ||
126        normalizedCommand.includes(normalizedExecPath)
127      )
128    } catch {
129      // If command check fails, trust the PID check
130      return true
131    }
132  }
133  
134  /**
135   * Read and parse a lock file's content
136   */
137  export function readLockContent(
138    lockFilePath: string,
139  ): VersionLockContent | null {
140    const fs = getFsImplementation()
141  
142    try {
143      const content = fs.readFileSync(lockFilePath, { encoding: 'utf8' })
144      if (!content || content.trim() === '') {
145        return null
146      }
147  
148      const parsed = jsonParse(content) as VersionLockContent
149  
150      // Validate required fields
151      if (typeof parsed.pid !== 'number' || !parsed.version || !parsed.execPath) {
152        return null
153      }
154  
155      return parsed
156    } catch {
157      return null
158    }
159  }
160  
161  /**
162   * Check if a lock file represents an active lock (process still running)
163   */
164  export function isLockActive(lockFilePath: string): boolean {
165    const content = readLockContent(lockFilePath)
166  
167    if (!content) {
168      return false
169    }
170  
171    const { pid, execPath } = content
172  
173    // Primary check: is the process running?
174    if (!isProcessRunning(pid)) {
175      return false
176    }
177  
178    // Secondary validation: is it actually a Claude process?
179    // This helps with PID reuse scenarios
180    if (!isClaudeProcess(pid, execPath)) {
181      logForDebugging(
182        `Lock PID ${pid} is running but does not appear to be Claude - treating as stale`,
183      )
184      return false
185    }
186  
187    // Fallback: if the lock is very old (> 2 hours) and we can't validate
188    // the command, be conservative and consider it potentially stale
189    // This handles edge cases like network filesystems
190    const fs = getFsImplementation()
191    try {
192      const stats = fs.statSync(lockFilePath)
193      const age = Date.now() - stats.mtimeMs
194      if (age > FALLBACK_STALE_MS) {
195        // Double-check that we can still see the process
196        if (!isProcessRunning(pid)) {
197          return false
198        }
199      }
200    } catch {
201      // If we can't stat the file, trust the PID check
202    }
203  
204    return true
205  }
206  
207  /**
208   * Write lock content to a file atomically
209   */
210  function writeLockFile(
211    lockFilePath: string,
212    content: VersionLockContent,
213  ): void {
214    const fs = getFsImplementation()
215    const tempPath = `${lockFilePath}.tmp.${process.pid}.${Date.now()}`
216  
217    try {
218      writeFileSync_DEPRECATED(tempPath, jsonStringify(content, null, 2), {
219        encoding: 'utf8',
220        flush: true,
221      })
222      fs.renameSync(tempPath, lockFilePath)
223    } catch (error) {
224      // Clean up temp file on failure (best-effort)
225      try {
226        fs.unlinkSync(tempPath)
227      } catch {
228        // Ignore cleanup errors (ENOENT expected if write failed before file creation)
229      }
230      throw error
231    }
232  }
233  
234  /**
235   * Try to acquire a lock on a version file
236   * Returns a release function if successful, null if the lock is already held
237   */
238  export async function tryAcquireLock(
239    versionPath: string,
240    lockFilePath: string,
241  ): Promise<(() => void) | null> {
242    const fs = getFsImplementation()
243    const versionName = basename(versionPath)
244  
245    // Check if there's an existing active lock (including by our own process)
246    // Use isLockActive for consistency with cleanup - it checks both PID running AND
247    // validates it's actually a Claude process (to handle PID reuse scenarios)
248    if (isLockActive(lockFilePath)) {
249      const existingContent = readLockContent(lockFilePath)
250      logForDebugging(
251        `Cannot acquire lock for ${versionName} - held by PID ${existingContent?.pid}`,
252      )
253      return null
254    }
255  
256    // Try to acquire the lock
257    const lockContent: VersionLockContent = {
258      pid: process.pid,
259      version: versionName,
260      execPath: process.execPath,
261      acquiredAt: Date.now(),
262    }
263  
264    try {
265      writeLockFile(lockFilePath, lockContent)
266  
267      // Verify we actually got the lock (race condition check)
268      const verifyContent = readLockContent(lockFilePath)
269      if (verifyContent?.pid !== process.pid) {
270        // Another process won the race
271        return null
272      }
273  
274      logForDebugging(`Acquired PID lock for ${versionName} (PID ${process.pid})`)
275  
276      // Return release function
277      return () => {
278        try {
279          // Only release if we still own the lock
280          const currentContent = readLockContent(lockFilePath)
281          if (currentContent?.pid === process.pid) {
282            fs.unlinkSync(lockFilePath)
283            logForDebugging(`Released PID lock for ${versionName}`)
284          }
285        } catch (error) {
286          logForDebugging(`Failed to release lock for ${versionName}: ${error}`)
287        }
288      }
289    } catch (error) {
290      logForDebugging(`Failed to acquire lock for ${versionName}: ${error}`)
291      return null
292    }
293  }
294  
295  /**
296   * Acquire a lock and hold it for the lifetime of the process
297   * This is used for locking the currently running version
298   */
299  export async function acquireProcessLifetimeLock(
300    versionPath: string,
301    lockFilePath: string,
302  ): Promise<boolean> {
303    const release = await tryAcquireLock(versionPath, lockFilePath)
304  
305    if (!release) {
306      return false
307    }
308  
309    // Register cleanup on process exit
310    const cleanup = () => {
311      try {
312        release()
313      } catch {
314        // Ignore errors during process exit
315      }
316    }
317  
318    process.on('exit', cleanup)
319    process.on('SIGINT', cleanup)
320    process.on('SIGTERM', cleanup)
321  
322    // Don't call release() - we want to hold the lock until process exits
323    return true
324  }
325  
326  /**
327   * Execute a callback while holding a lock
328   * Returns true if the callback executed, false if lock couldn't be acquired
329   */
330  export async function withLock(
331    versionPath: string,
332    lockFilePath: string,
333    callback: () => void | Promise<void>,
334  ): Promise<boolean> {
335    const release = await tryAcquireLock(versionPath, lockFilePath)
336  
337    if (!release) {
338      return false
339    }
340  
341    try {
342      await callback()
343      return true
344    } finally {
345      release()
346    }
347  }
348  
349  /**
350   * Get information about all version locks for diagnostics
351   */
352  export function getAllLockInfo(locksDir: string): LockInfo[] {
353    const fs = getFsImplementation()
354    const lockInfos: LockInfo[] = []
355  
356    try {
357      const lockFiles = fs
358        .readdirStringSync(locksDir)
359        .filter((f: string) => f.endsWith('.lock'))
360  
361      for (const lockFile of lockFiles) {
362        const lockFilePath = join(locksDir, lockFile)
363        const content = readLockContent(lockFilePath)
364  
365        if (content) {
366          lockInfos.push({
367            version: content.version,
368            pid: content.pid,
369            isProcessRunning: isProcessRunning(content.pid),
370            execPath: content.execPath,
371            acquiredAt: new Date(content.acquiredAt),
372            lockFilePath,
373          })
374        }
375      }
376    } catch (error) {
377      if (isENOENT(error)) {
378        return lockInfos
379      }
380      logError(toError(error))
381    }
382  
383    return lockInfos
384  }
385  
386  /**
387   * Clean up stale locks (locks where the process is no longer running)
388   * Returns the number of locks cleaned up
389   *
390   * Handles both:
391   * - PID-based locks (files containing JSON with PID)
392   * - Legacy proper-lockfile locks (directories created by mtime-based locking)
393   */
394  export function cleanupStaleLocks(locksDir: string): number {
395    const fs = getFsImplementation()
396    let cleanedCount = 0
397  
398    try {
399      const lockEntries = fs
400        .readdirStringSync(locksDir)
401        .filter((f: string) => f.endsWith('.lock'))
402  
403      for (const lockEntry of lockEntries) {
404        const lockFilePath = join(locksDir, lockEntry)
405  
406        try {
407          const stats = fs.lstatSync(lockFilePath)
408  
409          if (stats.isDirectory()) {
410            // Legacy proper-lockfile directory lock - always remove when PID-based
411            // locking is enabled since these are from a different locking mechanism
412            fs.rmSync(lockFilePath, { recursive: true, force: true })
413            cleanedCount++
414            logForDebugging(`Cleaned up legacy directory lock: ${lockEntry}`)
415          } else if (!isLockActive(lockFilePath)) {
416            // PID-based file lock with no running process
417            fs.unlinkSync(lockFilePath)
418            cleanedCount++
419            logForDebugging(`Cleaned up stale lock: ${lockEntry}`)
420          }
421        } catch {
422          // Ignore individual cleanup errors
423        }
424      }
425    } catch (error) {
426      if (isENOENT(error)) {
427        return 0
428      }
429      logError(toError(error))
430    }
431  
432    return cleanedCount
433  }