/ utils / telemetry / perfettoTracing.ts
perfettoTracing.ts
   1  /**
   2   * Perfetto Tracing for Claude Code (Ant-only)
   3   *
   4   * This module generates traces in the Chrome Trace Event format that can be
   5   * viewed in ui.perfetto.dev or Chrome's chrome://tracing.
   6   *
   7   * NOTE: This feature is ant-only and eliminated from external builds.
   8   *
   9   * The trace file includes:
  10   * - Agent hierarchy (parent-child relationships in a swarm)
  11   * - API requests with TTFT, TTLT, prompt length, cache stats, msg ID, speculative flag
  12   * - Tool executions with name, duration, and token usage
  13   * - User input waiting time
  14   *
  15   * Usage:
  16   * 1. Enable via CLAUDE_CODE_PERFETTO_TRACE=1 or CLAUDE_CODE_PERFETTO_TRACE=<path>
  17   * 2. Optionally set CLAUDE_CODE_PERFETTO_WRITE_INTERVAL_S=<positive integer> to write the
  18   *    trace file periodically (default: write only on exit).
  19   * 3. Run Claude Code normally
  20   * 4. Trace file is written to ~/.claude/traces/trace-<session-id>.json
  21   *    or to the specified path
  22   * 5. Open in ui.perfetto.dev to visualize
  23   */
  24  
  25  import { feature } from 'bun:bundle'
  26  import { mkdirSync, writeFileSync } from 'fs'
  27  import { mkdir, writeFile } from 'fs/promises'
  28  import { dirname, join } from 'path'
  29  import { getSessionId } from '../../bootstrap/state.js'
  30  import { registerCleanup } from '../cleanupRegistry.js'
  31  import { logForDebugging } from '../debug.js'
  32  import {
  33    getClaudeConfigHomeDir,
  34    isEnvDefinedFalsy,
  35    isEnvTruthy,
  36  } from '../envUtils.js'
  37  import { errorMessage } from '../errors.js'
  38  import { djb2Hash } from '../hash.js'
  39  import { jsonStringify } from '../slowOperations.js'
  40  import { getAgentId, getAgentName, getParentSessionId } from '../teammate.js'
  41  
  42  /**
  43   * Chrome Trace Event format types
  44   * See: https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU
  45   */
  46  
  47  export type TraceEventPhase =
  48    | 'B' // Begin duration event
  49    | 'E' // End duration event
  50    | 'X' // Complete event (with duration)
  51    | 'i' // Instant event
  52    | 'C' // Counter event
  53    | 'b' // Async begin
  54    | 'n' // Async instant
  55    | 'e' // Async end
  56    | 'M' // Metadata event
  57  
  58  export type TraceEvent = {
  59    name: string
  60    cat: string
  61    ph: TraceEventPhase
  62    ts: number // Timestamp in microseconds
  63    pid: number // Process ID (we use 1 for main, agent IDs for subagents)
  64    tid: number // Thread ID (we use numeric hash of agent name or 1 for main)
  65    dur?: number // Duration in microseconds (for 'X' events)
  66    args?: Record<string, unknown>
  67    id?: string // For async events
  68    scope?: string
  69  }
  70  
  71  /**
  72   * Agent info for tracking hierarchy
  73   */
  74  type AgentInfo = {
  75    agentId: string
  76    agentName: string
  77    parentAgentId?: string
  78    processId: number
  79    threadId: number
  80  }
  81  
  82  /**
  83   * Pending span for tracking begin/end pairs
  84   */
  85  type PendingSpan = {
  86    name: string
  87    category: string
  88    startTime: number
  89    agentInfo: AgentInfo
  90    args: Record<string, unknown>
  91  }
  92  
  93  // Global state for the Perfetto tracer
  94  let isEnabled = false
  95  let tracePath: string | null = null
  96  // Metadata events (ph: 'M' — process/thread names, parent links) are kept
  97  // separate so they survive eviction — Perfetto UI needs them to label
  98  // tracks. Bounded by agent count (~3 events per agent).
  99  const metadataEvents: TraceEvent[] = []
 100  const events: TraceEvent[] = []
 101  // events[] cap. Cron-driven sessions run for days; 22 push sites × many
 102  // turns would otherwise grow unboundedly (periodicWrite flushes to disk but
 103  // does not truncate — it writes the full snapshot). At ~300B/event this is
 104  // ~30MB, enough trace history for any debugging session. Eviction drops the
 105  // oldest half when hit, amortized O(1).
 106  const MAX_EVENTS = 100_000
 107  const pendingSpans = new Map<string, PendingSpan>()
 108  const agentRegistry = new Map<string, AgentInfo>()
 109  let totalAgentCount = 0
 110  let startTimeMs = 0
 111  let spanIdCounter = 0
 112  let traceWritten = false // Flag to avoid double writes
 113  
 114  // Map agent IDs to numeric process IDs (Perfetto requires numeric IDs)
 115  let processIdCounter = 1
 116  const agentIdToProcessId = new Map<string, number>()
 117  
 118  // Periodic write interval handle
 119  let writeIntervalId: ReturnType<typeof setInterval> | null = null
 120  
 121  const STALE_SPAN_TTL_MS = 30 * 60 * 1000 // 30 minutes
 122  const STALE_SPAN_CLEANUP_INTERVAL_MS = 60 * 1000 // 1 minute
 123  let staleSpanCleanupId: ReturnType<typeof setInterval> | null = null
 124  
 125  /**
 126   * Convert a string to a numeric hash for use as thread ID
 127   */
 128  function stringToNumericHash(str: string): number {
 129    return Math.abs(djb2Hash(str)) || 1 // Ensure non-zero
 130  }
 131  
 132  /**
 133   * Get or create a numeric process ID for an agent
 134   */
 135  function getProcessIdForAgent(agentId: string): number {
 136    const existing = agentIdToProcessId.get(agentId)
 137    if (existing !== undefined) return existing
 138  
 139    processIdCounter++
 140    agentIdToProcessId.set(agentId, processIdCounter)
 141    return processIdCounter
 142  }
 143  
 144  /**
 145   * Get current agent info
 146   */
 147  function getCurrentAgentInfo(): AgentInfo {
 148    const agentId = getAgentId() ?? getSessionId()
 149    const agentName = getAgentName() ?? 'main'
 150    const parentSessionId = getParentSessionId()
 151  
 152    // Check if we've already registered this agent
 153    const existing = agentRegistry.get(agentId)
 154    if (existing) return existing
 155  
 156    const info: AgentInfo = {
 157      agentId,
 158      agentName,
 159      parentAgentId: parentSessionId,
 160      processId: agentId === getSessionId() ? 1 : getProcessIdForAgent(agentId),
 161      threadId: stringToNumericHash(agentName),
 162    }
 163  
 164    agentRegistry.set(agentId, info)
 165    totalAgentCount++
 166    return info
 167  }
 168  
 169  /**
 170   * Get timestamp in microseconds relative to trace start
 171   */
 172  function getTimestamp(): number {
 173    return (Date.now() - startTimeMs) * 1000
 174  }
 175  
 176  /**
 177   * Generate a unique span ID
 178   */
 179  function generateSpanId(): string {
 180    return `span_${++spanIdCounter}`
 181  }
 182  
 183  /**
 184   * Evict pending spans older than STALE_SPAN_TTL_MS.
 185   * Mirrors the TTL cleanup pattern in sessionTracing.ts.
 186   */
 187  function evictStaleSpans(): void {
 188    const now = getTimestamp()
 189    const ttlUs = STALE_SPAN_TTL_MS * 1000 // Convert ms to microseconds
 190    for (const [spanId, span] of pendingSpans) {
 191      if (now - span.startTime > ttlUs) {
 192        // Emit an end event so the span shows up in the trace as incomplete
 193        events.push({
 194          name: span.name,
 195          cat: span.category,
 196          ph: 'E',
 197          ts: now,
 198          pid: span.agentInfo.processId,
 199          tid: span.agentInfo.threadId,
 200          args: {
 201            ...span.args,
 202            evicted: true,
 203            duration_ms: (now - span.startTime) / 1000,
 204          },
 205        })
 206        pendingSpans.delete(spanId)
 207      }
 208    }
 209  }
 210  
 211  /**
 212   * Build the full trace document (Chrome Trace JSON format).
 213   */
 214  function buildTraceDocument(): string {
 215    return jsonStringify({
 216      traceEvents: [...metadataEvents, ...events],
 217      metadata: {
 218        session_id: getSessionId(),
 219        trace_start_time: new Date(startTimeMs).toISOString(),
 220        agent_count: totalAgentCount,
 221        total_event_count: metadataEvents.length + events.length,
 222      },
 223    })
 224  }
 225  
 226  /**
 227   * Drop the oldest half of events[] when over MAX_EVENTS. Called from the
 228   * stale-span cleanup interval (60s). The half-batch splice keeps this
 229   * amortized O(1) — we don't pay splice cost per-push. A synthetic marker
 230   * is inserted so the gap is visible in ui.perfetto.dev.
 231   */
 232  function evictOldestEvents(): void {
 233    if (events.length < MAX_EVENTS) return
 234    const dropped = events.splice(0, MAX_EVENTS / 2)
 235    events.unshift({
 236      name: 'trace_truncated',
 237      cat: '__metadata',
 238      ph: 'i',
 239      ts: dropped[dropped.length - 1]?.ts ?? 0,
 240      pid: 1,
 241      tid: 0,
 242      args: { dropped_events: dropped.length },
 243    })
 244    logForDebugging(
 245      `[Perfetto] Evicted ${dropped.length} oldest events (cap ${MAX_EVENTS})`,
 246    )
 247  }
 248  
 249  /**
 250   * Initialize Perfetto tracing
 251   * Call this early in the application lifecycle
 252   */
 253  export function initializePerfettoTracing(): void {
 254    const envValue = process.env.CLAUDE_CODE_PERFETTO_TRACE
 255    logForDebugging(
 256      `[Perfetto] initializePerfettoTracing called, env value: ${envValue}`,
 257    )
 258  
 259    // Wrap in feature() for dead code elimination - entire block removed from external builds
 260    if (feature('PERFETTO_TRACING')) {
 261      if (!envValue || isEnvDefinedFalsy(envValue)) {
 262        logForDebugging(
 263          '[Perfetto] Tracing disabled (env var not set or disabled)',
 264        )
 265        return
 266      }
 267  
 268      isEnabled = true
 269      startTimeMs = Date.now()
 270  
 271      // Determine trace file path
 272      if (isEnvTruthy(envValue)) {
 273        const tracesDir = join(getClaudeConfigHomeDir(), 'traces')
 274        tracePath = join(tracesDir, `trace-${getSessionId()}.json`)
 275      } else {
 276        // Use the provided path
 277        tracePath = envValue
 278      }
 279  
 280      logForDebugging(
 281        `[Perfetto] Tracing enabled, will write to: ${tracePath}, isEnabled=${isEnabled}`,
 282      )
 283  
 284      // Start periodic full-trace write if CLAUDE_CODE_PERFETTO_WRITE_INTERVAL_S is a positive integer
 285      const intervalSec = parseInt(
 286        process.env.CLAUDE_CODE_PERFETTO_WRITE_INTERVAL_S ?? '',
 287        10,
 288      )
 289      if (intervalSec > 0) {
 290        writeIntervalId = setInterval(() => {
 291          void periodicWrite()
 292        }, intervalSec * 1000)
 293        // Don't let the interval keep the process alive on its own
 294        if (writeIntervalId.unref) writeIntervalId.unref()
 295        logForDebugging(
 296          `[Perfetto] Periodic write enabled, interval: ${intervalSec}s`,
 297        )
 298      }
 299  
 300      // Start stale span cleanup interval
 301      staleSpanCleanupId = setInterval(() => {
 302        evictStaleSpans()
 303        evictOldestEvents()
 304      }, STALE_SPAN_CLEANUP_INTERVAL_MS)
 305      if (staleSpanCleanupId.unref) staleSpanCleanupId.unref()
 306  
 307      // Register cleanup to write final trace on exit
 308      registerCleanup(async () => {
 309        logForDebugging('[Perfetto] Cleanup callback invoked')
 310        await writePerfettoTrace()
 311      })
 312  
 313      // Also register a beforeExit handler as a fallback
 314      // This ensures the trace is written even if cleanup registry is not called
 315      process.on('beforeExit', () => {
 316        logForDebugging('[Perfetto] beforeExit handler invoked')
 317        void writePerfettoTrace()
 318      })
 319  
 320      // Register a synchronous exit handler as a last resort
 321      // This is the final fallback to ensure trace is written before process exits
 322      process.on('exit', () => {
 323        if (!traceWritten) {
 324          logForDebugging(
 325            '[Perfetto] exit handler invoked, writing trace synchronously',
 326          )
 327          writePerfettoTraceSync()
 328        }
 329      })
 330  
 331      // Emit process metadata events for main process
 332      const mainAgent = getCurrentAgentInfo()
 333      emitProcessMetadata(mainAgent)
 334    }
 335  }
 336  
 337  /**
 338   * Emit metadata events for a process/agent
 339   */
 340  function emitProcessMetadata(agentInfo: AgentInfo): void {
 341    if (!isEnabled) return
 342  
 343    // Process name
 344    metadataEvents.push({
 345      name: 'process_name',
 346      cat: '__metadata',
 347      ph: 'M',
 348      ts: 0,
 349      pid: agentInfo.processId,
 350      tid: 0,
 351      args: { name: agentInfo.agentName },
 352    })
 353  
 354    // Thread name (same as process for now)
 355    metadataEvents.push({
 356      name: 'thread_name',
 357      cat: '__metadata',
 358      ph: 'M',
 359      ts: 0,
 360      pid: agentInfo.processId,
 361      tid: agentInfo.threadId,
 362      args: { name: agentInfo.agentName },
 363    })
 364  
 365    // Add parent info if available
 366    if (agentInfo.parentAgentId) {
 367      metadataEvents.push({
 368        name: 'parent_agent',
 369        cat: '__metadata',
 370        ph: 'M',
 371        ts: 0,
 372        pid: agentInfo.processId,
 373        tid: 0,
 374        args: {
 375          parent_agent_id: agentInfo.parentAgentId,
 376        },
 377      })
 378    }
 379  }
 380  
 381  /**
 382   * Check if Perfetto tracing is enabled
 383   */
 384  export function isPerfettoTracingEnabled(): boolean {
 385    return isEnabled
 386  }
 387  
 388  /**
 389   * Register a new agent in the trace
 390   * Call this when a subagent/teammate is spawned
 391   */
 392  export function registerAgent(
 393    agentId: string,
 394    agentName: string,
 395    parentAgentId?: string,
 396  ): void {
 397    if (!isEnabled) return
 398  
 399    const info: AgentInfo = {
 400      agentId,
 401      agentName,
 402      parentAgentId,
 403      processId: getProcessIdForAgent(agentId),
 404      threadId: stringToNumericHash(agentName),
 405    }
 406  
 407    agentRegistry.set(agentId, info)
 408    totalAgentCount++
 409    emitProcessMetadata(info)
 410  }
 411  
 412  /**
 413   * Unregister an agent from the trace.
 414   * Call this when an agent completes, fails, or is aborted to free memory.
 415   */
 416  export function unregisterAgent(agentId: string): void {
 417    if (!isEnabled) return
 418    agentRegistry.delete(agentId)
 419    agentIdToProcessId.delete(agentId)
 420  }
 421  
 422  /**
 423   * Start an API call span
 424   */
 425  export function startLLMRequestPerfettoSpan(args: {
 426    model: string
 427    promptTokens?: number
 428    messageId?: string
 429    isSpeculative?: boolean
 430    querySource?: string
 431  }): string {
 432    if (!isEnabled) return ''
 433  
 434    const spanId = generateSpanId()
 435    const agentInfo = getCurrentAgentInfo()
 436  
 437    pendingSpans.set(spanId, {
 438      name: 'API Call',
 439      category: 'api',
 440      startTime: getTimestamp(),
 441      agentInfo,
 442      args: {
 443        model: args.model,
 444        prompt_tokens: args.promptTokens,
 445        message_id: args.messageId,
 446        is_speculative: args.isSpeculative ?? false,
 447        query_source: args.querySource,
 448      },
 449    })
 450  
 451    // Emit begin event
 452    events.push({
 453      name: 'API Call',
 454      cat: 'api',
 455      ph: 'B',
 456      ts: pendingSpans.get(spanId)!.startTime,
 457      pid: agentInfo.processId,
 458      tid: agentInfo.threadId,
 459      args: pendingSpans.get(spanId)!.args,
 460    })
 461  
 462    return spanId
 463  }
 464  
 465  /**
 466   * End an API call span with response metadata
 467   */
 468  export function endLLMRequestPerfettoSpan(
 469    spanId: string,
 470    metadata: {
 471      ttftMs?: number
 472      ttltMs?: number
 473      promptTokens?: number
 474      outputTokens?: number
 475      cacheReadTokens?: number
 476      cacheCreationTokens?: number
 477      messageId?: string
 478      success?: boolean
 479      error?: string
 480      /** Time spent in pre-request setup (client creation, retries) before the successful attempt */
 481      requestSetupMs?: number
 482      /** Timestamps (Date.now()) of each attempt start — used to emit retry sub-spans */
 483      attemptStartTimes?: number[]
 484    },
 485  ): void {
 486    if (!isEnabled || !spanId) return
 487  
 488    const pending = pendingSpans.get(spanId)
 489    if (!pending) return
 490  
 491    const endTime = getTimestamp()
 492    const duration = endTime - pending.startTime
 493  
 494    const promptTokens =
 495      metadata.promptTokens ?? (pending.args.prompt_tokens as number | undefined)
 496    const ttftMs = metadata.ttftMs
 497    const ttltMs = metadata.ttltMs
 498    const outputTokens = metadata.outputTokens
 499    const cacheReadTokens = metadata.cacheReadTokens
 500  
 501    // Compute derived metrics
 502    // ITPS: input tokens per second (prompt processing speed)
 503    const itps =
 504      ttftMs !== undefined && promptTokens !== undefined && ttftMs > 0
 505        ? Math.round((promptTokens / (ttftMs / 1000)) * 100) / 100
 506        : undefined
 507  
 508    // OTPS: output tokens per second (sampling speed)
 509    const samplingMs =
 510      ttltMs !== undefined && ttftMs !== undefined ? ttltMs - ttftMs : undefined
 511    const otps =
 512      samplingMs !== undefined && outputTokens !== undefined && samplingMs > 0
 513        ? Math.round((outputTokens / (samplingMs / 1000)) * 100) / 100
 514        : undefined
 515  
 516    // Cache hit rate: percentage of prompt tokens from cache
 517    const cacheHitRate =
 518      cacheReadTokens !== undefined &&
 519      promptTokens !== undefined &&
 520      promptTokens > 0
 521        ? Math.round((cacheReadTokens / promptTokens) * 10000) / 100
 522        : undefined
 523  
 524    const requestSetupMs = metadata.requestSetupMs
 525    const attemptStartTimes = metadata.attemptStartTimes
 526  
 527    // Merge metadata with original args
 528    const args = {
 529      ...pending.args,
 530      ttft_ms: ttftMs,
 531      ttlt_ms: ttltMs,
 532      prompt_tokens: promptTokens,
 533      output_tokens: outputTokens,
 534      cache_read_tokens: cacheReadTokens,
 535      cache_creation_tokens: metadata.cacheCreationTokens,
 536      message_id: metadata.messageId ?? pending.args.message_id,
 537      success: metadata.success ?? true,
 538      error: metadata.error,
 539      duration_ms: duration / 1000,
 540      request_setup_ms: requestSetupMs,
 541      // Derived metrics
 542      itps,
 543      otps,
 544      cache_hit_rate_pct: cacheHitRate,
 545    }
 546  
 547    // Emit Request Setup sub-span when there was measurable setup time
 548    // (client creation, param building, retries before the successful attempt)
 549    const setupUs =
 550      requestSetupMs !== undefined && requestSetupMs > 0
 551        ? requestSetupMs * 1000
 552        : 0
 553    if (setupUs > 0) {
 554      const setupEndTs = pending.startTime + setupUs
 555  
 556      events.push({
 557        name: 'Request Setup',
 558        cat: 'api,setup',
 559        ph: 'B',
 560        ts: pending.startTime,
 561        pid: pending.agentInfo.processId,
 562        tid: pending.agentInfo.threadId,
 563        args: {
 564          request_setup_ms: requestSetupMs,
 565          attempt_count: attemptStartTimes?.length ?? 1,
 566        },
 567      })
 568  
 569      // Emit retry attempt sub-spans within Request Setup.
 570      // Each failed attempt runs from its start to the next attempt's start.
 571      if (attemptStartTimes && attemptStartTimes.length > 1) {
 572        // attemptStartTimes[0] is the reference point (first attempt).
 573        // Convert wall-clock deltas into Perfetto-relative microseconds.
 574        const baseWallMs = attemptStartTimes[0]!
 575        for (let i = 0; i < attemptStartTimes.length - 1; i++) {
 576          const attemptStartUs =
 577            pending.startTime + (attemptStartTimes[i]! - baseWallMs) * 1000
 578          const attemptEndUs =
 579            pending.startTime + (attemptStartTimes[i + 1]! - baseWallMs) * 1000
 580  
 581          events.push({
 582            name: `Attempt ${i + 1} (retry)`,
 583            cat: 'api,retry',
 584            ph: 'B',
 585            ts: attemptStartUs,
 586            pid: pending.agentInfo.processId,
 587            tid: pending.agentInfo.threadId,
 588            args: { attempt: i + 1 },
 589          })
 590          events.push({
 591            name: `Attempt ${i + 1} (retry)`,
 592            cat: 'api,retry',
 593            ph: 'E',
 594            ts: attemptEndUs,
 595            pid: pending.agentInfo.processId,
 596            tid: pending.agentInfo.threadId,
 597          })
 598        }
 599      }
 600  
 601      events.push({
 602        name: 'Request Setup',
 603        cat: 'api,setup',
 604        ph: 'E',
 605        ts: setupEndTs,
 606        pid: pending.agentInfo.processId,
 607        tid: pending.agentInfo.threadId,
 608      })
 609    }
 610  
 611    // Emit sub-spans for First Token and Sampling phases (before API Call end)
 612    // Using B/E pairs in proper nesting order for correct Perfetto visualization
 613    if (ttftMs !== undefined) {
 614      // First Token starts after request setup (if any)
 615      const firstTokenStartTs = pending.startTime + setupUs
 616      const firstTokenEndTs = firstTokenStartTs + ttftMs * 1000
 617  
 618      // First Token phase: from successful attempt start to first token
 619      events.push({
 620        name: 'First Token',
 621        cat: 'api,ttft',
 622        ph: 'B',
 623        ts: firstTokenStartTs,
 624        pid: pending.agentInfo.processId,
 625        tid: pending.agentInfo.threadId,
 626        args: {
 627          ttft_ms: ttftMs,
 628          prompt_tokens: promptTokens,
 629          itps,
 630          cache_hit_rate_pct: cacheHitRate,
 631        },
 632      })
 633      events.push({
 634        name: 'First Token',
 635        cat: 'api,ttft',
 636        ph: 'E',
 637        ts: firstTokenEndTs,
 638        pid: pending.agentInfo.processId,
 639        tid: pending.agentInfo.threadId,
 640      })
 641  
 642      // Sampling phase: from first token to last token
 643      // Note: samplingMs = ttltMs - ttftMs still includes setup time in ttltMs,
 644      // so we compute the actual sampling duration for the span as the time from
 645      // first token to API call end (endTime), not samplingMs directly.
 646      const actualSamplingMs =
 647        ttltMs !== undefined ? ttltMs - ttftMs - setupUs / 1000 : undefined
 648      if (actualSamplingMs !== undefined && actualSamplingMs > 0) {
 649        events.push({
 650          name: 'Sampling',
 651          cat: 'api,sampling',
 652          ph: 'B',
 653          ts: firstTokenEndTs,
 654          pid: pending.agentInfo.processId,
 655          tid: pending.agentInfo.threadId,
 656          args: {
 657            sampling_ms: actualSamplingMs,
 658            output_tokens: outputTokens,
 659            otps,
 660          },
 661        })
 662        events.push({
 663          name: 'Sampling',
 664          cat: 'api,sampling',
 665          ph: 'E',
 666          ts: firstTokenEndTs + actualSamplingMs * 1000,
 667          pid: pending.agentInfo.processId,
 668          tid: pending.agentInfo.threadId,
 669        })
 670      }
 671    }
 672  
 673    // Emit API Call end event (after sub-spans)
 674    events.push({
 675      name: pending.name,
 676      cat: pending.category,
 677      ph: 'E',
 678      ts: endTime,
 679      pid: pending.agentInfo.processId,
 680      tid: pending.agentInfo.threadId,
 681      args,
 682    })
 683  
 684    pendingSpans.delete(spanId)
 685  }
 686  
 687  /**
 688   * Start a tool execution span
 689   */
 690  export function startToolPerfettoSpan(
 691    toolName: string,
 692    args?: Record<string, unknown>,
 693  ): string {
 694    if (!isEnabled) return ''
 695  
 696    const spanId = generateSpanId()
 697    const agentInfo = getCurrentAgentInfo()
 698  
 699    pendingSpans.set(spanId, {
 700      name: `Tool: ${toolName}`,
 701      category: 'tool',
 702      startTime: getTimestamp(),
 703      agentInfo,
 704      args: {
 705        tool_name: toolName,
 706        ...args,
 707      },
 708    })
 709  
 710    // Emit begin event
 711    events.push({
 712      name: `Tool: ${toolName}`,
 713      cat: 'tool',
 714      ph: 'B',
 715      ts: pendingSpans.get(spanId)!.startTime,
 716      pid: agentInfo.processId,
 717      tid: agentInfo.threadId,
 718      args: pendingSpans.get(spanId)!.args,
 719    })
 720  
 721    return spanId
 722  }
 723  
 724  /**
 725   * End a tool execution span
 726   */
 727  export function endToolPerfettoSpan(
 728    spanId: string,
 729    metadata?: {
 730      success?: boolean
 731      error?: string
 732      resultTokens?: number
 733    },
 734  ): void {
 735    if (!isEnabled || !spanId) return
 736  
 737    const pending = pendingSpans.get(spanId)
 738    if (!pending) return
 739  
 740    const endTime = getTimestamp()
 741    const duration = endTime - pending.startTime
 742  
 743    const args = {
 744      ...pending.args,
 745      success: metadata?.success ?? true,
 746      error: metadata?.error,
 747      result_tokens: metadata?.resultTokens,
 748      duration_ms: duration / 1000,
 749    }
 750  
 751    // Emit end event
 752    events.push({
 753      name: pending.name,
 754      cat: pending.category,
 755      ph: 'E',
 756      ts: endTime,
 757      pid: pending.agentInfo.processId,
 758      tid: pending.agentInfo.threadId,
 759      args,
 760    })
 761  
 762    pendingSpans.delete(spanId)
 763  }
 764  
 765  /**
 766   * Start a user input waiting span
 767   */
 768  export function startUserInputPerfettoSpan(context?: string): string {
 769    if (!isEnabled) return ''
 770  
 771    const spanId = generateSpanId()
 772    const agentInfo = getCurrentAgentInfo()
 773  
 774    pendingSpans.set(spanId, {
 775      name: 'Waiting for User Input',
 776      category: 'user_input',
 777      startTime: getTimestamp(),
 778      agentInfo,
 779      args: {
 780        context,
 781      },
 782    })
 783  
 784    // Emit begin event
 785    events.push({
 786      name: 'Waiting for User Input',
 787      cat: 'user_input',
 788      ph: 'B',
 789      ts: pendingSpans.get(spanId)!.startTime,
 790      pid: agentInfo.processId,
 791      tid: agentInfo.threadId,
 792      args: pendingSpans.get(spanId)!.args,
 793    })
 794  
 795    return spanId
 796  }
 797  
 798  /**
 799   * End a user input waiting span
 800   */
 801  export function endUserInputPerfettoSpan(
 802    spanId: string,
 803    metadata?: {
 804      decision?: string
 805      source?: string
 806    },
 807  ): void {
 808    if (!isEnabled || !spanId) return
 809  
 810    const pending = pendingSpans.get(spanId)
 811    if (!pending) return
 812  
 813    const endTime = getTimestamp()
 814    const duration = endTime - pending.startTime
 815  
 816    const args = {
 817      ...pending.args,
 818      decision: metadata?.decision,
 819      source: metadata?.source,
 820      duration_ms: duration / 1000,
 821    }
 822  
 823    // Emit end event
 824    events.push({
 825      name: pending.name,
 826      cat: pending.category,
 827      ph: 'E',
 828      ts: endTime,
 829      pid: pending.agentInfo.processId,
 830      tid: pending.agentInfo.threadId,
 831      args,
 832    })
 833  
 834    pendingSpans.delete(spanId)
 835  }
 836  
 837  /**
 838   * Emit an instant event (marker)
 839   */
 840  export function emitPerfettoInstant(
 841    name: string,
 842    category: string,
 843    args?: Record<string, unknown>,
 844  ): void {
 845    if (!isEnabled) return
 846  
 847    const agentInfo = getCurrentAgentInfo()
 848  
 849    events.push({
 850      name,
 851      cat: category,
 852      ph: 'i',
 853      ts: getTimestamp(),
 854      pid: agentInfo.processId,
 855      tid: agentInfo.threadId,
 856      args,
 857    })
 858  }
 859  
 860  /**
 861   * Emit a counter event for tracking metrics over time
 862   */
 863  export function emitPerfettoCounter(
 864    name: string,
 865    values: Record<string, number>,
 866  ): void {
 867    if (!isEnabled) return
 868  
 869    const agentInfo = getCurrentAgentInfo()
 870  
 871    events.push({
 872      name,
 873      cat: 'counter',
 874      ph: 'C',
 875      ts: getTimestamp(),
 876      pid: agentInfo.processId,
 877      tid: agentInfo.threadId,
 878      args: values,
 879    })
 880  }
 881  
 882  /**
 883   * Start an interaction span (wraps a full user request cycle)
 884   */
 885  export function startInteractionPerfettoSpan(userPrompt?: string): string {
 886    if (!isEnabled) return ''
 887  
 888    const spanId = generateSpanId()
 889    const agentInfo = getCurrentAgentInfo()
 890  
 891    pendingSpans.set(spanId, {
 892      name: 'Interaction',
 893      category: 'interaction',
 894      startTime: getTimestamp(),
 895      agentInfo,
 896      args: {
 897        user_prompt_length: userPrompt?.length,
 898      },
 899    })
 900  
 901    // Emit begin event
 902    events.push({
 903      name: 'Interaction',
 904      cat: 'interaction',
 905      ph: 'B',
 906      ts: pendingSpans.get(spanId)!.startTime,
 907      pid: agentInfo.processId,
 908      tid: agentInfo.threadId,
 909      args: pendingSpans.get(spanId)!.args,
 910    })
 911  
 912    return spanId
 913  }
 914  
 915  /**
 916   * End an interaction span
 917   */
 918  export function endInteractionPerfettoSpan(spanId: string): void {
 919    if (!isEnabled || !spanId) return
 920  
 921    const pending = pendingSpans.get(spanId)
 922    if (!pending) return
 923  
 924    const endTime = getTimestamp()
 925    const duration = endTime - pending.startTime
 926  
 927    // Emit end event
 928    events.push({
 929      name: pending.name,
 930      cat: pending.category,
 931      ph: 'E',
 932      ts: endTime,
 933      pid: pending.agentInfo.processId,
 934      tid: pending.agentInfo.threadId,
 935      args: {
 936        ...pending.args,
 937        duration_ms: duration / 1000,
 938      },
 939    })
 940  
 941    pendingSpans.delete(spanId)
 942  }
 943  
 944  // ---------------------------------------------------------------------------
 945  // Periodic write helpers
 946  // ---------------------------------------------------------------------------
 947  
 948  /**
 949   * Stop the periodic write timer.
 950   */
 951  function stopWriteInterval(): void {
 952    if (staleSpanCleanupId) {
 953      clearInterval(staleSpanCleanupId)
 954      staleSpanCleanupId = null
 955    }
 956    if (writeIntervalId) {
 957      clearInterval(writeIntervalId)
 958      writeIntervalId = null
 959    }
 960  }
 961  
 962  /**
 963   * Force-close any remaining open spans at session end.
 964   */
 965  function closeOpenSpans(): void {
 966    for (const [spanId, pending] of pendingSpans) {
 967      const endTime = getTimestamp()
 968      events.push({
 969        name: pending.name,
 970        cat: pending.category,
 971        ph: 'E',
 972        ts: endTime,
 973        pid: pending.agentInfo.processId,
 974        tid: pending.agentInfo.threadId,
 975        args: {
 976          ...pending.args,
 977          incomplete: true,
 978          duration_ms: (endTime - pending.startTime) / 1000,
 979        },
 980      })
 981      pendingSpans.delete(spanId)
 982    }
 983  }
 984  
 985  /**
 986   * Write the full trace to disk.  Errors are logged but swallowed so that a
 987   * transient I/O problem does not crash the session — the next periodic tick
 988   * (or the final exit write) will retry with a complete snapshot.
 989   */
 990  async function periodicWrite(): Promise<void> {
 991    if (!isEnabled || !tracePath || traceWritten) return
 992  
 993    try {
 994      await mkdir(dirname(tracePath), { recursive: true })
 995      await writeFile(tracePath, buildTraceDocument())
 996      logForDebugging(
 997        `[Perfetto] Periodic write: ${events.length} events to ${tracePath}`,
 998      )
 999    } catch (error) {
1000      logForDebugging(
1001        `[Perfetto] Periodic write failed: ${errorMessage(error)}`,
1002        { level: 'error' },
1003      )
1004    }
1005  }
1006  
1007  /**
1008   * Final async write: close open spans and write the complete trace.
1009   * Idempotent — sets `traceWritten` on success so subsequent calls are no-ops.
1010   */
1011  async function writePerfettoTrace(): Promise<void> {
1012    if (!isEnabled || !tracePath || traceWritten) {
1013      logForDebugging(
1014        `[Perfetto] Skipping final write: isEnabled=${isEnabled}, tracePath=${tracePath}, traceWritten=${traceWritten}`,
1015      )
1016      return
1017    }
1018  
1019    stopWriteInterval()
1020    closeOpenSpans()
1021  
1022    logForDebugging(
1023      `[Perfetto] writePerfettoTrace called: events=${events.length}`,
1024    )
1025  
1026    try {
1027      await mkdir(dirname(tracePath), { recursive: true })
1028      await writeFile(tracePath, buildTraceDocument())
1029      traceWritten = true
1030      logForDebugging(`[Perfetto] Trace finalized at: ${tracePath}`)
1031    } catch (error) {
1032      logForDebugging(
1033        `[Perfetto] Failed to write final trace: ${errorMessage(error)}`,
1034        { level: 'error' },
1035      )
1036    }
1037  }
1038  
1039  /**
1040   * Final synchronous write (fallback for process 'exit' handler where async is forbidden).
1041   */
1042  function writePerfettoTraceSync(): void {
1043    if (!isEnabled || !tracePath || traceWritten) {
1044      logForDebugging(
1045        `[Perfetto] Skipping final sync write: isEnabled=${isEnabled}, tracePath=${tracePath}, traceWritten=${traceWritten}`,
1046      )
1047      return
1048    }
1049  
1050    stopWriteInterval()
1051    closeOpenSpans()
1052  
1053    logForDebugging(
1054      `[Perfetto] writePerfettoTraceSync called: events=${events.length}`,
1055    )
1056  
1057    try {
1058      const dir = dirname(tracePath)
1059      // eslint-disable-next-line custom-rules/no-sync-fs -- Only called from process.on('exit') handler
1060      mkdirSync(dir, { recursive: true })
1061      // eslint-disable-next-line custom-rules/no-sync-fs, eslint-plugin-n/no-sync -- Required for process 'exit' handler which doesn't support async
1062      writeFileSync(tracePath, buildTraceDocument())
1063      traceWritten = true
1064      logForDebugging(`[Perfetto] Trace finalized synchronously at: ${tracePath}`)
1065    } catch (error) {
1066      logForDebugging(
1067        `[Perfetto] Failed to write final trace synchronously: ${errorMessage(error)}`,
1068        { level: 'error' },
1069      )
1070    }
1071  }
1072  
1073  /**
1074   * Get all recorded events (for testing)
1075   */
1076  export function getPerfettoEvents(): TraceEvent[] {
1077    return [...metadataEvents, ...events]
1078  }
1079  
1080  /**
1081   * Reset the tracer state (for testing)
1082   */
1083  export function resetPerfettoTracer(): void {
1084    if (staleSpanCleanupId) {
1085      clearInterval(staleSpanCleanupId)
1086      staleSpanCleanupId = null
1087    }
1088    stopWriteInterval()
1089    metadataEvents.length = 0
1090    events.length = 0
1091    pendingSpans.clear()
1092    agentRegistry.clear()
1093    agentIdToProcessId.clear()
1094    totalAgentCount = 0
1095    processIdCounter = 1
1096    spanIdCounter = 0
1097    isEnabled = false
1098    tracePath = null
1099    startTimeMs = 0
1100    traceWritten = false
1101  }
1102  
1103  /**
1104   * Trigger a periodic write immediately (for testing)
1105   */
1106  export async function triggerPeriodicWriteForTesting(): Promise<void> {
1107    await periodicWrite()
1108  }
1109  
1110  /**
1111   * Evict stale spans immediately (for testing)
1112   */
1113  export function evictStaleSpansForTesting(): void {
1114    evictStaleSpans()
1115  }
1116  
1117  export const MAX_EVENTS_FOR_TESTING = MAX_EVENTS
1118  export function evictOldestEventsForTesting(): void {
1119    evictOldestEvents()
1120  }