transcript.ts
1 /** 2 * Transcript parsing utilities for Codex CLI rollout JSONL files. 3 * 4 * Codex CLI transcripts use a RolloutLine format defined in 5 * codex-rs/protocol/src/protocol.rs. Each line is: 6 * {"timestamp": "...", "type": "<variant>", "payload": {...}} 7 * 8 * Turns are delimited by event_msg task_started / task_complete pairs. 9 * 10 * References: 11 * - Protocol types: github.com/openai/codex codex-rs/protocol/src/protocol.rs 12 * - Rollout recorder: github.com/openai/codex codex-rs/rollout/src/recorder.rs 13 */ 14 15 import { readFileSync, readdirSync, existsSync } from 'node:fs'; 16 import { join } from 'node:path'; 17 import { homedir } from 'node:os'; 18 import type { 19 RolloutLine, 20 ResponseItemPayload, 21 EventMsgPayload, 22 SessionMetaPayload, 23 TokenUsage, 24 ContentBlock, 25 } from './types.js'; 26 27 export const NANOSECONDS_PER_MS = 1e6; 28 29 /** 30 * Read and parse a Codex JSONL transcript file. 31 */ 32 export function readTranscript(path: string): RolloutLine[] { 33 const content = readFileSync(path, 'utf-8'); 34 return content 35 .split('\n') 36 .filter((line) => line.trim()) 37 .map((line) => JSON.parse(line) as RolloutLine); 38 } 39 40 /** 41 * Parse an ISO timestamp string to nanoseconds since Unix epoch. 42 */ 43 export function parseTimestampToNs(timestamp: string | undefined | null): number | null { 44 if (!timestamp) { 45 return null; 46 } 47 try { 48 const ms = new Date(timestamp).getTime(); 49 if (isNaN(ms)) { 50 return null; 51 } 52 return ms * NANOSECONDS_PER_MS; 53 } catch { 54 return null; 55 } 56 } 57 58 /** 59 * Extract text from a response_item content field. 60 * Content is an array of ContentBlock objects with type "input_text" or "output_text". 61 */ 62 export function extractTextFromContent(content: ContentBlock[] | string | undefined): string { 63 if (!content) { 64 return ''; 65 } 66 if (typeof content === 'string') { 67 return content; 68 } 69 if (!Array.isArray(content)) { 70 return ''; 71 } 72 return content 73 .filter((block) => block.type === 'input_text' || block.type === 'output_text') 74 .map((block) => block.text) 75 .join('\n'); 76 } 77 78 /** 79 * Find the last user prompt in the transcript. 80 * User prompts are response_item records with payload.type=message and payload.role=user 81 * whose content has input_text blocks that aren't system/developer injections. 82 */ 83 export function findLastUserPrompt(records: RolloutLine[]): { text: string; index: number } | null { 84 for (let i = records.length - 1; i >= 0; i--) { 85 const record = records[i]; 86 if (record.type !== 'response_item') { 87 continue; 88 } 89 const payload = record.payload as ResponseItemPayload; 90 if (payload.type !== 'message' || payload.role !== 'user') { 91 continue; 92 } 93 94 const text = extractTextFromContent(payload.content); 95 // Skip system/developer context injections (start with XML-like tags) 96 if (text && !text.startsWith('<')) { 97 return { text, index: i }; 98 } 99 } 100 return null; 101 } 102 103 /** 104 * Extract records belonging to the last turn. 105 * Turns are delimited by event_msg records with type=task_started / task_complete. 106 */ 107 export function getLastTurnRecords(records: RolloutLine[]): RolloutLine[] { 108 let lastStart: number | null = null; 109 let lastEnd: number | null = null; 110 111 for (let i = 0; i < records.length; i++) { 112 if (records[i].type !== 'event_msg') { 113 continue; 114 } 115 const payload = records[i].payload as EventMsgPayload; 116 if (payload.type === 'task_started') { 117 lastStart = i; 118 } else if (payload.type === 'task_complete') { 119 lastEnd = i; 120 } 121 } 122 123 if (lastStart != null) { 124 // If lastEnd is before lastStart (or missing), the turn is in-progress — slice to end of file 125 const end = lastEnd != null && lastEnd >= lastStart ? lastEnd + 1 : records.length; 126 return records.slice(lastStart, end); 127 } 128 return records; 129 } 130 131 /** 132 * Extract cumulative token usage from the last token_count event in a set of records. 133 */ 134 export function getTokenUsage(records: RolloutLine[]): TokenUsage | null { 135 let usage: TokenUsage | null = null; 136 for (const record of records) { 137 if (record.type !== 'event_msg') { 138 continue; 139 } 140 const payload = record.payload as EventMsgPayload; 141 if (payload.type !== 'token_count') { 142 continue; 143 } 144 if (payload.info?.last_token_usage) { 145 usage = payload.info.last_token_usage; 146 } 147 } 148 return usage; 149 } 150 151 /** 152 * Extract model name from session_meta or turn_context records. 153 */ 154 export function getModel(records: RolloutLine[]): string { 155 for (const record of records) { 156 if (record.type === 'session_meta' || record.type === 'turn_context') { 157 const model = (record.payload as Record<string, unknown>).model; 158 if (typeof model === 'string') { 159 return model; 160 } 161 } 162 } 163 return 'unknown'; 164 } 165 166 /** 167 * Extract session ID from the session_meta record. 168 */ 169 export function getSessionId(records: RolloutLine[]): string | null { 170 for (const record of records) { 171 if (record.type === 'session_meta') { 172 return (record.payload as SessionMetaPayload).id ?? null; 173 } 174 } 175 return null; 176 } 177 178 /** 179 * Build a map from function_call call_id to function_call_output output. 180 */ 181 export function buildToolResultMap(records: RolloutLine[]): Record<string, string> { 182 const results: Record<string, string> = {}; 183 for (const record of records) { 184 if (record.type !== 'response_item') { 185 continue; 186 } 187 const payload = record.payload as ResponseItemPayload; 188 if (payload.type === 'function_call_output' && payload.call_id) { 189 results[payload.call_id] = payload.output ?? ''; 190 } 191 } 192 return results; 193 } 194 195 /** 196 * Find the transcript rollout file for a given thread ID. 197 * 198 * Codex stores transcripts at: 199 * ~/.codex/sessions/YYYY/MM/DD/rollout-<timestamp>-<thread-id>.jsonl 200 * 201 * This is optional enrichment — if not found, tracing still works 202 * from the notify payload alone. 203 */ 204 export function findTranscriptForThread(threadId: string): string | null { 205 try { 206 const sessionsDir = join(homedir(), '.codex', 'sessions'); 207 if (!existsSync(sessionsDir)) { 208 return null; 209 } 210 211 // Fast path: check today's directory first since the hook fires 212 // right after a turn completes — the transcript is almost always 213 // from the current date. 214 const now = new Date(); 215 const year = String(now.getFullYear()); 216 const month = String(now.getMonth() + 1).padStart(2, '0'); 217 const day = String(now.getDate()).padStart(2, '0'); 218 const todayDir = join(sessionsDir, year, month, day); 219 220 if (existsSync(todayDir)) { 221 const files = readdirSync(todayDir).filter( 222 (f) => f.endsWith('.jsonl') && f.includes(threadId), 223 ); 224 if (files.length > 0) { 225 return join(todayDir, files[0]); 226 } 227 } 228 229 // Slow path: walk year/month/day directories in reverse order. 230 // Only needed if the session started before midnight and the hook 231 // fires after, or the clock is off. 232 const years = readdirSync(sessionsDir).sort().reverse(); 233 for (const y of years) { 234 const yearDir = join(sessionsDir, y); 235 const months = readdirSync(yearDir).sort().reverse(); 236 for (const m of months) { 237 const monthDir = join(yearDir, m); 238 const days = readdirSync(monthDir).sort().reverse(); 239 for (const d of days) { 240 // Skip today's dir — already checked above 241 if (y === year && m === month && d === day) { 242 continue; 243 } 244 const dayDir = join(monthDir, d); 245 const files = readdirSync(dayDir).filter( 246 (f) => f.endsWith('.jsonl') && f.includes(threadId), 247 ); 248 if (files.length > 0) { 249 return join(dayDir, files[0]); 250 } 251 } 252 } 253 } 254 } catch { 255 // Transcript lookup is best-effort 256 } 257 return null; 258 }