diagnostic.ts
1 /** 2 * Structured diagnostic output for AI-driven adapter repair. 3 * 4 * When OPENCLI_DIAGNOSTIC=1, failed commands emit a JSON RepairContext to stderr 5 * containing the error, adapter source, and browser state (DOM snapshot, network 6 * requests, console errors). AI Agents consume this to diagnose and fix adapters. 7 * 8 * Safety boundaries: 9 * - Sensitive headers/cookies are redacted before emission 10 * - Individual fields are capped to prevent unbounded output 11 * - Network response bodies from authenticated requests are stripped 12 * - Total output is capped to MAX_DIAGNOSTIC_BYTES 13 */ 14 15 import * as fs from 'node:fs'; 16 import * as path from 'node:path'; 17 import type { IPage } from './types.js'; 18 import { CliError, getErrorMessage } from './errors.js'; 19 import type { InternalCliCommand } from './registry.js'; 20 import { fullName } from './registry.js'; 21 22 // ── Size budgets ───────────────────────────────────────────────────────────── 23 24 /** Maximum bytes for the entire diagnostic JSON output. */ 25 export const MAX_DIAGNOSTIC_BYTES = 256 * 1024; // 256 KB 26 /** Maximum characters for DOM snapshot. */ 27 const MAX_SNAPSHOT_CHARS = 100_000; 28 /** Maximum characters for adapter source. */ 29 const MAX_SOURCE_CHARS = 50_000; 30 /** Maximum number of network requests to include. */ 31 const MAX_NETWORK_REQUESTS = 50; 32 /** Maximum number of captured interceptor payloads to include. */ 33 const MAX_CAPTURED_PAYLOADS = 20; 34 /** Maximum characters for a single network request body. */ 35 const MAX_REQUEST_BODY_CHARS = 4_000; 36 /** Maximum characters for error stack trace. */ 37 const MAX_STACK_CHARS = 5_000; 38 /** Maximum nesting depth for arbitrary captured payloads. */ 39 const MAX_CAPTURED_DEPTH = 4; 40 /** Maximum object keys or array items to keep per nesting level. */ 41 const MAX_CAPTURED_CHILDREN = 20; 42 43 // ── Sensitive data patterns ────────────────────────────────────────────────── 44 45 const SENSITIVE_HEADERS = new Set([ 46 'authorization', 47 'cookie', 48 'set-cookie', 49 'x-csrf-token', 50 'x-xsrf-token', 51 'proxy-authorization', 52 'x-api-key', 53 'x-auth-token', 54 ]); 55 56 const SENSITIVE_URL_PARAMS = /([?&])(token|key|secret|password|auth|access_token|api_key|session_id|csrf)=[^&]*/gi; 57 58 /** Patterns that match inline secrets in free-text strings (error messages, stack traces, console output, DOM). */ 59 const SENSITIVE_TEXT_PATTERNS: Array<{ pattern: RegExp; replacement: string }> = [ 60 // Bearer tokens 61 { pattern: /Bearer\s+[A-Za-z0-9\-._~+/]+=*/gi, replacement: 'Bearer [REDACTED]' }, 62 // Generic "token=...", "key=...", etc. in non-URL text 63 { pattern: /(token|secret|password|api_key|apikey|access_token|session_id)[=:]\s*['"]?[A-Za-z0-9\-._~+/]{8,}['"]?/gi, replacement: '$1=[REDACTED]' }, 64 // Cookie header values (key=value pairs) 65 { pattern: /(cookie[=:]\s*)[^\n;]{10,}/gi, replacement: '$1[REDACTED]' }, 66 // JWT-like tokens (three base64 segments separated by dots) 67 { pattern: /eyJ[A-Za-z0-9_-]{10,}\.eyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}/g, replacement: '[REDACTED_JWT]' }, 68 ]; 69 70 // ── Types ──────────────────────────────────────────────────────────────────── 71 72 export interface RepairContext { 73 error: { 74 code: string; 75 message: string; 76 hint?: string; 77 stack?: string; 78 }; 79 adapter: { 80 site: string; 81 command: string; 82 sourcePath?: string; 83 source?: string; 84 }; 85 page?: { 86 url: string; 87 snapshot: string; 88 networkRequests: unknown[]; 89 capturedPayloads?: unknown[]; 90 consoleErrors: unknown[]; 91 }; 92 timestamp: string; 93 } 94 95 // ── Redaction helpers ──────────────────────────────────────────────────────── 96 97 /** Truncate a string to maxLen, appending a truncation marker. */ 98 export function truncate(str: string, maxLen: number): string { 99 if (str.length <= maxLen) return str; 100 return str.slice(0, maxLen) + `\n...[truncated, ${str.length - maxLen} chars omitted]`; 101 } 102 103 /** Redact sensitive query parameters from a URL. */ 104 export function redactUrl(url: string): string { 105 return url.replace(SENSITIVE_URL_PARAMS, '$1$2=[REDACTED]'); 106 } 107 108 /** Redact inline secrets from free-text strings (error messages, stack traces, console output, DOM). */ 109 export function redactText(text: string): string { 110 let result = text; 111 for (const { pattern, replacement } of SENSITIVE_TEXT_PATTERNS) { 112 // Reset lastIndex for global regexps 113 pattern.lastIndex = 0; 114 result = result.replace(pattern, replacement); 115 } 116 return result; 117 } 118 119 /** Redact sensitive headers from a headers object. */ 120 function redactHeaders(headers: Record<string, string> | undefined): Record<string, string> | undefined { 121 if (!headers || typeof headers !== 'object') return headers; 122 const result: Record<string, string> = {}; 123 for (const [key, value] of Object.entries(headers)) { 124 result[key] = SENSITIVE_HEADERS.has(key.toLowerCase()) ? '[REDACTED]' : value; 125 } 126 return result; 127 } 128 129 /** Recursively sanitize arbitrary captured response content for diagnostic output. */ 130 function sanitizeCapturedValue(value: unknown, depth: number = 0): unknown { 131 if (typeof value === 'string') { 132 return redactText(truncate(value, MAX_REQUEST_BODY_CHARS)); 133 } 134 if (value === null || typeof value === 'number' || typeof value === 'boolean') { 135 return value; 136 } 137 if (depth >= MAX_CAPTURED_DEPTH) { 138 return '[truncated: max depth reached]'; 139 } 140 if (Array.isArray(value)) { 141 const items = value 142 .slice(0, MAX_CAPTURED_CHILDREN) 143 .map(item => sanitizeCapturedValue(item, depth + 1)); 144 if (value.length > MAX_CAPTURED_CHILDREN) { 145 items.push(`[truncated, ${value.length - MAX_CAPTURED_CHILDREN} items omitted]`); 146 } 147 return items; 148 } 149 if (!value || typeof value !== 'object') { 150 return value; 151 } 152 153 const entries = Object.entries(value); 154 const result: Record<string, unknown> = {}; 155 for (const [key, child] of entries.slice(0, MAX_CAPTURED_CHILDREN)) { 156 result[key] = sanitizeCapturedValue(child, depth + 1); 157 } 158 if (entries.length > MAX_CAPTURED_CHILDREN) { 159 result.__truncated__ = `[${entries.length - MAX_CAPTURED_CHILDREN} fields omitted]`; 160 } 161 return result; 162 } 163 164 /** Redact sensitive data from a single network request entry. */ 165 function redactNetworkRequest(req: unknown): unknown { 166 if (!req || typeof req !== 'object') return req; 167 const r = req as Record<string, unknown>; 168 const redacted: Record<string, unknown> = { ...r }; 169 170 // Redact URL 171 if (typeof redacted.url === 'string') { 172 redacted.url = redactUrl(redacted.url); 173 } 174 175 // Redact headers 176 if (redacted.headers && typeof redacted.headers === 'object') { 177 redacted.headers = redactHeaders(redacted.headers as Record<string, string>); 178 } 179 if (redacted.requestHeaders && typeof redacted.requestHeaders === 'object') { 180 redacted.requestHeaders = redactHeaders(redacted.requestHeaders as Record<string, string>); 181 } 182 if (redacted.responseHeaders && typeof redacted.responseHeaders === 'object') { 183 redacted.responseHeaders = redactHeaders(redacted.responseHeaders as Record<string, string>); 184 } 185 186 // Redact and truncate response body 187 if (typeof redacted.body === 'string') { 188 redacted.body = redactText(truncate(redacted.body, MAX_REQUEST_BODY_CHARS)); 189 } 190 if ('responseBody' in redacted) { 191 redacted.responseBody = sanitizeCapturedValue(redacted.responseBody); 192 } 193 if ('responsePreview' in redacted) { 194 redacted.responsePreview = sanitizeCapturedValue(redacted.responsePreview); 195 } 196 197 return redacted; 198 } 199 200 // ── Timeout helper ─────────────────────────────────────────────────────────── 201 202 /** Timeout for page state collection (prevents hang when CDP connection is stuck). */ 203 const PAGE_STATE_TIMEOUT_MS = 5_000; 204 205 function withTimeout<T>(promise: Promise<T>, ms: number, fallback: T): Promise<T> { 206 return Promise.race([ 207 promise, 208 new Promise<T>(resolve => setTimeout(() => resolve(fallback), ms)), 209 ]); 210 } 211 212 // ── Source path resolution ─────────────────────────────────────────────────── 213 214 /** 215 * Resolve the editable source file path for an adapter. 216 * 217 * Priority: 218 * 1. cmd.source (set for FS-scanned JS and manifest lazy-loaded JS) 219 * 2. cmd._modulePath (set for manifest lazy-loaded JS) 220 * 221 * Skip manifest: prefixed pseudo-paths (YAML commands inlined in manifest). 222 */ 223 export function resolveAdapterSourcePath(cmd: InternalCliCommand): string | undefined { 224 const candidates: string[] = []; 225 226 // cmd.source may be a real file path or 'manifest:site/name' 227 if (cmd.source && !cmd.source.startsWith('manifest:')) { 228 candidates.push(cmd.source); 229 } 230 if (cmd._modulePath) { 231 candidates.push(cmd._modulePath); 232 } 233 234 for (const candidate of candidates) { 235 if (fs.existsSync(candidate)) return candidate; 236 } 237 238 return candidates[0]; // Return best guess even if file doesn't exist 239 } 240 241 // ── Diagnostic collection ──────────────────────────────────────────────────── 242 243 /** Whether diagnostic mode is enabled. */ 244 export function isDiagnosticEnabled(): boolean { 245 return process.env.OPENCLI_DIAGNOSTIC === '1'; 246 } 247 248 function normalizeInterceptedRequests(interceptedRequests: unknown[]): unknown[] { 249 return interceptedRequests.slice(0, MAX_CAPTURED_PAYLOADS).map(responseBody => ({ 250 source: 'interceptor', 251 responseBody: sanitizeCapturedValue(responseBody), 252 })); 253 } 254 255 /** Safely collect page diagnostic state with redaction, size caps, and timeout. */ 256 async function collectPageState(page: IPage): Promise<RepairContext['page'] | undefined> { 257 const collect = async (): Promise<RepairContext['page'] | undefined> => { 258 try { 259 const [url, snapshot, networkRequests, interceptedRequests, consoleErrors] = await Promise.all([ 260 page.getCurrentUrl?.().catch(() => null) ?? Promise.resolve(null), 261 page.snapshot().catch(() => '(snapshot unavailable)'), 262 page.networkRequests().catch(() => []), 263 page.getInterceptedRequests().catch(() => []), 264 page.consoleMessages('error').catch(() => []), 265 ]); 266 267 const rawUrl = url ?? 'unknown'; 268 const capturedResponses = normalizeInterceptedRequests(interceptedRequests as unknown[]); 269 return { 270 url: redactUrl(rawUrl), 271 snapshot: redactText(truncate(snapshot, MAX_SNAPSHOT_CHARS)), 272 networkRequests: (networkRequests as unknown[]) 273 .slice(0, MAX_NETWORK_REQUESTS) 274 .map(redactNetworkRequest), 275 capturedPayloads: capturedResponses, 276 consoleErrors: (consoleErrors as unknown[]) 277 .slice(0, 50) 278 .map(e => typeof e === 'string' ? redactText(e) : e), 279 }; 280 } catch { 281 return undefined; 282 } 283 }; 284 285 return withTimeout(collect(), PAGE_STATE_TIMEOUT_MS, undefined); 286 } 287 288 /** Read adapter source file content with size cap. */ 289 function readAdapterSource(sourcePath: string | undefined): string | undefined { 290 if (!sourcePath) return undefined; 291 try { 292 const content = fs.readFileSync(sourcePath, 'utf-8'); 293 return truncate(content, MAX_SOURCE_CHARS); 294 } catch { 295 return undefined; 296 } 297 } 298 299 /** Build a RepairContext from an error, command metadata, and optional page state. */ 300 export function buildRepairContext( 301 err: unknown, 302 cmd: InternalCliCommand, 303 pageState?: RepairContext['page'], 304 ): RepairContext { 305 const isCliError = err instanceof CliError; 306 const sourcePath = resolveAdapterSourcePath(cmd); 307 return { 308 error: { 309 code: isCliError ? err.code : 'UNKNOWN', 310 message: redactText(getErrorMessage(err)), 311 hint: isCliError && err.hint ? redactText(err.hint) : undefined, 312 stack: err instanceof Error ? redactText(truncate(err.stack ?? '', MAX_STACK_CHARS)) : undefined, 313 }, 314 adapter: { 315 site: cmd.site, 316 command: fullName(cmd), 317 sourcePath, 318 source: readAdapterSource(sourcePath), 319 }, 320 page: pageState, 321 timestamp: new Date().toISOString(), 322 }; 323 } 324 325 /** Collect full diagnostic context including page state (with timeout). */ 326 export async function collectDiagnostic( 327 err: unknown, 328 cmd: InternalCliCommand, 329 page: IPage | null, 330 ): Promise<RepairContext> { 331 const pageState = page ? await collectPageState(page) : undefined; 332 return buildRepairContext(err, cmd, pageState); 333 } 334 335 /** Emit diagnostic JSON to stderr, enforcing total size cap. */ 336 export function emitDiagnostic(ctx: RepairContext): void { 337 const marker = '___OPENCLI_DIAGNOSTIC___'; 338 let json = JSON.stringify(ctx); 339 340 // Enforce total output budget — drop page state (largest section) first if over budget 341 if (json.length > MAX_DIAGNOSTIC_BYTES && ctx.page) { 342 const trimmed = { 343 ...ctx, 344 page: { 345 ...ctx.page, 346 snapshot: '[omitted: over size budget]', 347 networkRequests: [], 348 capturedPayloads: [], 349 }, 350 }; 351 json = JSON.stringify(trimmed); 352 } 353 // If still over budget, drop page entirely 354 if (json.length > MAX_DIAGNOSTIC_BYTES) { 355 const minimal = { ...ctx, page: undefined }; 356 json = JSON.stringify(minimal); 357 } 358 359 process.stderr.write(`\n${marker}\n${json}\n${marker}\n`); 360 }