/ src / diagnostic.ts
diagnostic.ts
  1  /**
  2   * Structured diagnostic output for AI-driven adapter repair.
  3   *
  4   * When OPENCLI_DIAGNOSTIC=1, failed commands emit a JSON RepairContext to stderr
  5   * containing the error, adapter source, and browser state (DOM snapshot, network
  6   * requests, console errors). AI Agents consume this to diagnose and fix adapters.
  7   *
  8   * Safety boundaries:
  9   * - Sensitive headers/cookies are redacted before emission
 10   * - Individual fields are capped to prevent unbounded output
 11   * - Network response bodies from authenticated requests are stripped
 12   * - Total output is capped to MAX_DIAGNOSTIC_BYTES
 13   */
 14  
 15  import * as fs from 'node:fs';
 16  import * as path from 'node:path';
 17  import type { IPage } from './types.js';
 18  import { CliError, getErrorMessage } from './errors.js';
 19  import type { InternalCliCommand } from './registry.js';
 20  import { fullName } from './registry.js';
 21  
 22  // ── Size budgets ─────────────────────────────────────────────────────────────
 23  
 24  /** Maximum bytes for the entire diagnostic JSON output. */
 25  export const MAX_DIAGNOSTIC_BYTES = 256 * 1024; // 256 KB
 26  /** Maximum characters for DOM snapshot. */
 27  const MAX_SNAPSHOT_CHARS = 100_000;
 28  /** Maximum characters for adapter source. */
 29  const MAX_SOURCE_CHARS = 50_000;
 30  /** Maximum number of network requests to include. */
 31  const MAX_NETWORK_REQUESTS = 50;
 32  /** Maximum number of captured interceptor payloads to include. */
 33  const MAX_CAPTURED_PAYLOADS = 20;
 34  /** Maximum characters for a single network request body. */
 35  const MAX_REQUEST_BODY_CHARS = 4_000;
 36  /** Maximum characters for error stack trace. */
 37  const MAX_STACK_CHARS = 5_000;
 38  /** Maximum nesting depth for arbitrary captured payloads. */
 39  const MAX_CAPTURED_DEPTH = 4;
 40  /** Maximum object keys or array items to keep per nesting level. */
 41  const MAX_CAPTURED_CHILDREN = 20;
 42  
 43  // ── Sensitive data patterns ──────────────────────────────────────────────────
 44  
 45  const SENSITIVE_HEADERS = new Set([
 46    'authorization',
 47    'cookie',
 48    'set-cookie',
 49    'x-csrf-token',
 50    'x-xsrf-token',
 51    'proxy-authorization',
 52    'x-api-key',
 53    'x-auth-token',
 54  ]);
 55  
 56  const SENSITIVE_URL_PARAMS = /([?&])(token|key|secret|password|auth|access_token|api_key|session_id|csrf)=[^&]*/gi;
 57  
 58  /** Patterns that match inline secrets in free-text strings (error messages, stack traces, console output, DOM). */
 59  const SENSITIVE_TEXT_PATTERNS: Array<{ pattern: RegExp; replacement: string }> = [
 60    // Bearer tokens
 61    { pattern: /Bearer\s+[A-Za-z0-9\-._~+/]+=*/gi, replacement: 'Bearer [REDACTED]' },
 62    // Generic "token=...", "key=...", etc. in non-URL text
 63    { pattern: /(token|secret|password|api_key|apikey|access_token|session_id)[=:]\s*['"]?[A-Za-z0-9\-._~+/]{8,}['"]?/gi, replacement: '$1=[REDACTED]' },
 64    // Cookie header values (key=value pairs)
 65    { pattern: /(cookie[=:]\s*)[^\n;]{10,}/gi, replacement: '$1[REDACTED]' },
 66    // JWT-like tokens (three base64 segments separated by dots)
 67    { pattern: /eyJ[A-Za-z0-9_-]{10,}\.eyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}/g, replacement: '[REDACTED_JWT]' },
 68  ];
 69  
 70  // ── Types ────────────────────────────────────────────────────────────────────
 71  
 72  export interface RepairContext {
 73    error: {
 74      code: string;
 75      message: string;
 76      hint?: string;
 77      stack?: string;
 78    };
 79    adapter: {
 80      site: string;
 81      command: string;
 82      sourcePath?: string;
 83      source?: string;
 84    };
 85    page?: {
 86      url: string;
 87      snapshot: string;
 88      networkRequests: unknown[];
 89      capturedPayloads?: unknown[];
 90      consoleErrors: unknown[];
 91    };
 92    timestamp: string;
 93  }
 94  
 95  // ── Redaction helpers ────────────────────────────────────────────────────────
 96  
 97  /** Truncate a string to maxLen, appending a truncation marker. */
 98  export function truncate(str: string, maxLen: number): string {
 99    if (str.length <= maxLen) return str;
100    return str.slice(0, maxLen) + `\n...[truncated, ${str.length - maxLen} chars omitted]`;
101  }
102  
103  /** Redact sensitive query parameters from a URL. */
104  export function redactUrl(url: string): string {
105    return url.replace(SENSITIVE_URL_PARAMS, '$1$2=[REDACTED]');
106  }
107  
108  /** Redact inline secrets from free-text strings (error messages, stack traces, console output, DOM). */
109  export function redactText(text: string): string {
110    let result = text;
111    for (const { pattern, replacement } of SENSITIVE_TEXT_PATTERNS) {
112      // Reset lastIndex for global regexps
113      pattern.lastIndex = 0;
114      result = result.replace(pattern, replacement);
115    }
116    return result;
117  }
118  
119  /** Redact sensitive headers from a headers object. */
120  function redactHeaders(headers: Record<string, string> | undefined): Record<string, string> | undefined {
121    if (!headers || typeof headers !== 'object') return headers;
122    const result: Record<string, string> = {};
123    for (const [key, value] of Object.entries(headers)) {
124      result[key] = SENSITIVE_HEADERS.has(key.toLowerCase()) ? '[REDACTED]' : value;
125    }
126    return result;
127  }
128  
129  /** Recursively sanitize arbitrary captured response content for diagnostic output. */
130  function sanitizeCapturedValue(value: unknown, depth: number = 0): unknown {
131    if (typeof value === 'string') {
132      return redactText(truncate(value, MAX_REQUEST_BODY_CHARS));
133    }
134    if (value === null || typeof value === 'number' || typeof value === 'boolean') {
135      return value;
136    }
137    if (depth >= MAX_CAPTURED_DEPTH) {
138      return '[truncated: max depth reached]';
139    }
140    if (Array.isArray(value)) {
141      const items = value
142        .slice(0, MAX_CAPTURED_CHILDREN)
143        .map(item => sanitizeCapturedValue(item, depth + 1));
144      if (value.length > MAX_CAPTURED_CHILDREN) {
145        items.push(`[truncated, ${value.length - MAX_CAPTURED_CHILDREN} items omitted]`);
146      }
147      return items;
148    }
149    if (!value || typeof value !== 'object') {
150      return value;
151    }
152  
153    const entries = Object.entries(value);
154    const result: Record<string, unknown> = {};
155    for (const [key, child] of entries.slice(0, MAX_CAPTURED_CHILDREN)) {
156      result[key] = sanitizeCapturedValue(child, depth + 1);
157    }
158    if (entries.length > MAX_CAPTURED_CHILDREN) {
159      result.__truncated__ = `[${entries.length - MAX_CAPTURED_CHILDREN} fields omitted]`;
160    }
161    return result;
162  }
163  
164  /** Redact sensitive data from a single network request entry. */
165  function redactNetworkRequest(req: unknown): unknown {
166    if (!req || typeof req !== 'object') return req;
167    const r = req as Record<string, unknown>;
168    const redacted: Record<string, unknown> = { ...r };
169  
170    // Redact URL
171    if (typeof redacted.url === 'string') {
172      redacted.url = redactUrl(redacted.url);
173    }
174  
175    // Redact headers
176    if (redacted.headers && typeof redacted.headers === 'object') {
177      redacted.headers = redactHeaders(redacted.headers as Record<string, string>);
178    }
179    if (redacted.requestHeaders && typeof redacted.requestHeaders === 'object') {
180      redacted.requestHeaders = redactHeaders(redacted.requestHeaders as Record<string, string>);
181    }
182    if (redacted.responseHeaders && typeof redacted.responseHeaders === 'object') {
183      redacted.responseHeaders = redactHeaders(redacted.responseHeaders as Record<string, string>);
184    }
185  
186    // Redact and truncate response body
187    if (typeof redacted.body === 'string') {
188      redacted.body = redactText(truncate(redacted.body, MAX_REQUEST_BODY_CHARS));
189    }
190    if ('responseBody' in redacted) {
191      redacted.responseBody = sanitizeCapturedValue(redacted.responseBody);
192    }
193    if ('responsePreview' in redacted) {
194      redacted.responsePreview = sanitizeCapturedValue(redacted.responsePreview);
195    }
196  
197    return redacted;
198  }
199  
200  // ── Timeout helper ───────────────────────────────────────────────────────────
201  
202  /** Timeout for page state collection (prevents hang when CDP connection is stuck). */
203  const PAGE_STATE_TIMEOUT_MS = 5_000;
204  
205  function withTimeout<T>(promise: Promise<T>, ms: number, fallback: T): Promise<T> {
206    return Promise.race([
207      promise,
208      new Promise<T>(resolve => setTimeout(() => resolve(fallback), ms)),
209    ]);
210  }
211  
212  // ── Source path resolution ───────────────────────────────────────────────────
213  
214  /**
215   * Resolve the editable source file path for an adapter.
216   *
217   * Priority:
218   * 1. cmd.source (set for FS-scanned JS and manifest lazy-loaded JS)
219   * 2. cmd._modulePath (set for manifest lazy-loaded JS)
220   *
221   * Skip manifest: prefixed pseudo-paths (YAML commands inlined in manifest).
222   */
223  export function resolveAdapterSourcePath(cmd: InternalCliCommand): string | undefined {
224    const candidates: string[] = [];
225  
226    // cmd.source may be a real file path or 'manifest:site/name'
227    if (cmd.source && !cmd.source.startsWith('manifest:')) {
228      candidates.push(cmd.source);
229    }
230    if (cmd._modulePath) {
231      candidates.push(cmd._modulePath);
232    }
233  
234    for (const candidate of candidates) {
235      if (fs.existsSync(candidate)) return candidate;
236    }
237  
238    return candidates[0]; // Return best guess even if file doesn't exist
239  }
240  
241  // ── Diagnostic collection ────────────────────────────────────────────────────
242  
243  /** Whether diagnostic mode is enabled. */
244  export function isDiagnosticEnabled(): boolean {
245    return process.env.OPENCLI_DIAGNOSTIC === '1';
246  }
247  
248  function normalizeInterceptedRequests(interceptedRequests: unknown[]): unknown[] {
249    return interceptedRequests.slice(0, MAX_CAPTURED_PAYLOADS).map(responseBody => ({
250      source: 'interceptor',
251      responseBody: sanitizeCapturedValue(responseBody),
252    }));
253  }
254  
255  /** Safely collect page diagnostic state with redaction, size caps, and timeout. */
256  async function collectPageState(page: IPage): Promise<RepairContext['page'] | undefined> {
257    const collect = async (): Promise<RepairContext['page'] | undefined> => {
258      try {
259        const [url, snapshot, networkRequests, interceptedRequests, consoleErrors] = await Promise.all([
260          page.getCurrentUrl?.().catch(() => null) ?? Promise.resolve(null),
261          page.snapshot().catch(() => '(snapshot unavailable)'),
262          page.networkRequests().catch(() => []),
263          page.getInterceptedRequests().catch(() => []),
264          page.consoleMessages('error').catch(() => []),
265        ]);
266  
267        const rawUrl = url ?? 'unknown';
268        const capturedResponses = normalizeInterceptedRequests(interceptedRequests as unknown[]);
269        return {
270          url: redactUrl(rawUrl),
271          snapshot: redactText(truncate(snapshot, MAX_SNAPSHOT_CHARS)),
272          networkRequests: (networkRequests as unknown[])
273            .slice(0, MAX_NETWORK_REQUESTS)
274            .map(redactNetworkRequest),
275          capturedPayloads: capturedResponses,
276          consoleErrors: (consoleErrors as unknown[])
277            .slice(0, 50)
278            .map(e => typeof e === 'string' ? redactText(e) : e),
279        };
280      } catch {
281        return undefined;
282      }
283    };
284  
285    return withTimeout(collect(), PAGE_STATE_TIMEOUT_MS, undefined);
286  }
287  
288  /** Read adapter source file content with size cap. */
289  function readAdapterSource(sourcePath: string | undefined): string | undefined {
290    if (!sourcePath) return undefined;
291    try {
292      const content = fs.readFileSync(sourcePath, 'utf-8');
293      return truncate(content, MAX_SOURCE_CHARS);
294    } catch {
295      return undefined;
296    }
297  }
298  
299  /** Build a RepairContext from an error, command metadata, and optional page state. */
300  export function buildRepairContext(
301    err: unknown,
302    cmd: InternalCliCommand,
303    pageState?: RepairContext['page'],
304  ): RepairContext {
305    const isCliError = err instanceof CliError;
306    const sourcePath = resolveAdapterSourcePath(cmd);
307    return {
308      error: {
309        code: isCliError ? err.code : 'UNKNOWN',
310        message: redactText(getErrorMessage(err)),
311        hint: isCliError && err.hint ? redactText(err.hint) : undefined,
312        stack: err instanceof Error ? redactText(truncate(err.stack ?? '', MAX_STACK_CHARS)) : undefined,
313      },
314      adapter: {
315        site: cmd.site,
316        command: fullName(cmd),
317        sourcePath,
318        source: readAdapterSource(sourcePath),
319      },
320      page: pageState,
321      timestamp: new Date().toISOString(),
322    };
323  }
324  
325  /** Collect full diagnostic context including page state (with timeout). */
326  export async function collectDiagnostic(
327    err: unknown,
328    cmd: InternalCliCommand,
329    page: IPage | null,
330  ): Promise<RepairContext> {
331    const pageState = page ? await collectPageState(page) : undefined;
332    return buildRepairContext(err, cmd, pageState);
333  }
334  
335  /** Emit diagnostic JSON to stderr, enforcing total size cap. */
336  export function emitDiagnostic(ctx: RepairContext): void {
337    const marker = '___OPENCLI_DIAGNOSTIC___';
338    let json = JSON.stringify(ctx);
339  
340    // Enforce total output budget — drop page state (largest section) first if over budget
341    if (json.length > MAX_DIAGNOSTIC_BYTES && ctx.page) {
342      const trimmed = {
343        ...ctx,
344        page: {
345          ...ctx.page,
346          snapshot: '[omitted: over size budget]',
347          networkRequests: [],
348          capturedPayloads: [],
349        },
350      };
351      json = JSON.stringify(trimmed);
352    }
353    // If still over budget, drop page entirely
354    if (json.length > MAX_DIAGNOSTIC_BYTES) {
355      const minimal = { ...ctx, page: undefined };
356      json = JSON.stringify(minimal);
357    }
358  
359    process.stderr.write(`\n${marker}\n${json}\n${marker}\n`);
360  }