run-agent-regression-suite.ts
1 #!/usr/bin/env node 2 import { runAgentRegressionSuite, type RegressionApprovalMode, type RegressionExtensionMode } from '../src/lib/server/eval/agent-regression' 3 import { appendSessionNote } from '../src/lib/server/session-note' 4 import { loadAgents } from '../src/lib/server/storage' 5 6 function readFlag(name: string): string | null { 7 const index = process.argv.indexOf(name) 8 if (index === -1) return null 9 return process.argv[index + 1] || null 10 } 11 12 function hasFlag(name: string): boolean { 13 return process.argv.includes(name) 14 } 15 16 function parseApprovalModes(raw: string | null): RegressionApprovalMode[] | undefined { 17 if (!raw) return undefined 18 const parsed = raw.split(',').map((value) => value.trim()).filter(Boolean) 19 const valid = parsed.filter((value): value is RegressionApprovalMode => value === 'manual' || value === 'auto' || value === 'off') 20 return valid.length ? valid : undefined 21 } 22 23 function parseList(raw: string | null): string[] | undefined { 24 if (!raw) return undefined 25 const parsed = raw.split(',').map((value) => value.trim()).filter(Boolean) 26 return parsed.length ? parsed : undefined 27 } 28 29 function parseExtensionMode(raw: string | null): RegressionExtensionMode | undefined { 30 if (!raw) return undefined 31 return raw === 'agent' ? 'agent' : raw === 'scenario' ? 'scenario' : undefined 32 } 33 34 async function main() { 35 const result = await runAgentRegressionSuite({ 36 agentId: readFlag('--agent') || 'default', 37 approvalModes: parseApprovalModes(readFlag('--modes')), 38 scenarioIds: parseList(readFlag('--scenarios')), 39 extensionMode: parseExtensionMode(readFlag('--extension-mode')), 40 }) 41 42 const payload = { 43 id: result.id, 44 agentId: result.agentId, 45 approvalModes: result.approvalModes, 46 score: result.score, 47 maxScore: result.maxScore, 48 resultsPath: result.resultsPath, 49 scenarios: result.scenarios.map((scenario) => ({ 50 scenarioId: scenario.scenarioId, 51 approvalMode: scenario.approvalMode, 52 extensionMode: scenario.extensionMode, 53 status: scenario.status, 54 score: scenario.score, 55 maxScore: scenario.maxScore, 56 missingExtensions: scenario.missingExtensions, 57 failedAssertions: scenario.assertions.filter((assertion) => !assertion.passed).map((assertion) => assertion.name), 58 })), 59 } 60 61 console.log(JSON.stringify(payload, null, 2)) 62 63 const agent = (loadAgents() as unknown as Record<string, Record<string, unknown>>)[result.agentId] 64 const threadSessionId = typeof agent?.threadSessionId === 'string' ? agent.threadSessionId : '' 65 if (threadSessionId) { 66 const failedScenarios = result.scenarios.filter((scenario) => scenario.status !== 'passed') 67 const lines = [ 68 '## Live Test Report', 69 '', 70 `Regression suite for **${result.agentId}** completed.`, 71 '', 72 `- Suite ID: \`${result.id}\``, 73 `- Score: **${result.score}/${result.maxScore}**`, 74 `- Approval modes: ${result.approvalModes.join(', ')}`, 75 `- Results: \`${result.resultsPath}\``, 76 ] 77 if (failedScenarios.length) { 78 lines.push('', '### Attention', '') 79 for (const scenario of failedScenarios.slice(0, 6)) { 80 const failures = scenario.assertions.filter((assertion) => !assertion.passed).map((assertion) => assertion.name) 81 lines.push(`- **${scenario.scenarioId}** (${scenario.approvalMode}, ${scenario.status}): ${failures.join(', ') || 'Check results file for details'}`) 82 } 83 } 84 appendSessionNote({ 85 sessionId: threadSessionId, 86 text: lines.join('\n'), 87 role: 'assistant', 88 kind: 'system', 89 }) 90 } 91 92 if (hasFlag('--fail-on-regression') && result.score < result.maxScore) { 93 process.exitCode = 1 94 } 95 } 96 97 main().catch((error: unknown) => { 98 const message = error instanceof Error ? error.stack || error.message : String(error) 99 console.error(message) 100 process.exit(1) 101 })