/ scripts / run-agent-regression-suite.ts
run-agent-regression-suite.ts
  1  #!/usr/bin/env node
  2  import { runAgentRegressionSuite, type RegressionApprovalMode, type RegressionExtensionMode } from '../src/lib/server/eval/agent-regression'
  3  import { appendSessionNote } from '../src/lib/server/session-note'
  4  import { loadAgents } from '../src/lib/server/storage'
  5  
  6  function readFlag(name: string): string | null {
  7    const index = process.argv.indexOf(name)
  8    if (index === -1) return null
  9    return process.argv[index + 1] || null
 10  }
 11  
 12  function hasFlag(name: string): boolean {
 13    return process.argv.includes(name)
 14  }
 15  
 16  function parseApprovalModes(raw: string | null): RegressionApprovalMode[] | undefined {
 17    if (!raw) return undefined
 18    const parsed = raw.split(',').map((value) => value.trim()).filter(Boolean)
 19    const valid = parsed.filter((value): value is RegressionApprovalMode => value === 'manual' || value === 'auto' || value === 'off')
 20    return valid.length ? valid : undefined
 21  }
 22  
 23  function parseList(raw: string | null): string[] | undefined {
 24    if (!raw) return undefined
 25    const parsed = raw.split(',').map((value) => value.trim()).filter(Boolean)
 26    return parsed.length ? parsed : undefined
 27  }
 28  
 29  function parseExtensionMode(raw: string | null): RegressionExtensionMode | undefined {
 30    if (!raw) return undefined
 31    return raw === 'agent' ? 'agent' : raw === 'scenario' ? 'scenario' : undefined
 32  }
 33  
 34  async function main() {
 35    const result = await runAgentRegressionSuite({
 36      agentId: readFlag('--agent') || 'default',
 37      approvalModes: parseApprovalModes(readFlag('--modes')),
 38      scenarioIds: parseList(readFlag('--scenarios')),
 39      extensionMode: parseExtensionMode(readFlag('--extension-mode')),
 40    })
 41  
 42    const payload = {
 43      id: result.id,
 44      agentId: result.agentId,
 45      approvalModes: result.approvalModes,
 46      score: result.score,
 47      maxScore: result.maxScore,
 48      resultsPath: result.resultsPath,
 49      scenarios: result.scenarios.map((scenario) => ({
 50        scenarioId: scenario.scenarioId,
 51        approvalMode: scenario.approvalMode,
 52        extensionMode: scenario.extensionMode,
 53        status: scenario.status,
 54        score: scenario.score,
 55        maxScore: scenario.maxScore,
 56        missingExtensions: scenario.missingExtensions,
 57        failedAssertions: scenario.assertions.filter((assertion) => !assertion.passed).map((assertion) => assertion.name),
 58      })),
 59    }
 60  
 61    console.log(JSON.stringify(payload, null, 2))
 62  
 63    const agent = (loadAgents() as unknown as Record<string, Record<string, unknown>>)[result.agentId]
 64    const threadSessionId = typeof agent?.threadSessionId === 'string' ? agent.threadSessionId : ''
 65    if (threadSessionId) {
 66      const failedScenarios = result.scenarios.filter((scenario) => scenario.status !== 'passed')
 67      const lines = [
 68        '## Live Test Report',
 69        '',
 70        `Regression suite for **${result.agentId}** completed.`,
 71        '',
 72        `- Suite ID: \`${result.id}\``,
 73        `- Score: **${result.score}/${result.maxScore}**`,
 74        `- Approval modes: ${result.approvalModes.join(', ')}`,
 75        `- Results: \`${result.resultsPath}\``,
 76      ]
 77      if (failedScenarios.length) {
 78        lines.push('', '### Attention', '')
 79        for (const scenario of failedScenarios.slice(0, 6)) {
 80          const failures = scenario.assertions.filter((assertion) => !assertion.passed).map((assertion) => assertion.name)
 81          lines.push(`- **${scenario.scenarioId}** (${scenario.approvalMode}, ${scenario.status}): ${failures.join(', ') || 'Check results file for details'}`)
 82        }
 83      }
 84      appendSessionNote({
 85        sessionId: threadSessionId,
 86        text: lines.join('\n'),
 87        role: 'assistant',
 88        kind: 'system',
 89      })
 90    }
 91  
 92    if (hasFlag('--fail-on-regression') && result.score < result.maxScore) {
 93      process.exitCode = 1
 94    }
 95  }
 96  
 97  main().catch((error: unknown) => {
 98    const message = error instanceof Error ? error.stack || error.message : String(error)
 99    console.error(message)
100    process.exit(1)
101  })