benchmark-autonomy-harness.mjs
1 #!/usr/bin/env node 2 3 import fs from 'node:fs' 4 import path from 'node:path' 5 import Database from 'better-sqlite3' 6 7 const DEFAULT_BASE_URL = process.env.SWARMCLAW_URL || 'http://localhost:3456' 8 const DEFAULT_OUT_DIR = path.join(process.cwd(), 'data', 'autonomy-benchmarks') 9 const DEFAULT_MIN_SCORE = Number.parseFloat(process.env.AUTONOMY_BENCH_MIN_SCORE || '70') 10 const DEFAULT_PROBE_PROFILE = String(process.env.AUTONOMY_BENCH_PROFILE || 'full').trim() || 'full' 11 12 function supportsChildWrites(dir) { 13 try { 14 fs.mkdirSync(dir, { recursive: true }) 15 const probeDir = fs.mkdtempSync(path.join(dir, '.autonomy-bench-probe-')) 16 fs.rmSync(probeDir, { recursive: true, force: true }) 17 return true 18 } catch { 19 return false 20 } 21 } 22 23 function resolveWorkspaceRoot() { 24 if (process.env.WORKSPACE_DIR) return process.env.WORKSPACE_DIR 25 const external = path.join(process.env.HOME || '', '.swarmclaw', 'workspace') 26 if (external && supportsChildWrites(external)) return external 27 return path.join(process.cwd(), 'data', 'workspace') 28 } 29 30 const WORKSPACE_ROOT = resolveWorkspaceRoot() 31 32 const TOOL_ALIAS_GROUPS = [ 33 ['shell', 'execute_command', 'process_tool', 'process'], 34 ['files', 'read_file', 'write_file', 'list_files', 'copy_file', 'move_file', 'delete_file', 'send_file'], 35 ['edit_file'], 36 ['web', 'web_search', 'web_fetch'], 37 ['browser', 'openclaw_browser'], 38 ['delegate', 'claude_code', 'codex_cli', 'opencode_cli', 'gemini_cli', 'delegate_to_claude_code', 'delegate_to_codex_cli', 'delegate_to_opencode_cli', 'delegate_to_gemini_cli'], 39 ['manage_platform', 'manage_agents', 'manage_projects', 'manage_tasks', 'manage_schedules', 'manage_skills', 'manage_documents', 'manage_webhooks', 'manage_secrets', 'manage_sessions'], 40 ['manage_connectors', 'connectors', 'connector_message_tool'], 41 ['manage_chatrooms', 'chatroom'], 42 ['spawn_subagent', 'subagent', 'delegate_to_agent'], 43 ['manage_sessions', 'session_info', 'sessions_tool', 'whoami_tool', 'search_history_tool'], 44 ['schedule', 'schedule_wake'], 45 ['http', 'http_request'], 46 ['memory', 'memory_tool'], 47 ['execute', 'sandbox'], 48 ['wallet', 'wallet_tool'], 49 ['monitor', 'monitor_tool'], 50 ['sample_ui', 'show_extension_card'], 51 ['context_mgmt', 'context_status', 'context_summarize'], 52 ['openclaw_workspace'], 53 ['openclaw_nodes'], 54 ['image_gen', 'generate_image'], 55 ['email', 'send_email'], 56 ['calendar', 'calendar_events'], 57 ['replicate', 'replicate_run', 'replicate_models'], 58 ] 59 60 const TOOL_CANONICAL_MAP = (() => { 61 const map = new Map() 62 for (const group of TOOL_ALIAS_GROUPS) { 63 const normalized = group.map((entry) => String(entry || '').trim().toLowerCase()).filter(Boolean) 64 const canonical = normalized[0] 65 if (!canonical) continue 66 for (const entry of normalized) map.set(entry, canonical) 67 } 68 return map 69 })() 70 71 const PROBE_BASE_TOOLS = [ 72 'shell', 73 'execute', 74 'process', 75 'files', 76 'edit_file', 77 'web', 78 'manage_connectors', 79 'manage_sessions', 80 'memory', 81 'browser', 82 'delegate', 83 'claude_code', 84 'codex_cli', 85 'opencode_cli', 86 ] 87 88 const PROJECT_OPERATION_TOOLS = [ 89 'manage_projects', 90 'manage_schedules', 91 'manage_secrets', 92 ] 93 94 const PROBE_TOOL_PROFILES = { 95 full: [...PROBE_BASE_TOOLS, 'manage_tasks'], 96 no_task_management: [...PROBE_BASE_TOOLS], 97 full_project_context: [...PROBE_BASE_TOOLS, 'manage_tasks', ...PROJECT_OPERATION_TOOLS], 98 project_context_only: [...PROBE_BASE_TOOLS, ...PROJECT_OPERATION_TOOLS], 99 } 100 101 const OPENCLAW_SCENARIOS = [ 102 { 103 id: 'openclaw_companion', 104 prompt: 'Briefly introduce yourself and tell me one concrete way you can help me right now.', 105 timeoutMs: 120_000, 106 }, 107 { 108 id: 'openclaw_action_request', 109 prompt: 'Create a short 3-step plan to research and build a simple app with me, then execute step 1.', 110 timeoutMs: 180_000, 111 }, 112 ] 113 114 function usage() { 115 console.log([ 116 'Usage: node scripts/benchmark-autonomy-harness.mjs [options]', 117 '', 118 'Local-only benchmark for SwarmClaw autonomy harness.', 119 'This benchmark is intended to be run manually pre-release, not in CI.', 120 '', 121 'Options:', 122 ' --base-url <url> SwarmClaw base URL (default: http://localhost:3456)', 123 ' --access-key <key> Access key (fallback: SWARMCLAW_ACCESS_KEY, then .env.local ACCESS_KEY)', 124 ' --out-dir <dir> Output directory for benchmark reports', 125 ' --min-score <0-100> Exit non-zero when score is below this threshold (default: 70)', 126 ' --profile <name> Probe tool profile: full | no-task-management | full-project-context | project-context-only (default: full)', 127 ' --session-scenarios <ids> Comma-separated session scenario IDs to run', 128 ' --skip-chatrooms Skip chatroom collaboration scenarios', 129 ' --no-openclaw Skip optional OpenClaw comparison probe', 130 ' --keep-created Keep created benchmark agent/session/chatrooms for inspection', 131 ' --help Show this help', 132 ].join('\n')) 133 } 134 135 function parseArgs(argv) { 136 const options = { 137 baseUrl: DEFAULT_BASE_URL, 138 accessKey: '', 139 outDir: DEFAULT_OUT_DIR, 140 minScore: Number.isFinite(DEFAULT_MIN_SCORE) ? DEFAULT_MIN_SCORE : 70, 141 profile: DEFAULT_PROBE_PROFILE, 142 sessionScenarios: [], 143 skipChatrooms: false, 144 includeOpenclaw: true, 145 keepCreated: false, 146 } 147 148 for (let i = 0; i < argv.length; i++) { 149 const arg = argv[i] 150 if (arg === '--help') { 151 usage() 152 process.exit(0) 153 } 154 if (arg === '--base-url') { 155 options.baseUrl = String(argv[++i] || '').trim() 156 continue 157 } 158 if (arg === '--access-key') { 159 options.accessKey = String(argv[++i] || '').trim() 160 continue 161 } 162 if (arg === '--out-dir') { 163 options.outDir = String(argv[++i] || '').trim() 164 continue 165 } 166 if (arg === '--min-score') { 167 const value = Number.parseFloat(String(argv[++i] || '')) 168 if (!Number.isFinite(value) || value < 0 || value > 100) { 169 throw new Error('--min-score must be a number between 0 and 100') 170 } 171 options.minScore = value 172 continue 173 } 174 if (arg === '--profile') { 175 options.profile = String(argv[++i] || '').trim() 176 continue 177 } 178 if (arg === '--session-scenarios') { 179 options.sessionScenarios = String(argv[++i] || '') 180 .split(',') 181 .map((value) => value.trim()) 182 .filter(Boolean) 183 continue 184 } 185 if (arg === '--skip-chatrooms') { 186 options.skipChatrooms = true 187 continue 188 } 189 if (arg === '--no-openclaw') { 190 options.includeOpenclaw = false 191 continue 192 } 193 if (arg === '--keep-created') { 194 options.keepCreated = true 195 continue 196 } 197 throw new Error(`Unknown argument: ${arg}`) 198 } 199 200 return options 201 } 202 203 function normalizeProbeProfileName(value) { 204 return String(value || '') 205 .trim() 206 .toLowerCase() 207 .replace(/[^a-z0-9]+/g, '_') 208 .replace(/^_+|_+$/g, '') 209 } 210 211 function resolveProbeProfile(value) { 212 const normalized = normalizeProbeProfileName(value) || 'full' 213 if (normalized === 'full') { 214 return { 215 id: 'full', 216 label: 'Full tool profile', 217 tools: [...PROBE_TOOL_PROFILES.full], 218 hasTaskManagement: true, 219 hasProjectContext: false, 220 hasProjectTool: false, 221 hasProjectOperations: false, 222 } 223 } 224 if (normalized === 'no_task_management' || normalized === 'taskless') { 225 return { 226 id: 'no_task_management', 227 label: 'No task management tool', 228 tools: [...PROBE_TOOL_PROFILES.no_task_management], 229 hasTaskManagement: false, 230 hasProjectContext: false, 231 hasProjectTool: false, 232 hasProjectOperations: false, 233 } 234 } 235 if (normalized === 'full_project_context' || normalized === 'project_context' || normalized === 'task_and_project_context') { 236 return { 237 id: 'full_project_context', 238 label: 'Task management with active project context', 239 tools: [...PROBE_TOOL_PROFILES.full_project_context], 240 hasTaskManagement: true, 241 hasProjectContext: true, 242 hasProjectTool: true, 243 hasProjectOperations: true, 244 } 245 } 246 if (normalized === 'project_context_only' || normalized === 'project_only' || normalized === 'taskless_project_context') { 247 return { 248 id: 'project_context_only', 249 label: 'Active project context without task management', 250 tools: [...PROBE_TOOL_PROFILES.project_context_only], 251 hasTaskManagement: false, 252 hasProjectContext: true, 253 hasProjectTool: true, 254 hasProjectOperations: true, 255 } 256 } 257 throw new Error(`Unknown --profile value "${value}". Valid values: full, no-task-management, full-project-context, project-context-only`) 258 } 259 260 function loadAccessKey(explicitKey) { 261 if (explicitKey) return explicitKey 262 if (process.env.SWARMCLAW_ACCESS_KEY) return process.env.SWARMCLAW_ACCESS_KEY 263 const envPath = path.join(process.cwd(), '.env.local') 264 if (!fs.existsSync(envPath)) { 265 throw new Error('Access key missing. Pass --access-key or set SWARMCLAW_ACCESS_KEY/.env.local ACCESS_KEY') 266 } 267 const raw = fs.readFileSync(envPath, 'utf8') 268 const line = raw.split('\n').find((entry) => entry.startsWith('ACCESS_KEY=')) 269 if (!line) { 270 throw new Error('ACCESS_KEY not found in .env.local') 271 } 272 const key = line.slice('ACCESS_KEY='.length).trim() 273 if (!key) { 274 throw new Error('ACCESS_KEY is empty in .env.local') 275 } 276 return key 277 } 278 279 function toSlug(value) { 280 return value 281 .toLowerCase() 282 .replace(/[^a-z0-9]+/g, '-') 283 .replace(/^-+|-+$/g, '') 284 .slice(0, 40) 285 } 286 287 function nowSlug() { 288 return new Date().toISOString().replace(/[^\d]/g, '').slice(0, 14) 289 } 290 291 function summarize(text, max = 220) { 292 const compact = String(text || '').replace(/\s+/g, ' ').trim() 293 return compact.length > max ? `${compact.slice(0, max - 3)}...` : compact 294 } 295 296 function stripRunNoise(text) { 297 let cleaned = String(text || '').trim() 298 // Session SSE can prefix multiple {"run":...} status envelopes before the actual assistant text. 299 while (cleaned.startsWith('{"run":')) { 300 const end = cleaned.indexOf('}}') 301 if (end === -1) break 302 cleaned = cleaned.slice(end + 2).trim() 303 } 304 return cleaned 305 } 306 307 function sleep(ms) { 308 return new Promise((resolve) => setTimeout(resolve, ms)) 309 } 310 311 function round1(value) { 312 return Math.round(value * 10) / 10 313 } 314 315 function normalizeToolName(value) { 316 return typeof value === 'string' ? value.trim().toLowerCase() : '' 317 } 318 319 function canonicalizeToolName(value) { 320 const normalized = normalizeToolName(value) 321 if (!normalized) return '' 322 return TOOL_CANONICAL_MAP.get(normalized) || normalized 323 } 324 325 function canonicalizeToolList(values) { 326 if (!Array.isArray(values)) return [] 327 return [...new Set(values.map((value) => canonicalizeToolName(value)).filter(Boolean))] 328 } 329 330 function getAgentTools(agent) { 331 if (Array.isArray(agent?.extensions) && agent.extensions.length > 0) return agent.extensions 332 if (Array.isArray(agent?.tools) && agent.tools.length > 0) return agent.tools 333 return [] 334 } 335 336 function ensureDir(dir) { 337 fs.mkdirSync(dir, { recursive: true }) 338 } 339 340 function writeTextFile(filePath, content) { 341 ensureDir(path.dirname(filePath)) 342 fs.writeFileSync(filePath, content) 343 } 344 345 function extractFirstId(text) { 346 const match = String(text || '').match(/\b([a-f0-9]{8})\b/i) 347 return match ? match[1] : null 348 } 349 350 function extractUploadUrls(text) { 351 const matches = String(text || '').match(/\/api\/uploads\/[^\s)"'`]+/g) || [] 352 return [...new Set(matches)] 353 } 354 355 function gradeForScore(score) { 356 if (score >= 90) return 'A' 357 if (score >= 80) return 'B' 358 if (score >= 70) return 'C' 359 if (score >= 60) return 'D' 360 return 'F' 361 } 362 363 async function fetchJson(client, method, route, body, timeoutMs = 25_000) { 364 const controller = new AbortController() 365 const timer = setTimeout(() => controller.abort(), timeoutMs) 366 try { 367 const res = await fetch(`${client.baseUrl}${route}`, { 368 method, 369 headers: { 370 'content-type': 'application/json', 371 'x-access-key': client.accessKey, 372 }, 373 body: body === undefined ? undefined : JSON.stringify(body), 374 signal: controller.signal, 375 }) 376 const text = await res.text() 377 if (!res.ok) { 378 throw new Error(`${method} ${route} failed (${res.status}): ${summarize(text, 280)}`) 379 } 380 if (!text) return null 381 try { 382 return JSON.parse(text) 383 } catch { 384 return text 385 } 386 } finally { 387 clearTimeout(timer) 388 } 389 } 390 391 async function postSse(client, route, body, timeoutMs = 180_000) { 392 const controller = new AbortController() 393 const timer = setTimeout(() => controller.abort(), timeoutMs) 394 const startedAt = Date.now() 395 try { 396 const res = await fetch(`${client.baseUrl}${route}`, { 397 method: 'POST', 398 headers: { 399 'content-type': 'application/json', 400 'x-access-key': client.accessKey, 401 }, 402 body: JSON.stringify(body), 403 signal: controller.signal, 404 }) 405 406 if (!res.ok || !res.body) { 407 const text = await res.text().catch(() => '') 408 throw new Error(`POST ${route} failed (${res.status}): ${summarize(text, 280)}`) 409 } 410 411 const decoder = new TextDecoder() 412 const reader = res.body.getReader() 413 let buffer = '' 414 let textAcc = '' 415 let replacedText = '' 416 const events = [] 417 418 while (true) { 419 const { done, value } = await reader.read() 420 if (done) break 421 buffer += decoder.decode(value, { stream: true }) 422 let idx = buffer.indexOf('\n\n') 423 while (idx !== -1) { 424 const chunk = buffer.slice(0, idx) 425 buffer = buffer.slice(idx + 2) 426 const line = chunk 427 .split('\n') 428 .map((entry) => entry.trim()) 429 .find((entry) => entry.startsWith('data: ')) 430 if (line) { 431 try { 432 const ev = JSON.parse(line.slice(6)) 433 events.push(ev) 434 if ((ev?.t === 'd' || ev?.t === 'md') && typeof ev.text === 'string') textAcc += ev.text 435 if (ev?.t === 'r' && typeof ev.text === 'string') replacedText = ev.text 436 } catch { 437 // ignore malformed event chunk 438 } 439 } 440 idx = buffer.indexOf('\n\n') 441 } 442 } 443 444 return { 445 events, 446 durationMs: Date.now() - startedAt, 447 text: replacedText || textAcc, 448 } 449 } finally { 450 clearTimeout(timer) 451 } 452 } 453 454 function setChatroomHarnessFlags(chatroomId, { chatMode, autoAddress }) { 455 const dbPath = path.join(process.cwd(), 'data', 'swarmclaw.db') 456 if (!fs.existsSync(dbPath)) return false 457 let db = null 458 try { 459 db = new Database(dbPath) 460 const row = db.prepare('SELECT data FROM chatrooms WHERE id = ?').get(chatroomId) 461 if (!row?.data) return false 462 const parsed = JSON.parse(row.data) 463 parsed.chatMode = chatMode 464 parsed.autoAddress = autoAddress 465 parsed.updatedAt = Date.now() 466 db.prepare('UPDATE chatrooms SET data = ? WHERE id = ?').run(JSON.stringify(parsed), chatroomId) 467 return true 468 } catch { 469 return false 470 } finally { 471 if (db) db.close() 472 } 473 } 474 475 function collectToolStats(events) { 476 const toolCalls = events 477 .filter((event) => event?.t === 'tool_call') 478 .map((event) => String(event.toolName || 'unknown')) 479 480 const toolErrors = events 481 .filter((event) => event?.t === 'tool_result') 482 .map((event) => event?.toolOutput) 483 .filter((value) => typeof value === 'string' && /^Error:/i.test(value.trim())) 484 .map((value) => summarize(value, 180)) 485 486 const streamErrors = events 487 .filter((event) => event?.t === 'err') 488 .map((event) => summarize(event.text || 'unknown error', 180)) 489 490 return { toolCalls, toolErrors, streamErrors } 491 } 492 493 function containsEmpathy(text) { 494 return /\b(i hear you|i understand|that sounds hard|you are not alone|i am here|with you|sorry you|overwhelmed)\b/i.test(String(text || '')) 495 } 496 497 function containsActionableStep(text) { 498 return /\b(next step|first step|we can|let's|right now|today|do this)\b/i.test(String(text || '')) 499 } 500 501 function isoToLongDate(isoDate) { 502 const stamp = new Date(`${isoDate}T00:00:00Z`) 503 if (Number.isNaN(stamp.getTime())) return isoDate 504 return stamp.toLocaleDateString('en-US', { month: 'long', day: 'numeric', year: 'numeric', timeZone: 'UTC' }) 505 } 506 507 function mentionsDate(text, isoDate) { 508 const source = String(text || '').toLowerCase() 509 const iso = String(isoDate || '').toLowerCase() 510 const longForm = isoToLongDate(isoDate).toLowerCase() 511 return source.includes(iso) || source.includes(longForm) 512 } 513 514 function listBenchmarkTasksByTitle(tasks, titleIncludes) { 515 return Object.values(tasks || {}).filter((row) => 516 row 517 && typeof row === 'object' 518 && String(row.title || '').includes(titleIncludes), 519 ) 520 } 521 522 async function waitForBenchmarkTasks(client, titleIncludes, predicate, timeoutMs = 90_000) { 523 const startedAt = Date.now() 524 let lastMatching = [] 525 while (Date.now() - startedAt < timeoutMs) { 526 const tasks = await fetchJson(client, 'GET', '/api/tasks') 527 lastMatching = listBenchmarkTasksByTitle(tasks, titleIncludes) 528 if (predicate(lastMatching)) return lastMatching 529 await sleep(1500) 530 } 531 return lastMatching 532 } 533 534 function countMemoriesContaining(needle) { 535 const dbPath = path.join(process.cwd(), 'data', 'memory.db') 536 if (!fs.existsSync(dbPath)) return 0 537 let db = null 538 try { 539 db = new Database(dbPath, { readonly: true }) 540 const like = `%${needle}%` 541 const row = db 542 .prepare('SELECT COUNT(*) AS count FROM memories WHERE content LIKE ? OR title LIKE ?') 543 .get(like, like) 544 return Number(row?.count || 0) 545 } catch { 546 return 0 547 } finally { 548 if (db) db.close() 549 } 550 } 551 552 async function prepareWorkspaceFixture(client, runTag, profile, createdIds) { 553 const metadata = { 554 projectName: 'HarborPilot Dispatch', 555 objective: 'Reduce incident handoff chaos and prove a lightweight operator workflow that can expand into inbox-driven ops automation.', 556 targetUser: 'marina operations managers', 557 pilotPriorities: ['SMS outage handling', 'dock reassignment'], 558 openObjectives: [ 559 'publish a triage research brief', 560 'prepare credential bootstrap for ops inbox workflows', 561 ], 562 capabilityHints: [ 563 'research', 564 'build', 565 'web browsing', 566 'credential bootstrapping', 567 'goal tracking', 568 ], 569 successMetrics: [ 570 'publish a handoff summary within 5 minutes of an incident update', 571 'prepare one reusable operator playbook each pilot week', 572 ], 573 credentialRequirements: [ 574 'mockmail app password for operator inbox automation', 575 'harbor metrics api token for pilot reporting', 576 ], 577 heartbeatPrompt: 'Review active pilot risks, inbox blockers, and the next operator action.', 578 heartbeatIntervalSec: 1800, 579 projectDescription: [ 580 'HarborPilot Dispatch is a B2B dock-operations workspace for marina operations managers.', 581 'The first pilot is focused on SMS outage handling and dock reassignment during busy charter turnover.', 582 ].join(' '), 583 color: '#0f766e', 584 } 585 586 let project = null 587 let workspaceRoot = '' 588 if (profile.hasProjectContext) { 589 project = await fetchJson(client, 'POST', '/api/projects', { 590 name: metadata.projectName, 591 description: metadata.projectDescription, 592 color: metadata.color, 593 objective: metadata.objective, 594 audience: metadata.targetUser, 595 priorities: metadata.pilotPriorities, 596 openObjectives: metadata.openObjectives, 597 capabilityHints: metadata.capabilityHints, 598 successMetrics: metadata.successMetrics, 599 credentialRequirements: metadata.credentialRequirements, 600 heartbeatPrompt: metadata.heartbeatPrompt, 601 heartbeatIntervalSec: metadata.heartbeatIntervalSec, 602 }) 603 createdIds.projects.push(project.id) 604 workspaceRoot = path.join(WORKSPACE_ROOT, 'projects', project.id) 605 } else { 606 workspaceRoot = path.join(WORKSPACE_ROOT, `autonomy-benchmark-${runTag}-${profile.id}`) 607 } 608 609 createdIds.workspaces.push(workspaceRoot) 610 ensureDir(workspaceRoot) 611 612 writeTextFile(path.join(workspaceRoot, 'README.md'), [ 613 '# Workspace Seed', 614 '', 615 'This workspace intentionally starts with partial product notes.', 616 'Inspect the files, create concrete artifacts, and prefer moving the work forward over talking about it.', 617 ].join('\n')) 618 619 writeTextFile(path.join(workspaceRoot, 'docs', 'problem-notes.md'), [ 620 '# Problem Notes', 621 '', 622 '- Operators lose time during incident escalation and shift handoff.', 623 '- Internal checklists are inconsistent, especially under time pressure.', 624 '- The first deliverables should stay narrow and execution-oriented.', 625 ].join('\n')) 626 627 writeTextFile(path.join(workspaceRoot, 'docs', 'constraints.md'), [ 628 '# Constraints', 629 '', 630 '- Keep deliverables lightweight and testable.', 631 '- Bias toward artifacts that can be reused in a kickoff or pilot review.', 632 '- Assume the team needs clear assumptions and explicit risks, not generic strategy language.', 633 ].join('\n')) 634 635 writeTextFile(path.join(workspaceRoot, 'docs', 'interview-snippets.md'), [ 636 '# Interview Snippets', 637 '', 638 '> We keep losing context during handoff, and then incidents drag on longer than they should.', 639 '> If a tool helps us recover faster, I care more about clarity than fancy dashboards.', 640 ].join('\n')) 641 642 return { 643 workspaceRoot, 644 project, 645 projectName: metadata.projectName, 646 targetUser: metadata.targetUser, 647 pilotPriorities: [...metadata.pilotPriorities], 648 openObjectives: [...metadata.openObjectives], 649 capabilityHints: [...metadata.capabilityHints], 650 successMetrics: [...metadata.successMetrics], 651 credentialRequirements: [...metadata.credentialRequirements], 652 heartbeatPrompt: metadata.heartbeatPrompt, 653 heartbeatIntervalSec: metadata.heartbeatIntervalSec, 654 objective: metadata.objective, 655 projectDescription: metadata.projectDescription, 656 paths: { 657 moneyPlanPath: 'plans/money-plan.md', 658 backlogPath: 'plans/project-backlog.md', 659 researchBriefPath: 'docs/research-brief.md', 660 launchDraftPath: 'docs/launch-brief-draft.md', 661 launchCritiquePath: 'docs/launch-brief-critique.md', 662 launchFinalPath: 'docs/launch-brief-final.md', 663 inboxOpsPath: 'ops/inbox-ops-playbook.md', 664 marketWatchPath: 'plans/risk-bounded-market-watch.md', 665 }, 666 } 667 } 668 669 function buildSessionScenarios(runTag, delegateAgentId, profile, fixture) { 670 const moneyTaskPrefix = `[Autonomy Probe ${runTag}] money-` 671 const deliveryTaskPrefix = `[Autonomy Probe ${runTag}] harbor-` 672 const inboxTaskTitle = `[Autonomy Probe ${runTag}] inbox-triage-playbook` 673 const marketTaskPrefix = `[Autonomy Probe ${runTag}] market-` 674 const resumeTaskPrefix = `[Autonomy Probe ${runTag}] resume-` 675 const resumeSourceTaskTitle = `${resumeTaskPrefix}source` 676 const resumeFollowupTaskTitle = `${resumeTaskPrefix}followup` 677 const birthday = '2031-04-17' 678 const anniversary = '2031-10-02' 679 const recurringBug = `ws reconnect loop ${runTag}` 680 const moneyPlanPath = fixture.paths.moneyPlanPath 681 const backlogPath = fixture.paths.backlogPath 682 const researchBriefPath = fixture.paths.researchBriefPath 683 const launchDraftPath = fixture.paths.launchDraftPath 684 const launchCritiquePath = fixture.paths.launchCritiquePath 685 const launchFinalPath = fixture.paths.launchFinalPath 686 const inboxOpsPath = fixture.paths.inboxOpsPath 687 const marketWatchPath = fixture.paths.marketWatchPath 688 const resumeSourcePath = 'docs/task-continuation-source.md' 689 const resumeFollowupPath = 'docs/task-continuation-followup.md' 690 const hasTaskManagement = Boolean(profile?.hasTaskManagement) 691 const hasProjectContext = Boolean(profile?.hasProjectContext) 692 const hasProjectTool = Boolean(profile?.hasProjectTool) 693 const hasProjectOperations = Boolean(profile?.hasProjectOperations) 694 const projectOpsPath = 'plans/project-ops-brief.md' 695 const credentialPlanPath = 'ops/credential-bootstrap.md' 696 const heartbeatPlanPath = 'ops/project-heartbeat.md' 697 const activeProjectId = fixture.project?.id || null 698 return [ 699 { 700 id: 'task_continuation_resume', 701 skill: 'task_followup_continuation', 702 weight: 8, 703 timeoutMs: 220_000, 704 requiresTool: true, 705 expectedTools: hasTaskManagement ? ['manage_tasks'] : ['files'], 706 prompt: hasTaskManagement 707 ? [ 708 'Set up a two-step continuation workflow using task management.', 709 `Create exactly two queued tasks titled "${resumeSourceTaskTitle}" and "${resumeFollowupTaskTitle}".`, 710 `Assign both tasks to agent "${delegateAgentId}".`, 711 hasProjectContext ? 'If an active project exists, let the active project be used by default for both tasks.' : 'If no active project exists, do not fabricate a project link.', 712 `The "${resumeSourceTaskTitle}" task should create "${resumeSourcePath}" with sections "Context" and "Next Step".`, 713 `The "${resumeFollowupTaskTitle}" task should create "${resumeFollowupPath}" with sections "Continuation" and "Inherited Context", and it must mention "${resumeSourcePath}" inside the file.`, 714 `Use "continueFromTaskId" on "${resumeFollowupTaskTitle}" so it follows "${resumeSourceTaskTitle}" and reuses the earlier task context when possible.`, 715 'Confirm both task ids.', 716 ].join(' ') 717 : [ 718 'Task management is unavailable in this session.', 719 `Write "${resumeSourcePath}" with sections "Context" and "Next Step".`, 720 `Then write "${resumeFollowupPath}" with sections "Continuation" and "Inherited Context", and mention "${resumeSourcePath}" inside the file.`, 721 'Confirm both file paths.', 722 ].join(' '), 723 semanticCheck: hasTaskManagement 724 ? (result) => extractFirstId(result.response) !== null && /\btask\b/i.test(result.response) 725 : (result) => result.response.includes(resumeSourcePath) && result.response.includes(resumeFollowupPath), 726 externalCheckWeight: 0.35, 727 postRunCheck: hasTaskManagement 728 ? async ({ client }) => { 729 const matching = await waitForBenchmarkTasks( 730 client, 731 resumeTaskPrefix, 732 (rows) => rows.length >= 2 && rows.every((row) => ['completed', 'failed'].includes(String(row.status || ''))), 733 120_000, 734 ) 735 const sourceTask = matching.find((row) => String(row?.title || '') === resumeSourceTaskTitle) || null 736 const followupTask = matching.find((row) => String(row?.title || '') === resumeFollowupTaskTitle) || null 737 const sourceAbs = path.join(fixture.workspaceRoot, resumeSourcePath) 738 const followupAbs = path.join(fixture.workspaceRoot, resumeFollowupPath) 739 const followupText = fs.existsSync(followupAbs) ? fs.readFileSync(followupAbs, 'utf8') : '' 740 const sameSession = Boolean(sourceTask?.sessionId && followupTask?.sessionId && sourceTask.sessionId === followupTask.sessionId) 741 const reusedPriorSession = /reusing prior session/i.test(String(followupTask?.checkpoint?.note || '')) 742 const inheritedContinuationContext = sameSession || reusedPriorSession 743 const projectLinked = !hasProjectContext || !activeProjectId 744 ? true 745 : sourceTask?.projectId === activeProjectId && followupTask?.projectId === activeProjectId 746 return { 747 name: 'task_continuation_workflow_completed', 748 passed: sourceTask?.status === 'completed' 749 && followupTask?.status === 'completed' 750 && Array.isArray(followupTask?.blockedBy) 751 && followupTask.blockedBy.includes(sourceTask.id) 752 && inheritedContinuationContext 753 && fs.existsSync(sourceAbs) 754 && fs.existsSync(followupAbs) 755 && followupText.includes(resumeSourcePath) 756 && projectLinked, 757 details: { 758 sourceTaskId: sourceTask?.id || null, 759 followupTaskId: followupTask?.id || null, 760 sourceStatus: sourceTask?.status || null, 761 followupStatus: followupTask?.status || null, 762 sameSession, 763 reusedPriorSession, 764 inheritedContinuationContext, 765 projectLinked, 766 sourceExists: fs.existsSync(sourceAbs), 767 followupExists: fs.existsSync(followupAbs), 768 }, 769 } 770 } 771 : async () => { 772 const sourceAbs = path.join(fixture.workspaceRoot, resumeSourcePath) 773 const followupAbs = path.join(fixture.workspaceRoot, resumeFollowupPath) 774 const followupText = fs.existsSync(followupAbs) ? fs.readFileSync(followupAbs, 'utf8') : '' 775 return { 776 name: 'continuation_files_written', 777 passed: fs.existsSync(sourceAbs) 778 && fs.existsSync(followupAbs) 779 && followupText.includes(resumeSourcePath), 780 details: { 781 sourceExists: fs.existsSync(sourceAbs), 782 followupExists: fs.existsSync(followupAbs), 783 }, 784 } 785 }, 786 }, 787 { 788 id: 'money_goal_workflow', 789 skill: 'autonomous_goal_execution', 790 weight: 8, 791 timeoutMs: 220_000, 792 requiresTool: true, 793 expectedTools: hasTaskManagement ? ['web', 'manage_tasks', 'files'] : ['web', 'files'], 794 prompt: hasTaskManagement 795 ? [ 796 'A user says: "I want you to help me make money legally."', 797 'Do one concrete research step.', 798 `Create exactly two backlog tasks (not queued) titled "${moneyTaskPrefix}research" and "${moneyTaskPrefix}mvp" using the task-management capability.`, 799 `Assign both tasks to agent "${delegateAgentId}" instead of yourself.`, 800 hasProjectContext ? 'If an active project exists, link both tasks to it.' : 'If no active project exists, do not invent one.', 801 `Write a short plan to "${moneyPlanPath}", and then confirm task IDs and file path.`, 802 ].join(' ') 803 : [ 804 'A user says: "I want you to help me make money legally."', 805 'Do one concrete research step.', 806 `Task management is intentionally unavailable in this session, so do not claim to create tasks.`, 807 `Instead, write a short plan to "${moneyPlanPath}" with a "Backlog" section that contains exactly two bullet items titled "${moneyTaskPrefix}research" and "${moneyTaskPrefix}mvp".`, 808 'Then confirm the file path and the two backlog item titles.', 809 ].join(' '), 810 semanticCheck: hasTaskManagement 811 ? (result) => 812 /money-plan\.md/i.test(result.response) && 813 extractFirstId(result.response) !== null && 814 /\b(task|backlog)\b/i.test(result.response) 815 : (result) => 816 /money-plan\.md/i.test(result.response) && 817 result.response.includes(`${moneyTaskPrefix}research`) && 818 result.response.includes(`${moneyTaskPrefix}mvp`), 819 externalCheckWeight: hasTaskManagement ? 0.2 : 0.2, 820 postRunCheck: hasTaskManagement 821 ? async ({ client }) => { 822 const tasks = await fetchJson(client, 'GET', '/api/tasks') 823 const matching = listBenchmarkTasksByTitle(tasks, moneyTaskPrefix) 824 const projectLinkedCount = hasProjectContext && fixture.project?.id 825 ? matching.filter((row) => row.projectId === fixture.project.id).length 826 : null 827 return { 828 name: 'money_tasks_created', 829 passed: matching.length >= 2 && (!hasProjectContext || projectLinkedCount >= 2), 830 details: { 831 taskCount: matching.length, 832 expectedAtLeast: 2, 833 projectLinkedCount, 834 expectedProjectId: fixture.project?.id || null, 835 }, 836 } 837 } 838 : async () => { 839 const planPath = path.join(fixture.workspaceRoot, moneyPlanPath) 840 const planText = fs.existsSync(planPath) ? fs.readFileSync(planPath, 'utf8') : '' 841 return { 842 name: 'money_plan_backlog_written', 843 passed: planText.includes(`${moneyTaskPrefix}research`) && planText.includes(`${moneyTaskPrefix}mvp`), 844 details: { 845 exists: fs.existsSync(planPath), 846 planPath, 847 }, 848 } 849 }, 850 }, 851 { 852 id: 'project_delivery_execution', 853 skill: 'project_execution', 854 weight: 10, 855 timeoutMs: 220_000, 856 requiresTool: true, 857 expectedTools: hasTaskManagement ? ['manage_tasks', 'files'] : ['files'], 858 prompt: hasTaskManagement 859 ? [ 860 'You are working in the current workspace.', 861 'Inspect the existing files before acting.', 862 `Create exactly three backlog tasks titled "${deliveryTaskPrefix}research-brief", "${deliveryTaskPrefix}launch-checklist", and "${deliveryTaskPrefix}qa-pass".`, 863 `Assign all three tasks to agent "${delegateAgentId}" instead of yourself.`, 864 hasProjectContext ? 'If an active project exists, link all three tasks to it.' : 'If no active project exists, do not fabricate a project link.', 865 `Then execute the first step immediately by creating "${researchBriefPath}" with sections "Target User", "Primary Pain", "Assumptions", and "Risks".`, 866 'Under "Risks", include a markdown table with at least two rows.', 867 'Finally confirm the file path and the task ids.', 868 ].join(' ') 869 : [ 870 'You are working in the current workspace.', 871 'Inspect the existing files before acting.', 872 'Task management is intentionally unavailable in this session, so do not claim to create tasks.', 873 `Write "${backlogPath}" with exactly three bullet items titled "${deliveryTaskPrefix}research-brief", "${deliveryTaskPrefix}launch-checklist", and "${deliveryTaskPrefix}qa-pass".`, 874 `Then execute the first step immediately by creating "${researchBriefPath}" with sections "Target User", "Primary Pain", "Assumptions", and "Risks".`, 875 'Under "Risks", include a markdown table with at least two rows.', 876 'Finally confirm the backlog file path and the research brief path.', 877 ].join(' '), 878 semanticCheck: hasTaskManagement 879 ? (result) => 880 result.response.includes(researchBriefPath) && 881 extractFirstId(result.response) !== null 882 : (result) => 883 result.response.includes(backlogPath) && 884 result.response.includes(researchBriefPath), 885 externalCheckWeight: 0.15, 886 postRunCheck: hasTaskManagement 887 ? async ({ client }) => { 888 const tasks = await fetchJson(client, 'GET', '/api/tasks') 889 const matching = listBenchmarkTasksByTitle(tasks, deliveryTaskPrefix) 890 const researchBriefAbs = path.join(fixture.workspaceRoot, researchBriefPath) 891 const researchBrief = fs.existsSync(researchBriefAbs) ? fs.readFileSync(researchBriefAbs, 'utf8') : '' 892 const projectLinkedCount = hasProjectContext && fixture.project?.id 893 ? matching.filter((row) => row.projectId === fixture.project.id).length 894 : null 895 return { 896 name: 'project_tasks_and_brief_created', 897 passed: matching.length >= 3 898 && researchBrief.includes('## Target User') 899 && researchBrief.includes('## Risks') 900 && /\|.+\|.+\|/.test(researchBrief) 901 && (!hasProjectContext || projectLinkedCount >= 3), 902 details: { 903 taskCount: matching.length, 904 expectedAtLeast: 3, 905 projectLinkedCount, 906 expectedProjectId: fixture.project?.id || null, 907 researchBriefExists: fs.existsSync(researchBriefAbs), 908 }, 909 } 910 } 911 : async () => { 912 const backlogAbs = path.join(fixture.workspaceRoot, backlogPath) 913 const researchBriefAbs = path.join(fixture.workspaceRoot, researchBriefPath) 914 const backlogText = fs.existsSync(backlogAbs) ? fs.readFileSync(backlogAbs, 'utf8') : '' 915 const researchBrief = fs.existsSync(researchBriefAbs) ? fs.readFileSync(researchBriefAbs, 'utf8') : '' 916 return { 917 name: 'project_backlog_and_brief_written', 918 passed: backlogText.includes(`${deliveryTaskPrefix}research-brief`) 919 && backlogText.includes(`${deliveryTaskPrefix}launch-checklist`) 920 && backlogText.includes(`${deliveryTaskPrefix}qa-pass`) 921 && researchBrief.includes('## Target User') 922 && researchBrief.includes('## Risks') 923 && /\|.+\|.+\|/.test(researchBrief), 924 details: { 925 backlogExists: fs.existsSync(backlogAbs), 926 researchBriefExists: fs.existsSync(researchBriefAbs), 927 backlogAbs, 928 researchBriefAbs, 929 }, 930 } 931 }, 932 }, 933 { 934 id: 'open_ended_iteration', 935 skill: 'deliverable_iteration', 936 weight: 10, 937 timeoutMs: 220_000, 938 requiresTool: true, 939 expectedTools: ['files'], 940 prompt: [ 941 'Create a first draft launch brief for the current workspace at', 942 `"${launchDraftPath}".`, 943 'Then write a short critique at', 944 `"${launchCritiquePath}" that names at least two weaknesses in the draft.`, 945 'Then revise the brief into', 946 `"${launchFinalPath}" and make at least one concrete change because of that critique.`, 947 'Inspect any existing files you need first.', 948 'Report all three file paths and one specific thing you changed in the final version.', 949 ].join(' '), 950 semanticCheck: (result) => 951 result.response.includes(launchDraftPath) 952 && result.response.includes(launchCritiquePath) 953 && result.response.includes(launchFinalPath) 954 && /\b(changed|revised|updated)\b/i.test(result.response), 955 externalCheckWeight: 0.25, 956 postRunCheck: async () => { 957 const draftAbs = path.join(fixture.workspaceRoot, launchDraftPath) 958 const critiqueAbs = path.join(fixture.workspaceRoot, launchCritiquePath) 959 const finalAbs = path.join(fixture.workspaceRoot, launchFinalPath) 960 const draftText = fs.existsSync(draftAbs) ? fs.readFileSync(draftAbs, 'utf8') : '' 961 const critiqueText = fs.existsSync(critiqueAbs) ? fs.readFileSync(critiqueAbs, 'utf8') : '' 962 const finalText = fs.existsSync(finalAbs) ? fs.readFileSync(finalAbs, 'utf8') : '' 963 const critiqueLineCount = critiqueText 964 .split('\n') 965 .map((line) => line.trim()) 966 .filter((line) => line.startsWith('- ') || /^\d+\./.test(line)) 967 .length 968 return { 969 name: 'iteration_artifacts_created', 970 passed: fs.existsSync(draftAbs) 971 && fs.existsSync(critiqueAbs) 972 && fs.existsSync(finalAbs) 973 && critiqueLineCount >= 2 974 && draftText.trim().length > 0 975 && finalText.trim().length > 0 976 && draftText !== finalText, 977 details: { 978 draftExists: fs.existsSync(draftAbs), 979 critiqueExists: fs.existsSync(critiqueAbs), 980 finalExists: fs.existsSync(finalAbs), 981 critiqueLineCount, 982 }, 983 } 984 }, 985 }, 986 { 987 id: 'project_operating_system', 988 skill: 'project_context', 989 weight: 8, 990 timeoutMs: 180_000, 991 requiresTool: true, 992 expectedTools: hasProjectTool ? ['manage_projects', 'files'] : ['files'], 993 prompt: hasProjectTool && hasProjectContext 994 ? [ 995 'Use the active project-management tool to strengthen the current project record before doing anything else.', 996 `Set the project objective to "${fixture.objective}".`, 997 `Set the open objectives to "${fixture.openObjectives[0]}" and "${fixture.openObjectives[1]}".`, 998 `Set the operating modes to "${fixture.capabilityHints.join('", "')}".`, 999 `Set the credential requirements to "${fixture.credentialRequirements.join('", "')}".`, 1000 `Set the preferred heartbeat prompt to "${fixture.heartbeatPrompt}" and heartbeat interval to ${fixture.heartbeatIntervalSec} seconds.`, 1001 `Then write "${projectOpsPath}" with sections "Objective", "Open Objectives", "Operating Modes", "Credential Requirements", and "Heartbeat".`, 1002 'Confirm the active project id and the file path.', 1003 ].join(' ') 1004 : [ 1005 'Project-management tooling is unavailable in this session.', 1006 `Write "${projectOpsPath}" with sections "Objective", "Open Objectives", "Operating Modes", "Credential Requirements", and "Heartbeat".`, 1007 `Use these exact values: objective "${fixture.objective}", open objectives "${fixture.openObjectives[0]}" and "${fixture.openObjectives[1]}", operating modes "${fixture.capabilityHints.join('", "')}", credential requirements "${fixture.credentialRequirements.join('", "')}", and heartbeat "${fixture.heartbeatPrompt}" every ${fixture.heartbeatIntervalSec} seconds.`, 1008 'Confirm the file path.', 1009 ].join(' '), 1010 semanticCheck: hasProjectTool && hasProjectContext 1011 ? (result) => result.response.includes(projectOpsPath) && (activeProjectId ? result.response.includes(activeProjectId) : true) 1012 : (result) => result.response.includes(projectOpsPath), 1013 externalCheckWeight: 0.3, 1014 postRunCheck: hasProjectTool && hasProjectContext 1015 ? async ({ client }) => { 1016 const project = await fetchJson(client, 'GET', `/api/projects/${encodeURIComponent(activeProjectId)}`) 1017 const projectOpsAbs = path.join(fixture.workspaceRoot, projectOpsPath) 1018 const text = fs.existsSync(projectOpsAbs) ? fs.readFileSync(projectOpsAbs, 'utf8') : '' 1019 return { 1020 name: 'project_record_enriched', 1021 passed: project?.objective === fixture.objective 1022 && Array.isArray(project?.openObjectives) 1023 && project.openObjectives.includes(fixture.openObjectives[0]) 1024 && Array.isArray(project?.credentialRequirements) 1025 && project.credentialRequirements.includes(fixture.credentialRequirements[0]) 1026 && project?.heartbeatPrompt === fixture.heartbeatPrompt 1027 && Number(project?.heartbeatIntervalSec) === fixture.heartbeatIntervalSec 1028 && text.includes('## Objective') 1029 && text.includes('## Credential Requirements'), 1030 details: { 1031 projectId: activeProjectId, 1032 projectOpsExists: fs.existsSync(projectOpsAbs), 1033 }, 1034 } 1035 } 1036 : async () => { 1037 const projectOpsAbs = path.join(fixture.workspaceRoot, projectOpsPath) 1038 const text = fs.existsSync(projectOpsAbs) ? fs.readFileSync(projectOpsAbs, 'utf8') : '' 1039 return { 1040 name: 'project_ops_brief_written', 1041 passed: text.includes('## Objective') 1042 && text.includes(fixture.objective) 1043 && text.includes(fixture.credentialRequirements[0]) 1044 && text.includes(fixture.heartbeatPrompt), 1045 details: { 1046 projectOpsExists: fs.existsSync(projectOpsAbs), 1047 }, 1048 } 1049 }, 1050 }, 1051 { 1052 id: 'project_credentials_and_heartbeat', 1053 skill: 'project_operations', 1054 weight: 8, 1055 timeoutMs: 180_000, 1056 requiresTool: true, 1057 expectedTools: hasProjectOperations ? ['manage_secrets', 'manage_schedules', 'files'] : ['files'], 1058 prompt: hasProjectOperations && hasProjectContext 1059 ? [ 1060 'Bootstrap lightweight project operations for the active project.', 1061 `Create one project-linked secret named "MockMail App Password ${runTag}" with service "mockmail" and value "${runTag}-mockmail-secret".`, 1062 `Create one active interval schedule named "Pilot heartbeat ${runTag}" with intervalMs ${fixture.heartbeatIntervalSec * 1000} and taskPrompt "Review active project goals, inbox blockers, and next operator action."`, 1063 'Omit projectId when possible so the active project is used by default.', 1064 `Then write "${credentialPlanPath}" with sections "Services", "Secrets", and "Heartbeat" that summarize what you configured.`, 1065 'Confirm the secret id, schedule id, and file path.', 1066 ].join(' ') 1067 : [ 1068 'Project secret and schedule tooling is unavailable in this session.', 1069 `Write "${credentialPlanPath}" with sections "Services", "Secrets", and "Heartbeat" describing the credentials and recurring follow-up needed for an inbox-oriented operator workflow.`, 1070 `Also write "${heartbeatPlanPath}" with a recurring heartbeat recommendation every ${fixture.heartbeatIntervalSec} seconds and mention "${fixture.heartbeatPrompt}".`, 1071 'Confirm both file paths.', 1072 ].join(' '), 1073 semanticCheck: hasProjectOperations && hasProjectContext 1074 ? (result) => result.response.includes(credentialPlanPath) && extractFirstId(result.response) !== null 1075 : (result) => result.response.includes(credentialPlanPath) && result.response.includes(heartbeatPlanPath), 1076 externalCheckWeight: 0.3, 1077 postRunCheck: hasProjectOperations && hasProjectContext 1078 ? async ({ client }) => { 1079 const secrets = await fetchJson(client, 'GET', '/api/secrets') 1080 const schedules = await fetchJson(client, 'GET', '/api/schedules') 1081 const secretMatch = Object.values(secrets || {}).find((row) => String(row?.name || '') === `MockMail App Password ${runTag}`) 1082 const scheduleMatch = Object.values(schedules || {}).find((row) => String(row?.name || '') === `Pilot heartbeat ${runTag}`) 1083 const credentialPlanAbs = path.join(fixture.workspaceRoot, credentialPlanPath) 1084 const text = fs.existsSync(credentialPlanAbs) ? fs.readFileSync(credentialPlanAbs, 'utf8') : '' 1085 return { 1086 name: 'project_secret_and_schedule_created', 1087 passed: Boolean(secretMatch) 1088 && Boolean(scheduleMatch) 1089 && secretMatch?.projectId === activeProjectId 1090 && scheduleMatch?.projectId === activeProjectId 1091 && text.includes('## Secrets') 1092 && text.includes('## Heartbeat'), 1093 details: { 1094 secretId: secretMatch?.id || null, 1095 scheduleId: scheduleMatch?.id || null, 1096 projectId: activeProjectId, 1097 credentialPlanExists: fs.existsSync(credentialPlanAbs), 1098 }, 1099 } 1100 } 1101 : async () => { 1102 const credentialPlanAbs = path.join(fixture.workspaceRoot, credentialPlanPath) 1103 const heartbeatPlanAbs = path.join(fixture.workspaceRoot, heartbeatPlanPath) 1104 const credentialText = fs.existsSync(credentialPlanAbs) ? fs.readFileSync(credentialPlanAbs, 'utf8') : '' 1105 const heartbeatText = fs.existsSync(heartbeatPlanAbs) ? fs.readFileSync(heartbeatPlanAbs, 'utf8') : '' 1106 return { 1107 name: 'credential_and_heartbeat_docs_written', 1108 passed: credentialText.includes('## Secrets') 1109 && heartbeatText.includes(fixture.heartbeatPrompt) 1110 && heartbeatText.includes(String(fixture.heartbeatIntervalSec)), 1111 details: { 1112 credentialPlanExists: fs.existsSync(credentialPlanAbs), 1113 heartbeatPlanExists: fs.existsSync(heartbeatPlanAbs), 1114 }, 1115 } 1116 }, 1117 }, 1118 { 1119 id: 'inbox_operations_kickoff', 1120 skill: 'project_operations', 1121 weight: 8, 1122 timeoutMs: 200_000, 1123 requiresTool: true, 1124 expectedTools: hasProjectOperations ? ['manage_projects', 'manage_secrets', 'manage_schedules', 'files'] : ['files'], 1125 prompt: hasProjectOperations && hasProjectContext 1126 ? [ 1127 'Treat the active project as an inbox-operations system.', 1128 'Add the capability hint "inbox triage" and the open objective "stand up inbox triage workflow".', 1129 `Create one project-linked secret named "Inbox OAuth Refresh ${runTag}" with service "mockmail" and value "${runTag}-inbox-refresh".`, 1130 `Create one active interval schedule named "Inbox triage review ${runTag}" with intervalMs 900000 and taskPrompt "Review unread inbox items, blockers, and next reply actions."`, 1131 hasTaskManagement 1132 ? `Also create exactly one backlog task titled "${inboxTaskTitle}" assigned to agent "${delegateAgentId}". Omit projectId so the active project is used by default.` 1133 : 'Task management is unavailable in this session, so do not claim to create tasks.', 1134 `Then write "${inboxOpsPath}" with sections "Inbox Goals", "Credential Bootstrap", "Heartbeat Cadence", and "Failure Modes".`, 1135 'Confirm the file path and any created ids.', 1136 ].join(' ') 1137 : [ 1138 'Project operations tooling is unavailable in this session.', 1139 'Do not claim to create real secrets, schedules, or project updates.', 1140 `Write "${inboxOpsPath}" with sections "Inbox Goals", "Credential Bootstrap", "Heartbeat Cadence", and "Failure Modes" for a lightweight operator inbox workflow.`, 1141 'Confirm the file path.', 1142 ].join(' '), 1143 semanticCheck: (result) => result.response.includes(inboxOpsPath), 1144 externalCheckWeight: 0.3, 1145 postRunCheck: hasProjectOperations && hasProjectContext 1146 ? async ({ client }) => { 1147 const projects = await fetchJson(client, 'GET', `/api/projects/${encodeURIComponent(activeProjectId)}`) 1148 const secrets = await fetchJson(client, 'GET', '/api/secrets') 1149 const schedules = await fetchJson(client, 'GET', '/api/schedules') 1150 const tasks = hasTaskManagement ? await fetchJson(client, 'GET', '/api/tasks') : null 1151 const secretMatch = Object.values(secrets || {}).find((row) => String(row?.name || '') === `Inbox OAuth Refresh ${runTag}`) 1152 const scheduleMatch = Object.values(schedules || {}).find((row) => String(row?.name || '') === `Inbox triage review ${runTag}`) 1153 const taskMatch = hasTaskManagement 1154 ? Object.values(tasks || {}).find((row) => String(row?.title || '') === inboxTaskTitle) 1155 : null 1156 const inboxOpsAbs = path.join(fixture.workspaceRoot, inboxOpsPath) 1157 const text = fs.existsSync(inboxOpsAbs) ? fs.readFileSync(inboxOpsAbs, 'utf8') : '' 1158 return { 1159 name: 'inbox_ops_seeded', 1160 passed: Array.isArray(projects?.capabilityHints) 1161 && projects.capabilityHints.includes('inbox triage') 1162 && Array.isArray(projects?.openObjectives) 1163 && projects.openObjectives.includes('stand up inbox triage workflow') 1164 && secretMatch?.projectId === activeProjectId 1165 && scheduleMatch?.projectId === activeProjectId 1166 && (!hasTaskManagement || (taskMatch?.projectId === activeProjectId)) 1167 && text.includes('## Inbox Goals') 1168 && text.includes('## Credential Bootstrap') 1169 && text.includes('## Heartbeat Cadence') 1170 && text.includes('## Failure Modes'), 1171 details: { 1172 projectId: activeProjectId, 1173 secretId: secretMatch?.id || null, 1174 scheduleId: scheduleMatch?.id || null, 1175 taskId: taskMatch?.id || null, 1176 inboxOpsExists: fs.existsSync(inboxOpsAbs), 1177 }, 1178 } 1179 } 1180 : async () => { 1181 const inboxOpsAbs = path.join(fixture.workspaceRoot, inboxOpsPath) 1182 const text = fs.existsSync(inboxOpsAbs) ? fs.readFileSync(inboxOpsAbs, 'utf8') : '' 1183 return { 1184 name: 'inbox_ops_playbook_written', 1185 passed: text.includes('## Inbox Goals') 1186 && text.includes('## Credential Bootstrap') 1187 && text.includes('## Heartbeat Cadence') 1188 && text.includes('## Failure Modes'), 1189 details: { 1190 inboxOpsExists: fs.existsSync(inboxOpsAbs), 1191 }, 1192 } 1193 }, 1194 }, 1195 { 1196 id: 'market_watch_planning', 1197 skill: 'autonomous_goal_execution', 1198 weight: 8, 1199 timeoutMs: 190_000, 1200 requiresTool: true, 1201 expectedTools: hasTaskManagement 1202 ? (hasProjectTool ? ['manage_projects', 'manage_tasks', 'files'] : ['manage_tasks', 'files']) 1203 : (hasProjectTool ? ['manage_projects', 'files'] : ['files']), 1204 prompt: hasProjectTool && hasProjectContext 1205 ? [ 1206 'A user says "manage my trading research for me", but keep the work research-only and do not place trades or claim real execution.', 1207 'Use the active project as the durable goal tracker.', 1208 'Add the open objective "maintain a research-only market watch" and the success metric "publish one risk-bounded market memo per week".', 1209 hasTaskManagement 1210 ? `Create exactly two backlog tasks titled "${marketTaskPrefix}signal-review" and "${marketTaskPrefix}memo". Assign both to agent "${delegateAgentId}" and let the active project be used by default.` 1211 : 'Task management is unavailable in this session, so do not claim to create tasks.', 1212 `Then write "${marketWatchPath}" with sections "Guardrails", "Signals", "Research Cadence", and "Next Review".`, 1213 'Confirm the file path and any created ids.', 1214 ].join(' ') 1215 : hasTaskManagement 1216 ? [ 1217 'A user says "manage my trading research for me", but keep the work research-only and do not place trades or claim real execution.', 1218 `Create exactly two backlog tasks titled "${marketTaskPrefix}signal-review" and "${marketTaskPrefix}memo".`, 1219 `Assign both to agent "${delegateAgentId}".`, 1220 `Then write "${marketWatchPath}" with sections "Guardrails", "Signals", "Research Cadence", and "Next Review".`, 1221 'Confirm the task ids and file path.', 1222 ].join(' ') 1223 : [ 1224 'A user says "manage my trading research for me", but keep the work research-only and do not place trades or claim real execution.', 1225 `Write "${marketWatchPath}" with sections "Guardrails", "Signals", "Research Cadence", and "Next Review".`, 1226 'Confirm the file path.', 1227 ].join(' '), 1228 semanticCheck: (result) => result.response.includes(marketWatchPath), 1229 externalCheckWeight: 0.3, 1230 postRunCheck: async ({ client }) => { 1231 const marketWatchAbs = path.join(fixture.workspaceRoot, marketWatchPath) 1232 const text = fs.existsSync(marketWatchAbs) ? fs.readFileSync(marketWatchAbs, 'utf8') : '' 1233 const tasks = hasTaskManagement ? await fetchJson(client, 'GET', '/api/tasks') : null 1234 const matchingTasks = hasTaskManagement ? listBenchmarkTasksByTitle(tasks, marketTaskPrefix) : [] 1235 const project = hasProjectTool && hasProjectContext && activeProjectId 1236 ? await fetchJson(client, 'GET', `/api/projects/${encodeURIComponent(activeProjectId)}`) 1237 : null 1238 const projectLinkedCount = hasProjectContext && activeProjectId 1239 ? matchingTasks.filter((row) => row.projectId === activeProjectId).length 1240 : null 1241 return { 1242 name: 'market_watch_plan_seeded', 1243 passed: text.includes('## Guardrails') 1244 && text.includes('## Signals') 1245 && text.includes('## Research Cadence') 1246 && text.includes('## Next Review') 1247 && (!hasTaskManagement || matchingTasks.length >= 2) 1248 && (!hasProjectTool || !hasProjectContext || ( 1249 Array.isArray(project?.openObjectives) 1250 && project.openObjectives.includes('maintain a research-only market watch') 1251 && Array.isArray(project?.successMetrics) 1252 && project.successMetrics.includes('publish one risk-bounded market memo per week') 1253 && (!hasTaskManagement || projectLinkedCount >= 2) 1254 )), 1255 details: { 1256 marketWatchExists: fs.existsSync(marketWatchAbs), 1257 taskCount: matchingTasks.length, 1258 projectLinkedCount, 1259 projectId: activeProjectId, 1260 }, 1261 } 1262 }, 1263 }, 1264 { 1265 id: 'news_media_delivery', 1266 skill: 'research_delivery', 1267 weight: 8, 1268 timeoutMs: 220_000, 1269 requiresTool: true, 1270 expectedTools: ['web', 'browser', 'manage_connectors'], 1271 prompt: [ 1272 'A user asks:', 1273 '"Can you tell me more if there is any news related to the US-Iran war, and can you send me some screenshots and give me a summary and maybe send me a voice note about it?"', 1274 'Use live web research first.', 1275 'Then use the browser tool to capture at least one relevant screenshot from a source page.', 1276 'Give a concise summary of the latest relevant developments.', 1277 'If outbound delivery is possible, send the screenshot and a short voice note update through connector_message_tool.', 1278 'If no running connector is available, explicitly check that and report the delivery blocker instead of claiming the capability does not exist.', 1279 'In your final answer, include the screenshot upload URL exactly and say whether the voice note was sent or blocked after checking connectors.', 1280 ].join(' '), 1281 semanticCheck: (result) => 1282 /\b(us|u\.s\.)\b/i.test(result.response) 1283 && /\biran\b/i.test(result.response) 1284 && /\b(summary|summarized|latest|update|updates|reported|developments)\b/i.test(result.response) 1285 && /\/api\/uploads\/[^\s)"'`]+\.(png|jpg|jpeg|webp)/i.test(result.response) 1286 && /\b(voice[\s-]?note|voice_sent|blocked|no running connectors|connector)\b/i.test(result.response), 1287 externalCheckWeight: 0.35, 1288 postRunCheck: async ({ client, row }) => { 1289 const screenshotUrls = extractUploadUrls(row.response) 1290 .filter((url) => /\.(png|jpg|jpeg|webp)(?:[?#].*)?$/i.test(url)) 1291 .filter((url) => /\/api\/uploads\/(?:screenshot-|browser-)/i.test(url)) 1292 const screenshotReachability = await Promise.all( 1293 screenshotUrls.slice(0, 3).map(async (url) => { 1294 try { 1295 const res = await fetch(`${client.baseUrl}${url}`, { 1296 headers: { 'x-access-key': client.accessKey }, 1297 }) 1298 return res.ok 1299 } catch { 1300 return false 1301 } 1302 }), 1303 ) 1304 const connectorOutcome = /\b(voice[\s-]?note sent|voice_sent|no running connectors|set one up in the connectors panel|delivery blocker|delivery blocked|could not send (?:the )?voice(?:[\s-]?note)?|unable to send (?:the )?voice(?:[\s-]?note)?)\b/i.test(row.response) 1305 return { 1306 name: 'news_media_delivery_checked', 1307 passed: screenshotReachability.some(Boolean) && connectorOutcome, 1308 details: { 1309 screenshotUrls, 1310 reachableScreenshots: screenshotReachability.filter(Boolean).length, 1311 connectorOutcome, 1312 }, 1313 } 1314 }, 1315 }, 1316 { 1317 id: 'project_context_alignment', 1318 skill: 'project_context', 1319 weight: 6, 1320 timeoutMs: 120_000, 1321 requiresTool: false, 1322 expectedTools: [], 1323 prompt: 'Without reading files or browsing the web, tell me the active project\'s exact name, objective, who it is for, the first two pilot priorities, and the first open objective. If there is no active project context, say that plainly.', 1324 semanticCheck: hasProjectContext 1325 ? (result) => 1326 result.response.includes(fixture.projectName) 1327 && result.response.toLowerCase().includes(fixture.objective.toLowerCase()) 1328 && result.response.toLowerCase().includes(fixture.targetUser.toLowerCase()) 1329 && result.response.toLowerCase().includes(fixture.pilotPriorities[0].toLowerCase()) 1330 && result.response.toLowerCase().includes(fixture.pilotPriorities[1].toLowerCase()) 1331 && result.response.toLowerCase().includes(fixture.openObjectives[0].toLowerCase()) 1332 : (result) => /\b(no active project|no current project|do not have active project context|no active project context)\b/i.test(result.response), 1333 }, 1334 { 1335 id: 'session_history_recall', 1336 skill: 'session_management', 1337 weight: 8, 1338 timeoutMs: 140_000, 1339 requiresTool: true, 1340 expectedTools: ['manage_sessions'], 1341 prompt: `Use the session-management tool to inspect the recent history of this current session. Then tell me the exact "${moneyPlanPath}", "${researchBriefPath}", and "${launchFinalPath}" file paths created earlier in this chat, and mention that you checked session history.`, 1342 semanticCheck: (result) => 1343 result.response.includes(moneyPlanPath) && 1344 result.response.includes(researchBriefPath) && 1345 result.response.includes(launchFinalPath) && 1346 /\b(history|session history|recent history)\b/i.test(result.response), 1347 }, 1348 { 1349 id: 'memory_significant_store', 1350 skill: 'memory', 1351 weight: 4, 1352 timeoutMs: 140_000, 1353 requiresTool: true, 1354 expectedTools: ['memory'], 1355 prompt: [ 1356 'Store significant long-term memory for this user:', 1357 `birthday ${birthday}, anniversary ${anniversary}, recurring bug "${recurringBug}".`, 1358 'Save it explicitly as durable memory and confirm what was saved.', 1359 ].join(' '), 1360 semanticCheck: (result) => 1361 mentionsDate(result.response, birthday) && 1362 mentionsDate(result.response, anniversary) && 1363 result.response.toLowerCase().includes(recurringBug.toLowerCase()), 1364 externalCheckWeight: 0.4, 1365 postRunCheck: async () => { 1366 const memoryCount = countMemoriesContaining(runTag) 1367 return { 1368 name: 'memory_rows_created', 1369 passed: memoryCount >= 1, 1370 details: { memoryCount, expectedAtLeast: 1 }, 1371 } 1372 }, 1373 }, 1374 { 1375 id: 'memory_significant_recall', 1376 skill: 'memory', 1377 weight: 4, 1378 timeoutMs: 120_000, 1379 requiresTool: false, 1380 expectedTools: [], 1381 prompt: 'What significant personal details and recurring bug did I ask you to remember earlier in this conversation? Answer with exact values.', 1382 semanticCheck: (result) => 1383 mentionsDate(result.response, birthday) && 1384 mentionsDate(result.response, anniversary) && 1385 result.response.toLowerCase().includes(recurringBug.toLowerCase()), 1386 }, 1387 ] 1388 } 1389 1390 const CHATROOM_SCENARIOS = [ 1391 { 1392 id: 'sequential_project_split_execute', 1393 mode: 'sequential', 1394 autoAddress: true, 1395 weight: 10, 1396 timeoutMs: 240_000, 1397 requireAction: true, 1398 prompt: '@all We need to research and build a tiny app together. Split responsibilities by role and perform one concrete action now.', 1399 }, 1400 { 1401 id: 'parallel_cross_delegate', 1402 mode: 'parallel', 1403 autoAddress: true, 1404 weight: 10, 1405 timeoutMs: 240_000, 1406 requireAction: true, 1407 requireDelegation: true, 1408 prompt: '@all Work as a team: each of you delegate one subtask to another specific agent and execute one concrete action now.', 1409 }, 1410 { 1411 id: 'sequential_companion_team', 1412 mode: 'sequential', 1413 autoAddress: true, 1414 weight: 10, 1415 timeoutMs: 240_000, 1416 requireAction: true, 1417 requireEmpathy: true, 1418 prompt: '@all User says they are overwhelmed and lonely while trying to build a startup. Respond empathetically and provide one concrete next step each.', 1419 }, 1420 ] 1421 1422 function evaluateSessionScenario(scenario, result, postCheck = null) { 1423 const called = new Set(canonicalizeToolList(result.toolCalls)) 1424 const expected = canonicalizeToolList(scenario.expectedTools || []) 1425 const expectedMatched = expected.filter((toolName) => called.has(toolName)).length 1426 const toolCoverage = expected.length > 0 1427 ? expectedMatched / expected.length 1428 : (scenario.requiresTool ? (result.toolCalls.length > 0 ? 1 : 0) : 1) 1429 const noErrors = result.toolErrors.length === 0 && result.streamErrors.length === 0 ? 1 : 0 1430 const semantic = scenario.semanticCheck(result) ? 1 : 0 1431 const timely = result.durationMs <= scenario.timeoutMs ? 1 : 0 1432 const external = postCheck ? (postCheck.passed ? 1 : 0) : 1 1433 const externalWeight = Number.isFinite(Number(scenario.externalCheckWeight)) 1434 ? Math.max(0, Math.min(0.5, Number(scenario.externalCheckWeight))) 1435 : 0 1436 const primaryScore = (toolCoverage * 0.5) + (noErrors * 0.2) + (semantic * 0.2) + (timely * 0.1) 1437 const blended = externalWeight > 0 1438 ? ((primaryScore * (1 - externalWeight)) + (external * externalWeight)) 1439 : primaryScore 1440 1441 let score = scenario.weight * blended 1442 if (scenario.requiresTool && result.toolCalls.length === 0) { 1443 score *= 0.35 1444 } 1445 score = round1(score) 1446 1447 return { 1448 id: scenario.id, 1449 skill: scenario.skill, 1450 weight: scenario.weight, 1451 score, 1452 passed: score >= scenario.weight * 0.7, 1453 durationMs: result.durationMs, 1454 checks: { 1455 toolCoverage: round1(toolCoverage * 100), 1456 noErrors: Boolean(noErrors), 1457 semantic: Boolean(semantic), 1458 timely: Boolean(timely), 1459 external: postCheck ? Boolean(postCheck.passed) : null, 1460 }, 1461 toolCalls: result.toolCalls, 1462 toolErrors: result.toolErrors, 1463 streamErrors: result.streamErrors, 1464 response: result.responseSummary || summarize(result.response, 340), 1465 postCheck, 1466 } 1467 } 1468 1469 function evaluateChatroomScenario(scenario, result, expectedAgentIds) { 1470 const expected = expectedAgentIds.length 1471 const participation = expected > 0 ? Math.min(1, result.respondedAgentIds.length / expected) : 0 1472 const combinedText = result.newMessages 1473 .filter((msg) => msg.senderId !== 'user' && msg.senderId !== 'system') 1474 .map((msg) => msg.text) 1475 .join('\n') 1476 const splitSignal = /\b(assign|split|role|research|build|verify|delegate|owner)\b/i.test(combinedText) ? 1 : 0 1477 const actionSignal = result.toolCalls.length > 0 1478 || /\b(created|started|ran|executed|wrote|searched|configured|checked|listed|implemented|launched)\b/i.test(combinedText) 1479 || containsActionableStep(combinedText) 1480 ? 1 : 0 1481 const delegationSignal = result.newMessages.some((msg) => 1482 msg.senderId !== 'user' && 1483 msg.senderId !== 'system' && 1484 ( 1485 (Array.isArray(msg.mentions) && msg.mentions.length > 0) 1486 || /@\w+/.test(String(msg.text || '')) 1487 || /\bdelegate\b/i.test(String(msg.text || '')) 1488 ) 1489 ) ? 1 : 0 1490 const empathySignal = containsEmpathy(combinedText) ? 1 : 0 1491 const noErrors = result.errors.length === 0 ? 1 : 0 1492 1493 let score = 0 1494 if (scenario.requireEmpathy) { 1495 score = round1(scenario.weight * ( 1496 (participation * 0.35) + 1497 (empathySignal * 0.3) + 1498 (actionSignal * 0.2) + 1499 (noErrors * 0.15) 1500 )) 1501 } else if (scenario.requireDelegation) { 1502 score = round1(scenario.weight * ( 1503 (participation * 0.35) + 1504 (delegationSignal * 0.25) + 1505 (splitSignal * 0.2) + 1506 (actionSignal * 0.1) + 1507 (noErrors * 0.1) 1508 )) 1509 } else { 1510 score = round1(scenario.weight * ( 1511 (participation * 0.4) + 1512 (splitSignal * 0.25) + 1513 (actionSignal * 0.25) + 1514 (noErrors * 0.1) 1515 )) 1516 } 1517 1518 return { 1519 id: scenario.id, 1520 mode: scenario.mode, 1521 weight: scenario.weight, 1522 score, 1523 passed: score >= scenario.weight * 0.7, 1524 durationMs: result.durationMs, 1525 checks: { 1526 participation: round1(participation * 100), 1527 splitSignal: Boolean(splitSignal), 1528 actionSignal: Boolean(actionSignal), 1529 delegationSignal: Boolean(delegationSignal), 1530 empathySignal: Boolean(empathySignal), 1531 noErrors: Boolean(noErrors), 1532 }, 1533 respondedAgentIds: result.respondedAgentIds, 1534 toolCalls: result.toolCalls, 1535 errors: result.errors, 1536 sampleMessages: result.newMessages.slice(0, 8).map((msg) => ({ 1537 senderName: msg.senderName, 1538 text: summarize(msg.text, 180), 1539 })), 1540 } 1541 } 1542 1543 function evaluateModelDiversity(participantAgents) { 1544 const normalizeTools = (tools) => { 1545 if (!Array.isArray(tools)) return '' 1546 return [...new Set(tools.map((tool) => String(tool || '').trim()).filter(Boolean))].sort().join(',') 1547 } 1548 const modelFamily = (model) => String(model || '').toLowerCase().split(/[:/@]/)[0] || String(model || '').toLowerCase() 1549 const uniqueModelKeys = new Set( 1550 participantAgents.map((agent) => `${agent.provider || 'unknown'}:${agent.model || 'unknown'}`) 1551 ) 1552 const uniqueFamilyKeys = new Set( 1553 participantAgents.map((agent) => `${agent.provider || 'unknown'}:${modelFamily(agent.model || 'unknown')}`) 1554 ) 1555 const uniqueCapabilityProfiles = new Set( 1556 participantAgents.map((agent) => [ 1557 String(agent.provider || 'unknown').toLowerCase(), 1558 String(agent.model || 'unknown').toLowerCase(), 1559 normalizeTools(getAgentTools(agent)), 1560 agent.credentialId ? 'cred' : 'nocred', 1561 agent.apiEndpoint ? 'custom-endpoint' : 'default-endpoint', 1562 ].join('|')) 1563 ) 1564 const uniqueToolProfiles = new Set( 1565 participantAgents.map((agent) => normalizeTools(getAgentTools(agent))) 1566 ) 1567 const agentCount = Math.max(1, participantAgents.length) 1568 const modelDiversity = Math.min(1, uniqueModelKeys.size / agentCount) 1569 const familyDiversity = Math.min(1, uniqueFamilyKeys.size / agentCount) 1570 const capabilityDiversity = Math.min(1, uniqueCapabilityProfiles.size / agentCount) 1571 const toolProfileDiversity = Math.min(1, uniqueToolProfiles.size / agentCount) 1572 const roleHints = participantAgents.filter((agent) => { 1573 const text = `${agent.name || ''} ${agent.description || ''}`.toLowerCase() 1574 return /(research|build|assistant|planner|coder|qa|ops|orchestr)/.test(text) 1575 }).length 1576 const specialization = Math.min(1, roleHints / agentCount) 1577 const score = round1(10 * ( 1578 (modelDiversity * 0.2) 1579 + (familyDiversity * 0.2) 1580 + (capabilityDiversity * 0.35) 1581 + (toolProfileDiversity * 0.1) 1582 + (specialization * 0.15) 1583 )) 1584 1585 return { 1586 weight: 10, 1587 score, 1588 passed: score >= 5, 1589 checks: { 1590 uniqueModels: uniqueModelKeys.size, 1591 uniqueModelFamilies: uniqueFamilyKeys.size, 1592 uniqueCapabilityProfiles: uniqueCapabilityProfiles.size, 1593 uniqueToolProfiles: uniqueToolProfiles.size, 1594 agentCount, 1595 diversityPct: round1(modelDiversity * 100), 1596 familyDiversityPct: round1(familyDiversity * 100), 1597 capabilityDiversityPct: round1(capabilityDiversity * 100), 1598 toolProfileDiversityPct: round1(toolProfileDiversity * 100), 1599 specializationPct: round1(specialization * 100), 1600 }, 1601 participants: participantAgents.map((agent) => ({ 1602 id: agent.id, 1603 name: agent.name, 1604 provider: agent.provider, 1605 model: agent.model, 1606 tools: getAgentTools(agent), 1607 hasCredential: Boolean(agent.credentialId), 1608 hasEndpoint: Boolean(agent.apiEndpoint), 1609 })), 1610 } 1611 } 1612 1613 function evaluateOpenclawComparison(results) { 1614 if (!results || results.length === 0) { 1615 return { status: 'not_configured', available: false, notes: 'No OpenClaw agent configured.' } 1616 } 1617 const hasConnectionRefused = results.some((row) => 1618 row.streamErrors.some((error) => /econnrefused/i.test(error)) 1619 ) 1620 const healthyTurns = results.filter((row) => 1621 row.streamErrors.length === 0 && row.response && row.response.trim().length >= 20 1622 ).length 1623 if (hasConnectionRefused && healthyTurns === 0) { 1624 return { status: 'unreachable', available: false, notes: 'OpenClaw provider unreachable (connection refused).' } 1625 } 1626 return { 1627 status: 'available', 1628 available: true, 1629 healthyTurns, 1630 totalTurns: results.length, 1631 notes: healthyTurns === results.length 1632 ? 'OpenClaw comparison completed.' 1633 : 'OpenClaw comparison partially completed with errors.', 1634 } 1635 } 1636 1637 function readLatestBenchmark(outDir, profileId) { 1638 if (!fs.existsSync(outDir)) return null 1639 const prefix = `autonomy-benchmark-${profileId}-` 1640 const files = fs.readdirSync(outDir) 1641 .filter((name) => name.startsWith(prefix) && name.endsWith('.json')) 1642 .sort() 1643 if (files.length === 0) return null 1644 const latest = path.join(outDir, files[files.length - 1]) 1645 try { 1646 const parsed = JSON.parse(fs.readFileSync(latest, 'utf8')) 1647 return { path: latest, report: parsed } 1648 } catch { 1649 return null 1650 } 1651 } 1652 1653 function filterScenarioIds(rows, requestedIds) { 1654 if (!Array.isArray(requestedIds) || requestedIds.length === 0) return rows 1655 const wanted = new Set(requestedIds) 1656 return rows.filter((row) => wanted.has(row.id)) 1657 } 1658 1659 function renderMarkdown(report) { 1660 const lines = [] 1661 lines.push('# Autonomy Harness Benchmark') 1662 lines.push('') 1663 lines.push(`- Generated: ${report.generatedAt}`) 1664 lines.push(`- Base URL: ${report.baseUrl}`) 1665 lines.push(`- Grade: **${report.summary.grade}** (${report.summary.totalScore}/${report.summary.maxScore})`) 1666 lines.push(`- Min Score Threshold: ${report.summary.minScore}`) 1667 lines.push(`- Result: ${report.summary.passed ? 'PASS' : 'FAIL'}`) 1668 lines.push('') 1669 lines.push('## Category Scores') 1670 lines.push('') 1671 lines.push('| Category | Score | Max | Pass |') 1672 lines.push('| --- | ---: | ---: | :---: |') 1673 lines.push(`| Session Skills | ${report.categoryScores.session.score} | ${report.categoryScores.session.max} | ${report.categoryScores.session.passed ? 'yes' : 'no'} |`) 1674 lines.push(`| Chatroom Collaboration | ${report.categoryScores.chatroom.score} | ${report.categoryScores.chatroom.max} | ${report.categoryScores.chatroom.passed ? 'yes' : 'no'} |`) 1675 lines.push(`| Collaboration Diversity | ${report.categoryScores.modelDiversity.score} | ${report.categoryScores.modelDiversity.max} | ${report.categoryScores.modelDiversity.passed ? 'yes' : 'no'} |`) 1676 lines.push('') 1677 lines.push('## Session Skills') 1678 lines.push('') 1679 lines.push('| Scenario | Skill | Score | Tool Coverage | Semantic | External | Errors |') 1680 lines.push('| --- | --- | ---: | ---: | :---: | :---: | :---: |') 1681 for (const row of report.sessionScenarios) { 1682 const external = row.checks.external === null ? 'n/a' : (row.checks.external ? 'yes' : 'no') 1683 lines.push(`| ${row.id} | ${row.skill} | ${row.score}/${row.weight} | ${row.checks.toolCoverage}% | ${row.checks.semantic ? 'yes' : 'no'} | ${external} | ${row.checks.noErrors ? 'yes' : 'no'} |`) 1684 } 1685 lines.push('') 1686 lines.push('## Chatroom Collaboration') 1687 lines.push('') 1688 lines.push('| Scenario | Mode | Score | Participation | Action | Delegation | Empathy | Errors |') 1689 lines.push('| --- | --- | ---: | ---: | :---: | :---: | :---: | :---: |') 1690 for (const row of report.chatroomScenarios) { 1691 lines.push(`| ${row.id} | ${row.mode} | ${row.score}/${row.weight} | ${row.checks.participation}% | ${row.checks.actionSignal ? 'yes' : 'no'} | ${row.checks.delegationSignal ? 'yes' : 'no'} | ${row.checks.empathySignal ? 'yes' : 'no'} | ${row.checks.noErrors ? 'yes' : 'no'} |`) 1692 } 1693 lines.push('') 1694 lines.push('## OpenClaw Comparison') 1695 lines.push('') 1696 lines.push(`- Status: ${report.openclaw.status}`) 1697 lines.push(`- Notes: ${report.openclaw.notes}`) 1698 lines.push('') 1699 if (report.previous) { 1700 lines.push('## Previous Run Delta') 1701 lines.push('') 1702 lines.push(`- Previous: ${report.previous.path}`) 1703 lines.push(`- Score Change: ${report.previous.deltaScore > 0 ? '+' : ''}${report.previous.deltaScore}`) 1704 lines.push(`- Grade Change: ${report.previous.prevGrade} -> ${report.summary.grade}`) 1705 lines.push('') 1706 } 1707 return `${lines.join('\n')}\n` 1708 } 1709 1710 async function runSessionTurn(client, sessionId, scenario) { 1711 const sse = await postSse( 1712 client, 1713 `/api/chats/${encodeURIComponent(sessionId)}/chat`, 1714 { message: scenario.prompt }, 1715 scenario.timeoutMs, 1716 ) 1717 const { toolCalls, toolErrors, streamErrors } = collectToolStats(sse.events) 1718 const cleanedResponse = stripRunNoise(sse.text) 1719 return { 1720 id: scenario.id, 1721 durationMs: sse.durationMs, 1722 toolCalls, 1723 toolErrors, 1724 streamErrors, 1725 response: cleanedResponse, 1726 responseSummary: summarize(cleanedResponse, 340), 1727 } 1728 } 1729 1730 async function runChatroomTurn(client, chatroomId, scenario, expectedAgentIds) { 1731 const before = await fetchJson(client, 'GET', `/api/chatrooms/${encodeURIComponent(chatroomId)}`) 1732 const previousCount = Array.isArray(before?.messages) ? before.messages.length : 0 1733 const sse = await postSse( 1734 client, 1735 `/api/chatrooms/${encodeURIComponent(chatroomId)}/chat`, 1736 { senderId: 'user', text: scenario.prompt }, 1737 scenario.timeoutMs, 1738 ) 1739 const after = await fetchJson(client, 'GET', `/api/chatrooms/${encodeURIComponent(chatroomId)}`) 1740 const messages = Array.isArray(after?.messages) ? after.messages : [] 1741 const newMessages = messages.slice(previousCount) 1742 const respondedAgentIds = [...new Set( 1743 newMessages 1744 .filter((msg) => msg && msg.senderId && msg.senderId !== 'user' && msg.senderId !== 'system') 1745 .map((msg) => String(msg.senderId)), 1746 )] 1747 const { toolCalls } = collectToolStats(sse.events) 1748 const errors = sse.events 1749 .filter((event) => event?.t === 'err') 1750 .map((event) => summarize(event.text || 'unknown error', 180)) 1751 1752 return { 1753 id: scenario.id, 1754 durationMs: sse.durationMs, 1755 expectedAgentIds, 1756 respondedAgentIds, 1757 toolCalls, 1758 errors, 1759 newMessages: newMessages.map((msg) => ({ 1760 senderId: msg.senderId, 1761 senderName: msg.senderName, 1762 text: msg.text, 1763 mentions: Array.isArray(msg.mentions) ? msg.mentions : [], 1764 })), 1765 } 1766 } 1767 1768 function selectChatroomAgentIds(agents, probeAgent) { 1769 const probeAgentId = probeAgent?.id 1770 const selected = [] 1771 const normalizeTools = (tools) => { 1772 if (!Array.isArray(tools)) return '' 1773 return [...new Set(tools.map((tool) => String(tool || '').trim()).filter(Boolean))].sort().join(',') 1774 } 1775 const probeModelKey = `${probeAgent?.provider || ''}:${probeAgent?.model || ''}`.toLowerCase() 1776 const probeToolKey = normalizeTools(getAgentTools(probeAgent)) 1777 const probeHasCred = Boolean(probeAgent?.credentialId) 1778 const probeHasEndpoint = Boolean(probeAgent?.apiEndpoint) 1779 const isHealthyCandidate = (agent) => { 1780 const provider = String(agent?.provider || '').toLowerCase() 1781 if (!provider) return false 1782 if (provider === 'openclaw') return false 1783 if (provider === 'ollama') return Boolean(agent?.apiEndpoint) 1784 if (provider.endsWith('-cli')) return true 1785 return Boolean(agent?.credentialId) 1786 } 1787 1788 const candidates = Object.entries(agents) 1789 .filter(([id, agent]) => { 1790 if (!agent || id === probeAgentId) return false 1791 if (!isHealthyCandidate(agent)) return false 1792 const name = String(agent.name || '').toLowerCase() 1793 return !(name.includes('probe autonomy') || name.includes('[autonomy probe')) 1794 }) 1795 .map(([id, agent]) => { 1796 const modelKey = `${agent.provider || ''}:${agent.model || ''}`.toLowerCase() 1797 const toolKey = normalizeTools(getAgentTools(agent)) 1798 let score = 0 1799 if (modelKey !== probeModelKey) score += 4 1800 if (toolKey !== probeToolKey) score += 3 1801 if (Boolean(agent.credentialId) !== probeHasCred) score += 1 1802 if (Boolean(agent.apiEndpoint) !== probeHasEndpoint) score += 1 1803 const text = `${agent.name || ''} ${agent.description || ''}`.toLowerCase() 1804 if (/(research|build|assistant|planner|coder|qa|ops|orchestr)/.test(text)) score += 1 1805 return { id, score } 1806 }) 1807 .sort((a, b) => b.score - a.score) 1808 1809 // Keep assistant as first collaborator when available for consistent baseline UX. 1810 if (agents.default && probeAgentId !== 'default' && isHealthyCandidate(agents.default)) selected.push('default') 1811 1812 for (const candidate of candidates) { 1813 if (selected.includes(candidate.id)) continue 1814 selected.push(candidate.id) 1815 if (selected.length >= 2) break 1816 } 1817 1818 return [probeAgentId, ...selected].filter(Boolean).slice(0, 3) 1819 } 1820 1821 async function cleanupBenchmarkArtifacts(client, ids, runTag) { 1822 const warnings = [] 1823 1824 for (const chatroomId of ids.chatrooms) { 1825 try { 1826 await fetchJson(client, 'DELETE', `/api/chatrooms/${encodeURIComponent(chatroomId)}`) 1827 } catch (err) { 1828 warnings.push(`cleanup chatroom ${chatroomId}: ${err instanceof Error ? err.message : String(err)}`) 1829 } 1830 } 1831 1832 for (const sessionId of ids.sessions) { 1833 try { 1834 await fetchJson(client, 'DELETE', `/api/chats/${encodeURIComponent(sessionId)}`) 1835 } catch (err) { 1836 warnings.push(`cleanup session ${sessionId}: ${err instanceof Error ? err.message : String(err)}`) 1837 } 1838 } 1839 1840 for (const agentId of ids.agents) { 1841 try { 1842 await fetchJson(client, 'DELETE', `/api/agents/${encodeURIComponent(agentId)}`) 1843 } catch (err) { 1844 warnings.push(`cleanup agent ${agentId}: ${err instanceof Error ? err.message : String(err)}`) 1845 } 1846 } 1847 1848 try { 1849 const tasks = await fetchJson(client, 'GET', '/api/tasks') 1850 const rows = Object.values(tasks || {}) 1851 for (const row of rows) { 1852 if (!row || typeof row !== 'object') continue 1853 const id = row.id 1854 const title = String(row.title || '') 1855 if (!id || !title.includes(`[Autonomy Probe ${runTag}]`)) continue 1856 await fetchJson(client, 'DELETE', `/api/tasks/${encodeURIComponent(id)}`) 1857 } 1858 } catch (err) { 1859 warnings.push(`cleanup benchmark tasks: ${err instanceof Error ? err.message : String(err)}`) 1860 } 1861 1862 for (const projectId of ids.projects || []) { 1863 try { 1864 await fetchJson(client, 'DELETE', `/api/projects/${encodeURIComponent(projectId)}`) 1865 } catch (err) { 1866 warnings.push(`cleanup project ${projectId}: ${err instanceof Error ? err.message : String(err)}`) 1867 } 1868 } 1869 1870 for (const workspaceRoot of ids.workspaces || []) { 1871 try { 1872 fs.rmSync(workspaceRoot, { recursive: true, force: true }) 1873 } catch (err) { 1874 warnings.push(`cleanup workspace ${workspaceRoot}: ${err instanceof Error ? err.message : String(err)}`) 1875 } 1876 } 1877 1878 return warnings 1879 } 1880 1881 async function main() { 1882 const options = parseArgs(process.argv.slice(2)) 1883 const profile = resolveProbeProfile(options.profile) 1884 const accessKey = loadAccessKey(options.accessKey) 1885 const client = { baseUrl: options.baseUrl, accessKey } 1886 const runTag = toSlug(nowSlug()) 1887 const probeTitle = `[Autonomy Probe ${runTag}]` 1888 const createdIds = { agents: [], sessions: [], chatrooms: [], projects: [], workspaces: [] } 1889 const warnings = [] 1890 1891 ensureDir(options.outDir) 1892 const previous = readLatestBenchmark(options.outDir, profile.id) 1893 1894 await fetchJson(client, 'GET', '/api/auth') 1895 const agents = await fetchJson(client, 'GET', '/api/agents') 1896 const defaultAgent = agents?.default || Object.values(agents || {})[0] 1897 if (!defaultAgent) { 1898 throw new Error('No agent found. Configure at least one agent before running benchmark.') 1899 } 1900 1901 const workspaceFixture = await prepareWorkspaceFixture(client, runTag, profile, createdIds) 1902 1903 const probeAgent = await fetchJson(client, 'POST', '/api/agents', { 1904 name: `${probeTitle} Agent`, 1905 description: 'Temporary autonomy benchmark agent', 1906 systemPrompt: defaultAgent.systemPrompt || '', 1907 provider: defaultAgent.provider || 'openai', 1908 model: defaultAgent.model || 'gpt-4o', 1909 credentialId: defaultAgent.credentialId || null, 1910 apiEndpoint: defaultAgent.apiEndpoint || null, 1911 tools: profile.tools, 1912 delegationEnabled: true, 1913 delegationTargetMode: 'all', 1914 delegationTargetAgentIds: [], 1915 memoryScopeMode: profile.hasProjectContext ? 'project' : 'auto', 1916 projectId: workspaceFixture.project?.id || undefined, 1917 }) 1918 createdIds.agents.push(probeAgent.id) 1919 1920 const probeSession = await fetchJson(client, 'POST', '/api/chats', { 1921 name: `${probeTitle} Session`, 1922 agentId: probeAgent.id, 1923 provider: probeAgent.provider, 1924 model: probeAgent.model, 1925 credentialId: probeAgent.credentialId || null, 1926 apiEndpoint: probeAgent.apiEndpoint || null, 1927 extensions: getAgentTools(probeAgent), 1928 user: 'benchmark', 1929 cwd: workspaceFixture.workspaceRoot, 1930 }) 1931 createdIds.sessions.push(probeSession.id) 1932 1933 const memoryRecallSession = await fetchJson(client, 'POST', '/api/chats', { 1934 name: `${probeTitle} Memory recall`, 1935 agentId: probeAgent.id, 1936 provider: probeAgent.provider, 1937 model: probeAgent.model, 1938 credentialId: probeAgent.credentialId || null, 1939 apiEndpoint: probeAgent.apiEndpoint || null, 1940 extensions: getAgentTools(probeAgent), 1941 user: 'benchmark', 1942 cwd: workspaceFixture.workspaceRoot, 1943 }) 1944 createdIds.sessions.push(memoryRecallSession.id) 1945 1946 const sessionScenarios = filterScenarioIds( 1947 buildSessionScenarios(runTag, defaultAgent.id, profile, workspaceFixture), 1948 options.sessionScenarios, 1949 ) 1950 if (sessionScenarios.length === 0) { 1951 throw new Error('No session scenarios selected. Check --session-scenarios values.') 1952 } 1953 const sessionResults = [] 1954 const sessionEvaluated = [] 1955 for (const scenario of sessionScenarios) { 1956 const targetSessionId = scenario.id === 'memory_significant_recall' 1957 ? memoryRecallSession.id 1958 : probeSession.id 1959 const row = await runSessionTurn(client, targetSessionId, scenario) 1960 let postCheck = null 1961 if (typeof scenario.postRunCheck === 'function') { 1962 try { 1963 postCheck = await scenario.postRunCheck({ 1964 client, 1965 runTag, 1966 sessionId: targetSessionId, 1967 row, 1968 }) 1969 } catch (err) { 1970 postCheck = { 1971 name: 'post_check_error', 1972 passed: false, 1973 details: { error: err instanceof Error ? err.message : String(err) }, 1974 } 1975 } 1976 } 1977 sessionResults.push(row) 1978 sessionEvaluated.push(evaluateSessionScenario(scenario, row, postCheck)) 1979 await sleep(250) 1980 } 1981 1982 const roomAgentIds = options.skipChatrooms ? [] : selectChatroomAgentIds(agents, probeAgent) 1983 const roomAgents = options.skipChatrooms 1984 ? [] 1985 : roomAgentIds.map((id) => agents[id] || (id === probeAgent.id ? probeAgent : null)).filter(Boolean) 1986 const chatroomResults = [] 1987 const chatroomEvaluated = [] 1988 1989 if (!options.skipChatrooms) { 1990 for (const scenario of CHATROOM_SCENARIOS) { 1991 const room = await fetchJson(client, 'POST', '/api/chatrooms', { 1992 name: `${probeTitle} ${scenario.mode} room`, 1993 description: `${scenario.mode} benchmark room`, 1994 agentIds: roomAgentIds, 1995 }) 1996 createdIds.chatrooms.push(room.id) 1997 const modeSet = setChatroomHarnessFlags(room.id, { 1998 chatMode: scenario.mode, 1999 autoAddress: scenario.autoAddress, 2000 }) 2001 if (!modeSet) { 2002 warnings.push(`Could not set chatroom mode flags for ${room.id}; benchmark fell back to room defaults.`) 2003 } 2004 const row = await runChatroomTurn(client, room.id, scenario, roomAgentIds) 2005 chatroomResults.push(row) 2006 chatroomEvaluated.push(evaluateChatroomScenario(scenario, row, roomAgentIds)) 2007 await sleep(250) 2008 } 2009 } else { 2010 warnings.push('Chatroom scenarios skipped by --skip-chatrooms.') 2011 } 2012 2013 const openclawAgent = options.includeOpenclaw 2014 ? Object.values(agents || {}).find((agent) => agent && String(agent.provider || '').toLowerCase() === 'openclaw') 2015 : null 2016 let openclawSession = null 2017 const openclawResults = [] 2018 if (openclawAgent) { 2019 try { 2020 openclawSession = await fetchJson(client, 'POST', '/api/chats', { 2021 name: `${probeTitle} OpenClaw compare`, 2022 agentId: openclawAgent.id, 2023 provider: openclawAgent.provider, 2024 model: openclawAgent.model, 2025 credentialId: openclawAgent.credentialId || null, 2026 apiEndpoint: openclawAgent.apiEndpoint || null, 2027 extensions: getAgentTools(openclawAgent), 2028 user: 'benchmark', 2029 }) 2030 createdIds.sessions.push(openclawSession.id) 2031 for (const scenario of OPENCLAW_SCENARIOS) { 2032 const row = await runSessionTurn(client, openclawSession.id, scenario) 2033 openclawResults.push(row) 2034 await sleep(250) 2035 } 2036 } catch (err) { 2037 warnings.push(`OpenClaw comparison skipped: ${err instanceof Error ? err.message : String(err)}`) 2038 } 2039 } 2040 2041 const modelDiversity = options.skipChatrooms 2042 ? { 2043 weight: 0, 2044 score: 0, 2045 passed: true, 2046 checks: { 2047 uniqueModels: 0, 2048 uniqueModelFamilies: 0, 2049 uniqueCapabilityProfiles: 0, 2050 uniqueToolProfiles: 0, 2051 agentCount: 0, 2052 diversityPct: 0, 2053 familyDiversityPct: 0, 2054 capabilityDiversityPct: 0, 2055 toolProfileDiversityPct: 0, 2056 specializationPct: 0, 2057 }, 2058 participants: [], 2059 } 2060 : evaluateModelDiversity(roomAgents) 2061 const sessionScore = round1(sessionEvaluated.reduce((sum, row) => sum + row.score, 0)) 2062 const sessionMax = round1(sessionEvaluated.reduce((sum, row) => sum + row.weight, 0)) 2063 const chatroomScore = round1(chatroomEvaluated.reduce((sum, row) => sum + row.score, 0)) 2064 const chatroomMax = round1(chatroomEvaluated.reduce((sum, row) => sum + row.weight, 0)) 2065 const totalScore = round1(sessionScore + chatroomScore + modelDiversity.score) 2066 const maxScore = round1(sessionMax + chatroomMax + modelDiversity.weight) 2067 const normalizedScore = maxScore > 0 ? round1((totalScore / maxScore) * 100) : 0 2068 const grade = gradeForScore(normalizedScore) 2069 const openclawSummary = evaluateOpenclawComparison(openclawResults) 2070 const totalDurationMs = sessionResults.reduce((sum, row) => sum + row.durationMs, 0) 2071 + chatroomResults.reduce((sum, row) => sum + row.durationMs, 0) 2072 const totalToolCalls = sessionResults.reduce((sum, row) => sum + row.toolCalls.length, 0) 2073 + chatroomResults.reduce((sum, row) => sum + row.toolCalls.length, 0) 2074 2075 let previousSummary = null 2076 if (previous?.report?.summary?.totalScore !== undefined) { 2077 const prevScore = Number(previous.report.summary.totalScore) || 0 2078 const deltaScore = round1(totalScore - prevScore) 2079 previousSummary = { 2080 path: previous.path, 2081 prevScore: round1(prevScore), 2082 prevGrade: String(previous.report.summary.grade || '?'), 2083 deltaScore, 2084 } 2085 } 2086 2087 if (!options.keepCreated) { 2088 const cleanupWarnings = await cleanupBenchmarkArtifacts(client, createdIds, runTag) 2089 warnings.push(...cleanupWarnings) 2090 } 2091 2092 const report = { 2093 schemaVersion: 1, 2094 generatedAt: new Date().toISOString(), 2095 baseUrl: client.baseUrl, 2096 runTag, 2097 profile: { 2098 id: profile.id, 2099 label: profile.label, 2100 tools: [...profile.tools], 2101 hasTaskManagement: profile.hasTaskManagement, 2102 hasProjectContext: profile.hasProjectContext, 2103 hasProjectTool: profile.hasProjectTool, 2104 hasProjectOperations: profile.hasProjectOperations, 2105 notes: profile.hasProjectContext 2106 ? 'Project context uses a real Project record, a workspace under WORKSPACE_ROOT/projects/<projectId>, structured project metadata, and project-linked tasks/schedules/secrets when those tools are enabled.' 2107 : 'This profile does not enable project context; comparisons isolate task management against file-based fallback workflows.', 2108 }, 2109 options: { 2110 sessionScenarioIds: sessionScenarios.map((scenario) => scenario.id), 2111 skipChatrooms: options.skipChatrooms, 2112 includeOpenclaw: options.includeOpenclaw, 2113 }, 2114 summary: { 2115 totalScore, 2116 maxScore, 2117 normalizedScore, 2118 grade, 2119 minScore: options.minScore, 2120 passed: normalizedScore >= options.minScore, 2121 totalDurationMs, 2122 totalToolCalls, 2123 }, 2124 categoryScores: { 2125 session: { 2126 score: sessionScore, 2127 max: sessionMax, 2128 passed: sessionScore >= sessionMax * 0.7, 2129 }, 2130 chatroom: { 2131 score: chatroomScore, 2132 max: chatroomMax, 2133 passed: chatroomScore >= chatroomMax * 0.7, 2134 }, 2135 modelDiversity: { 2136 score: modelDiversity.score, 2137 max: modelDiversity.weight, 2138 passed: modelDiversity.passed, 2139 }, 2140 }, 2141 probe: { 2142 profileId: profile.id, 2143 probeAgent: { id: probeAgent.id, name: probeAgent.name, provider: probeAgent.provider, model: probeAgent.model }, 2144 probeSession: { id: probeSession.id, name: probeSession.name }, 2145 workspaceRoot: workspaceFixture.workspaceRoot, 2146 project: workspaceFixture.project ? { 2147 id: workspaceFixture.project.id, 2148 name: workspaceFixture.project.name, 2149 description: workspaceFixture.project.description, 2150 } : null, 2151 chatroomAgentIds: roomAgentIds, 2152 }, 2153 sessionScenarios: sessionEvaluated, 2154 sessionRaw: sessionResults, 2155 chatroomScenarios: chatroomEvaluated, 2156 chatroomRaw: chatroomResults, 2157 modelDiversity, 2158 openclaw: { 2159 ...openclawSummary, 2160 sessionId: openclawSession?.id || null, 2161 results: openclawResults, 2162 }, 2163 previous: previousSummary, 2164 warnings: [...warnings], 2165 } 2166 2167 const fileStem = `autonomy-benchmark-${profile.id}-${runTag}` 2168 const jsonPath = path.join(options.outDir, `${fileStem}.json`) 2169 const markdownPath = path.join(options.outDir, `${fileStem}.md`) 2170 fs.writeFileSync(jsonPath, JSON.stringify(report, null, 2)) 2171 fs.writeFileSync(markdownPath, renderMarkdown(report)) 2172 2173 const summaryLine = `${report.summary.passed ? 'PASS' : 'FAIL'} ${report.summary.grade} ${report.summary.normalizedScore}/100` 2174 console.log(JSON.stringify({ 2175 summary: summaryLine, 2176 jsonPath, 2177 markdownPath, 2178 openclaw: report.openclaw.status, 2179 warnings: report.warnings, 2180 }, null, 2)) 2181 2182 if (!report.summary.passed) { 2183 process.exit(2) 2184 } 2185 } 2186 2187 main().catch((err) => { 2188 const message = err instanceof Error ? err.message : String(err) 2189 console.error(JSON.stringify({ error: message }, null, 2)) 2190 process.exit(1) 2191 })