/ scripts / benchmark-autonomy-harness.mjs
benchmark-autonomy-harness.mjs
   1  #!/usr/bin/env node
   2  
   3  import fs from 'node:fs'
   4  import path from 'node:path'
   5  import Database from 'better-sqlite3'
   6  
   7  const DEFAULT_BASE_URL = process.env.SWARMCLAW_URL || 'http://localhost:3456'
   8  const DEFAULT_OUT_DIR = path.join(process.cwd(), 'data', 'autonomy-benchmarks')
   9  const DEFAULT_MIN_SCORE = Number.parseFloat(process.env.AUTONOMY_BENCH_MIN_SCORE || '70')
  10  const DEFAULT_PROBE_PROFILE = String(process.env.AUTONOMY_BENCH_PROFILE || 'full').trim() || 'full'
  11  
  12  function supportsChildWrites(dir) {
  13    try {
  14      fs.mkdirSync(dir, { recursive: true })
  15      const probeDir = fs.mkdtempSync(path.join(dir, '.autonomy-bench-probe-'))
  16      fs.rmSync(probeDir, { recursive: true, force: true })
  17      return true
  18    } catch {
  19      return false
  20    }
  21  }
  22  
  23  function resolveWorkspaceRoot() {
  24    if (process.env.WORKSPACE_DIR) return process.env.WORKSPACE_DIR
  25    const external = path.join(process.env.HOME || '', '.swarmclaw', 'workspace')
  26    if (external && supportsChildWrites(external)) return external
  27    return path.join(process.cwd(), 'data', 'workspace')
  28  }
  29  
  30  const WORKSPACE_ROOT = resolveWorkspaceRoot()
  31  
  32  const TOOL_ALIAS_GROUPS = [
  33    ['shell', 'execute_command', 'process_tool', 'process'],
  34    ['files', 'read_file', 'write_file', 'list_files', 'copy_file', 'move_file', 'delete_file', 'send_file'],
  35    ['edit_file'],
  36    ['web', 'web_search', 'web_fetch'],
  37    ['browser', 'openclaw_browser'],
  38    ['delegate', 'claude_code', 'codex_cli', 'opencode_cli', 'gemini_cli', 'delegate_to_claude_code', 'delegate_to_codex_cli', 'delegate_to_opencode_cli', 'delegate_to_gemini_cli'],
  39    ['manage_platform', 'manage_agents', 'manage_projects', 'manage_tasks', 'manage_schedules', 'manage_skills', 'manage_documents', 'manage_webhooks', 'manage_secrets', 'manage_sessions'],
  40    ['manage_connectors', 'connectors', 'connector_message_tool'],
  41    ['manage_chatrooms', 'chatroom'],
  42    ['spawn_subagent', 'subagent', 'delegate_to_agent'],
  43    ['manage_sessions', 'session_info', 'sessions_tool', 'whoami_tool', 'search_history_tool'],
  44    ['schedule', 'schedule_wake'],
  45    ['http', 'http_request'],
  46    ['memory', 'memory_tool'],
  47    ['execute', 'sandbox'],
  48    ['wallet', 'wallet_tool'],
  49    ['monitor', 'monitor_tool'],
  50    ['sample_ui', 'show_extension_card'],
  51    ['context_mgmt', 'context_status', 'context_summarize'],
  52    ['openclaw_workspace'],
  53    ['openclaw_nodes'],
  54    ['image_gen', 'generate_image'],
  55    ['email', 'send_email'],
  56    ['calendar', 'calendar_events'],
  57    ['replicate', 'replicate_run', 'replicate_models'],
  58  ]
  59  
  60  const TOOL_CANONICAL_MAP = (() => {
  61    const map = new Map()
  62    for (const group of TOOL_ALIAS_GROUPS) {
  63      const normalized = group.map((entry) => String(entry || '').trim().toLowerCase()).filter(Boolean)
  64      const canonical = normalized[0]
  65      if (!canonical) continue
  66      for (const entry of normalized) map.set(entry, canonical)
  67    }
  68    return map
  69  })()
  70  
  71  const PROBE_BASE_TOOLS = [
  72    'shell',
  73    'execute',
  74    'process',
  75    'files',
  76    'edit_file',
  77    'web',
  78    'manage_connectors',
  79    'manage_sessions',
  80    'memory',
  81    'browser',
  82    'delegate',
  83    'claude_code',
  84    'codex_cli',
  85    'opencode_cli',
  86  ]
  87  
  88  const PROJECT_OPERATION_TOOLS = [
  89    'manage_projects',
  90    'manage_schedules',
  91    'manage_secrets',
  92  ]
  93  
  94  const PROBE_TOOL_PROFILES = {
  95    full: [...PROBE_BASE_TOOLS, 'manage_tasks'],
  96    no_task_management: [...PROBE_BASE_TOOLS],
  97    full_project_context: [...PROBE_BASE_TOOLS, 'manage_tasks', ...PROJECT_OPERATION_TOOLS],
  98    project_context_only: [...PROBE_BASE_TOOLS, ...PROJECT_OPERATION_TOOLS],
  99  }
 100  
 101  const OPENCLAW_SCENARIOS = [
 102    {
 103      id: 'openclaw_companion',
 104      prompt: 'Briefly introduce yourself and tell me one concrete way you can help me right now.',
 105      timeoutMs: 120_000,
 106    },
 107    {
 108      id: 'openclaw_action_request',
 109      prompt: 'Create a short 3-step plan to research and build a simple app with me, then execute step 1.',
 110      timeoutMs: 180_000,
 111    },
 112  ]
 113  
 114  function usage() {
 115    console.log([
 116      'Usage: node scripts/benchmark-autonomy-harness.mjs [options]',
 117      '',
 118      'Local-only benchmark for SwarmClaw autonomy harness.',
 119      'This benchmark is intended to be run manually pre-release, not in CI.',
 120      '',
 121      'Options:',
 122      '  --base-url <url>        SwarmClaw base URL (default: http://localhost:3456)',
 123      '  --access-key <key>      Access key (fallback: SWARMCLAW_ACCESS_KEY, then .env.local ACCESS_KEY)',
 124      '  --out-dir <dir>         Output directory for benchmark reports',
 125      '  --min-score <0-100>     Exit non-zero when score is below this threshold (default: 70)',
 126      '  --profile <name>        Probe tool profile: full | no-task-management | full-project-context | project-context-only (default: full)',
 127      '  --session-scenarios <ids> Comma-separated session scenario IDs to run',
 128      '  --skip-chatrooms        Skip chatroom collaboration scenarios',
 129      '  --no-openclaw           Skip optional OpenClaw comparison probe',
 130      '  --keep-created          Keep created benchmark agent/session/chatrooms for inspection',
 131      '  --help                  Show this help',
 132    ].join('\n'))
 133  }
 134  
 135  function parseArgs(argv) {
 136    const options = {
 137      baseUrl: DEFAULT_BASE_URL,
 138      accessKey: '',
 139      outDir: DEFAULT_OUT_DIR,
 140      minScore: Number.isFinite(DEFAULT_MIN_SCORE) ? DEFAULT_MIN_SCORE : 70,
 141      profile: DEFAULT_PROBE_PROFILE,
 142      sessionScenarios: [],
 143      skipChatrooms: false,
 144      includeOpenclaw: true,
 145      keepCreated: false,
 146    }
 147  
 148    for (let i = 0; i < argv.length; i++) {
 149      const arg = argv[i]
 150      if (arg === '--help') {
 151        usage()
 152        process.exit(0)
 153      }
 154      if (arg === '--base-url') {
 155        options.baseUrl = String(argv[++i] || '').trim()
 156        continue
 157      }
 158      if (arg === '--access-key') {
 159        options.accessKey = String(argv[++i] || '').trim()
 160        continue
 161      }
 162      if (arg === '--out-dir') {
 163        options.outDir = String(argv[++i] || '').trim()
 164        continue
 165      }
 166      if (arg === '--min-score') {
 167        const value = Number.parseFloat(String(argv[++i] || ''))
 168        if (!Number.isFinite(value) || value < 0 || value > 100) {
 169          throw new Error('--min-score must be a number between 0 and 100')
 170        }
 171        options.minScore = value
 172        continue
 173      }
 174      if (arg === '--profile') {
 175        options.profile = String(argv[++i] || '').trim()
 176        continue
 177      }
 178      if (arg === '--session-scenarios') {
 179        options.sessionScenarios = String(argv[++i] || '')
 180          .split(',')
 181          .map((value) => value.trim())
 182          .filter(Boolean)
 183        continue
 184      }
 185      if (arg === '--skip-chatrooms') {
 186        options.skipChatrooms = true
 187        continue
 188      }
 189      if (arg === '--no-openclaw') {
 190        options.includeOpenclaw = false
 191        continue
 192      }
 193      if (arg === '--keep-created') {
 194        options.keepCreated = true
 195        continue
 196      }
 197      throw new Error(`Unknown argument: ${arg}`)
 198    }
 199  
 200    return options
 201  }
 202  
 203  function normalizeProbeProfileName(value) {
 204    return String(value || '')
 205      .trim()
 206      .toLowerCase()
 207      .replace(/[^a-z0-9]+/g, '_')
 208      .replace(/^_+|_+$/g, '')
 209  }
 210  
 211  function resolveProbeProfile(value) {
 212    const normalized = normalizeProbeProfileName(value) || 'full'
 213    if (normalized === 'full') {
 214      return {
 215        id: 'full',
 216        label: 'Full tool profile',
 217        tools: [...PROBE_TOOL_PROFILES.full],
 218        hasTaskManagement: true,
 219        hasProjectContext: false,
 220        hasProjectTool: false,
 221        hasProjectOperations: false,
 222      }
 223    }
 224    if (normalized === 'no_task_management' || normalized === 'taskless') {
 225      return {
 226        id: 'no_task_management',
 227        label: 'No task management tool',
 228        tools: [...PROBE_TOOL_PROFILES.no_task_management],
 229        hasTaskManagement: false,
 230        hasProjectContext: false,
 231        hasProjectTool: false,
 232        hasProjectOperations: false,
 233      }
 234    }
 235    if (normalized === 'full_project_context' || normalized === 'project_context' || normalized === 'task_and_project_context') {
 236      return {
 237        id: 'full_project_context',
 238        label: 'Task management with active project context',
 239        tools: [...PROBE_TOOL_PROFILES.full_project_context],
 240        hasTaskManagement: true,
 241        hasProjectContext: true,
 242        hasProjectTool: true,
 243        hasProjectOperations: true,
 244      }
 245    }
 246    if (normalized === 'project_context_only' || normalized === 'project_only' || normalized === 'taskless_project_context') {
 247      return {
 248        id: 'project_context_only',
 249        label: 'Active project context without task management',
 250        tools: [...PROBE_TOOL_PROFILES.project_context_only],
 251        hasTaskManagement: false,
 252        hasProjectContext: true,
 253        hasProjectTool: true,
 254        hasProjectOperations: true,
 255      }
 256    }
 257    throw new Error(`Unknown --profile value "${value}". Valid values: full, no-task-management, full-project-context, project-context-only`)
 258  }
 259  
 260  function loadAccessKey(explicitKey) {
 261    if (explicitKey) return explicitKey
 262    if (process.env.SWARMCLAW_ACCESS_KEY) return process.env.SWARMCLAW_ACCESS_KEY
 263    const envPath = path.join(process.cwd(), '.env.local')
 264    if (!fs.existsSync(envPath)) {
 265      throw new Error('Access key missing. Pass --access-key or set SWARMCLAW_ACCESS_KEY/.env.local ACCESS_KEY')
 266    }
 267    const raw = fs.readFileSync(envPath, 'utf8')
 268    const line = raw.split('\n').find((entry) => entry.startsWith('ACCESS_KEY='))
 269    if (!line) {
 270      throw new Error('ACCESS_KEY not found in .env.local')
 271    }
 272    const key = line.slice('ACCESS_KEY='.length).trim()
 273    if (!key) {
 274      throw new Error('ACCESS_KEY is empty in .env.local')
 275    }
 276    return key
 277  }
 278  
 279  function toSlug(value) {
 280    return value
 281      .toLowerCase()
 282      .replace(/[^a-z0-9]+/g, '-')
 283      .replace(/^-+|-+$/g, '')
 284      .slice(0, 40)
 285  }
 286  
 287  function nowSlug() {
 288    return new Date().toISOString().replace(/[^\d]/g, '').slice(0, 14)
 289  }
 290  
 291  function summarize(text, max = 220) {
 292    const compact = String(text || '').replace(/\s+/g, ' ').trim()
 293    return compact.length > max ? `${compact.slice(0, max - 3)}...` : compact
 294  }
 295  
 296  function stripRunNoise(text) {
 297    let cleaned = String(text || '').trim()
 298    // Session SSE can prefix multiple {"run":...} status envelopes before the actual assistant text.
 299    while (cleaned.startsWith('{"run":')) {
 300      const end = cleaned.indexOf('}}')
 301      if (end === -1) break
 302      cleaned = cleaned.slice(end + 2).trim()
 303    }
 304    return cleaned
 305  }
 306  
 307  function sleep(ms) {
 308    return new Promise((resolve) => setTimeout(resolve, ms))
 309  }
 310  
 311  function round1(value) {
 312    return Math.round(value * 10) / 10
 313  }
 314  
 315  function normalizeToolName(value) {
 316    return typeof value === 'string' ? value.trim().toLowerCase() : ''
 317  }
 318  
 319  function canonicalizeToolName(value) {
 320    const normalized = normalizeToolName(value)
 321    if (!normalized) return ''
 322    return TOOL_CANONICAL_MAP.get(normalized) || normalized
 323  }
 324  
 325  function canonicalizeToolList(values) {
 326    if (!Array.isArray(values)) return []
 327    return [...new Set(values.map((value) => canonicalizeToolName(value)).filter(Boolean))]
 328  }
 329  
 330  function getAgentTools(agent) {
 331    if (Array.isArray(agent?.extensions) && agent.extensions.length > 0) return agent.extensions
 332    if (Array.isArray(agent?.tools) && agent.tools.length > 0) return agent.tools
 333    return []
 334  }
 335  
 336  function ensureDir(dir) {
 337    fs.mkdirSync(dir, { recursive: true })
 338  }
 339  
 340  function writeTextFile(filePath, content) {
 341    ensureDir(path.dirname(filePath))
 342    fs.writeFileSync(filePath, content)
 343  }
 344  
 345  function extractFirstId(text) {
 346    const match = String(text || '').match(/\b([a-f0-9]{8})\b/i)
 347    return match ? match[1] : null
 348  }
 349  
 350  function extractUploadUrls(text) {
 351    const matches = String(text || '').match(/\/api\/uploads\/[^\s)"'`]+/g) || []
 352    return [...new Set(matches)]
 353  }
 354  
 355  function gradeForScore(score) {
 356    if (score >= 90) return 'A'
 357    if (score >= 80) return 'B'
 358    if (score >= 70) return 'C'
 359    if (score >= 60) return 'D'
 360    return 'F'
 361  }
 362  
 363  async function fetchJson(client, method, route, body, timeoutMs = 25_000) {
 364    const controller = new AbortController()
 365    const timer = setTimeout(() => controller.abort(), timeoutMs)
 366    try {
 367      const res = await fetch(`${client.baseUrl}${route}`, {
 368        method,
 369        headers: {
 370          'content-type': 'application/json',
 371          'x-access-key': client.accessKey,
 372        },
 373        body: body === undefined ? undefined : JSON.stringify(body),
 374        signal: controller.signal,
 375      })
 376      const text = await res.text()
 377      if (!res.ok) {
 378        throw new Error(`${method} ${route} failed (${res.status}): ${summarize(text, 280)}`)
 379      }
 380      if (!text) return null
 381      try {
 382        return JSON.parse(text)
 383      } catch {
 384        return text
 385      }
 386    } finally {
 387      clearTimeout(timer)
 388    }
 389  }
 390  
 391  async function postSse(client, route, body, timeoutMs = 180_000) {
 392    const controller = new AbortController()
 393    const timer = setTimeout(() => controller.abort(), timeoutMs)
 394    const startedAt = Date.now()
 395    try {
 396      const res = await fetch(`${client.baseUrl}${route}`, {
 397        method: 'POST',
 398        headers: {
 399          'content-type': 'application/json',
 400          'x-access-key': client.accessKey,
 401        },
 402        body: JSON.stringify(body),
 403        signal: controller.signal,
 404      })
 405  
 406      if (!res.ok || !res.body) {
 407        const text = await res.text().catch(() => '')
 408        throw new Error(`POST ${route} failed (${res.status}): ${summarize(text, 280)}`)
 409      }
 410  
 411      const decoder = new TextDecoder()
 412      const reader = res.body.getReader()
 413      let buffer = ''
 414      let textAcc = ''
 415      let replacedText = ''
 416      const events = []
 417  
 418      while (true) {
 419        const { done, value } = await reader.read()
 420        if (done) break
 421        buffer += decoder.decode(value, { stream: true })
 422        let idx = buffer.indexOf('\n\n')
 423        while (idx !== -1) {
 424          const chunk = buffer.slice(0, idx)
 425          buffer = buffer.slice(idx + 2)
 426          const line = chunk
 427            .split('\n')
 428            .map((entry) => entry.trim())
 429            .find((entry) => entry.startsWith('data: '))
 430          if (line) {
 431            try {
 432              const ev = JSON.parse(line.slice(6))
 433              events.push(ev)
 434              if ((ev?.t === 'd' || ev?.t === 'md') && typeof ev.text === 'string') textAcc += ev.text
 435              if (ev?.t === 'r' && typeof ev.text === 'string') replacedText = ev.text
 436            } catch {
 437              // ignore malformed event chunk
 438            }
 439          }
 440          idx = buffer.indexOf('\n\n')
 441        }
 442      }
 443  
 444      return {
 445        events,
 446        durationMs: Date.now() - startedAt,
 447        text: replacedText || textAcc,
 448      }
 449    } finally {
 450      clearTimeout(timer)
 451    }
 452  }
 453  
 454  function setChatroomHarnessFlags(chatroomId, { chatMode, autoAddress }) {
 455    const dbPath = path.join(process.cwd(), 'data', 'swarmclaw.db')
 456    if (!fs.existsSync(dbPath)) return false
 457    let db = null
 458    try {
 459      db = new Database(dbPath)
 460      const row = db.prepare('SELECT data FROM chatrooms WHERE id = ?').get(chatroomId)
 461      if (!row?.data) return false
 462      const parsed = JSON.parse(row.data)
 463      parsed.chatMode = chatMode
 464      parsed.autoAddress = autoAddress
 465      parsed.updatedAt = Date.now()
 466      db.prepare('UPDATE chatrooms SET data = ? WHERE id = ?').run(JSON.stringify(parsed), chatroomId)
 467      return true
 468    } catch {
 469      return false
 470    } finally {
 471      if (db) db.close()
 472    }
 473  }
 474  
 475  function collectToolStats(events) {
 476    const toolCalls = events
 477      .filter((event) => event?.t === 'tool_call')
 478      .map((event) => String(event.toolName || 'unknown'))
 479  
 480    const toolErrors = events
 481      .filter((event) => event?.t === 'tool_result')
 482      .map((event) => event?.toolOutput)
 483      .filter((value) => typeof value === 'string' && /^Error:/i.test(value.trim()))
 484      .map((value) => summarize(value, 180))
 485  
 486    const streamErrors = events
 487      .filter((event) => event?.t === 'err')
 488      .map((event) => summarize(event.text || 'unknown error', 180))
 489  
 490    return { toolCalls, toolErrors, streamErrors }
 491  }
 492  
 493  function containsEmpathy(text) {
 494    return /\b(i hear you|i understand|that sounds hard|you are not alone|i am here|with you|sorry you|overwhelmed)\b/i.test(String(text || ''))
 495  }
 496  
 497  function containsActionableStep(text) {
 498    return /\b(next step|first step|we can|let's|right now|today|do this)\b/i.test(String(text || ''))
 499  }
 500  
 501  function isoToLongDate(isoDate) {
 502    const stamp = new Date(`${isoDate}T00:00:00Z`)
 503    if (Number.isNaN(stamp.getTime())) return isoDate
 504    return stamp.toLocaleDateString('en-US', { month: 'long', day: 'numeric', year: 'numeric', timeZone: 'UTC' })
 505  }
 506  
 507  function mentionsDate(text, isoDate) {
 508    const source = String(text || '').toLowerCase()
 509    const iso = String(isoDate || '').toLowerCase()
 510    const longForm = isoToLongDate(isoDate).toLowerCase()
 511    return source.includes(iso) || source.includes(longForm)
 512  }
 513  
 514  function listBenchmarkTasksByTitle(tasks, titleIncludes) {
 515    return Object.values(tasks || {}).filter((row) =>
 516      row
 517      && typeof row === 'object'
 518      && String(row.title || '').includes(titleIncludes),
 519    )
 520  }
 521  
 522  async function waitForBenchmarkTasks(client, titleIncludes, predicate, timeoutMs = 90_000) {
 523    const startedAt = Date.now()
 524    let lastMatching = []
 525    while (Date.now() - startedAt < timeoutMs) {
 526      const tasks = await fetchJson(client, 'GET', '/api/tasks')
 527      lastMatching = listBenchmarkTasksByTitle(tasks, titleIncludes)
 528      if (predicate(lastMatching)) return lastMatching
 529      await sleep(1500)
 530    }
 531    return lastMatching
 532  }
 533  
 534  function countMemoriesContaining(needle) {
 535    const dbPath = path.join(process.cwd(), 'data', 'memory.db')
 536    if (!fs.existsSync(dbPath)) return 0
 537    let db = null
 538    try {
 539      db = new Database(dbPath, { readonly: true })
 540      const like = `%${needle}%`
 541      const row = db
 542        .prepare('SELECT COUNT(*) AS count FROM memories WHERE content LIKE ? OR title LIKE ?')
 543        .get(like, like)
 544      return Number(row?.count || 0)
 545    } catch {
 546      return 0
 547    } finally {
 548      if (db) db.close()
 549    }
 550  }
 551  
 552  async function prepareWorkspaceFixture(client, runTag, profile, createdIds) {
 553    const metadata = {
 554      projectName: 'HarborPilot Dispatch',
 555      objective: 'Reduce incident handoff chaos and prove a lightweight operator workflow that can expand into inbox-driven ops automation.',
 556      targetUser: 'marina operations managers',
 557      pilotPriorities: ['SMS outage handling', 'dock reassignment'],
 558      openObjectives: [
 559        'publish a triage research brief',
 560        'prepare credential bootstrap for ops inbox workflows',
 561      ],
 562      capabilityHints: [
 563        'research',
 564        'build',
 565        'web browsing',
 566        'credential bootstrapping',
 567        'goal tracking',
 568      ],
 569      successMetrics: [
 570        'publish a handoff summary within 5 minutes of an incident update',
 571        'prepare one reusable operator playbook each pilot week',
 572      ],
 573      credentialRequirements: [
 574        'mockmail app password for operator inbox automation',
 575        'harbor metrics api token for pilot reporting',
 576      ],
 577      heartbeatPrompt: 'Review active pilot risks, inbox blockers, and the next operator action.',
 578      heartbeatIntervalSec: 1800,
 579      projectDescription: [
 580        'HarborPilot Dispatch is a B2B dock-operations workspace for marina operations managers.',
 581        'The first pilot is focused on SMS outage handling and dock reassignment during busy charter turnover.',
 582      ].join(' '),
 583      color: '#0f766e',
 584    }
 585  
 586    let project = null
 587    let workspaceRoot = ''
 588    if (profile.hasProjectContext) {
 589      project = await fetchJson(client, 'POST', '/api/projects', {
 590        name: metadata.projectName,
 591        description: metadata.projectDescription,
 592        color: metadata.color,
 593        objective: metadata.objective,
 594        audience: metadata.targetUser,
 595        priorities: metadata.pilotPriorities,
 596        openObjectives: metadata.openObjectives,
 597        capabilityHints: metadata.capabilityHints,
 598        successMetrics: metadata.successMetrics,
 599        credentialRequirements: metadata.credentialRequirements,
 600        heartbeatPrompt: metadata.heartbeatPrompt,
 601        heartbeatIntervalSec: metadata.heartbeatIntervalSec,
 602      })
 603      createdIds.projects.push(project.id)
 604      workspaceRoot = path.join(WORKSPACE_ROOT, 'projects', project.id)
 605    } else {
 606      workspaceRoot = path.join(WORKSPACE_ROOT, `autonomy-benchmark-${runTag}-${profile.id}`)
 607    }
 608  
 609    createdIds.workspaces.push(workspaceRoot)
 610    ensureDir(workspaceRoot)
 611  
 612    writeTextFile(path.join(workspaceRoot, 'README.md'), [
 613      '# Workspace Seed',
 614      '',
 615      'This workspace intentionally starts with partial product notes.',
 616      'Inspect the files, create concrete artifacts, and prefer moving the work forward over talking about it.',
 617    ].join('\n'))
 618  
 619    writeTextFile(path.join(workspaceRoot, 'docs', 'problem-notes.md'), [
 620      '# Problem Notes',
 621      '',
 622      '- Operators lose time during incident escalation and shift handoff.',
 623      '- Internal checklists are inconsistent, especially under time pressure.',
 624      '- The first deliverables should stay narrow and execution-oriented.',
 625    ].join('\n'))
 626  
 627    writeTextFile(path.join(workspaceRoot, 'docs', 'constraints.md'), [
 628      '# Constraints',
 629      '',
 630      '- Keep deliverables lightweight and testable.',
 631      '- Bias toward artifacts that can be reused in a kickoff or pilot review.',
 632      '- Assume the team needs clear assumptions and explicit risks, not generic strategy language.',
 633    ].join('\n'))
 634  
 635    writeTextFile(path.join(workspaceRoot, 'docs', 'interview-snippets.md'), [
 636      '# Interview Snippets',
 637      '',
 638      '> We keep losing context during handoff, and then incidents drag on longer than they should.',
 639      '> If a tool helps us recover faster, I care more about clarity than fancy dashboards.',
 640    ].join('\n'))
 641  
 642    return {
 643      workspaceRoot,
 644      project,
 645      projectName: metadata.projectName,
 646      targetUser: metadata.targetUser,
 647      pilotPriorities: [...metadata.pilotPriorities],
 648      openObjectives: [...metadata.openObjectives],
 649      capabilityHints: [...metadata.capabilityHints],
 650      successMetrics: [...metadata.successMetrics],
 651      credentialRequirements: [...metadata.credentialRequirements],
 652      heartbeatPrompt: metadata.heartbeatPrompt,
 653      heartbeatIntervalSec: metadata.heartbeatIntervalSec,
 654      objective: metadata.objective,
 655      projectDescription: metadata.projectDescription,
 656      paths: {
 657        moneyPlanPath: 'plans/money-plan.md',
 658        backlogPath: 'plans/project-backlog.md',
 659        researchBriefPath: 'docs/research-brief.md',
 660        launchDraftPath: 'docs/launch-brief-draft.md',
 661        launchCritiquePath: 'docs/launch-brief-critique.md',
 662        launchFinalPath: 'docs/launch-brief-final.md',
 663        inboxOpsPath: 'ops/inbox-ops-playbook.md',
 664        marketWatchPath: 'plans/risk-bounded-market-watch.md',
 665      },
 666    }
 667  }
 668  
 669  function buildSessionScenarios(runTag, delegateAgentId, profile, fixture) {
 670    const moneyTaskPrefix = `[Autonomy Probe ${runTag}] money-`
 671    const deliveryTaskPrefix = `[Autonomy Probe ${runTag}] harbor-`
 672    const inboxTaskTitle = `[Autonomy Probe ${runTag}] inbox-triage-playbook`
 673    const marketTaskPrefix = `[Autonomy Probe ${runTag}] market-`
 674    const resumeTaskPrefix = `[Autonomy Probe ${runTag}] resume-`
 675    const resumeSourceTaskTitle = `${resumeTaskPrefix}source`
 676    const resumeFollowupTaskTitle = `${resumeTaskPrefix}followup`
 677    const birthday = '2031-04-17'
 678    const anniversary = '2031-10-02'
 679    const recurringBug = `ws reconnect loop ${runTag}`
 680    const moneyPlanPath = fixture.paths.moneyPlanPath
 681    const backlogPath = fixture.paths.backlogPath
 682    const researchBriefPath = fixture.paths.researchBriefPath
 683    const launchDraftPath = fixture.paths.launchDraftPath
 684    const launchCritiquePath = fixture.paths.launchCritiquePath
 685    const launchFinalPath = fixture.paths.launchFinalPath
 686    const inboxOpsPath = fixture.paths.inboxOpsPath
 687    const marketWatchPath = fixture.paths.marketWatchPath
 688    const resumeSourcePath = 'docs/task-continuation-source.md'
 689    const resumeFollowupPath = 'docs/task-continuation-followup.md'
 690    const hasTaskManagement = Boolean(profile?.hasTaskManagement)
 691    const hasProjectContext = Boolean(profile?.hasProjectContext)
 692    const hasProjectTool = Boolean(profile?.hasProjectTool)
 693    const hasProjectOperations = Boolean(profile?.hasProjectOperations)
 694    const projectOpsPath = 'plans/project-ops-brief.md'
 695    const credentialPlanPath = 'ops/credential-bootstrap.md'
 696    const heartbeatPlanPath = 'ops/project-heartbeat.md'
 697    const activeProjectId = fixture.project?.id || null
 698    return [
 699      {
 700        id: 'task_continuation_resume',
 701        skill: 'task_followup_continuation',
 702        weight: 8,
 703        timeoutMs: 220_000,
 704        requiresTool: true,
 705        expectedTools: hasTaskManagement ? ['manage_tasks'] : ['files'],
 706        prompt: hasTaskManagement
 707          ? [
 708              'Set up a two-step continuation workflow using task management.',
 709              `Create exactly two queued tasks titled "${resumeSourceTaskTitle}" and "${resumeFollowupTaskTitle}".`,
 710              `Assign both tasks to agent "${delegateAgentId}".`,
 711              hasProjectContext ? 'If an active project exists, let the active project be used by default for both tasks.' : 'If no active project exists, do not fabricate a project link.',
 712              `The "${resumeSourceTaskTitle}" task should create "${resumeSourcePath}" with sections "Context" and "Next Step".`,
 713              `The "${resumeFollowupTaskTitle}" task should create "${resumeFollowupPath}" with sections "Continuation" and "Inherited Context", and it must mention "${resumeSourcePath}" inside the file.`,
 714              `Use "continueFromTaskId" on "${resumeFollowupTaskTitle}" so it follows "${resumeSourceTaskTitle}" and reuses the earlier task context when possible.`,
 715              'Confirm both task ids.',
 716            ].join(' ')
 717          : [
 718              'Task management is unavailable in this session.',
 719              `Write "${resumeSourcePath}" with sections "Context" and "Next Step".`,
 720              `Then write "${resumeFollowupPath}" with sections "Continuation" and "Inherited Context", and mention "${resumeSourcePath}" inside the file.`,
 721              'Confirm both file paths.',
 722            ].join(' '),
 723        semanticCheck: hasTaskManagement
 724          ? (result) => extractFirstId(result.response) !== null && /\btask\b/i.test(result.response)
 725          : (result) => result.response.includes(resumeSourcePath) && result.response.includes(resumeFollowupPath),
 726        externalCheckWeight: 0.35,
 727        postRunCheck: hasTaskManagement
 728          ? async ({ client }) => {
 729              const matching = await waitForBenchmarkTasks(
 730                client,
 731                resumeTaskPrefix,
 732                (rows) => rows.length >= 2 && rows.every((row) => ['completed', 'failed'].includes(String(row.status || ''))),
 733                120_000,
 734              )
 735              const sourceTask = matching.find((row) => String(row?.title || '') === resumeSourceTaskTitle) || null
 736              const followupTask = matching.find((row) => String(row?.title || '') === resumeFollowupTaskTitle) || null
 737              const sourceAbs = path.join(fixture.workspaceRoot, resumeSourcePath)
 738              const followupAbs = path.join(fixture.workspaceRoot, resumeFollowupPath)
 739              const followupText = fs.existsSync(followupAbs) ? fs.readFileSync(followupAbs, 'utf8') : ''
 740              const sameSession = Boolean(sourceTask?.sessionId && followupTask?.sessionId && sourceTask.sessionId === followupTask.sessionId)
 741              const reusedPriorSession = /reusing prior session/i.test(String(followupTask?.checkpoint?.note || ''))
 742              const inheritedContinuationContext = sameSession || reusedPriorSession
 743              const projectLinked = !hasProjectContext || !activeProjectId
 744                ? true
 745                : sourceTask?.projectId === activeProjectId && followupTask?.projectId === activeProjectId
 746              return {
 747                name: 'task_continuation_workflow_completed',
 748                passed: sourceTask?.status === 'completed'
 749                  && followupTask?.status === 'completed'
 750                  && Array.isArray(followupTask?.blockedBy)
 751                  && followupTask.blockedBy.includes(sourceTask.id)
 752                  && inheritedContinuationContext
 753                  && fs.existsSync(sourceAbs)
 754                  && fs.existsSync(followupAbs)
 755                  && followupText.includes(resumeSourcePath)
 756                  && projectLinked,
 757                details: {
 758                  sourceTaskId: sourceTask?.id || null,
 759                  followupTaskId: followupTask?.id || null,
 760                  sourceStatus: sourceTask?.status || null,
 761                  followupStatus: followupTask?.status || null,
 762                  sameSession,
 763                  reusedPriorSession,
 764                  inheritedContinuationContext,
 765                  projectLinked,
 766                  sourceExists: fs.existsSync(sourceAbs),
 767                  followupExists: fs.existsSync(followupAbs),
 768                },
 769              }
 770            }
 771          : async () => {
 772              const sourceAbs = path.join(fixture.workspaceRoot, resumeSourcePath)
 773              const followupAbs = path.join(fixture.workspaceRoot, resumeFollowupPath)
 774              const followupText = fs.existsSync(followupAbs) ? fs.readFileSync(followupAbs, 'utf8') : ''
 775              return {
 776                name: 'continuation_files_written',
 777                passed: fs.existsSync(sourceAbs)
 778                  && fs.existsSync(followupAbs)
 779                  && followupText.includes(resumeSourcePath),
 780                details: {
 781                  sourceExists: fs.existsSync(sourceAbs),
 782                  followupExists: fs.existsSync(followupAbs),
 783                },
 784              }
 785            },
 786      },
 787      {
 788        id: 'money_goal_workflow',
 789        skill: 'autonomous_goal_execution',
 790        weight: 8,
 791        timeoutMs: 220_000,
 792        requiresTool: true,
 793        expectedTools: hasTaskManagement ? ['web', 'manage_tasks', 'files'] : ['web', 'files'],
 794        prompt: hasTaskManagement
 795          ? [
 796              'A user says: "I want you to help me make money legally."',
 797              'Do one concrete research step.',
 798              `Create exactly two backlog tasks (not queued) titled "${moneyTaskPrefix}research" and "${moneyTaskPrefix}mvp" using the task-management capability.`,
 799              `Assign both tasks to agent "${delegateAgentId}" instead of yourself.`,
 800              hasProjectContext ? 'If an active project exists, link both tasks to it.' : 'If no active project exists, do not invent one.',
 801              `Write a short plan to "${moneyPlanPath}", and then confirm task IDs and file path.`,
 802            ].join(' ')
 803          : [
 804              'A user says: "I want you to help me make money legally."',
 805              'Do one concrete research step.',
 806              `Task management is intentionally unavailable in this session, so do not claim to create tasks.`,
 807              `Instead, write a short plan to "${moneyPlanPath}" with a "Backlog" section that contains exactly two bullet items titled "${moneyTaskPrefix}research" and "${moneyTaskPrefix}mvp".`,
 808              'Then confirm the file path and the two backlog item titles.',
 809            ].join(' '),
 810        semanticCheck: hasTaskManagement
 811          ? (result) =>
 812              /money-plan\.md/i.test(result.response) &&
 813              extractFirstId(result.response) !== null &&
 814              /\b(task|backlog)\b/i.test(result.response)
 815          : (result) =>
 816              /money-plan\.md/i.test(result.response) &&
 817              result.response.includes(`${moneyTaskPrefix}research`) &&
 818              result.response.includes(`${moneyTaskPrefix}mvp`),
 819        externalCheckWeight: hasTaskManagement ? 0.2 : 0.2,
 820        postRunCheck: hasTaskManagement
 821          ? async ({ client }) => {
 822              const tasks = await fetchJson(client, 'GET', '/api/tasks')
 823              const matching = listBenchmarkTasksByTitle(tasks, moneyTaskPrefix)
 824              const projectLinkedCount = hasProjectContext && fixture.project?.id
 825                ? matching.filter((row) => row.projectId === fixture.project.id).length
 826                : null
 827              return {
 828                name: 'money_tasks_created',
 829                passed: matching.length >= 2 && (!hasProjectContext || projectLinkedCount >= 2),
 830                details: {
 831                  taskCount: matching.length,
 832                  expectedAtLeast: 2,
 833                  projectLinkedCount,
 834                  expectedProjectId: fixture.project?.id || null,
 835                },
 836              }
 837            }
 838          : async () => {
 839              const planPath = path.join(fixture.workspaceRoot, moneyPlanPath)
 840              const planText = fs.existsSync(planPath) ? fs.readFileSync(planPath, 'utf8') : ''
 841              return {
 842                name: 'money_plan_backlog_written',
 843                passed: planText.includes(`${moneyTaskPrefix}research`) && planText.includes(`${moneyTaskPrefix}mvp`),
 844                details: {
 845                  exists: fs.existsSync(planPath),
 846                  planPath,
 847                },
 848              }
 849            },
 850      },
 851      {
 852        id: 'project_delivery_execution',
 853        skill: 'project_execution',
 854        weight: 10,
 855        timeoutMs: 220_000,
 856        requiresTool: true,
 857        expectedTools: hasTaskManagement ? ['manage_tasks', 'files'] : ['files'],
 858        prompt: hasTaskManagement
 859          ? [
 860              'You are working in the current workspace.',
 861              'Inspect the existing files before acting.',
 862              `Create exactly three backlog tasks titled "${deliveryTaskPrefix}research-brief", "${deliveryTaskPrefix}launch-checklist", and "${deliveryTaskPrefix}qa-pass".`,
 863              `Assign all three tasks to agent "${delegateAgentId}" instead of yourself.`,
 864              hasProjectContext ? 'If an active project exists, link all three tasks to it.' : 'If no active project exists, do not fabricate a project link.',
 865              `Then execute the first step immediately by creating "${researchBriefPath}" with sections "Target User", "Primary Pain", "Assumptions", and "Risks".`,
 866              'Under "Risks", include a markdown table with at least two rows.',
 867              'Finally confirm the file path and the task ids.',
 868            ].join(' ')
 869          : [
 870              'You are working in the current workspace.',
 871              'Inspect the existing files before acting.',
 872              'Task management is intentionally unavailable in this session, so do not claim to create tasks.',
 873              `Write "${backlogPath}" with exactly three bullet items titled "${deliveryTaskPrefix}research-brief", "${deliveryTaskPrefix}launch-checklist", and "${deliveryTaskPrefix}qa-pass".`,
 874              `Then execute the first step immediately by creating "${researchBriefPath}" with sections "Target User", "Primary Pain", "Assumptions", and "Risks".`,
 875              'Under "Risks", include a markdown table with at least two rows.',
 876              'Finally confirm the backlog file path and the research brief path.',
 877            ].join(' '),
 878        semanticCheck: hasTaskManagement
 879          ? (result) =>
 880              result.response.includes(researchBriefPath) &&
 881              extractFirstId(result.response) !== null
 882          : (result) =>
 883              result.response.includes(backlogPath) &&
 884              result.response.includes(researchBriefPath),
 885        externalCheckWeight: 0.15,
 886        postRunCheck: hasTaskManagement
 887          ? async ({ client }) => {
 888              const tasks = await fetchJson(client, 'GET', '/api/tasks')
 889              const matching = listBenchmarkTasksByTitle(tasks, deliveryTaskPrefix)
 890              const researchBriefAbs = path.join(fixture.workspaceRoot, researchBriefPath)
 891              const researchBrief = fs.existsSync(researchBriefAbs) ? fs.readFileSync(researchBriefAbs, 'utf8') : ''
 892              const projectLinkedCount = hasProjectContext && fixture.project?.id
 893                ? matching.filter((row) => row.projectId === fixture.project.id).length
 894                : null
 895              return {
 896                name: 'project_tasks_and_brief_created',
 897                passed: matching.length >= 3
 898                  && researchBrief.includes('## Target User')
 899                  && researchBrief.includes('## Risks')
 900                  && /\|.+\|.+\|/.test(researchBrief)
 901                  && (!hasProjectContext || projectLinkedCount >= 3),
 902                details: {
 903                  taskCount: matching.length,
 904                  expectedAtLeast: 3,
 905                  projectLinkedCount,
 906                  expectedProjectId: fixture.project?.id || null,
 907                  researchBriefExists: fs.existsSync(researchBriefAbs),
 908                },
 909              }
 910            }
 911          : async () => {
 912              const backlogAbs = path.join(fixture.workspaceRoot, backlogPath)
 913              const researchBriefAbs = path.join(fixture.workspaceRoot, researchBriefPath)
 914              const backlogText = fs.existsSync(backlogAbs) ? fs.readFileSync(backlogAbs, 'utf8') : ''
 915              const researchBrief = fs.existsSync(researchBriefAbs) ? fs.readFileSync(researchBriefAbs, 'utf8') : ''
 916              return {
 917                name: 'project_backlog_and_brief_written',
 918                passed: backlogText.includes(`${deliveryTaskPrefix}research-brief`)
 919                  && backlogText.includes(`${deliveryTaskPrefix}launch-checklist`)
 920                  && backlogText.includes(`${deliveryTaskPrefix}qa-pass`)
 921                  && researchBrief.includes('## Target User')
 922                  && researchBrief.includes('## Risks')
 923                  && /\|.+\|.+\|/.test(researchBrief),
 924                details: {
 925                  backlogExists: fs.existsSync(backlogAbs),
 926                  researchBriefExists: fs.existsSync(researchBriefAbs),
 927                  backlogAbs,
 928                  researchBriefAbs,
 929                },
 930              }
 931            },
 932      },
 933      {
 934        id: 'open_ended_iteration',
 935        skill: 'deliverable_iteration',
 936        weight: 10,
 937        timeoutMs: 220_000,
 938        requiresTool: true,
 939        expectedTools: ['files'],
 940        prompt: [
 941          'Create a first draft launch brief for the current workspace at',
 942          `"${launchDraftPath}".`,
 943          'Then write a short critique at',
 944          `"${launchCritiquePath}" that names at least two weaknesses in the draft.`,
 945          'Then revise the brief into',
 946          `"${launchFinalPath}" and make at least one concrete change because of that critique.`,
 947          'Inspect any existing files you need first.',
 948          'Report all three file paths and one specific thing you changed in the final version.',
 949        ].join(' '),
 950        semanticCheck: (result) =>
 951          result.response.includes(launchDraftPath)
 952          && result.response.includes(launchCritiquePath)
 953          && result.response.includes(launchFinalPath)
 954          && /\b(changed|revised|updated)\b/i.test(result.response),
 955        externalCheckWeight: 0.25,
 956        postRunCheck: async () => {
 957          const draftAbs = path.join(fixture.workspaceRoot, launchDraftPath)
 958          const critiqueAbs = path.join(fixture.workspaceRoot, launchCritiquePath)
 959          const finalAbs = path.join(fixture.workspaceRoot, launchFinalPath)
 960          const draftText = fs.existsSync(draftAbs) ? fs.readFileSync(draftAbs, 'utf8') : ''
 961          const critiqueText = fs.existsSync(critiqueAbs) ? fs.readFileSync(critiqueAbs, 'utf8') : ''
 962          const finalText = fs.existsSync(finalAbs) ? fs.readFileSync(finalAbs, 'utf8') : ''
 963          const critiqueLineCount = critiqueText
 964            .split('\n')
 965            .map((line) => line.trim())
 966            .filter((line) => line.startsWith('- ') || /^\d+\./.test(line))
 967            .length
 968          return {
 969            name: 'iteration_artifacts_created',
 970            passed: fs.existsSync(draftAbs)
 971              && fs.existsSync(critiqueAbs)
 972              && fs.existsSync(finalAbs)
 973              && critiqueLineCount >= 2
 974              && draftText.trim().length > 0
 975              && finalText.trim().length > 0
 976              && draftText !== finalText,
 977            details: {
 978              draftExists: fs.existsSync(draftAbs),
 979              critiqueExists: fs.existsSync(critiqueAbs),
 980              finalExists: fs.existsSync(finalAbs),
 981              critiqueLineCount,
 982            },
 983          }
 984        },
 985      },
 986      {
 987        id: 'project_operating_system',
 988        skill: 'project_context',
 989        weight: 8,
 990        timeoutMs: 180_000,
 991        requiresTool: true,
 992        expectedTools: hasProjectTool ? ['manage_projects', 'files'] : ['files'],
 993        prompt: hasProjectTool && hasProjectContext
 994          ? [
 995              'Use the active project-management tool to strengthen the current project record before doing anything else.',
 996              `Set the project objective to "${fixture.objective}".`,
 997              `Set the open objectives to "${fixture.openObjectives[0]}" and "${fixture.openObjectives[1]}".`,
 998              `Set the operating modes to "${fixture.capabilityHints.join('", "')}".`,
 999              `Set the credential requirements to "${fixture.credentialRequirements.join('", "')}".`,
1000              `Set the preferred heartbeat prompt to "${fixture.heartbeatPrompt}" and heartbeat interval to ${fixture.heartbeatIntervalSec} seconds.`,
1001              `Then write "${projectOpsPath}" with sections "Objective", "Open Objectives", "Operating Modes", "Credential Requirements", and "Heartbeat".`,
1002              'Confirm the active project id and the file path.',
1003            ].join(' ')
1004          : [
1005              'Project-management tooling is unavailable in this session.',
1006              `Write "${projectOpsPath}" with sections "Objective", "Open Objectives", "Operating Modes", "Credential Requirements", and "Heartbeat".`,
1007              `Use these exact values: objective "${fixture.objective}", open objectives "${fixture.openObjectives[0]}" and "${fixture.openObjectives[1]}", operating modes "${fixture.capabilityHints.join('", "')}", credential requirements "${fixture.credentialRequirements.join('", "')}", and heartbeat "${fixture.heartbeatPrompt}" every ${fixture.heartbeatIntervalSec} seconds.`,
1008              'Confirm the file path.',
1009            ].join(' '),
1010        semanticCheck: hasProjectTool && hasProjectContext
1011          ? (result) => result.response.includes(projectOpsPath) && (activeProjectId ? result.response.includes(activeProjectId) : true)
1012          : (result) => result.response.includes(projectOpsPath),
1013        externalCheckWeight: 0.3,
1014        postRunCheck: hasProjectTool && hasProjectContext
1015          ? async ({ client }) => {
1016              const project = await fetchJson(client, 'GET', `/api/projects/${encodeURIComponent(activeProjectId)}`)
1017              const projectOpsAbs = path.join(fixture.workspaceRoot, projectOpsPath)
1018              const text = fs.existsSync(projectOpsAbs) ? fs.readFileSync(projectOpsAbs, 'utf8') : ''
1019              return {
1020                name: 'project_record_enriched',
1021                passed: project?.objective === fixture.objective
1022                  && Array.isArray(project?.openObjectives)
1023                  && project.openObjectives.includes(fixture.openObjectives[0])
1024                  && Array.isArray(project?.credentialRequirements)
1025                  && project.credentialRequirements.includes(fixture.credentialRequirements[0])
1026                  && project?.heartbeatPrompt === fixture.heartbeatPrompt
1027                  && Number(project?.heartbeatIntervalSec) === fixture.heartbeatIntervalSec
1028                  && text.includes('## Objective')
1029                  && text.includes('## Credential Requirements'),
1030                details: {
1031                  projectId: activeProjectId,
1032                  projectOpsExists: fs.existsSync(projectOpsAbs),
1033                },
1034              }
1035            }
1036          : async () => {
1037              const projectOpsAbs = path.join(fixture.workspaceRoot, projectOpsPath)
1038              const text = fs.existsSync(projectOpsAbs) ? fs.readFileSync(projectOpsAbs, 'utf8') : ''
1039              return {
1040                name: 'project_ops_brief_written',
1041                passed: text.includes('## Objective')
1042                  && text.includes(fixture.objective)
1043                  && text.includes(fixture.credentialRequirements[0])
1044                  && text.includes(fixture.heartbeatPrompt),
1045                details: {
1046                  projectOpsExists: fs.existsSync(projectOpsAbs),
1047                },
1048              }
1049            },
1050      },
1051      {
1052        id: 'project_credentials_and_heartbeat',
1053        skill: 'project_operations',
1054        weight: 8,
1055        timeoutMs: 180_000,
1056        requiresTool: true,
1057        expectedTools: hasProjectOperations ? ['manage_secrets', 'manage_schedules', 'files'] : ['files'],
1058        prompt: hasProjectOperations && hasProjectContext
1059          ? [
1060              'Bootstrap lightweight project operations for the active project.',
1061              `Create one project-linked secret named "MockMail App Password ${runTag}" with service "mockmail" and value "${runTag}-mockmail-secret".`,
1062              `Create one active interval schedule named "Pilot heartbeat ${runTag}" with intervalMs ${fixture.heartbeatIntervalSec * 1000} and taskPrompt "Review active project goals, inbox blockers, and next operator action."`,
1063              'Omit projectId when possible so the active project is used by default.',
1064              `Then write "${credentialPlanPath}" with sections "Services", "Secrets", and "Heartbeat" that summarize what you configured.`,
1065              'Confirm the secret id, schedule id, and file path.',
1066            ].join(' ')
1067          : [
1068              'Project secret and schedule tooling is unavailable in this session.',
1069              `Write "${credentialPlanPath}" with sections "Services", "Secrets", and "Heartbeat" describing the credentials and recurring follow-up needed for an inbox-oriented operator workflow.`,
1070              `Also write "${heartbeatPlanPath}" with a recurring heartbeat recommendation every ${fixture.heartbeatIntervalSec} seconds and mention "${fixture.heartbeatPrompt}".`,
1071              'Confirm both file paths.',
1072            ].join(' '),
1073        semanticCheck: hasProjectOperations && hasProjectContext
1074          ? (result) => result.response.includes(credentialPlanPath) && extractFirstId(result.response) !== null
1075          : (result) => result.response.includes(credentialPlanPath) && result.response.includes(heartbeatPlanPath),
1076        externalCheckWeight: 0.3,
1077        postRunCheck: hasProjectOperations && hasProjectContext
1078          ? async ({ client }) => {
1079              const secrets = await fetchJson(client, 'GET', '/api/secrets')
1080              const schedules = await fetchJson(client, 'GET', '/api/schedules')
1081              const secretMatch = Object.values(secrets || {}).find((row) => String(row?.name || '') === `MockMail App Password ${runTag}`)
1082              const scheduleMatch = Object.values(schedules || {}).find((row) => String(row?.name || '') === `Pilot heartbeat ${runTag}`)
1083              const credentialPlanAbs = path.join(fixture.workspaceRoot, credentialPlanPath)
1084              const text = fs.existsSync(credentialPlanAbs) ? fs.readFileSync(credentialPlanAbs, 'utf8') : ''
1085              return {
1086                name: 'project_secret_and_schedule_created',
1087                passed: Boolean(secretMatch)
1088                  && Boolean(scheduleMatch)
1089                  && secretMatch?.projectId === activeProjectId
1090                  && scheduleMatch?.projectId === activeProjectId
1091                  && text.includes('## Secrets')
1092                  && text.includes('## Heartbeat'),
1093                details: {
1094                  secretId: secretMatch?.id || null,
1095                  scheduleId: scheduleMatch?.id || null,
1096                  projectId: activeProjectId,
1097                  credentialPlanExists: fs.existsSync(credentialPlanAbs),
1098                },
1099              }
1100            }
1101          : async () => {
1102              const credentialPlanAbs = path.join(fixture.workspaceRoot, credentialPlanPath)
1103              const heartbeatPlanAbs = path.join(fixture.workspaceRoot, heartbeatPlanPath)
1104              const credentialText = fs.existsSync(credentialPlanAbs) ? fs.readFileSync(credentialPlanAbs, 'utf8') : ''
1105              const heartbeatText = fs.existsSync(heartbeatPlanAbs) ? fs.readFileSync(heartbeatPlanAbs, 'utf8') : ''
1106              return {
1107                name: 'credential_and_heartbeat_docs_written',
1108                passed: credentialText.includes('## Secrets')
1109                  && heartbeatText.includes(fixture.heartbeatPrompt)
1110                  && heartbeatText.includes(String(fixture.heartbeatIntervalSec)),
1111                details: {
1112                  credentialPlanExists: fs.existsSync(credentialPlanAbs),
1113                  heartbeatPlanExists: fs.existsSync(heartbeatPlanAbs),
1114                },
1115              }
1116            },
1117      },
1118      {
1119        id: 'inbox_operations_kickoff',
1120        skill: 'project_operations',
1121        weight: 8,
1122        timeoutMs: 200_000,
1123        requiresTool: true,
1124        expectedTools: hasProjectOperations ? ['manage_projects', 'manage_secrets', 'manage_schedules', 'files'] : ['files'],
1125        prompt: hasProjectOperations && hasProjectContext
1126          ? [
1127              'Treat the active project as an inbox-operations system.',
1128              'Add the capability hint "inbox triage" and the open objective "stand up inbox triage workflow".',
1129              `Create one project-linked secret named "Inbox OAuth Refresh ${runTag}" with service "mockmail" and value "${runTag}-inbox-refresh".`,
1130              `Create one active interval schedule named "Inbox triage review ${runTag}" with intervalMs 900000 and taskPrompt "Review unread inbox items, blockers, and next reply actions."`,
1131              hasTaskManagement
1132                ? `Also create exactly one backlog task titled "${inboxTaskTitle}" assigned to agent "${delegateAgentId}". Omit projectId so the active project is used by default.`
1133                : 'Task management is unavailable in this session, so do not claim to create tasks.',
1134              `Then write "${inboxOpsPath}" with sections "Inbox Goals", "Credential Bootstrap", "Heartbeat Cadence", and "Failure Modes".`,
1135              'Confirm the file path and any created ids.',
1136            ].join(' ')
1137          : [
1138              'Project operations tooling is unavailable in this session.',
1139              'Do not claim to create real secrets, schedules, or project updates.',
1140              `Write "${inboxOpsPath}" with sections "Inbox Goals", "Credential Bootstrap", "Heartbeat Cadence", and "Failure Modes" for a lightweight operator inbox workflow.`,
1141              'Confirm the file path.',
1142            ].join(' '),
1143        semanticCheck: (result) => result.response.includes(inboxOpsPath),
1144        externalCheckWeight: 0.3,
1145        postRunCheck: hasProjectOperations && hasProjectContext
1146          ? async ({ client }) => {
1147              const projects = await fetchJson(client, 'GET', `/api/projects/${encodeURIComponent(activeProjectId)}`)
1148              const secrets = await fetchJson(client, 'GET', '/api/secrets')
1149              const schedules = await fetchJson(client, 'GET', '/api/schedules')
1150              const tasks = hasTaskManagement ? await fetchJson(client, 'GET', '/api/tasks') : null
1151              const secretMatch = Object.values(secrets || {}).find((row) => String(row?.name || '') === `Inbox OAuth Refresh ${runTag}`)
1152              const scheduleMatch = Object.values(schedules || {}).find((row) => String(row?.name || '') === `Inbox triage review ${runTag}`)
1153              const taskMatch = hasTaskManagement
1154                ? Object.values(tasks || {}).find((row) => String(row?.title || '') === inboxTaskTitle)
1155                : null
1156              const inboxOpsAbs = path.join(fixture.workspaceRoot, inboxOpsPath)
1157              const text = fs.existsSync(inboxOpsAbs) ? fs.readFileSync(inboxOpsAbs, 'utf8') : ''
1158              return {
1159                name: 'inbox_ops_seeded',
1160                passed: Array.isArray(projects?.capabilityHints)
1161                  && projects.capabilityHints.includes('inbox triage')
1162                  && Array.isArray(projects?.openObjectives)
1163                  && projects.openObjectives.includes('stand up inbox triage workflow')
1164                  && secretMatch?.projectId === activeProjectId
1165                  && scheduleMatch?.projectId === activeProjectId
1166                  && (!hasTaskManagement || (taskMatch?.projectId === activeProjectId))
1167                  && text.includes('## Inbox Goals')
1168                  && text.includes('## Credential Bootstrap')
1169                  && text.includes('## Heartbeat Cadence')
1170                  && text.includes('## Failure Modes'),
1171                details: {
1172                  projectId: activeProjectId,
1173                  secretId: secretMatch?.id || null,
1174                  scheduleId: scheduleMatch?.id || null,
1175                  taskId: taskMatch?.id || null,
1176                  inboxOpsExists: fs.existsSync(inboxOpsAbs),
1177                },
1178              }
1179            }
1180          : async () => {
1181              const inboxOpsAbs = path.join(fixture.workspaceRoot, inboxOpsPath)
1182              const text = fs.existsSync(inboxOpsAbs) ? fs.readFileSync(inboxOpsAbs, 'utf8') : ''
1183              return {
1184                name: 'inbox_ops_playbook_written',
1185                passed: text.includes('## Inbox Goals')
1186                  && text.includes('## Credential Bootstrap')
1187                  && text.includes('## Heartbeat Cadence')
1188                  && text.includes('## Failure Modes'),
1189                details: {
1190                  inboxOpsExists: fs.existsSync(inboxOpsAbs),
1191                },
1192              }
1193            },
1194      },
1195      {
1196        id: 'market_watch_planning',
1197        skill: 'autonomous_goal_execution',
1198        weight: 8,
1199        timeoutMs: 190_000,
1200        requiresTool: true,
1201        expectedTools: hasTaskManagement
1202          ? (hasProjectTool ? ['manage_projects', 'manage_tasks', 'files'] : ['manage_tasks', 'files'])
1203          : (hasProjectTool ? ['manage_projects', 'files'] : ['files']),
1204        prompt: hasProjectTool && hasProjectContext
1205          ? [
1206              'A user says "manage my trading research for me", but keep the work research-only and do not place trades or claim real execution.',
1207              'Use the active project as the durable goal tracker.',
1208              'Add the open objective "maintain a research-only market watch" and the success metric "publish one risk-bounded market memo per week".',
1209              hasTaskManagement
1210                ? `Create exactly two backlog tasks titled "${marketTaskPrefix}signal-review" and "${marketTaskPrefix}memo". Assign both to agent "${delegateAgentId}" and let the active project be used by default.`
1211                : 'Task management is unavailable in this session, so do not claim to create tasks.',
1212              `Then write "${marketWatchPath}" with sections "Guardrails", "Signals", "Research Cadence", and "Next Review".`,
1213              'Confirm the file path and any created ids.',
1214            ].join(' ')
1215          : hasTaskManagement
1216            ? [
1217                'A user says "manage my trading research for me", but keep the work research-only and do not place trades or claim real execution.',
1218                `Create exactly two backlog tasks titled "${marketTaskPrefix}signal-review" and "${marketTaskPrefix}memo".`,
1219                `Assign both to agent "${delegateAgentId}".`,
1220                `Then write "${marketWatchPath}" with sections "Guardrails", "Signals", "Research Cadence", and "Next Review".`,
1221                'Confirm the task ids and file path.',
1222              ].join(' ')
1223            : [
1224                'A user says "manage my trading research for me", but keep the work research-only and do not place trades or claim real execution.',
1225                `Write "${marketWatchPath}" with sections "Guardrails", "Signals", "Research Cadence", and "Next Review".`,
1226                'Confirm the file path.',
1227              ].join(' '),
1228        semanticCheck: (result) => result.response.includes(marketWatchPath),
1229        externalCheckWeight: 0.3,
1230        postRunCheck: async ({ client }) => {
1231          const marketWatchAbs = path.join(fixture.workspaceRoot, marketWatchPath)
1232          const text = fs.existsSync(marketWatchAbs) ? fs.readFileSync(marketWatchAbs, 'utf8') : ''
1233          const tasks = hasTaskManagement ? await fetchJson(client, 'GET', '/api/tasks') : null
1234          const matchingTasks = hasTaskManagement ? listBenchmarkTasksByTitle(tasks, marketTaskPrefix) : []
1235          const project = hasProjectTool && hasProjectContext && activeProjectId
1236            ? await fetchJson(client, 'GET', `/api/projects/${encodeURIComponent(activeProjectId)}`)
1237            : null
1238          const projectLinkedCount = hasProjectContext && activeProjectId
1239            ? matchingTasks.filter((row) => row.projectId === activeProjectId).length
1240            : null
1241          return {
1242            name: 'market_watch_plan_seeded',
1243            passed: text.includes('## Guardrails')
1244              && text.includes('## Signals')
1245              && text.includes('## Research Cadence')
1246              && text.includes('## Next Review')
1247              && (!hasTaskManagement || matchingTasks.length >= 2)
1248              && (!hasProjectTool || !hasProjectContext || (
1249                Array.isArray(project?.openObjectives)
1250                && project.openObjectives.includes('maintain a research-only market watch')
1251                && Array.isArray(project?.successMetrics)
1252                && project.successMetrics.includes('publish one risk-bounded market memo per week')
1253                && (!hasTaskManagement || projectLinkedCount >= 2)
1254              )),
1255            details: {
1256              marketWatchExists: fs.existsSync(marketWatchAbs),
1257              taskCount: matchingTasks.length,
1258              projectLinkedCount,
1259              projectId: activeProjectId,
1260            },
1261          }
1262        },
1263      },
1264      {
1265        id: 'news_media_delivery',
1266        skill: 'research_delivery',
1267        weight: 8,
1268        timeoutMs: 220_000,
1269        requiresTool: true,
1270        expectedTools: ['web', 'browser', 'manage_connectors'],
1271        prompt: [
1272          'A user asks:',
1273          '"Can you tell me more if there is any news related to the US-Iran war, and can you send me some screenshots and give me a summary and maybe send me a voice note about it?"',
1274          'Use live web research first.',
1275          'Then use the browser tool to capture at least one relevant screenshot from a source page.',
1276          'Give a concise summary of the latest relevant developments.',
1277          'If outbound delivery is possible, send the screenshot and a short voice note update through connector_message_tool.',
1278          'If no running connector is available, explicitly check that and report the delivery blocker instead of claiming the capability does not exist.',
1279          'In your final answer, include the screenshot upload URL exactly and say whether the voice note was sent or blocked after checking connectors.',
1280        ].join(' '),
1281        semanticCheck: (result) =>
1282          /\b(us|u\.s\.)\b/i.test(result.response)
1283          && /\biran\b/i.test(result.response)
1284          && /\b(summary|summarized|latest|update|updates|reported|developments)\b/i.test(result.response)
1285          && /\/api\/uploads\/[^\s)"'`]+\.(png|jpg|jpeg|webp)/i.test(result.response)
1286          && /\b(voice[\s-]?note|voice_sent|blocked|no running connectors|connector)\b/i.test(result.response),
1287        externalCheckWeight: 0.35,
1288        postRunCheck: async ({ client, row }) => {
1289          const screenshotUrls = extractUploadUrls(row.response)
1290            .filter((url) => /\.(png|jpg|jpeg|webp)(?:[?#].*)?$/i.test(url))
1291            .filter((url) => /\/api\/uploads\/(?:screenshot-|browser-)/i.test(url))
1292          const screenshotReachability = await Promise.all(
1293            screenshotUrls.slice(0, 3).map(async (url) => {
1294              try {
1295                const res = await fetch(`${client.baseUrl}${url}`, {
1296                  headers: { 'x-access-key': client.accessKey },
1297                })
1298                return res.ok
1299              } catch {
1300                return false
1301              }
1302            }),
1303          )
1304          const connectorOutcome = /\b(voice[\s-]?note sent|voice_sent|no running connectors|set one up in the connectors panel|delivery blocker|delivery blocked|could not send (?:the )?voice(?:[\s-]?note)?|unable to send (?:the )?voice(?:[\s-]?note)?)\b/i.test(row.response)
1305          return {
1306            name: 'news_media_delivery_checked',
1307            passed: screenshotReachability.some(Boolean) && connectorOutcome,
1308            details: {
1309              screenshotUrls,
1310              reachableScreenshots: screenshotReachability.filter(Boolean).length,
1311              connectorOutcome,
1312            },
1313          }
1314        },
1315      },
1316      {
1317        id: 'project_context_alignment',
1318        skill: 'project_context',
1319        weight: 6,
1320        timeoutMs: 120_000,
1321        requiresTool: false,
1322        expectedTools: [],
1323        prompt: 'Without reading files or browsing the web, tell me the active project\'s exact name, objective, who it is for, the first two pilot priorities, and the first open objective. If there is no active project context, say that plainly.',
1324        semanticCheck: hasProjectContext
1325          ? (result) =>
1326              result.response.includes(fixture.projectName)
1327              && result.response.toLowerCase().includes(fixture.objective.toLowerCase())
1328              && result.response.toLowerCase().includes(fixture.targetUser.toLowerCase())
1329              && result.response.toLowerCase().includes(fixture.pilotPriorities[0].toLowerCase())
1330              && result.response.toLowerCase().includes(fixture.pilotPriorities[1].toLowerCase())
1331              && result.response.toLowerCase().includes(fixture.openObjectives[0].toLowerCase())
1332          : (result) => /\b(no active project|no current project|do not have active project context|no active project context)\b/i.test(result.response),
1333      },
1334      {
1335        id: 'session_history_recall',
1336        skill: 'session_management',
1337        weight: 8,
1338        timeoutMs: 140_000,
1339        requiresTool: true,
1340        expectedTools: ['manage_sessions'],
1341        prompt: `Use the session-management tool to inspect the recent history of this current session. Then tell me the exact "${moneyPlanPath}", "${researchBriefPath}", and "${launchFinalPath}" file paths created earlier in this chat, and mention that you checked session history.`,
1342        semanticCheck: (result) =>
1343          result.response.includes(moneyPlanPath) &&
1344          result.response.includes(researchBriefPath) &&
1345          result.response.includes(launchFinalPath) &&
1346          /\b(history|session history|recent history)\b/i.test(result.response),
1347      },
1348      {
1349        id: 'memory_significant_store',
1350        skill: 'memory',
1351        weight: 4,
1352        timeoutMs: 140_000,
1353        requiresTool: true,
1354        expectedTools: ['memory'],
1355        prompt: [
1356          'Store significant long-term memory for this user:',
1357          `birthday ${birthday}, anniversary ${anniversary}, recurring bug "${recurringBug}".`,
1358          'Save it explicitly as durable memory and confirm what was saved.',
1359        ].join(' '),
1360        semanticCheck: (result) =>
1361          mentionsDate(result.response, birthday) &&
1362          mentionsDate(result.response, anniversary) &&
1363          result.response.toLowerCase().includes(recurringBug.toLowerCase()),
1364        externalCheckWeight: 0.4,
1365        postRunCheck: async () => {
1366          const memoryCount = countMemoriesContaining(runTag)
1367          return {
1368            name: 'memory_rows_created',
1369            passed: memoryCount >= 1,
1370            details: { memoryCount, expectedAtLeast: 1 },
1371          }
1372        },
1373      },
1374      {
1375        id: 'memory_significant_recall',
1376        skill: 'memory',
1377        weight: 4,
1378        timeoutMs: 120_000,
1379        requiresTool: false,
1380        expectedTools: [],
1381        prompt: 'What significant personal details and recurring bug did I ask you to remember earlier in this conversation? Answer with exact values.',
1382        semanticCheck: (result) =>
1383          mentionsDate(result.response, birthday) &&
1384          mentionsDate(result.response, anniversary) &&
1385          result.response.toLowerCase().includes(recurringBug.toLowerCase()),
1386      },
1387    ]
1388  }
1389  
1390  const CHATROOM_SCENARIOS = [
1391    {
1392      id: 'sequential_project_split_execute',
1393      mode: 'sequential',
1394      autoAddress: true,
1395      weight: 10,
1396      timeoutMs: 240_000,
1397      requireAction: true,
1398      prompt: '@all We need to research and build a tiny app together. Split responsibilities by role and perform one concrete action now.',
1399    },
1400    {
1401      id: 'parallel_cross_delegate',
1402      mode: 'parallel',
1403      autoAddress: true,
1404      weight: 10,
1405      timeoutMs: 240_000,
1406      requireAction: true,
1407      requireDelegation: true,
1408      prompt: '@all Work as a team: each of you delegate one subtask to another specific agent and execute one concrete action now.',
1409    },
1410    {
1411      id: 'sequential_companion_team',
1412      mode: 'sequential',
1413      autoAddress: true,
1414      weight: 10,
1415      timeoutMs: 240_000,
1416      requireAction: true,
1417      requireEmpathy: true,
1418      prompt: '@all User says they are overwhelmed and lonely while trying to build a startup. Respond empathetically and provide one concrete next step each.',
1419    },
1420  ]
1421  
1422  function evaluateSessionScenario(scenario, result, postCheck = null) {
1423    const called = new Set(canonicalizeToolList(result.toolCalls))
1424    const expected = canonicalizeToolList(scenario.expectedTools || [])
1425    const expectedMatched = expected.filter((toolName) => called.has(toolName)).length
1426    const toolCoverage = expected.length > 0
1427      ? expectedMatched / expected.length
1428      : (scenario.requiresTool ? (result.toolCalls.length > 0 ? 1 : 0) : 1)
1429    const noErrors = result.toolErrors.length === 0 && result.streamErrors.length === 0 ? 1 : 0
1430    const semantic = scenario.semanticCheck(result) ? 1 : 0
1431    const timely = result.durationMs <= scenario.timeoutMs ? 1 : 0
1432    const external = postCheck ? (postCheck.passed ? 1 : 0) : 1
1433    const externalWeight = Number.isFinite(Number(scenario.externalCheckWeight))
1434      ? Math.max(0, Math.min(0.5, Number(scenario.externalCheckWeight)))
1435      : 0
1436    const primaryScore = (toolCoverage * 0.5) + (noErrors * 0.2) + (semantic * 0.2) + (timely * 0.1)
1437    const blended = externalWeight > 0
1438      ? ((primaryScore * (1 - externalWeight)) + (external * externalWeight))
1439      : primaryScore
1440  
1441    let score = scenario.weight * blended
1442    if (scenario.requiresTool && result.toolCalls.length === 0) {
1443      score *= 0.35
1444    }
1445    score = round1(score)
1446  
1447    return {
1448      id: scenario.id,
1449      skill: scenario.skill,
1450      weight: scenario.weight,
1451      score,
1452      passed: score >= scenario.weight * 0.7,
1453      durationMs: result.durationMs,
1454      checks: {
1455        toolCoverage: round1(toolCoverage * 100),
1456        noErrors: Boolean(noErrors),
1457        semantic: Boolean(semantic),
1458        timely: Boolean(timely),
1459        external: postCheck ? Boolean(postCheck.passed) : null,
1460      },
1461      toolCalls: result.toolCalls,
1462      toolErrors: result.toolErrors,
1463      streamErrors: result.streamErrors,
1464      response: result.responseSummary || summarize(result.response, 340),
1465      postCheck,
1466    }
1467  }
1468  
1469  function evaluateChatroomScenario(scenario, result, expectedAgentIds) {
1470    const expected = expectedAgentIds.length
1471    const participation = expected > 0 ? Math.min(1, result.respondedAgentIds.length / expected) : 0
1472    const combinedText = result.newMessages
1473      .filter((msg) => msg.senderId !== 'user' && msg.senderId !== 'system')
1474      .map((msg) => msg.text)
1475      .join('\n')
1476    const splitSignal = /\b(assign|split|role|research|build|verify|delegate|owner)\b/i.test(combinedText) ? 1 : 0
1477    const actionSignal = result.toolCalls.length > 0
1478      || /\b(created|started|ran|executed|wrote|searched|configured|checked|listed|implemented|launched)\b/i.test(combinedText)
1479      || containsActionableStep(combinedText)
1480      ? 1 : 0
1481    const delegationSignal = result.newMessages.some((msg) =>
1482      msg.senderId !== 'user' &&
1483      msg.senderId !== 'system' &&
1484      (
1485        (Array.isArray(msg.mentions) && msg.mentions.length > 0)
1486        || /@\w+/.test(String(msg.text || ''))
1487        || /\bdelegate\b/i.test(String(msg.text || ''))
1488      )
1489    ) ? 1 : 0
1490    const empathySignal = containsEmpathy(combinedText) ? 1 : 0
1491    const noErrors = result.errors.length === 0 ? 1 : 0
1492  
1493    let score = 0
1494    if (scenario.requireEmpathy) {
1495      score = round1(scenario.weight * (
1496        (participation * 0.35) +
1497        (empathySignal * 0.3) +
1498        (actionSignal * 0.2) +
1499        (noErrors * 0.15)
1500      ))
1501    } else if (scenario.requireDelegation) {
1502      score = round1(scenario.weight * (
1503        (participation * 0.35) +
1504        (delegationSignal * 0.25) +
1505        (splitSignal * 0.2) +
1506        (actionSignal * 0.1) +
1507        (noErrors * 0.1)
1508      ))
1509    } else {
1510      score = round1(scenario.weight * (
1511        (participation * 0.4) +
1512        (splitSignal * 0.25) +
1513        (actionSignal * 0.25) +
1514        (noErrors * 0.1)
1515      ))
1516    }
1517  
1518    return {
1519      id: scenario.id,
1520      mode: scenario.mode,
1521      weight: scenario.weight,
1522      score,
1523      passed: score >= scenario.weight * 0.7,
1524      durationMs: result.durationMs,
1525      checks: {
1526        participation: round1(participation * 100),
1527        splitSignal: Boolean(splitSignal),
1528        actionSignal: Boolean(actionSignal),
1529        delegationSignal: Boolean(delegationSignal),
1530        empathySignal: Boolean(empathySignal),
1531        noErrors: Boolean(noErrors),
1532      },
1533      respondedAgentIds: result.respondedAgentIds,
1534      toolCalls: result.toolCalls,
1535      errors: result.errors,
1536      sampleMessages: result.newMessages.slice(0, 8).map((msg) => ({
1537        senderName: msg.senderName,
1538        text: summarize(msg.text, 180),
1539      })),
1540    }
1541  }
1542  
1543  function evaluateModelDiversity(participantAgents) {
1544    const normalizeTools = (tools) => {
1545      if (!Array.isArray(tools)) return ''
1546      return [...new Set(tools.map((tool) => String(tool || '').trim()).filter(Boolean))].sort().join(',')
1547    }
1548    const modelFamily = (model) => String(model || '').toLowerCase().split(/[:/@]/)[0] || String(model || '').toLowerCase()
1549    const uniqueModelKeys = new Set(
1550      participantAgents.map((agent) => `${agent.provider || 'unknown'}:${agent.model || 'unknown'}`)
1551    )
1552    const uniqueFamilyKeys = new Set(
1553      participantAgents.map((agent) => `${agent.provider || 'unknown'}:${modelFamily(agent.model || 'unknown')}`)
1554    )
1555    const uniqueCapabilityProfiles = new Set(
1556      participantAgents.map((agent) => [
1557        String(agent.provider || 'unknown').toLowerCase(),
1558        String(agent.model || 'unknown').toLowerCase(),
1559        normalizeTools(getAgentTools(agent)),
1560        agent.credentialId ? 'cred' : 'nocred',
1561        agent.apiEndpoint ? 'custom-endpoint' : 'default-endpoint',
1562      ].join('|'))
1563    )
1564    const uniqueToolProfiles = new Set(
1565      participantAgents.map((agent) => normalizeTools(getAgentTools(agent)))
1566    )
1567    const agentCount = Math.max(1, participantAgents.length)
1568    const modelDiversity = Math.min(1, uniqueModelKeys.size / agentCount)
1569    const familyDiversity = Math.min(1, uniqueFamilyKeys.size / agentCount)
1570    const capabilityDiversity = Math.min(1, uniqueCapabilityProfiles.size / agentCount)
1571    const toolProfileDiversity = Math.min(1, uniqueToolProfiles.size / agentCount)
1572    const roleHints = participantAgents.filter((agent) => {
1573      const text = `${agent.name || ''} ${agent.description || ''}`.toLowerCase()
1574      return /(research|build|assistant|planner|coder|qa|ops|orchestr)/.test(text)
1575    }).length
1576    const specialization = Math.min(1, roleHints / agentCount)
1577    const score = round1(10 * (
1578      (modelDiversity * 0.2)
1579      + (familyDiversity * 0.2)
1580      + (capabilityDiversity * 0.35)
1581      + (toolProfileDiversity * 0.1)
1582      + (specialization * 0.15)
1583    ))
1584  
1585    return {
1586      weight: 10,
1587      score,
1588      passed: score >= 5,
1589      checks: {
1590        uniqueModels: uniqueModelKeys.size,
1591        uniqueModelFamilies: uniqueFamilyKeys.size,
1592        uniqueCapabilityProfiles: uniqueCapabilityProfiles.size,
1593        uniqueToolProfiles: uniqueToolProfiles.size,
1594        agentCount,
1595        diversityPct: round1(modelDiversity * 100),
1596        familyDiversityPct: round1(familyDiversity * 100),
1597        capabilityDiversityPct: round1(capabilityDiversity * 100),
1598        toolProfileDiversityPct: round1(toolProfileDiversity * 100),
1599        specializationPct: round1(specialization * 100),
1600      },
1601      participants: participantAgents.map((agent) => ({
1602        id: agent.id,
1603        name: agent.name,
1604        provider: agent.provider,
1605        model: agent.model,
1606        tools: getAgentTools(agent),
1607        hasCredential: Boolean(agent.credentialId),
1608        hasEndpoint: Boolean(agent.apiEndpoint),
1609      })),
1610    }
1611  }
1612  
1613  function evaluateOpenclawComparison(results) {
1614    if (!results || results.length === 0) {
1615      return { status: 'not_configured', available: false, notes: 'No OpenClaw agent configured.' }
1616    }
1617    const hasConnectionRefused = results.some((row) =>
1618      row.streamErrors.some((error) => /econnrefused/i.test(error))
1619    )
1620    const healthyTurns = results.filter((row) =>
1621      row.streamErrors.length === 0 && row.response && row.response.trim().length >= 20
1622    ).length
1623    if (hasConnectionRefused && healthyTurns === 0) {
1624      return { status: 'unreachable', available: false, notes: 'OpenClaw provider unreachable (connection refused).' }
1625    }
1626    return {
1627      status: 'available',
1628      available: true,
1629      healthyTurns,
1630      totalTurns: results.length,
1631      notes: healthyTurns === results.length
1632        ? 'OpenClaw comparison completed.'
1633        : 'OpenClaw comparison partially completed with errors.',
1634    }
1635  }
1636  
1637  function readLatestBenchmark(outDir, profileId) {
1638    if (!fs.existsSync(outDir)) return null
1639    const prefix = `autonomy-benchmark-${profileId}-`
1640    const files = fs.readdirSync(outDir)
1641      .filter((name) => name.startsWith(prefix) && name.endsWith('.json'))
1642      .sort()
1643    if (files.length === 0) return null
1644    const latest = path.join(outDir, files[files.length - 1])
1645    try {
1646      const parsed = JSON.parse(fs.readFileSync(latest, 'utf8'))
1647      return { path: latest, report: parsed }
1648    } catch {
1649      return null
1650    }
1651  }
1652  
1653  function filterScenarioIds(rows, requestedIds) {
1654    if (!Array.isArray(requestedIds) || requestedIds.length === 0) return rows
1655    const wanted = new Set(requestedIds)
1656    return rows.filter((row) => wanted.has(row.id))
1657  }
1658  
1659  function renderMarkdown(report) {
1660    const lines = []
1661    lines.push('# Autonomy Harness Benchmark')
1662    lines.push('')
1663    lines.push(`- Generated: ${report.generatedAt}`)
1664    lines.push(`- Base URL: ${report.baseUrl}`)
1665    lines.push(`- Grade: **${report.summary.grade}** (${report.summary.totalScore}/${report.summary.maxScore})`)
1666    lines.push(`- Min Score Threshold: ${report.summary.minScore}`)
1667    lines.push(`- Result: ${report.summary.passed ? 'PASS' : 'FAIL'}`)
1668    lines.push('')
1669    lines.push('## Category Scores')
1670    lines.push('')
1671    lines.push('| Category | Score | Max | Pass |')
1672    lines.push('| --- | ---: | ---: | :---: |')
1673    lines.push(`| Session Skills | ${report.categoryScores.session.score} | ${report.categoryScores.session.max} | ${report.categoryScores.session.passed ? 'yes' : 'no'} |`)
1674    lines.push(`| Chatroom Collaboration | ${report.categoryScores.chatroom.score} | ${report.categoryScores.chatroom.max} | ${report.categoryScores.chatroom.passed ? 'yes' : 'no'} |`)
1675    lines.push(`| Collaboration Diversity | ${report.categoryScores.modelDiversity.score} | ${report.categoryScores.modelDiversity.max} | ${report.categoryScores.modelDiversity.passed ? 'yes' : 'no'} |`)
1676    lines.push('')
1677    lines.push('## Session Skills')
1678    lines.push('')
1679    lines.push('| Scenario | Skill | Score | Tool Coverage | Semantic | External | Errors |')
1680    lines.push('| --- | --- | ---: | ---: | :---: | :---: | :---: |')
1681    for (const row of report.sessionScenarios) {
1682      const external = row.checks.external === null ? 'n/a' : (row.checks.external ? 'yes' : 'no')
1683      lines.push(`| ${row.id} | ${row.skill} | ${row.score}/${row.weight} | ${row.checks.toolCoverage}% | ${row.checks.semantic ? 'yes' : 'no'} | ${external} | ${row.checks.noErrors ? 'yes' : 'no'} |`)
1684    }
1685    lines.push('')
1686    lines.push('## Chatroom Collaboration')
1687    lines.push('')
1688    lines.push('| Scenario | Mode | Score | Participation | Action | Delegation | Empathy | Errors |')
1689    lines.push('| --- | --- | ---: | ---: | :---: | :---: | :---: | :---: |')
1690    for (const row of report.chatroomScenarios) {
1691      lines.push(`| ${row.id} | ${row.mode} | ${row.score}/${row.weight} | ${row.checks.participation}% | ${row.checks.actionSignal ? 'yes' : 'no'} | ${row.checks.delegationSignal ? 'yes' : 'no'} | ${row.checks.empathySignal ? 'yes' : 'no'} | ${row.checks.noErrors ? 'yes' : 'no'} |`)
1692    }
1693    lines.push('')
1694    lines.push('## OpenClaw Comparison')
1695    lines.push('')
1696    lines.push(`- Status: ${report.openclaw.status}`)
1697    lines.push(`- Notes: ${report.openclaw.notes}`)
1698    lines.push('')
1699    if (report.previous) {
1700      lines.push('## Previous Run Delta')
1701      lines.push('')
1702      lines.push(`- Previous: ${report.previous.path}`)
1703      lines.push(`- Score Change: ${report.previous.deltaScore > 0 ? '+' : ''}${report.previous.deltaScore}`)
1704      lines.push(`- Grade Change: ${report.previous.prevGrade} -> ${report.summary.grade}`)
1705      lines.push('')
1706    }
1707    return `${lines.join('\n')}\n`
1708  }
1709  
1710  async function runSessionTurn(client, sessionId, scenario) {
1711    const sse = await postSse(
1712      client,
1713      `/api/chats/${encodeURIComponent(sessionId)}/chat`,
1714      { message: scenario.prompt },
1715      scenario.timeoutMs,
1716    )
1717    const { toolCalls, toolErrors, streamErrors } = collectToolStats(sse.events)
1718    const cleanedResponse = stripRunNoise(sse.text)
1719    return {
1720      id: scenario.id,
1721      durationMs: sse.durationMs,
1722      toolCalls,
1723      toolErrors,
1724      streamErrors,
1725      response: cleanedResponse,
1726      responseSummary: summarize(cleanedResponse, 340),
1727    }
1728  }
1729  
1730  async function runChatroomTurn(client, chatroomId, scenario, expectedAgentIds) {
1731    const before = await fetchJson(client, 'GET', `/api/chatrooms/${encodeURIComponent(chatroomId)}`)
1732    const previousCount = Array.isArray(before?.messages) ? before.messages.length : 0
1733    const sse = await postSse(
1734      client,
1735      `/api/chatrooms/${encodeURIComponent(chatroomId)}/chat`,
1736      { senderId: 'user', text: scenario.prompt },
1737      scenario.timeoutMs,
1738    )
1739    const after = await fetchJson(client, 'GET', `/api/chatrooms/${encodeURIComponent(chatroomId)}`)
1740    const messages = Array.isArray(after?.messages) ? after.messages : []
1741    const newMessages = messages.slice(previousCount)
1742    const respondedAgentIds = [...new Set(
1743      newMessages
1744        .filter((msg) => msg && msg.senderId && msg.senderId !== 'user' && msg.senderId !== 'system')
1745        .map((msg) => String(msg.senderId)),
1746    )]
1747    const { toolCalls } = collectToolStats(sse.events)
1748    const errors = sse.events
1749      .filter((event) => event?.t === 'err')
1750      .map((event) => summarize(event.text || 'unknown error', 180))
1751  
1752    return {
1753      id: scenario.id,
1754      durationMs: sse.durationMs,
1755      expectedAgentIds,
1756      respondedAgentIds,
1757      toolCalls,
1758      errors,
1759      newMessages: newMessages.map((msg) => ({
1760        senderId: msg.senderId,
1761        senderName: msg.senderName,
1762        text: msg.text,
1763        mentions: Array.isArray(msg.mentions) ? msg.mentions : [],
1764      })),
1765    }
1766  }
1767  
1768  function selectChatroomAgentIds(agents, probeAgent) {
1769    const probeAgentId = probeAgent?.id
1770    const selected = []
1771    const normalizeTools = (tools) => {
1772      if (!Array.isArray(tools)) return ''
1773      return [...new Set(tools.map((tool) => String(tool || '').trim()).filter(Boolean))].sort().join(',')
1774    }
1775    const probeModelKey = `${probeAgent?.provider || ''}:${probeAgent?.model || ''}`.toLowerCase()
1776    const probeToolKey = normalizeTools(getAgentTools(probeAgent))
1777    const probeHasCred = Boolean(probeAgent?.credentialId)
1778    const probeHasEndpoint = Boolean(probeAgent?.apiEndpoint)
1779    const isHealthyCandidate = (agent) => {
1780      const provider = String(agent?.provider || '').toLowerCase()
1781      if (!provider) return false
1782      if (provider === 'openclaw') return false
1783      if (provider === 'ollama') return Boolean(agent?.apiEndpoint)
1784      if (provider.endsWith('-cli')) return true
1785      return Boolean(agent?.credentialId)
1786    }
1787  
1788    const candidates = Object.entries(agents)
1789      .filter(([id, agent]) => {
1790        if (!agent || id === probeAgentId) return false
1791        if (!isHealthyCandidate(agent)) return false
1792        const name = String(agent.name || '').toLowerCase()
1793        return !(name.includes('probe autonomy') || name.includes('[autonomy probe'))
1794      })
1795      .map(([id, agent]) => {
1796        const modelKey = `${agent.provider || ''}:${agent.model || ''}`.toLowerCase()
1797        const toolKey = normalizeTools(getAgentTools(agent))
1798        let score = 0
1799        if (modelKey !== probeModelKey) score += 4
1800        if (toolKey !== probeToolKey) score += 3
1801        if (Boolean(agent.credentialId) !== probeHasCred) score += 1
1802        if (Boolean(agent.apiEndpoint) !== probeHasEndpoint) score += 1
1803        const text = `${agent.name || ''} ${agent.description || ''}`.toLowerCase()
1804        if (/(research|build|assistant|planner|coder|qa|ops|orchestr)/.test(text)) score += 1
1805        return { id, score }
1806      })
1807      .sort((a, b) => b.score - a.score)
1808  
1809    // Keep assistant as first collaborator when available for consistent baseline UX.
1810    if (agents.default && probeAgentId !== 'default' && isHealthyCandidate(agents.default)) selected.push('default')
1811  
1812    for (const candidate of candidates) {
1813      if (selected.includes(candidate.id)) continue
1814      selected.push(candidate.id)
1815      if (selected.length >= 2) break
1816    }
1817  
1818    return [probeAgentId, ...selected].filter(Boolean).slice(0, 3)
1819  }
1820  
1821  async function cleanupBenchmarkArtifacts(client, ids, runTag) {
1822    const warnings = []
1823  
1824    for (const chatroomId of ids.chatrooms) {
1825      try {
1826        await fetchJson(client, 'DELETE', `/api/chatrooms/${encodeURIComponent(chatroomId)}`)
1827      } catch (err) {
1828        warnings.push(`cleanup chatroom ${chatroomId}: ${err instanceof Error ? err.message : String(err)}`)
1829      }
1830    }
1831  
1832    for (const sessionId of ids.sessions) {
1833      try {
1834        await fetchJson(client, 'DELETE', `/api/chats/${encodeURIComponent(sessionId)}`)
1835      } catch (err) {
1836        warnings.push(`cleanup session ${sessionId}: ${err instanceof Error ? err.message : String(err)}`)
1837      }
1838    }
1839  
1840    for (const agentId of ids.agents) {
1841      try {
1842        await fetchJson(client, 'DELETE', `/api/agents/${encodeURIComponent(agentId)}`)
1843      } catch (err) {
1844        warnings.push(`cleanup agent ${agentId}: ${err instanceof Error ? err.message : String(err)}`)
1845      }
1846    }
1847  
1848    try {
1849      const tasks = await fetchJson(client, 'GET', '/api/tasks')
1850      const rows = Object.values(tasks || {})
1851      for (const row of rows) {
1852        if (!row || typeof row !== 'object') continue
1853        const id = row.id
1854        const title = String(row.title || '')
1855        if (!id || !title.includes(`[Autonomy Probe ${runTag}]`)) continue
1856        await fetchJson(client, 'DELETE', `/api/tasks/${encodeURIComponent(id)}`)
1857      }
1858    } catch (err) {
1859      warnings.push(`cleanup benchmark tasks: ${err instanceof Error ? err.message : String(err)}`)
1860    }
1861  
1862    for (const projectId of ids.projects || []) {
1863      try {
1864        await fetchJson(client, 'DELETE', `/api/projects/${encodeURIComponent(projectId)}`)
1865      } catch (err) {
1866        warnings.push(`cleanup project ${projectId}: ${err instanceof Error ? err.message : String(err)}`)
1867      }
1868    }
1869  
1870    for (const workspaceRoot of ids.workspaces || []) {
1871      try {
1872        fs.rmSync(workspaceRoot, { recursive: true, force: true })
1873      } catch (err) {
1874        warnings.push(`cleanup workspace ${workspaceRoot}: ${err instanceof Error ? err.message : String(err)}`)
1875      }
1876    }
1877  
1878    return warnings
1879  }
1880  
1881  async function main() {
1882    const options = parseArgs(process.argv.slice(2))
1883    const profile = resolveProbeProfile(options.profile)
1884    const accessKey = loadAccessKey(options.accessKey)
1885    const client = { baseUrl: options.baseUrl, accessKey }
1886    const runTag = toSlug(nowSlug())
1887    const probeTitle = `[Autonomy Probe ${runTag}]`
1888    const createdIds = { agents: [], sessions: [], chatrooms: [], projects: [], workspaces: [] }
1889    const warnings = []
1890  
1891    ensureDir(options.outDir)
1892    const previous = readLatestBenchmark(options.outDir, profile.id)
1893  
1894    await fetchJson(client, 'GET', '/api/auth')
1895    const agents = await fetchJson(client, 'GET', '/api/agents')
1896    const defaultAgent = agents?.default || Object.values(agents || {})[0]
1897    if (!defaultAgent) {
1898      throw new Error('No agent found. Configure at least one agent before running benchmark.')
1899    }
1900  
1901    const workspaceFixture = await prepareWorkspaceFixture(client, runTag, profile, createdIds)
1902  
1903    const probeAgent = await fetchJson(client, 'POST', '/api/agents', {
1904      name: `${probeTitle} Agent`,
1905      description: 'Temporary autonomy benchmark agent',
1906      systemPrompt: defaultAgent.systemPrompt || '',
1907      provider: defaultAgent.provider || 'openai',
1908      model: defaultAgent.model || 'gpt-4o',
1909      credentialId: defaultAgent.credentialId || null,
1910      apiEndpoint: defaultAgent.apiEndpoint || null,
1911      tools: profile.tools,
1912      delegationEnabled: true,
1913      delegationTargetMode: 'all',
1914      delegationTargetAgentIds: [],
1915      memoryScopeMode: profile.hasProjectContext ? 'project' : 'auto',
1916      projectId: workspaceFixture.project?.id || undefined,
1917    })
1918    createdIds.agents.push(probeAgent.id)
1919  
1920    const probeSession = await fetchJson(client, 'POST', '/api/chats', {
1921      name: `${probeTitle} Session`,
1922      agentId: probeAgent.id,
1923      provider: probeAgent.provider,
1924      model: probeAgent.model,
1925      credentialId: probeAgent.credentialId || null,
1926      apiEndpoint: probeAgent.apiEndpoint || null,
1927      extensions: getAgentTools(probeAgent),
1928      user: 'benchmark',
1929      cwd: workspaceFixture.workspaceRoot,
1930    })
1931    createdIds.sessions.push(probeSession.id)
1932  
1933    const memoryRecallSession = await fetchJson(client, 'POST', '/api/chats', {
1934      name: `${probeTitle} Memory recall`,
1935      agentId: probeAgent.id,
1936      provider: probeAgent.provider,
1937      model: probeAgent.model,
1938      credentialId: probeAgent.credentialId || null,
1939      apiEndpoint: probeAgent.apiEndpoint || null,
1940      extensions: getAgentTools(probeAgent),
1941      user: 'benchmark',
1942      cwd: workspaceFixture.workspaceRoot,
1943    })
1944    createdIds.sessions.push(memoryRecallSession.id)
1945  
1946    const sessionScenarios = filterScenarioIds(
1947      buildSessionScenarios(runTag, defaultAgent.id, profile, workspaceFixture),
1948      options.sessionScenarios,
1949    )
1950    if (sessionScenarios.length === 0) {
1951      throw new Error('No session scenarios selected. Check --session-scenarios values.')
1952    }
1953    const sessionResults = []
1954    const sessionEvaluated = []
1955    for (const scenario of sessionScenarios) {
1956      const targetSessionId = scenario.id === 'memory_significant_recall'
1957        ? memoryRecallSession.id
1958        : probeSession.id
1959      const row = await runSessionTurn(client, targetSessionId, scenario)
1960      let postCheck = null
1961      if (typeof scenario.postRunCheck === 'function') {
1962        try {
1963          postCheck = await scenario.postRunCheck({
1964            client,
1965            runTag,
1966            sessionId: targetSessionId,
1967            row,
1968          })
1969        } catch (err) {
1970          postCheck = {
1971            name: 'post_check_error',
1972            passed: false,
1973            details: { error: err instanceof Error ? err.message : String(err) },
1974          }
1975        }
1976      }
1977      sessionResults.push(row)
1978      sessionEvaluated.push(evaluateSessionScenario(scenario, row, postCheck))
1979      await sleep(250)
1980    }
1981  
1982    const roomAgentIds = options.skipChatrooms ? [] : selectChatroomAgentIds(agents, probeAgent)
1983    const roomAgents = options.skipChatrooms
1984      ? []
1985      : roomAgentIds.map((id) => agents[id] || (id === probeAgent.id ? probeAgent : null)).filter(Boolean)
1986    const chatroomResults = []
1987    const chatroomEvaluated = []
1988  
1989    if (!options.skipChatrooms) {
1990      for (const scenario of CHATROOM_SCENARIOS) {
1991        const room = await fetchJson(client, 'POST', '/api/chatrooms', {
1992          name: `${probeTitle} ${scenario.mode} room`,
1993          description: `${scenario.mode} benchmark room`,
1994          agentIds: roomAgentIds,
1995        })
1996        createdIds.chatrooms.push(room.id)
1997        const modeSet = setChatroomHarnessFlags(room.id, {
1998          chatMode: scenario.mode,
1999          autoAddress: scenario.autoAddress,
2000        })
2001        if (!modeSet) {
2002          warnings.push(`Could not set chatroom mode flags for ${room.id}; benchmark fell back to room defaults.`)
2003        }
2004        const row = await runChatroomTurn(client, room.id, scenario, roomAgentIds)
2005        chatroomResults.push(row)
2006        chatroomEvaluated.push(evaluateChatroomScenario(scenario, row, roomAgentIds))
2007        await sleep(250)
2008      }
2009    } else {
2010      warnings.push('Chatroom scenarios skipped by --skip-chatrooms.')
2011    }
2012  
2013    const openclawAgent = options.includeOpenclaw
2014      ? Object.values(agents || {}).find((agent) => agent && String(agent.provider || '').toLowerCase() === 'openclaw')
2015      : null
2016    let openclawSession = null
2017    const openclawResults = []
2018    if (openclawAgent) {
2019      try {
2020        openclawSession = await fetchJson(client, 'POST', '/api/chats', {
2021          name: `${probeTitle} OpenClaw compare`,
2022          agentId: openclawAgent.id,
2023          provider: openclawAgent.provider,
2024          model: openclawAgent.model,
2025          credentialId: openclawAgent.credentialId || null,
2026          apiEndpoint: openclawAgent.apiEndpoint || null,
2027          extensions: getAgentTools(openclawAgent),
2028          user: 'benchmark',
2029        })
2030        createdIds.sessions.push(openclawSession.id)
2031        for (const scenario of OPENCLAW_SCENARIOS) {
2032          const row = await runSessionTurn(client, openclawSession.id, scenario)
2033          openclawResults.push(row)
2034          await sleep(250)
2035        }
2036      } catch (err) {
2037        warnings.push(`OpenClaw comparison skipped: ${err instanceof Error ? err.message : String(err)}`)
2038      }
2039    }
2040  
2041    const modelDiversity = options.skipChatrooms
2042      ? {
2043          weight: 0,
2044          score: 0,
2045          passed: true,
2046          checks: {
2047            uniqueModels: 0,
2048            uniqueModelFamilies: 0,
2049            uniqueCapabilityProfiles: 0,
2050            uniqueToolProfiles: 0,
2051            agentCount: 0,
2052            diversityPct: 0,
2053            familyDiversityPct: 0,
2054            capabilityDiversityPct: 0,
2055            toolProfileDiversityPct: 0,
2056            specializationPct: 0,
2057          },
2058          participants: [],
2059        }
2060      : evaluateModelDiversity(roomAgents)
2061    const sessionScore = round1(sessionEvaluated.reduce((sum, row) => sum + row.score, 0))
2062    const sessionMax = round1(sessionEvaluated.reduce((sum, row) => sum + row.weight, 0))
2063    const chatroomScore = round1(chatroomEvaluated.reduce((sum, row) => sum + row.score, 0))
2064    const chatroomMax = round1(chatroomEvaluated.reduce((sum, row) => sum + row.weight, 0))
2065    const totalScore = round1(sessionScore + chatroomScore + modelDiversity.score)
2066    const maxScore = round1(sessionMax + chatroomMax + modelDiversity.weight)
2067    const normalizedScore = maxScore > 0 ? round1((totalScore / maxScore) * 100) : 0
2068    const grade = gradeForScore(normalizedScore)
2069    const openclawSummary = evaluateOpenclawComparison(openclawResults)
2070    const totalDurationMs = sessionResults.reduce((sum, row) => sum + row.durationMs, 0)
2071      + chatroomResults.reduce((sum, row) => sum + row.durationMs, 0)
2072    const totalToolCalls = sessionResults.reduce((sum, row) => sum + row.toolCalls.length, 0)
2073      + chatroomResults.reduce((sum, row) => sum + row.toolCalls.length, 0)
2074  
2075    let previousSummary = null
2076    if (previous?.report?.summary?.totalScore !== undefined) {
2077      const prevScore = Number(previous.report.summary.totalScore) || 0
2078      const deltaScore = round1(totalScore - prevScore)
2079      previousSummary = {
2080        path: previous.path,
2081        prevScore: round1(prevScore),
2082        prevGrade: String(previous.report.summary.grade || '?'),
2083        deltaScore,
2084      }
2085    }
2086  
2087    if (!options.keepCreated) {
2088      const cleanupWarnings = await cleanupBenchmarkArtifacts(client, createdIds, runTag)
2089      warnings.push(...cleanupWarnings)
2090    }
2091  
2092    const report = {
2093      schemaVersion: 1,
2094      generatedAt: new Date().toISOString(),
2095      baseUrl: client.baseUrl,
2096      runTag,
2097      profile: {
2098        id: profile.id,
2099        label: profile.label,
2100        tools: [...profile.tools],
2101        hasTaskManagement: profile.hasTaskManagement,
2102        hasProjectContext: profile.hasProjectContext,
2103        hasProjectTool: profile.hasProjectTool,
2104        hasProjectOperations: profile.hasProjectOperations,
2105        notes: profile.hasProjectContext
2106          ? 'Project context uses a real Project record, a workspace under WORKSPACE_ROOT/projects/<projectId>, structured project metadata, and project-linked tasks/schedules/secrets when those tools are enabled.'
2107          : 'This profile does not enable project context; comparisons isolate task management against file-based fallback workflows.',
2108      },
2109      options: {
2110        sessionScenarioIds: sessionScenarios.map((scenario) => scenario.id),
2111        skipChatrooms: options.skipChatrooms,
2112        includeOpenclaw: options.includeOpenclaw,
2113      },
2114      summary: {
2115        totalScore,
2116        maxScore,
2117        normalizedScore,
2118        grade,
2119        minScore: options.minScore,
2120        passed: normalizedScore >= options.minScore,
2121        totalDurationMs,
2122        totalToolCalls,
2123      },
2124      categoryScores: {
2125        session: {
2126          score: sessionScore,
2127          max: sessionMax,
2128          passed: sessionScore >= sessionMax * 0.7,
2129        },
2130        chatroom: {
2131          score: chatroomScore,
2132          max: chatroomMax,
2133          passed: chatroomScore >= chatroomMax * 0.7,
2134        },
2135        modelDiversity: {
2136          score: modelDiversity.score,
2137          max: modelDiversity.weight,
2138          passed: modelDiversity.passed,
2139        },
2140      },
2141      probe: {
2142        profileId: profile.id,
2143        probeAgent: { id: probeAgent.id, name: probeAgent.name, provider: probeAgent.provider, model: probeAgent.model },
2144        probeSession: { id: probeSession.id, name: probeSession.name },
2145        workspaceRoot: workspaceFixture.workspaceRoot,
2146        project: workspaceFixture.project ? {
2147          id: workspaceFixture.project.id,
2148          name: workspaceFixture.project.name,
2149          description: workspaceFixture.project.description,
2150        } : null,
2151        chatroomAgentIds: roomAgentIds,
2152      },
2153      sessionScenarios: sessionEvaluated,
2154      sessionRaw: sessionResults,
2155      chatroomScenarios: chatroomEvaluated,
2156      chatroomRaw: chatroomResults,
2157      modelDiversity,
2158      openclaw: {
2159        ...openclawSummary,
2160        sessionId: openclawSession?.id || null,
2161        results: openclawResults,
2162      },
2163      previous: previousSummary,
2164      warnings: [...warnings],
2165    }
2166  
2167    const fileStem = `autonomy-benchmark-${profile.id}-${runTag}`
2168    const jsonPath = path.join(options.outDir, `${fileStem}.json`)
2169    const markdownPath = path.join(options.outDir, `${fileStem}.md`)
2170    fs.writeFileSync(jsonPath, JSON.stringify(report, null, 2))
2171    fs.writeFileSync(markdownPath, renderMarkdown(report))
2172  
2173    const summaryLine = `${report.summary.passed ? 'PASS' : 'FAIL'} ${report.summary.grade} ${report.summary.normalizedScore}/100`
2174    console.log(JSON.stringify({
2175      summary: summaryLine,
2176      jsonPath,
2177      markdownPath,
2178      openclaw: report.openclaw.status,
2179      warnings: report.warnings,
2180    }, null, 2))
2181  
2182    if (!report.summary.passed) {
2183      process.exit(2)
2184    }
2185  }
2186  
2187  main().catch((err) => {
2188    const message = err instanceof Error ? err.message : String(err)
2189    console.error(JSON.stringify({ error: message }, null, 2))
2190    process.exit(1)
2191  })