supervisor-reflection.test.ts
1 import assert from 'node:assert/strict' 2 import fs from 'node:fs' 3 import os from 'node:os' 4 import path from 'node:path' 5 import { spawnSync } from 'node:child_process' 6 import { describe, it } from 'node:test' 7 8 import { assessAutonomyRun } from '@/lib/server/autonomy/supervisor-reflection' 9 import type { Session } from '@/types' 10 11 const repoRoot = path.resolve(path.dirname(new URL(import.meta.url).pathname), '../../../..') 12 13 function runWithTempDataDir(script: string) { 14 const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'swarmclaw-supervisor-reflection-')) 15 try { 16 const result = spawnSync( 17 process.execPath, 18 ['--import', 'tsx', '--input-type=module', '--eval', script], 19 { 20 cwd: repoRoot, 21 env: { 22 ...process.env, 23 DATA_DIR: tempDir, 24 WORKSPACE_DIR: path.join(tempDir, 'workspace'), 25 SWARMCLAW_BUILD_MODE: '1', 26 }, 27 encoding: 'utf-8', 28 timeout: 20000, 29 }, 30 ) 31 assert.equal(result.status, 0, result.stderr || result.stdout || 'subprocess failed') 32 const lines = (result.stdout || '') 33 .trim() 34 .split('\n') 35 .map((line) => line.trim()) 36 .filter(Boolean) 37 const jsonLine = [...lines].reverse().find((line) => line.startsWith('{')) 38 return JSON.parse(jsonLine || '{}') as Record<string, unknown> 39 } finally { 40 fs.rmSync(tempDir, { recursive: true, force: true }) 41 } 42 } 43 44 describe('supervisor-reflection', () => { 45 it('recommends an automatic supervisor recovery step for repeated tool thrash', () => { 46 const session: Session = { 47 id: 'session-1', 48 name: 'Autonomy Test', 49 cwd: process.cwd(), 50 user: 'tester', 51 provider: 'openai', 52 model: 'gpt-test', 53 claudeSessionId: null, 54 messages: [], 55 createdAt: Date.now(), 56 lastActiveAt: Date.now(), 57 } 58 const assessment = assessAutonomyRun({ 59 runId: 'run-1', 60 sessionId: 'session-1', 61 source: 'chat', 62 status: 'completed', 63 resultText: 'Retried the same shell command and got the same output.', 64 toolEvents: [ 65 { name: 'shell', input: '{"cmd":"npm test"}' }, 66 { name: 'shell', input: '{"cmd":"npm test"}' }, 67 { name: 'shell', input: '{"cmd":"npm test"}' }, 68 ], 69 mainLoopState: { 70 followupChainCount: 1, 71 summary: 'Retried the same shell command and got the same output.', 72 }, 73 settings: { 74 supervisorEnabled: true, 75 supervisorRuntimeScope: 'both', 76 supervisorRepeatedToolLimit: 3, 77 supervisorNoProgressLimit: 2, 78 reflectionEnabled: true, 79 reflectionAutoWriteMemory: true, 80 }, 81 session, 82 }) 83 84 assert.ok(assessment.incidents.some((incident) => incident.kind === 'repeated_tool')) 85 assert.match(String(assessment.interventionPrompt || ''), /stop repeating shell/i) 86 assert.equal(assessment.shouldBlock, false) 87 }) 88 89 it('persists reflections and auto-written reflection memory', () => { 90 const output = runWithTempDataDir(` 91 const storageMod = await import('@/lib/server/storage') 92 const storage = storageMod.default || storageMod['module.exports'] || storageMod 93 const reflectionMod = await import('@/lib/server/autonomy/supervisor-reflection') 94 const mod = reflectionMod.default || reflectionMod['module.exports'] || reflectionMod 95 const memoryDbMod = await import('@/lib/server/memory/memory-db') 96 const memoryMod = memoryDbMod.default || memoryDbMod['module.exports'] || memoryDbMod 97 98 storage.saveAgents({ 99 'agent-a': { 100 id: 'agent-a', 101 name: 'Agent A', 102 provider: 'openai', 103 model: 'gpt-test', 104 }, 105 }) 106 107 storage.saveSessions({ 108 s1: { 109 id: 's1', 110 name: 'Autonomy Session', 111 cwd: process.cwd(), 112 user: 'tester', 113 provider: 'openai', 114 model: 'gpt-test', 115 claudeSessionId: null, 116 messages: [ 117 { role: 'user', text: 'Repair the deployment workflow and keep notes for later.', time: 1 }, 118 { role: 'assistant', text: 'I retried the same shell path and nothing changed.', time: 2 }, 119 ], 120 createdAt: 1, 121 lastActiveAt: 2, 122 sessionType: 'human', 123 agentId: 'agent-a', 124 }, 125 }) 126 127 storage.saveSettings({ 128 supervisorEnabled: true, 129 supervisorRuntimeScope: 'both', 130 supervisorNoProgressLimit: 2, 131 supervisorRepeatedToolLimit: 3, 132 reflectionEnabled: true, 133 reflectionAutoWriteMemory: true, 134 }) 135 136 const result = await mod.observeAutonomyRunOutcome({ 137 runId: 'run-1', 138 sessionId: 's1', 139 agentId: 'agent-a', 140 source: 'chat', 141 status: 'completed', 142 resultText: 'I retried the same shell path and nothing changed.', 143 toolEvents: [ 144 { name: 'shell', input: '{"cmd":"npm test"}' }, 145 { name: 'shell', input: '{"cmd":"npm test"}' }, 146 { name: 'shell', input: '{"cmd":"npm test"}' }, 147 ], 148 mainLoopState: { 149 followupChainCount: 2, 150 summary: 'I retried the same shell path and nothing changed.', 151 }, 152 sourceMessage: 'Repair the deployment workflow and keep notes for later.', 153 }, { 154 generateText: async () => JSON.stringify({ 155 summary: 'Deployment repair reflection', 156 invariants: ['Verify changed files and command output before marking the task complete.'], 157 derived: ['Switch recovery strategy after two identical shell failures in a row.'], 158 failures: ['Repeated shell retries without changing inputs waste budget.'], 159 lessons: ['Capture a short recovery brief before continuing a stuck run.'], 160 communication: ['Keep execution updates concise when reporting repair progress.'], 161 relationship: ['Treat the user as wanting decisive recovery rather than repeated status chatter.'], 162 significant_events: ['The deployment workflow is currently broken and needs a confirmed repair path.'], 163 profile: ['The user is directly responsible for the deployment workflow.'], 164 boundaries: ['Do not claim the repair is complete without concrete verification evidence.'], 165 open_loops: ['Follow up with the final verification result once the repair path succeeds.'], 166 }), 167 }) 168 169 const memories = memoryMod.getMemoryDb().list(undefined, 50) 170 .filter((entry) => entry.metadata && entry.metadata.origin === 'autonomy-reflection') 171 172 console.log(JSON.stringify({ 173 incidentKinds: result.incidents.map((incident) => incident.kind).sort(), 174 reflectionSummary: result.reflection?.summary ?? null, 175 reflectionCount: mod.listRunReflections({ sessionId: 's1' }).length, 176 autoMemoryCount: result.reflection?.autoMemoryIds?.length ?? 0, 177 memoryCategories: memories.map((entry) => entry.category).sort(), 178 profileNotes: result.reflection?.profileNotes ?? [], 179 boundaryNotes: result.reflection?.boundaryNotes ?? [], 180 openLoopNotes: result.reflection?.openLoopNotes ?? [], 181 })) 182 `) 183 184 assert.deepEqual(output.incidentKinds, ['no_progress', 'repeated_tool']) 185 assert.equal(output.reflectionSummary, 'Deployment repair reflection') 186 assert.equal(output.reflectionCount, 1) 187 assert.equal(output.autoMemoryCount, 10) 188 assert.deepEqual(output.profileNotes, ['The user is directly responsible for the deployment workflow.']) 189 assert.deepEqual(output.boundaryNotes, ['Do not claim the repair is complete without concrete verification evidence.']) 190 assert.deepEqual(output.openLoopNotes, ['Follow up with the final verification result once the repair path succeeds.']) 191 assert.deepEqual(output.memoryCategories, [ 192 'reflection/boundary', 193 'reflection/communication', 194 'reflection/derived', 195 'reflection/failure', 196 'reflection/invariant', 197 'reflection/lesson', 198 'reflection/open_loop', 199 'reflection/profile', 200 'reflection/relationship', 201 'reflection/significant_event', 202 ]) 203 }) 204 205 it('reflects short human chats when they contain durable personal context', () => { 206 const output = runWithTempDataDir(` 207 const storageMod = await import('@/lib/server/storage') 208 const storage = storageMod.default || storageMod['module.exports'] || storageMod 209 const reflectionMod = await import('@/lib/server/autonomy/supervisor-reflection') 210 const mod = reflectionMod.default || reflectionMod['module.exports'] || reflectionMod 211 212 storage.saveAgents({ 213 'agent-a': { 214 id: 'agent-a', 215 name: 'Agent A', 216 provider: 'openai', 217 model: 'gpt-test', 218 }, 219 }) 220 221 storage.saveSessions({ 222 s2: { 223 id: 's2', 224 name: 'Human Context Session', 225 cwd: process.cwd(), 226 user: 'tester', 227 provider: 'openai', 228 model: 'gpt-test', 229 claudeSessionId: null, 230 messages: [ 231 { 232 role: 'user', 233 text: 'I am moving to Lisbon next month and prefer short check-ins while I am juggling the move.', 234 time: 1, 235 semantics: { 236 taskIntent: 'general', 237 workType: 'general', 238 isDeliverableTask: false, 239 isBroadGoal: false, 240 isResearchSynthesis: false, 241 hasHumanSignals: true, 242 hasSignificantEvent: true, 243 wantsScreenshots: false, 244 wantsOutboundDelivery: false, 245 wantsVoiceDelivery: false, 246 explicitToolRequests: [], 247 confidence: 0.98, 248 }, 249 }, 250 { role: 'assistant', text: 'Understood. I will keep updates tight and remember the move timing.', time: 2 }, 251 ], 252 createdAt: 1, 253 lastActiveAt: 2, 254 sessionType: 'human', 255 agentId: 'agent-a', 256 }, 257 }) 258 259 storage.saveSettings({ 260 supervisorEnabled: true, 261 supervisorRuntimeScope: 'both', 262 supervisorNoProgressLimit: 2, 263 supervisorRepeatedToolLimit: 3, 264 reflectionEnabled: true, 265 reflectionAutoWriteMemory: true, 266 }) 267 268 const result = await mod.observeAutonomyRunOutcome({ 269 runId: 'run-human', 270 sessionId: 's2', 271 agentId: 'agent-a', 272 source: 'chat', 273 status: 'completed', 274 resultText: 'I will keep updates tight and remember the move timing.', 275 sourceMessage: 'I am moving to Lisbon next month and prefer short check-ins while I am juggling the move.', 276 }, { 277 generateText: async () => JSON.stringify({ 278 summary: 'Human context reflection', 279 communication: ['Prefer short check-ins while the move is in progress.'], 280 significant_events: ['Moving to Lisbon next month.'], 281 open_loops: ['Check in again once the move is complete.'], 282 profile: ['Currently planning a move to Lisbon.'], 283 }), 284 }) 285 286 console.log(JSON.stringify({ 287 reflectionSummary: result.reflection?.summary ?? null, 288 communicationNotes: result.reflection?.communicationNotes ?? [], 289 significantEventNotes: result.reflection?.significantEventNotes ?? [], 290 openLoopNotes: result.reflection?.openLoopNotes ?? [], 291 })) 292 `) 293 294 assert.equal(output.reflectionSummary, 'Human context reflection') 295 assert.deepEqual(output.communicationNotes, ['Prefer short check-ins while the move is in progress.']) 296 assert.deepEqual(output.significantEventNotes, ['Moving to Lisbon next month.']) 297 assert.deepEqual(output.openLoopNotes, ['Check in again once the move is complete.']) 298 }) 299 300 it('does not reuse stale assistant tool events when the current chat turn used none', () => { 301 const output = runWithTempDataDir(` 302 const storageMod = await import('@/lib/server/storage') 303 const storage = storageMod.default || storageMod['module.exports'] || storageMod 304 const reflectionMod = await import('@/lib/server/autonomy/supervisor-reflection') 305 const mod = reflectionMod.default || reflectionMod['module.exports'] || reflectionMod 306 307 storage.saveAgents({ 308 'agent-a': { 309 id: 'agent-a', 310 name: 'Agent A', 311 provider: 'openai', 312 model: 'gpt-test', 313 }, 314 }) 315 316 storage.saveSessions({ 317 s3: { 318 id: 's3', 319 name: 'Noise Check', 320 cwd: process.cwd(), 321 user: 'tester', 322 provider: 'openai', 323 model: 'gpt-test', 324 claudeSessionId: null, 325 messages: [ 326 { 327 role: 'assistant', 328 text: 'Old heartbeat reply with browser-heavy history.', 329 time: 1, 330 toolEvents: [ 331 { name: 'browser', input: '{"action":"open"}' }, 332 { name: 'browser', input: '{"action":"click"}' }, 333 { name: 'browser', input: '{"action":"click"}' }, 334 { name: 'browser', input: '{"action":"type"}' }, 335 { name: 'browser', input: '{"action":"wait"}' }, 336 ], 337 }, 338 { role: 'user', text: 'Hello?', time: 2 }, 339 { role: 'assistant', text: 'Hello! I am here and ready to help.', time: 3 }, 340 ], 341 createdAt: 1, 342 lastActiveAt: 3, 343 sessionType: 'human', 344 agentId: 'agent-a', 345 }, 346 }) 347 348 storage.saveSettings({ 349 supervisorEnabled: true, 350 supervisorRuntimeScope: 'both', 351 supervisorNoProgressLimit: 2, 352 supervisorRepeatedToolLimit: 3, 353 reflectionEnabled: false, 354 reflectionAutoWriteMemory: false, 355 }) 356 357 const result = await mod.observeAutonomyRunOutcome({ 358 runId: 'run-no-tools', 359 sessionId: 's3', 360 agentId: 'agent-a', 361 source: 'chat', 362 status: 'completed', 363 resultText: 'Hello! I am here and ready to help.', 364 sourceMessage: 'Hello?', 365 }) 366 367 console.log(JSON.stringify({ 368 incidentKinds: result.incidents.map((incident) => incident.kind), 369 repeatedSummaries: result.incidents.filter((incident) => incident.kind === 'repeated_tool').map((incident) => incident.summary), 370 })) 371 `) 372 373 assert.deepEqual(output.incidentKinds, []) 374 assert.deepEqual(output.repeatedSummaries, []) 375 }) 376 377 it('sanitizes captured HTML error payloads in incident details', () => { 378 const assessment = assessAutonomyRun({ 379 runId: 'run-html', 380 sessionId: 'session-html', 381 source: 'chat', 382 status: 'failed', 383 error: 'Connection error.', 384 resultText: '<!DOCTYPE html><html><head><script src="/_next/static/chunks/main.js"></script></head><body>ReferenceError: Singleton is not defined</body></html>', 385 toolEvents: [], 386 settings: { 387 supervisorEnabled: true, 388 supervisorRuntimeScope: 'both', 389 reflectionEnabled: false, 390 reflectionAutoWriteMemory: false, 391 }, 392 session: { 393 id: 'session-html', 394 name: 'HTML Error Session', 395 cwd: process.cwd(), 396 user: 'tester', 397 provider: 'openai', 398 model: 'gpt-test', 399 claudeSessionId: null, 400 messages: [], 401 createdAt: Date.now(), 402 lastActiveAt: Date.now(), 403 }, 404 }) 405 406 const incident = assessment.incidents.find((entry) => entry.kind === 'run_error') 407 assert.ok(incident) 408 assert.match(String(incident?.details || ''), /html error payload/i) 409 assert.doesNotMatch(String(incident?.details || ''), /<!doctype html>/i) 410 assert.match(String(incident?.details || ''), /singleton is not defined/i) 411 }) 412 })