Cradicle Explorer

/ src / lib / server / autonomy / supervisor-reflection.test.ts
supervisor-reflection.test.ts
  1  import assert from 'node:assert/strict'
  2  import fs from 'node:fs'
  3  import os from 'node:os'
  4  import path from 'node:path'
  5  import { spawnSync } from 'node:child_process'
  6  import { describe, it } from 'node:test'
  7  
  8  import { assessAutonomyRun } from '@/lib/server/autonomy/supervisor-reflection'
  9  import type { Session } from '@/types'
 10  
 11  const repoRoot = path.resolve(path.dirname(new URL(import.meta.url).pathname), '../../../..')
 12  
 13  function runWithTempDataDir(script: string) {
 14    const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'swarmclaw-supervisor-reflection-'))
 15    try {
 16      const result = spawnSync(
 17        process.execPath,
 18        ['--import', 'tsx', '--input-type=module', '--eval', script],
 19        {
 20          cwd: repoRoot,
 21          env: {
 22            ...process.env,
 23            DATA_DIR: tempDir,
 24            WORKSPACE_DIR: path.join(tempDir, 'workspace'),
 25            SWARMCLAW_BUILD_MODE: '1',
 26          },
 27          encoding: 'utf-8',
 28          timeout: 20000,
 29        },
 30      )
 31      assert.equal(result.status, 0, result.stderr || result.stdout || 'subprocess failed')
 32      const lines = (result.stdout || '')
 33        .trim()
 34        .split('\n')
 35        .map((line) => line.trim())
 36        .filter(Boolean)
 37      const jsonLine = [...lines].reverse().find((line) => line.startsWith('{'))
 38      return JSON.parse(jsonLine || '{}') as Record<string, unknown>
 39    } finally {
 40      fs.rmSync(tempDir, { recursive: true, force: true })
 41    }
 42  }
 43  
 44  describe('supervisor-reflection', () => {
 45    it('recommends an automatic supervisor recovery step for repeated tool thrash', () => {
 46      const session: Session = {
 47        id: 'session-1',
 48        name: 'Autonomy Test',
 49        cwd: process.cwd(),
 50        user: 'tester',
 51        provider: 'openai',
 52        model: 'gpt-test',
 53        claudeSessionId: null,
 54        messages: [],
 55        createdAt: Date.now(),
 56        lastActiveAt: Date.now(),
 57      }
 58      const assessment = assessAutonomyRun({
 59        runId: 'run-1',
 60        sessionId: 'session-1',
 61        source: 'chat',
 62        status: 'completed',
 63        resultText: 'Retried the same shell command and got the same output.',
 64        toolEvents: [
 65          { name: 'shell', input: '{"cmd":"npm test"}' },
 66          { name: 'shell', input: '{"cmd":"npm test"}' },
 67          { name: 'shell', input: '{"cmd":"npm test"}' },
 68        ],
 69        mainLoopState: {
 70          followupChainCount: 1,
 71          summary: 'Retried the same shell command and got the same output.',
 72        },
 73        settings: {
 74          supervisorEnabled: true,
 75          supervisorRuntimeScope: 'both',
 76          supervisorRepeatedToolLimit: 3,
 77          supervisorNoProgressLimit: 2,
 78          reflectionEnabled: true,
 79          reflectionAutoWriteMemory: true,
 80        },
 81        session,
 82      })
 83  
 84      assert.ok(assessment.incidents.some((incident) => incident.kind === 'repeated_tool'))
 85      assert.match(String(assessment.interventionPrompt || ''), /stop repeating shell/i)
 86      assert.equal(assessment.shouldBlock, false)
 87    })
 88  
 89    it('persists reflections and auto-written reflection memory', () => {
 90      const output = runWithTempDataDir(`
 91        const storageMod = await import('@/lib/server/storage')
 92        const storage = storageMod.default || storageMod['module.exports'] || storageMod
 93        const reflectionMod = await import('@/lib/server/autonomy/supervisor-reflection')
 94        const mod = reflectionMod.default || reflectionMod['module.exports'] || reflectionMod
 95        const memoryDbMod = await import('@/lib/server/memory/memory-db')
 96        const memoryMod = memoryDbMod.default || memoryDbMod['module.exports'] || memoryDbMod
 97  
 98        storage.saveAgents({
 99          'agent-a': {
100            id: 'agent-a',
101            name: 'Agent A',
102            provider: 'openai',
103            model: 'gpt-test',
104          },
105        })
106  
107        storage.saveSessions({
108          s1: {
109            id: 's1',
110            name: 'Autonomy Session',
111            cwd: process.cwd(),
112            user: 'tester',
113            provider: 'openai',
114            model: 'gpt-test',
115            claudeSessionId: null,
116            messages: [
117              { role: 'user', text: 'Repair the deployment workflow and keep notes for later.', time: 1 },
118              { role: 'assistant', text: 'I retried the same shell path and nothing changed.', time: 2 },
119            ],
120            createdAt: 1,
121            lastActiveAt: 2,
122            sessionType: 'human',
123            agentId: 'agent-a',
124          },
125        })
126  
127        storage.saveSettings({
128          supervisorEnabled: true,
129          supervisorRuntimeScope: 'both',
130          supervisorNoProgressLimit: 2,
131          supervisorRepeatedToolLimit: 3,
132          reflectionEnabled: true,
133          reflectionAutoWriteMemory: true,
134        })
135  
136        const result = await mod.observeAutonomyRunOutcome({
137          runId: 'run-1',
138          sessionId: 's1',
139          agentId: 'agent-a',
140          source: 'chat',
141          status: 'completed',
142          resultText: 'I retried the same shell path and nothing changed.',
143          toolEvents: [
144            { name: 'shell', input: '{"cmd":"npm test"}' },
145            { name: 'shell', input: '{"cmd":"npm test"}' },
146            { name: 'shell', input: '{"cmd":"npm test"}' },
147          ],
148          mainLoopState: {
149            followupChainCount: 2,
150            summary: 'I retried the same shell path and nothing changed.',
151          },
152          sourceMessage: 'Repair the deployment workflow and keep notes for later.',
153        }, {
154          generateText: async () => JSON.stringify({
155            summary: 'Deployment repair reflection',
156            invariants: ['Verify changed files and command output before marking the task complete.'],
157            derived: ['Switch recovery strategy after two identical shell failures in a row.'],
158            failures: ['Repeated shell retries without changing inputs waste budget.'],
159            lessons: ['Capture a short recovery brief before continuing a stuck run.'],
160            communication: ['Keep execution updates concise when reporting repair progress.'],
161            relationship: ['Treat the user as wanting decisive recovery rather than repeated status chatter.'],
162            significant_events: ['The deployment workflow is currently broken and needs a confirmed repair path.'],
163            profile: ['The user is directly responsible for the deployment workflow.'],
164            boundaries: ['Do not claim the repair is complete without concrete verification evidence.'],
165            open_loops: ['Follow up with the final verification result once the repair path succeeds.'],
166          }),
167        })
168  
169        const memories = memoryMod.getMemoryDb().list(undefined, 50)
170          .filter((entry) => entry.metadata && entry.metadata.origin === 'autonomy-reflection')
171  
172        console.log(JSON.stringify({
173          incidentKinds: result.incidents.map((incident) => incident.kind).sort(),
174          reflectionSummary: result.reflection?.summary ?? null,
175          reflectionCount: mod.listRunReflections({ sessionId: 's1' }).length,
176          autoMemoryCount: result.reflection?.autoMemoryIds?.length ?? 0,
177          memoryCategories: memories.map((entry) => entry.category).sort(),
178          profileNotes: result.reflection?.profileNotes ?? [],
179          boundaryNotes: result.reflection?.boundaryNotes ?? [],
180          openLoopNotes: result.reflection?.openLoopNotes ?? [],
181        }))
182      `)
183  
184      assert.deepEqual(output.incidentKinds, ['no_progress', 'repeated_tool'])
185      assert.equal(output.reflectionSummary, 'Deployment repair reflection')
186      assert.equal(output.reflectionCount, 1)
187      assert.equal(output.autoMemoryCount, 10)
188      assert.deepEqual(output.profileNotes, ['The user is directly responsible for the deployment workflow.'])
189      assert.deepEqual(output.boundaryNotes, ['Do not claim the repair is complete without concrete verification evidence.'])
190      assert.deepEqual(output.openLoopNotes, ['Follow up with the final verification result once the repair path succeeds.'])
191      assert.deepEqual(output.memoryCategories, [
192        'reflection/boundary',
193        'reflection/communication',
194        'reflection/derived',
195        'reflection/failure',
196        'reflection/invariant',
197        'reflection/lesson',
198        'reflection/open_loop',
199        'reflection/profile',
200        'reflection/relationship',
201        'reflection/significant_event',
202      ])
203    })
204  
205    it('reflects short human chats when they contain durable personal context', () => {
206      const output = runWithTempDataDir(`
207        const storageMod = await import('@/lib/server/storage')
208        const storage = storageMod.default || storageMod['module.exports'] || storageMod
209        const reflectionMod = await import('@/lib/server/autonomy/supervisor-reflection')
210        const mod = reflectionMod.default || reflectionMod['module.exports'] || reflectionMod
211  
212        storage.saveAgents({
213          'agent-a': {
214            id: 'agent-a',
215            name: 'Agent A',
216            provider: 'openai',
217            model: 'gpt-test',
218          },
219        })
220  
221        storage.saveSessions({
222          s2: {
223            id: 's2',
224            name: 'Human Context Session',
225            cwd: process.cwd(),
226            user: 'tester',
227            provider: 'openai',
228            model: 'gpt-test',
229            claudeSessionId: null,
230            messages: [
231              {
232                role: 'user',
233                text: 'I am moving to Lisbon next month and prefer short check-ins while I am juggling the move.',
234                time: 1,
235                semantics: {
236                  taskIntent: 'general',
237                  workType: 'general',
238                  isDeliverableTask: false,
239                  isBroadGoal: false,
240                  isResearchSynthesis: false,
241                  hasHumanSignals: true,
242                  hasSignificantEvent: true,
243                  wantsScreenshots: false,
244                  wantsOutboundDelivery: false,
245                  wantsVoiceDelivery: false,
246                  explicitToolRequests: [],
247                  confidence: 0.98,
248                },
249              },
250              { role: 'assistant', text: 'Understood. I will keep updates tight and remember the move timing.', time: 2 },
251            ],
252            createdAt: 1,
253            lastActiveAt: 2,
254            sessionType: 'human',
255            agentId: 'agent-a',
256          },
257        })
258  
259        storage.saveSettings({
260          supervisorEnabled: true,
261          supervisorRuntimeScope: 'both',
262          supervisorNoProgressLimit: 2,
263          supervisorRepeatedToolLimit: 3,
264          reflectionEnabled: true,
265          reflectionAutoWriteMemory: true,
266        })
267  
268        const result = await mod.observeAutonomyRunOutcome({
269          runId: 'run-human',
270          sessionId: 's2',
271          agentId: 'agent-a',
272          source: 'chat',
273          status: 'completed',
274          resultText: 'I will keep updates tight and remember the move timing.',
275          sourceMessage: 'I am moving to Lisbon next month and prefer short check-ins while I am juggling the move.',
276        }, {
277          generateText: async () => JSON.stringify({
278            summary: 'Human context reflection',
279            communication: ['Prefer short check-ins while the move is in progress.'],
280            significant_events: ['Moving to Lisbon next month.'],
281            open_loops: ['Check in again once the move is complete.'],
282            profile: ['Currently planning a move to Lisbon.'],
283          }),
284        })
285  
286        console.log(JSON.stringify({
287          reflectionSummary: result.reflection?.summary ?? null,
288          communicationNotes: result.reflection?.communicationNotes ?? [],
289          significantEventNotes: result.reflection?.significantEventNotes ?? [],
290          openLoopNotes: result.reflection?.openLoopNotes ?? [],
291        }))
292      `)
293  
294      assert.equal(output.reflectionSummary, 'Human context reflection')
295      assert.deepEqual(output.communicationNotes, ['Prefer short check-ins while the move is in progress.'])
296      assert.deepEqual(output.significantEventNotes, ['Moving to Lisbon next month.'])
297      assert.deepEqual(output.openLoopNotes, ['Check in again once the move is complete.'])
298    })
299  
300    it('does not reuse stale assistant tool events when the current chat turn used none', () => {
301      const output = runWithTempDataDir(`
302        const storageMod = await import('@/lib/server/storage')
303        const storage = storageMod.default || storageMod['module.exports'] || storageMod
304        const reflectionMod = await import('@/lib/server/autonomy/supervisor-reflection')
305        const mod = reflectionMod.default || reflectionMod['module.exports'] || reflectionMod
306  
307        storage.saveAgents({
308          'agent-a': {
309            id: 'agent-a',
310            name: 'Agent A',
311            provider: 'openai',
312            model: 'gpt-test',
313          },
314        })
315  
316        storage.saveSessions({
317          s3: {
318            id: 's3',
319            name: 'Noise Check',
320            cwd: process.cwd(),
321            user: 'tester',
322            provider: 'openai',
323            model: 'gpt-test',
324            claudeSessionId: null,
325            messages: [
326              {
327                role: 'assistant',
328                text: 'Old heartbeat reply with browser-heavy history.',
329                time: 1,
330                toolEvents: [
331                  { name: 'browser', input: '{"action":"open"}' },
332                  { name: 'browser', input: '{"action":"click"}' },
333                  { name: 'browser', input: '{"action":"click"}' },
334                  { name: 'browser', input: '{"action":"type"}' },
335                  { name: 'browser', input: '{"action":"wait"}' },
336                ],
337              },
338              { role: 'user', text: 'Hello?', time: 2 },
339              { role: 'assistant', text: 'Hello! I am here and ready to help.', time: 3 },
340            ],
341            createdAt: 1,
342            lastActiveAt: 3,
343            sessionType: 'human',
344            agentId: 'agent-a',
345          },
346        })
347  
348        storage.saveSettings({
349          supervisorEnabled: true,
350          supervisorRuntimeScope: 'both',
351          supervisorNoProgressLimit: 2,
352          supervisorRepeatedToolLimit: 3,
353          reflectionEnabled: false,
354          reflectionAutoWriteMemory: false,
355        })
356  
357        const result = await mod.observeAutonomyRunOutcome({
358          runId: 'run-no-tools',
359          sessionId: 's3',
360          agentId: 'agent-a',
361          source: 'chat',
362          status: 'completed',
363          resultText: 'Hello! I am here and ready to help.',
364          sourceMessage: 'Hello?',
365        })
366  
367        console.log(JSON.stringify({
368          incidentKinds: result.incidents.map((incident) => incident.kind),
369          repeatedSummaries: result.incidents.filter((incident) => incident.kind === 'repeated_tool').map((incident) => incident.summary),
370        }))
371      `)
372  
373      assert.deepEqual(output.incidentKinds, [])
374      assert.deepEqual(output.repeatedSummaries, [])
375    })
376  
377    it('sanitizes captured HTML error payloads in incident details', () => {
378      const assessment = assessAutonomyRun({
379        runId: 'run-html',
380        sessionId: 'session-html',
381        source: 'chat',
382        status: 'failed',
383        error: 'Connection error.',
384        resultText: '<!DOCTYPE html><html><head><script src="/_next/static/chunks/main.js"></script></head><body>ReferenceError: Singleton is not defined</body></html>',
385        toolEvents: [],
386        settings: {
387          supervisorEnabled: true,
388          supervisorRuntimeScope: 'both',
389          reflectionEnabled: false,
390          reflectionAutoWriteMemory: false,
391        },
392        session: {
393          id: 'session-html',
394          name: 'HTML Error Session',
395          cwd: process.cwd(),
396          user: 'tester',
397          provider: 'openai',
398          model: 'gpt-test',
399          claudeSessionId: null,
400          messages: [],
401          createdAt: Date.now(),
402          lastActiveAt: Date.now(),
403        },
404      })
405  
406      const incident = assessment.incidents.find((entry) => entry.kind === 'run_error')
407      assert.ok(incident)
408      assert.match(String(incident?.details || ''), /html error payload/i)
409      assert.doesNotMatch(String(incident?.details || ''), /<!doctype html>/i)
410      assert.match(String(incident?.details || ''), /singleton is not defined/i)
411    })
412  })