loop_compaction_test.go
1 package agent 2 3 import ( 4 "context" 5 "encoding/json" 6 "fmt" 7 "net/http" 8 "net/http/httptest" 9 "os" 10 "path/filepath" 11 "strings" 12 "sync" 13 "testing" 14 15 "github.com/Kocoro-lab/ShanClaw/internal/client" 16 ) 17 18 // TestAgentLoop_CompactionAndMemoryPersist verifies the full compaction chain: 19 // 20 // 1. Agent loop runs multiple tool-call iterations within a single Run() 21 // 2. Mock server reports growing input tokens each iteration 22 // 3. When tokens exceed 85% of context_window → compaction triggers 23 // 4. PersistLearnings fires (small tier) → writes to MEMORY.md 24 // 5. GenerateSummary fires (small tier) → creates summary 25 // 6. ShapeHistory reduces messages 26 // 27 // Uses context_window=2000 so 85% threshold = 1700 tokens. 28 // Needs ≥5 tool iterations so messages > MinShapeable (9). 29 func TestAgentLoop_CompactionAndMemoryPersist(t *testing.T) { 30 memoryDir := t.TempDir() 31 32 var mu sync.Mutex 33 var calls []string // ordered log of all calls 34 35 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 36 raw, _ := readBody(r.Body) 37 defer r.Body.Close() 38 39 var req struct { 40 ModelTier string `json:"model_tier"` 41 Messages []struct { 42 Role string `json:"role"` 43 Content json.RawMessage `json:"content"` 44 } `json:"messages"` 45 } 46 json.Unmarshal(raw, &req) 47 48 mu.Lock() 49 callNum := len(calls) + 1 50 51 // Identify small-tier calls 52 if req.ModelTier == "small" { 53 isPersist := false 54 isSummary := false 55 for _, m := range req.Messages { 56 var text string 57 json.Unmarshal(m.Content, &text) 58 if strings.Contains(text, "extracting durable knowledge") { 59 isPersist = true 60 } 61 if strings.Contains(text, "Compress the following conversation") { 62 isSummary = true 63 } 64 } 65 66 if isPersist { 67 calls = append(calls, fmt.Sprintf("call %d: PERSIST", callNum)) 68 mu.Unlock() 69 t.Logf("Call %d: [small] PersistLearnings (messages: %d)", callNum, len(req.Messages)) 70 json.NewEncoder(w).Encode(nativeResponse( 71 "- Agent discussed system architecture\n- Testing compaction flow", 72 "end_turn", nil, 50, 30)) 73 return 74 } 75 if isSummary { 76 calls = append(calls, fmt.Sprintf("call %d: SUMMARY", callNum)) 77 mu.Unlock() 78 t.Logf("Call %d: [small] GenerateSummary", callNum) 79 json.NewEncoder(w).Encode(nativeResponse( 80 "User asked about architecture. Agent reasoned through multiple steps.", 81 "end_turn", nil, 50, 30)) 82 return 83 } 84 85 calls = append(calls, fmt.Sprintf("call %d: small-other", callNum)) 86 mu.Unlock() 87 t.Logf("Call %d: [small] other", callNum) 88 json.NewEncoder(w).Encode(nativeResponse("ok", "end_turn", nil, 50, 30)) 89 return 90 } 91 92 // Main-tier calls: use message count to decide behavior. 93 // We need the loop to iterate 6+ times so messages exceed MinShapeable (9). 94 // Report input tokens that grow to exceed the 1700 threshold. 95 msgCount := len(req.Messages) 96 // Scale input tokens based on message count to simulate realistic growth 97 inputTokens := msgCount * 200 98 99 if msgCount < 12 { 100 // Keep looping with tool calls until we have enough messages 101 calls = append(calls, fmt.Sprintf("call %d: TOOL (msgs=%d, input=%d)", callNum, msgCount, inputTokens)) 102 mu.Unlock() 103 t.Logf("Call %d: [main] tool_use (msgs=%d, input_tokens=%d)", callNum, msgCount, inputTokens) 104 json.NewEncoder(w).Encode(nativeResponse( 105 "", "tool_use", 106 toolCall("think", fmt.Sprintf(`{"thought":"Analyzing step with %d messages in context"}`, msgCount)), 107 inputTokens, 100)) 108 } else { 109 calls = append(calls, fmt.Sprintf("call %d: END_TURN (msgs=%d, input=%d)", callNum, msgCount, inputTokens)) 110 mu.Unlock() 111 t.Logf("Call %d: [main] end_turn (msgs=%d, input_tokens=%d)", callNum, msgCount, inputTokens) 112 json.NewEncoder(w).Encode(nativeResponse( 113 "Here is the complete analysis based on my reasoning through all the steps.", 114 "end_turn", nil, inputTokens, 100)) 115 } 116 })) 117 defer server.Close() 118 119 gw := client.NewGatewayClient(server.URL, "") 120 reg := NewToolRegistry() 121 122 // Register think tool — no approval needed, keeps loop iterating 123 reg.Register(&thinkTool{}) 124 125 handler := &mockHandler{approveResult: true} 126 127 loop := NewAgentLoop(gw, reg, "medium", "", 20, 2000, 200, nil, nil, nil) 128 loop.SetContextWindow(2000) // 85% = 1700 triggers compaction 129 loop.SetMemoryDir(memoryDir) 130 loop.SetHandler(handler) 131 132 // Run with a big message 133 result, usage, err := loop.Run(context.Background(), 134 "Explain the complete system architecture. Think through each component step by step. Be thorough.", 135 nil, nil) 136 if err != nil { 137 t.Logf("Run error (may be iteration limit): %v", err) 138 } 139 140 mu.Lock() 141 t.Logf("\n=== Call sequence (%d total) ===", len(calls)) 142 for _, c := range calls { 143 t.Logf(" %s", c) 144 } 145 146 hasPersist := false 147 hasSummary := false 148 for _, c := range calls { 149 if strings.Contains(c, "PERSIST") { 150 hasPersist = true 151 } 152 if strings.Contains(c, "SUMMARY") { 153 hasSummary = true 154 } 155 } 156 mu.Unlock() 157 158 t.Logf("Result: %d chars", len(result)) 159 t.Logf("Usage: %d LLM calls, %d input+output tokens", 160 usage.LLMCalls, usage.InputTokens+usage.OutputTokens) 161 162 // Check compaction fired 163 if !hasPersist { 164 t.Error("PersistLearnings should have fired during compaction") 165 } 166 if !hasSummary { 167 t.Error("GenerateSummary should have fired during compaction") 168 } 169 170 // Check MEMORY.md 171 memPath := filepath.Join(memoryDir, "MEMORY.md") 172 memData, err := os.ReadFile(memPath) 173 if err != nil { 174 if hasPersist { 175 t.Fatalf("MEMORY.md should exist since PersistLearnings fired: %v", err) 176 } 177 t.Logf("MEMORY.md not created — compaction didn't trigger") 178 return 179 } 180 181 memContent := string(memData) 182 t.Logf("\n=== MEMORY.md ===\n%s", memContent) 183 184 if !strings.Contains(memContent, "Auto-persisted") { 185 t.Error("MEMORY.md should contain Auto-persisted section") 186 } 187 } 188 189 // thinkTool is a minimal think tool for the compaction test. 190 type thinkTool struct{} 191 192 func (t *thinkTool) Info() ToolInfo { 193 return ToolInfo{ 194 Name: "think", 195 Description: "Plan or reason through tasks", 196 Parameters: map[string]any{"type": "object", "properties": map[string]any{"thought": map[string]any{"type": "string"}}}, 197 Required: []string{"thought"}, 198 } 199 } 200 201 func (t *thinkTool) Run(ctx context.Context, args string) (ToolResult, error) { 202 return ToolResult{Content: "Thought recorded."}, nil 203 } 204 205 func (t *thinkTool) RequiresApproval() bool { return false } 206 207 func readBody(body interface{ Read([]byte) (int, error) }) ([]byte, error) { 208 var buf []byte 209 tmp := make([]byte, 4096) 210 for { 211 n, err := body.Read(tmp) 212 buf = append(buf, tmp[:n]...) 213 if err != nil { 214 break 215 } 216 } 217 return buf, nil 218 } 219 220 func TestTruncateHeadTail(t *testing.T) { 221 t.Run("short string unchanged", func(t *testing.T) { 222 s := "hello world" 223 got := truncateHeadTail(s, 100) 224 if got != s { 225 t.Errorf("expected unchanged, got %q", got) 226 } 227 }) 228 229 t.Run("exact limit unchanged", func(t *testing.T) { 230 s := "abcdefghij" // 10 runes 231 got := truncateHeadTail(s, 10) 232 if got != s { 233 t.Errorf("expected unchanged, got %q", got) 234 } 235 }) 236 237 t.Run("long string gets head+tail", func(t *testing.T) { 238 // 100 chars, truncate to 40 239 s := strings.Repeat("a", 50) + strings.Repeat("z", 50) 240 got := truncateHeadTail(s, 40) 241 // keepHead=30, keepTail=10 242 if !strings.HasPrefix(got, strings.Repeat("a", 30)) { 243 t.Errorf("expected head of 30 'a's, got prefix: %q", got[:40]) 244 } 245 if !strings.HasSuffix(got, strings.Repeat("z", 10)) { 246 t.Errorf("expected tail of 10 'z's, got suffix: %q", got[len(got)-20:]) 247 } 248 if !strings.Contains(got, "[... truncated 60 chars ...]") { 249 t.Errorf("expected truncation marker with 60 dropped chars, got: %q", got) 250 } 251 }) 252 253 t.Run("rune-safe with multibyte", func(t *testing.T) { 254 // 20 runes of 3 bytes each 255 s := strings.Repeat("日", 20) 256 got := truncateHeadTail(s, 10) 257 // keepHead=7, keepTail=2 258 runes := []rune(got) 259 // Should start with 7 日 and end with 2 日 260 if runes[0] != '日' || runes[len(runes)-1] != '日' { 261 t.Errorf("expected rune-safe truncation, got: %q", got) 262 } 263 if !strings.Contains(got, "[... truncated 10 chars ...]") { 264 t.Errorf("expected truncation marker, got: %q", got) 265 } 266 }) 267 } 268 269 func TestBuildToolCallMap(t *testing.T) { 270 messages := []client.Message{ 271 { 272 Role: "assistant", 273 Content: client.NewBlockContent([]client.ContentBlock{ 274 client.NewToolUseBlock("tu-1", "file_read", json.RawMessage(`{"path":"/tmp/foo.txt"}`)), 275 client.NewToolUseBlock("tu-2", "bash", json.RawMessage(`{"command":"echo hello"}`)), 276 }), 277 }, 278 { 279 Role: "user", 280 Content: client.NewBlockContent([]client.ContentBlock{ 281 client.NewToolResultBlock("tu-1", "file contents here", false), 282 }), 283 }, 284 } 285 286 m := buildToolCallMap(messages) 287 if len(m) != 2 { 288 t.Fatalf("expected 2 entries, got %d", len(m)) 289 } 290 if m["tu-1"].Name != "file_read" { 291 t.Errorf("expected file_read, got %q", m["tu-1"].Name) 292 } 293 if m["tu-2"].Name != "bash" { 294 t.Errorf("expected bash, got %q", m["tu-2"].Name) 295 } 296 if !strings.Contains(m["tu-1"].Args, "/tmp/foo.txt") { 297 t.Errorf("expected args to contain path, got %q", m["tu-1"].Args) 298 } 299 } 300 301 func TestBuildToolCallMap_LongArgsTruncated(t *testing.T) { 302 longArgs := `{"content":"` + strings.Repeat("x", 200) + `"}` 303 messages := []client.Message{ 304 { 305 Role: "assistant", 306 Content: client.NewBlockContent([]client.ContentBlock{ 307 client.NewToolUseBlock("tu-1", "file_write", json.RawMessage(longArgs)), 308 }), 309 }, 310 } 311 312 m := buildToolCallMap(messages) 313 if len(m["tu-1"].Args) > 104 { // 100 + "..." 314 t.Errorf("expected args truncated to ~103 chars, got %d", len(m["tu-1"].Args)) 315 } 316 } 317 318 func TestCompressOldToolResults_TieredBehavior(t *testing.T) { 319 // Create 25 tool result pairs to exercise all three tiers with current constants: 320 // tier1Threshold=20, keepRecent passed as 8 to match compressAfter. 321 const numTools = 25 322 const keepRecent = 8 323 324 var messages []client.Message 325 messages = append(messages, client.Message{ 326 Role: "user", 327 Content: client.NewTextContent("Do some work"), 328 }) 329 330 for i := 0; i < numTools; i++ { 331 id := fmt.Sprintf("tu-%d", i) 332 name := fmt.Sprintf("tool_%d", i) 333 args := json.RawMessage(fmt.Sprintf(`{"arg":"value_%d"}`, i)) 334 content := fmt.Sprintf("Result content for tool %d: %s", i, strings.Repeat("x", 500)) 335 336 messages = append(messages, client.Message{ 337 Role: "assistant", 338 Content: client.NewBlockContent([]client.ContentBlock{ 339 client.NewToolUseBlock(id, name, args), 340 }), 341 }) 342 messages = append(messages, client.Message{ 343 Role: "user", 344 Content: client.NewBlockContent([]client.ContentBlock{ 345 client.NewToolResultBlock(id, content, false), 346 }), 347 }) 348 } 349 350 compressOldToolResults(context.Background(), messages, keepRecent, 300, nil) 351 352 for i := 0; i < numTools; i++ { 353 msgIdx := 2 + i*2 354 msg := messages[msgIdx] 355 blocks := msg.Content.Blocks() 356 if len(blocks) == 0 { 357 t.Fatalf("tool result %d: no blocks", i) 358 } 359 resultContent := "" 360 for _, b := range blocks { 361 if b.Type == "tool_result" { 362 if s, ok := b.ToolContent.(string); ok { 363 resultContent = s 364 } 365 } 366 } 367 368 distFromEnd := (numTools - 1) - i 369 370 if distFromEnd < keepRecent { 371 // Tier 3: should be full (500+ chars) 372 if len(resultContent) < 500 { 373 t.Errorf("tool %d (dist=%d): expected tier 3 full content (%d chars), got %d chars", 374 i, distFromEnd, 500, len(resultContent)) 375 } 376 } else if distFromEnd >= 20 { 377 // Tier 1: should contain "snipped" 378 if !strings.Contains(resultContent, "snipped") { 379 t.Errorf("tool %d (dist=%d): expected tier 1 metadata with 'snipped', got: %q", 380 i, distFromEnd, resultContent) 381 } 382 } else { 383 // Tier 2: should be truncated but not snipped (head+tail) 384 if strings.Contains(resultContent, "snipped") { 385 t.Errorf("tool %d (dist=%d): tier 2 should not contain 'snipped', got: %q", 386 i, distFromEnd, resultContent) 387 } 388 if len(resultContent) > 400 { 389 t.Errorf("tool %d (dist=%d): expected tier 2 truncated to ~300 chars, got %d", 390 i, distFromEnd, len(resultContent)) 391 } 392 if !strings.Contains(resultContent, "[... truncated") { 393 t.Errorf("tool %d (dist=%d): expected head+tail truncation marker, got: %q", 394 i, distFromEnd, resultContent) 395 } 396 } 397 } 398 } 399 400 func TestCompressOldToolResults_Tier2FloorForReadTools(t *testing.T) { 401 // Verify that file_read and grep results never degrade to Tier 1 metadata stubs, 402 // even when they would normally be old enough for Tier 1. 403 const numTools = 26 404 var messages []client.Message 405 messages = append(messages, client.Message{ 406 Role: "user", 407 Content: client.NewTextContent("Start"), 408 }) 409 410 // Tools 0-4: floor tools, 5-25: normal tools. 411 // With 26 total results, tool 5 sits exactly at distFromEnd=20, so it should 412 // hit Tier 1 and serve as the non-floor control case. 413 for i := 0; i < numTools; i++ { 414 id := fmt.Sprintf("tu-%d", i) 415 name := "tool_other" 416 if i < 3 { 417 name = "file_read" 418 } else if i < 5 { 419 name = "grep" 420 } 421 args := json.RawMessage(fmt.Sprintf(`{"arg":"value_%d"}`, i)) 422 content := fmt.Sprintf("Result %d: %s", i, strings.Repeat("x", 500)) 423 424 messages = append(messages, client.Message{ 425 Role: "assistant", 426 Content: client.NewBlockContent([]client.ContentBlock{ 427 client.NewToolUseBlock(id, name, args), 428 }), 429 }) 430 messages = append(messages, client.Message{ 431 Role: "user", 432 Content: client.NewBlockContent([]client.ContentBlock{ 433 client.NewToolResultBlock(id, content, false), 434 }), 435 }) 436 } 437 438 compressOldToolResults(context.Background(), messages, 8, 300, nil) 439 440 // Check the oldest file_read/grep results (tools 0-4, dist 25-21 from end) 441 // These should be Tier 2 (truncated with head+tail), NOT Tier 1 (snipped). 442 for i := 0; i < 5; i++ { 443 msgIdx := 2 + i*2 444 blocks := messages[msgIdx].Content.Blocks() 445 resultContent := "" 446 for _, b := range blocks { 447 if b.Type == "tool_result" { 448 if s, ok := b.ToolContent.(string); ok { 449 resultContent = s 450 } 451 } 452 } 453 if strings.Contains(resultContent, "snipped") { 454 t.Errorf("floor tool %d: should not be Tier 1 (snipped), got: %q", i, resultContent[:80]) 455 } 456 if !strings.Contains(resultContent, "[... truncated") { 457 t.Errorf("floor tool %d: should be Tier 2 (truncated), got: %q", i, resultContent[:80]) 458 } 459 } 460 461 // Non-floor control: tool 5 is old enough for Tier 1 and should become metadata-only. 462 normalIdx := 2 + 5*2 463 blocks := messages[normalIdx].Content.Blocks() 464 resultContent := "" 465 for _, b := range blocks { 466 if b.Type == "tool_result" { 467 if s, ok := b.ToolContent.(string); ok { 468 resultContent = s 469 } 470 } 471 } 472 if !strings.Contains(resultContent, "snipped") { 473 t.Fatalf("non-floor tool should be Tier 1 (snipped), got: %q", resultContent[:80]) 474 } 475 if strings.Contains(resultContent, "[... truncated") { 476 t.Fatalf("non-floor tool should not stay in Tier 2, got: %q", resultContent[:80]) 477 } 478 } 479 480 func TestCompressOldToolResults_EmergencyMode(t *testing.T) { 481 // Simulate emergency compaction: keepRecent=1, maxChars=100 482 var messages []client.Message 483 messages = append(messages, client.Message{ 484 Role: "user", 485 Content: client.NewTextContent("Start"), 486 }) 487 488 for i := 0; i < 5; i++ { 489 id := fmt.Sprintf("tu-%d", i) 490 content := strings.Repeat("y", 300) 491 messages = append(messages, client.Message{ 492 Role: "assistant", 493 Content: client.NewBlockContent([]client.ContentBlock{ 494 client.NewToolUseBlock(id, "bash", json.RawMessage(`{"command":"ls"}`)), 495 }), 496 }) 497 messages = append(messages, client.Message{ 498 Role: "user", 499 Content: client.NewBlockContent([]client.ContentBlock{ 500 client.NewToolResultBlock(id, content, false), 501 }), 502 }) 503 } 504 505 compressOldToolResults(context.Background(), messages, 1, 100, nil) 506 507 // Only the last tool result should be full 508 for i := 0; i < 5; i++ { 509 msgIdx := 2 + i*2 510 blocks := messages[msgIdx].Content.Blocks() 511 for _, b := range blocks { 512 if b.Type == "tool_result" { 513 s, ok := b.ToolContent.(string) 514 if !ok { 515 continue 516 } 517 if i == 4 { 518 // Last one: tier 3, full 519 if len(s) < 300 { 520 t.Errorf("last tool result should be full, got %d chars", len(s)) 521 } 522 } else { 523 // All others should be compressed 524 if len(s) >= 300 { 525 t.Errorf("tool %d should be compressed, got %d chars", i, len(s)) 526 } 527 } 528 } 529 } 530 } 531 } 532 533 // TestAgentLoop_ReactiveCompaction verifies the reactive compaction safety net: 534 // 535 // 1. Agent loop has enough messages to build context (6+ tool iterations) 536 // 2. Mock server returns HTTP 400 "prompt is too long" after sufficient iterations 537 // 3. Reactive compaction fires: PersistLearnings → compress → summary → ShapeHistory 538 // 4. Retry succeeds with compacted messages 539 // 5. compactionApplied flag prevents infinite retry loops 540 // 541 // The proactive compaction is bypassed by reporting low input tokens until the 542 // server triggers the 400 error, simulating the case where token counting 543 // underestimates and the API rejects the request. 544 func TestAgentLoop_ReactiveCompaction(t *testing.T) { 545 memoryDir := t.TempDir() 546 547 var mu sync.Mutex 548 var calls []string 549 550 // After 6 tool iterations (13+ messages), return a 400 context-length error 551 // on the next main-tier call, then succeed on retry. 552 contextErrorReturned := false 553 retrySucceeded := false 554 555 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 556 raw, _ := readBody(r.Body) 557 defer r.Body.Close() 558 559 var req struct { 560 ModelTier string `json:"model_tier"` 561 Messages []struct { 562 Role string `json:"role"` 563 Content json.RawMessage `json:"content"` 564 } `json:"messages"` 565 } 566 json.Unmarshal(raw, &req) 567 568 mu.Lock() 569 callNum := len(calls) + 1 570 571 // Small-tier calls (PersistLearnings, GenerateSummary) 572 if req.ModelTier == "small" { 573 isPersist := false 574 isSummary := false 575 for _, m := range req.Messages { 576 var text string 577 json.Unmarshal(m.Content, &text) 578 if strings.Contains(text, "extracting durable knowledge") { 579 isPersist = true 580 } 581 if strings.Contains(text, "Compress the following conversation") { 582 isSummary = true 583 } 584 } 585 586 if isPersist { 587 calls = append(calls, fmt.Sprintf("call %d: PERSIST", callNum)) 588 mu.Unlock() 589 t.Logf("Call %d: [small] PersistLearnings (messages: %d)", callNum, len(req.Messages)) 590 json.NewEncoder(w).Encode(nativeResponse( 591 "- Agent was analyzing system architecture\n- Reactive compaction triggered", 592 "end_turn", nil, 50, 30)) 593 return 594 } 595 if isSummary { 596 calls = append(calls, fmt.Sprintf("call %d: SUMMARY", callNum)) 597 mu.Unlock() 598 t.Logf("Call %d: [small] GenerateSummary", callNum) 599 json.NewEncoder(w).Encode(nativeResponse( 600 "User asked about architecture. Agent analyzed multiple components before context overflow.", 601 "end_turn", nil, 50, 30)) 602 return 603 } 604 605 calls = append(calls, fmt.Sprintf("call %d: small-other", callNum)) 606 mu.Unlock() 607 json.NewEncoder(w).Encode(nativeResponse("ok", "end_turn", nil, 50, 30)) 608 return 609 } 610 611 // Main-tier calls 612 msgCount := len(req.Messages) 613 614 if msgCount < 12 { 615 // Keep looping with tool calls, report LOW tokens so proactive 616 // compaction does NOT trigger (under 85% of 128000). 617 calls = append(calls, fmt.Sprintf("call %d: TOOL (msgs=%d)", callNum, msgCount)) 618 mu.Unlock() 619 t.Logf("Call %d: [main] tool_use (msgs=%d)", callNum, msgCount) 620 json.NewEncoder(w).Encode(nativeResponse( 621 "", "tool_use", 622 toolCall("think", fmt.Sprintf(`{"thought":"Step %d analysis"}`, msgCount)), 623 500, 100)) // Low tokens — proactive compaction won't trigger 624 return 625 } 626 627 // At 12+ messages: return 400 context-length error (once) 628 if !contextErrorReturned { 629 contextErrorReturned = true 630 calls = append(calls, fmt.Sprintf("call %d: CONTEXT_ERROR (msgs=%d)", callNum, msgCount)) 631 mu.Unlock() 632 t.Logf("Call %d: [main] → 400 prompt is too long (msgs=%d)", callNum, msgCount) 633 w.WriteHeader(http.StatusBadRequest) 634 w.Write([]byte(`{"error":{"type":"invalid_request_error","message":"prompt is too long"}}`)) 635 return 636 } 637 638 // After reactive compaction retries: succeed 639 retrySucceeded = true 640 calls = append(calls, fmt.Sprintf("call %d: RETRY_SUCCESS (msgs=%d)", callNum, msgCount)) 641 mu.Unlock() 642 t.Logf("Call %d: [main] end_turn after reactive compaction (msgs=%d)", callNum, msgCount) 643 json.NewEncoder(w).Encode(nativeResponse( 644 "Analysis complete after reactive compaction.", 645 "end_turn", nil, 800, 100)) 646 })) 647 defer server.Close() 648 649 gw := client.NewGatewayClient(server.URL, "") 650 reg := NewToolRegistry() 651 reg.Register(&thinkTool{}) 652 653 handler := &mockHandler{approveResult: true} 654 655 loop := NewAgentLoop(gw, reg, "medium", "", 20, 2000, 200, nil, nil, nil) 656 loop.SetContextWindow(128000) // High window so proactive compaction doesn't trigger 657 loop.SetMemoryDir(memoryDir) 658 loop.SetHandler(handler) 659 660 result, usage, err := loop.Run(context.Background(), 661 "Analyze each component of the system. Think through every step carefully.", 662 nil, nil) 663 if err != nil { 664 t.Logf("Run error: %v", err) 665 } 666 667 mu.Lock() 668 t.Logf("\n=== Call sequence (%d total) ===", len(calls)) 669 for _, c := range calls { 670 t.Logf(" %s", c) 671 } 672 673 hasPersist := false 674 hasSummary := false 675 hasContextError := false 676 hasRetrySuccess := false 677 for _, c := range calls { 678 if strings.Contains(c, "PERSIST") { 679 hasPersist = true 680 } 681 if strings.Contains(c, "SUMMARY") { 682 hasSummary = true 683 } 684 if strings.Contains(c, "CONTEXT_ERROR") { 685 hasContextError = true 686 } 687 if strings.Contains(c, "RETRY_SUCCESS") { 688 hasRetrySuccess = true 689 } 690 } 691 mu.Unlock() 692 693 t.Logf("Result: %d chars", len(result)) 694 t.Logf("Usage: %d LLM calls", usage.LLMCalls) 695 696 // Verify reactive compaction chain 697 if !hasContextError { 698 t.Error("expected context-length 400 error to be returned by mock server") 699 } 700 if !hasPersist { 701 t.Error("PersistLearnings should fire during reactive compaction") 702 } 703 if !hasSummary { 704 t.Error("GenerateSummary should fire during reactive compaction") 705 } 706 if !hasRetrySuccess { 707 t.Error("retry after reactive compaction should succeed") 708 } 709 if !retrySucceeded { 710 t.Error("retrySucceeded flag should be true") 711 } 712 713 // Verify MEMORY.md was written 714 memPath := filepath.Join(memoryDir, "MEMORY.md") 715 memData, err := os.ReadFile(memPath) 716 if err != nil { 717 t.Fatalf("MEMORY.md should exist after reactive PersistLearnings: %v", err) 718 } 719 memContent := string(memData) 720 t.Logf("\n=== MEMORY.md ===\n%s", memContent) 721 if !strings.Contains(memContent, "Auto-persisted") { 722 t.Error("MEMORY.md should contain Auto-persisted section") 723 } 724 725 // Verify result came through 726 if result == "" { 727 t.Error("expected non-empty result after successful retry") 728 } 729 } 730 731 // TestAgentLoop_ReactiveCompactionNoDoubleRetry verifies the compactionApplied 732 // guard prevents infinite loops: if reactive compaction fires but the retry 733 // ALSO returns a context-length error, the loop should fail instead of retrying. 734 func TestAgentLoop_ReactiveCompactionNoDoubleRetry(t *testing.T) { 735 contextErrors := 0 736 737 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 738 raw, _ := readBody(r.Body) 739 defer r.Body.Close() 740 741 var req struct { 742 ModelTier string `json:"model_tier"` 743 Messages []struct { 744 Role string `json:"role"` 745 Content json.RawMessage `json:"content"` 746 } `json:"messages"` 747 } 748 json.Unmarshal(raw, &req) 749 750 // Small-tier: always succeed 751 if req.ModelTier == "small" { 752 for _, m := range req.Messages { 753 var text string 754 json.Unmarshal(m.Content, &text) 755 if strings.Contains(text, "extracting durable knowledge") { 756 json.NewEncoder(w).Encode(nativeResponse("learnings", "end_turn", nil, 50, 30)) 757 return 758 } 759 if strings.Contains(text, "Compress the following conversation") { 760 json.NewEncoder(w).Encode(nativeResponse("summary", "end_turn", nil, 50, 30)) 761 return 762 } 763 } 764 json.NewEncoder(w).Encode(nativeResponse("ok", "end_turn", nil, 50, 30)) 765 return 766 } 767 768 msgCount := len(req.Messages) 769 t.Logf("Main-tier call: msgs=%d, contextErrors=%d", msgCount, contextErrors) 770 771 if msgCount < 6 && contextErrors == 0 { 772 // Build up messages with tool calls until we first trigger overflow. 773 json.NewEncoder(w).Encode(nativeResponse( 774 "", "tool_use", 775 toolCall("think", `{"thought":"building context"}`), 776 500, 100)) 777 return 778 } 779 780 // Always return context-length error once we've started — even after 781 // compaction reduces message count. This forces the double-retry guard. 782 contextErrors++ 783 t.Logf("Returning context-length error #%d (msgs=%d)", contextErrors, msgCount) 784 w.WriteHeader(http.StatusBadRequest) 785 w.Write([]byte(`{"error":{"type":"invalid_request_error","message":"context_length_exceeded"}}`)) 786 })) 787 defer server.Close() 788 789 gw := client.NewGatewayClient(server.URL, "") 790 reg := NewToolRegistry() 791 reg.Register(&thinkTool{}) 792 793 handler := &mockHandler{approveResult: true} 794 loop := NewAgentLoop(gw, reg, "medium", "", 20, 2000, 200, nil, nil, nil) 795 loop.SetContextWindow(128000) 796 loop.SetMemoryDir(t.TempDir()) 797 loop.SetHandler(handler) 798 799 _, _, err := loop.Run(context.Background(), "Trigger reactive compaction that fails on retry too.", nil, nil) 800 801 // Should get an error — NOT an infinite loop 802 if err == nil { 803 t.Fatal("expected error when retry after reactive compaction also fails") 804 } 805 t.Logf("Got expected error: %v", err) 806 807 // Should have seen at most 2 context-length errors (original + one retry) 808 if contextErrors > 2 { 809 t.Errorf("expected at most 2 context-length errors (original + retry), got %d — infinite loop guard may be broken", contextErrors) 810 } 811 } 812 813 func TestReactiveSummaryInput_InsertsPriorSummaryOnce(t *testing.T) { 814 messages := []client.Message{ 815 {Role: "system", Content: client.NewTextContent("system")}, 816 {Role: "user", Content: client.NewTextContent("first user")}, 817 {Role: "assistant", Content: client.NewTextContent("recent reply")}, 818 } 819 820 withSummary := reactiveSummaryInput(messages, "Earlier work happened") 821 if len(withSummary) != len(messages)+1 { 822 t.Fatalf("expected injected summary message, got %d messages", len(withSummary)) 823 } 824 if got := withSummary[2].Content.Text(); got != "Previous context summary: Earlier work happened" { 825 t.Fatalf("unexpected injected summary message: %q", got) 826 } 827 828 again := reactiveSummaryInput(withSummary, "Earlier work happened") 829 if len(again) != len(withSummary) { 830 t.Fatal("summary should not be injected twice") 831 } 832 } 833 834 func TestAgentLoop_ReactiveCompaction_UsesEmergencyFallbackWhenSoftStillOverBudget(t *testing.T) { 835 var mu sync.Mutex 836 var calls []string 837 mainCalls := 0 838 839 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 840 raw, _ := readBody(r.Body) 841 defer r.Body.Close() 842 843 var req struct { 844 ModelTier string `json:"model_tier"` 845 Messages []struct { 846 Role string `json:"role"` 847 Content json.RawMessage `json:"content"` 848 } `json:"messages"` 849 } 850 json.Unmarshal(raw, &req) 851 852 mu.Lock() 853 defer mu.Unlock() 854 855 if req.ModelTier == "small" { 856 calls = append(calls, "summary") 857 json.NewEncoder(w).Encode(nativeResponse( 858 "condensed summary", 859 "end_turn", nil, 50, 30)) 860 return 861 } 862 863 mainCalls++ 864 if mainCalls == 1 { 865 calls = append(calls, "context_error") 866 w.WriteHeader(http.StatusBadRequest) 867 w.Write([]byte(`{"error":{"type":"invalid_request_error","message":"prompt is too long"}}`)) 868 return 869 } 870 871 calls = append(calls, "retry_success") 872 json.NewEncoder(w).Encode(nativeResponse( 873 "Recovered after emergency fallback.", 874 "end_turn", nil, 500, 100)) 875 })) 876 defer server.Close() 877 878 gw := client.NewGatewayClient(server.URL, "") 879 reg := NewToolRegistry() 880 reg.Register(&thinkTool{}) 881 882 loop := NewAgentLoop(gw, reg, "medium", "", 10, 2000, 200, nil, nil, nil) 883 loop.SetContextWindow(100000) 884 885 huge := strings.Repeat("x", 450000) 886 history := []client.Message{ 887 {Role: "user", Content: client.NewTextContent(huge)}, 888 {Role: "assistant", Content: client.NewTextContent("ack")}, 889 {Role: "user", Content: client.NewTextContent("second turn")}, 890 {Role: "assistant", Content: client.NewTextContent("second reply")}, 891 {Role: "user", Content: client.NewTextContent("third turn")}, 892 {Role: "assistant", Content: client.NewTextContent("third reply")}, 893 } 894 895 result, _, err := loop.Run(context.Background(), "trigger reactive overflow", nil, history) 896 if err != nil { 897 t.Fatalf("unexpected error: %v", err) 898 } 899 if result != "Recovered after emergency fallback." { 900 t.Fatalf("unexpected result: %q", result) 901 } 902 903 mu.Lock() 904 gotCalls := append([]string(nil), calls...) 905 mu.Unlock() 906 907 summaryCalls := 0 908 for _, call := range gotCalls { 909 if call == "summary" { 910 summaryCalls++ 911 } 912 } 913 if summaryCalls != 2 { 914 t.Fatalf("expected soft + emergency summary calls, got %d (%v)", summaryCalls, gotCalls) 915 } 916 if len(gotCalls) != 4 || gotCalls[0] != "context_error" || gotCalls[1] != "summary" || gotCalls[2] != "summary" || gotCalls[3] != "retry_success" { 917 t.Fatalf("unexpected call order: %v", gotCalls) 918 } 919 } 920 921 // TestAgentLoop_CompactionTriggersOnWarmCache is a regression test for the 922 // compaction-gate fix that sums cached tokens into the gate's input. 923 // 924 // Before the fix, lastInputTokens was assigned normalizedUsage.InputTokens — 925 // which Anthropic defines as *excluding* cached tokens. A long warm-cache 926 // session would report input_tokens of a few hundred while cache_read_tokens 927 // carried the real 90K+ prompt, so ShouldCompact never tripped and compaction 928 // never fired until the cache went cold. 929 // 930 // After the fix, totalPromptTokens(u) = input + cache_read + cache_creation, 931 // which reflects the real context-window consumption. 932 // 933 // This test drives the loop against a mock that always reports a small 934 // InputTokens but a large CacheReadTokens. Once messages grow past 935 // MinShapeable (9), the gate must trigger — PersistLearnings + GenerateSummary 936 // must both fire. If the test fails, the gate has regressed to the pre-fix 937 // behaviour. 938 func TestAgentLoop_CompactionTriggersOnWarmCache(t *testing.T) { 939 memoryDir := t.TempDir() 940 941 var mu sync.Mutex 942 var calls []string 943 944 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 945 raw, _ := readBody(r.Body) 946 defer r.Body.Close() 947 948 var req struct { 949 ModelTier string `json:"model_tier"` 950 Messages []struct { 951 Role string `json:"role"` 952 Content json.RawMessage `json:"content"` 953 } `json:"messages"` 954 } 955 json.Unmarshal(raw, &req) 956 957 mu.Lock() 958 callNum := len(calls) + 1 959 960 if req.ModelTier == "small" { 961 isPersist := false 962 isSummary := false 963 for _, m := range req.Messages { 964 var text string 965 json.Unmarshal(m.Content, &text) 966 if strings.Contains(text, "extracting durable knowledge") { 967 isPersist = true 968 } 969 if strings.Contains(text, "Compress the following conversation") { 970 isSummary = true 971 } 972 } 973 if isPersist { 974 calls = append(calls, fmt.Sprintf("call %d: PERSIST", callNum)) 975 mu.Unlock() 976 json.NewEncoder(w).Encode(nativeResponse( 977 "- Warm-cache compaction fired correctly", 978 "end_turn", nil, 50, 30)) 979 return 980 } 981 if isSummary { 982 calls = append(calls, fmt.Sprintf("call %d: SUMMARY", callNum)) 983 mu.Unlock() 984 json.NewEncoder(w).Encode(nativeResponse( 985 "Agent summarised cached history.", "end_turn", nil, 50, 30)) 986 return 987 } 988 calls = append(calls, fmt.Sprintf("call %d: small-other", callNum)) 989 mu.Unlock() 990 json.NewEncoder(w).Encode(nativeResponse("ok", "end_turn", nil, 50, 30)) 991 return 992 } 993 994 // Main-tier: simulate a warm cache — small InputTokens, large CacheReadTokens. 995 // context_window=2000 so threshold = 1700. InputTokens alone (200) is below 996 // threshold; total prompt (200 + 1800 cache_read = 2000) is above. Pre-fix 997 // code reads only InputTokens and would NOT compact; post-fix reads 998 // totalPromptTokens and SHOULD compact once msgCount > MinShapeable (9). 999 msgCount := len(req.Messages) 1000 resp := client.CompletionResponse{ 1001 Model: "test-model", 1002 FinishReason: "tool_use", 1003 FunctionCall: nil, 1004 ToolCalls: []client.FunctionCall{{ 1005 Name: "think", 1006 Arguments: json.RawMessage(fmt.Sprintf(`{"thought":"step with %d msgs"}`, msgCount)), 1007 }}, 1008 Usage: client.Usage{ 1009 InputTokens: 200, 1010 OutputTokens: 50, 1011 TotalTokens: 250, 1012 CacheReadTokens: 1800, 1013 }, 1014 RequestID: "req-test", 1015 } 1016 if msgCount >= 12 { 1017 // Emit end_turn so the run can terminate after compaction fires. 1018 resp.FinishReason = "end_turn" 1019 resp.ToolCalls = nil 1020 resp.OutputText = "Analysis complete after warm-cache compaction." 1021 } 1022 calls = append(calls, fmt.Sprintf("call %d: MAIN (msgs=%d, input=200, cache_read=1800)", callNum, msgCount)) 1023 mu.Unlock() 1024 json.NewEncoder(w).Encode(resp) 1025 })) 1026 defer server.Close() 1027 1028 gw := client.NewGatewayClient(server.URL, "") 1029 reg := NewToolRegistry() 1030 reg.Register(&thinkTool{}) 1031 1032 handler := &mockHandler{approveResult: true} 1033 1034 loop := NewAgentLoop(gw, reg, "medium", "", 20, 2000, 200, nil, nil, nil) 1035 loop.SetContextWindow(2000) 1036 loop.SetMemoryDir(memoryDir) 1037 loop.SetHandler(handler) 1038 1039 _, _, err := loop.Run(context.Background(), 1040 "Run through several reasoning steps so message count grows past MinShapeable.", 1041 nil, nil) 1042 if err != nil { 1043 t.Logf("Run error (iteration limit is acceptable): %v", err) 1044 } 1045 1046 mu.Lock() 1047 defer mu.Unlock() 1048 t.Logf("\n=== Call sequence (%d total) ===", len(calls)) 1049 for _, c := range calls { 1050 t.Logf(" %s", c) 1051 } 1052 1053 hasPersist := false 1054 hasSummary := false 1055 for _, c := range calls { 1056 if strings.Contains(c, "PERSIST") { 1057 hasPersist = true 1058 } 1059 if strings.Contains(c, "SUMMARY") { 1060 hasSummary = true 1061 } 1062 } 1063 1064 if !hasPersist { 1065 t.Error("PersistLearnings must fire once warm-cache total prompt exceeds 85% — gate regressed to pre-fix behavior") 1066 } 1067 if !hasSummary { 1068 t.Error("GenerateSummary must fire once warm-cache total prompt exceeds 85% — gate regressed to pre-fix behavior") 1069 } 1070 } 1071 1072 // TestAgentLoop_EmptySummaryTriggersBackoff verifies two related fixes: 1073 // 1074 // 1. When GenerateSummary returns a non-error empty string (e.g. LLM produced 1075 // <analysis> only, extractSummary filtered to ""), the compaction gate 1076 // treats it as a failure and increments summaryFailures. 1077 // 2. After 3 consecutive failures, the cool-off window of 5 iterations 1078 // really skips 5 iterations of SUMMARY attempts — regardless of when 1079 // the failures happen in the run. The pre-fix `(i - summaryFailures) < 5` 1080 // expression only yields a full 5-iter window when failures start at 1081 // i=0; a middle cluster at e.g. i=4,5,6 collapsed the window to 1 iter, 1082 // a late cluster at i=9,10,11 produced zero backoff at all. 1083 // 1084 // Post-fix assertions: 1085 // - Total SUMMARY count is ≤ 4 across the whole run (3 initial failures 1086 // plus at most one post-cool-off retry before the iter cap) 1087 // - At least 3 SUMMARY calls fire, so the breaker actually trips 1088 // - Between the 3rd and 4th SUMMARY there are ≥ 5 MAIN completion calls. 1089 // Every iteration emits exactly one MAIN call regardless of compaction 1090 // gating, so MAIN count between SUMMARYs is a direct measure of 1091 // iterations skipped by backoff. This is the key assertion: measuring 1092 // call-stream index differences (e.g. "4th SUMMARY ≥ call 3rdIndex+6") 1093 // would silently accept a 3-iter backoff as if it were 5, because the 1094 // iter that retries also contributes MAIN+PERSIST calls to the stream. 1095 func TestAgentLoop_EmptySummaryTriggersBackoff(t *testing.T) { 1096 memoryDir := t.TempDir() 1097 1098 var mu sync.Mutex 1099 var calls []string 1100 1101 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 1102 raw, _ := readBody(r.Body) 1103 defer r.Body.Close() 1104 1105 var req struct { 1106 ModelTier string `json:"model_tier"` 1107 Messages []struct { 1108 Role string `json:"role"` 1109 Content json.RawMessage `json:"content"` 1110 } `json:"messages"` 1111 } 1112 json.Unmarshal(raw, &req) 1113 1114 mu.Lock() 1115 callNum := len(calls) + 1 1116 1117 if req.ModelTier == "small" { 1118 isPersist := false 1119 isSummary := false 1120 for _, m := range req.Messages { 1121 var text string 1122 json.Unmarshal(m.Content, &text) 1123 if strings.Contains(text, "extracting durable knowledge") { 1124 isPersist = true 1125 } 1126 if strings.Contains(text, "Compress the following conversation") { 1127 isSummary = true 1128 } 1129 } 1130 if isPersist { 1131 calls = append(calls, fmt.Sprintf("call %d: PERSIST", callNum)) 1132 mu.Unlock() 1133 json.NewEncoder(w).Encode(nativeResponse( 1134 "- simulated persist", "end_turn", nil, 50, 30)) 1135 return 1136 } 1137 if isSummary { 1138 calls = append(calls, fmt.Sprintf("call %d: SUMMARY(empty)", callNum)) 1139 mu.Unlock() 1140 // LLM returned <analysis> only — extractSummary strips it and returns "". 1141 // sumErr is nil; summary is "". 1142 json.NewEncoder(w).Encode(nativeResponse( 1143 "<analysis>scratch work, no summary block produced</analysis>", 1144 "end_turn", nil, 50, 30)) 1145 return 1146 } 1147 calls = append(calls, fmt.Sprintf("call %d: small-other", callNum)) 1148 mu.Unlock() 1149 json.NewEncoder(w).Encode(nativeResponse("ok", "end_turn", nil, 50, 30)) 1150 return 1151 } 1152 1153 // Main-tier: push messages past MinShapeable (9) and keep total prompt above 1154 // context_window*0.85. With context_window=2000 threshold=1700, small input 1155 // + large cache_read (1800) makes totalPromptTokens cross every turn. 1156 msgCount := len(req.Messages) 1157 resp := client.CompletionResponse{ 1158 Model: "test-model", 1159 FinishReason: "tool_use", 1160 ToolCalls: []client.FunctionCall{{ 1161 Name: "think", 1162 Arguments: json.RawMessage(fmt.Sprintf(`{"thought":"iter with %d msgs"}`, msgCount)), 1163 }}, 1164 Usage: client.Usage{ 1165 InputTokens: 200, 1166 OutputTokens: 50, 1167 TotalTokens: 250, 1168 CacheReadTokens: 1800, // total = 2000 > 1700 threshold 1169 }, 1170 RequestID: "req-test", 1171 } 1172 if msgCount >= 30 { 1173 // Hard stop after 15 rounds so the test can't loop forever. 1174 resp.FinishReason = "end_turn" 1175 resp.ToolCalls = nil 1176 resp.OutputText = "done" 1177 } 1178 calls = append(calls, fmt.Sprintf("call %d: MAIN (msgs=%d)", callNum, msgCount)) 1179 mu.Unlock() 1180 json.NewEncoder(w).Encode(resp) 1181 })) 1182 defer server.Close() 1183 1184 gw := client.NewGatewayClient(server.URL, "") 1185 reg := NewToolRegistry() 1186 reg.Register(&thinkTool{}) 1187 1188 handler := &mockHandler{approveResult: true} 1189 1190 loop := NewAgentLoop(gw, reg, "medium", "", 20, 2000, 200, nil, nil, nil) 1191 loop.SetContextWindow(2000) 1192 loop.SetMemoryDir(memoryDir) 1193 loop.SetHandler(handler) 1194 1195 _, _, err := loop.Run(context.Background(), 1196 "Drive the loop past MinShapeable while reporting warm-cache tokens.", 1197 nil, nil) 1198 if err != nil { 1199 t.Logf("Run error (iteration cap is acceptable): %v", err) 1200 } 1201 1202 mu.Lock() 1203 defer mu.Unlock() 1204 t.Logf("\n=== Call sequence (%d total) ===", len(calls)) 1205 for _, c := range calls { 1206 t.Logf(" %s", c) 1207 } 1208 1209 // Extract iteration numbers of SUMMARY calls. The `calls` slice records 1210 // every /v1/completions hit with "call N: …"; the call index is our 1211 // proxy for iteration ordering since MAIN + SUMMARY + PERSIST are 1212 // serialized per iter. 1213 summaryIndices := []int{} 1214 for idx, c := range calls { 1215 if strings.Contains(c, "SUMMARY") { 1216 summaryIndices = append(summaryIndices, idx) 1217 } 1218 } 1219 1220 // Assertion 1 — empty is treated as failure, so backoff engages after 3. 1221 // Pre-fix: no backoff on empty → ≥8 SUMMARY in a 15-iter run. 1222 // Post-fix: fails on 3 then cool-off → at most 4 across the whole run 1223 // (3 initial failures + at most 1 retry after the 5-iter window closes 1224 // if the run has not yet hit the 15-iter cap). 1225 if len(summaryIndices) > 4 { 1226 t.Errorf("empty-summary backoff did not engage: saw %d SUMMARY calls (expected ≤4)\n"+ 1227 "pre-fix behaviour resets summaryFailures when sumErr==nil && summary==\"\", "+ 1228 "defeating the backoff circuit breaker", 1229 len(summaryIndices)) 1230 } 1231 1232 // Assertion 2 — the first 3 SUMMARY calls land before the run's midpoint. 1233 // If they straddle too wide an interval it means SUMMARY was silently 1234 // skipping (shouldCompact gate closed) rather than genuinely firing. 1235 if len(summaryIndices) < 3 { 1236 t.Fatalf("expected at least 3 SUMMARY calls to trip the breaker; got %d.\n"+ 1237 "call sequence:\n %s", 1238 len(summaryIndices), strings.Join(calls, "\n ")) 1239 } 1240 1241 // Stress-adequacy soft guard — when the breaker holds to end of run 1242 // (len(summaryIndices) == 3), Assertion 3 below is skipped entirely via 1243 // its len >= 4 guard. That is a valid GREEN state only if the run was 1244 // long enough that a 4th SUMMARY could have fired had the cool-off 1245 // window been too narrow. If too few MAIN iters actually completed, the 1246 // test is passing vacuously — a future bump to MinShapeable() or the 1247 // hard-stop condition could silently hollow out Assertion 3 without any 1248 // real behavior regression. Count total MAIN iters and flag the gap. 1249 mainIterCount := 0 1250 for _, c := range calls { 1251 if strings.Contains(c, "MAIN") { 1252 mainIterCount++ 1253 } 1254 } 1255 t.Logf("ran %d MAIN iterations total", mainIterCount) 1256 if len(summaryIndices) == 3 && mainIterCount < 12 { 1257 t.Errorf("test under-stressed: only %d MAIN iters completed; the breaker holding with "+ 1258 "exactly 3 SUMMARY may be because the run ended, not because the cool-off is 5 iters. "+ 1259 "Raise the hard-stop condition (msgCount>=30) or maxIter so the run reaches ≥ 12 MAIN "+ 1260 "iterations — then Assertion 3 can actually measure the cool-off window.", 1261 mainIterCount) 1262 } 1263 1264 // Assertion 3 — iteration-level cool-off window. Measured by counting 1265 // MAIN calls between the 3rd and 4th SUMMARY. 1266 // 1267 // Within one iteration the call order is PERSIST → SUMMARY → MAIN. So for 1268 // a correct summaryBackoffIters=5 cool-off, calls[thirdIdx+1 : fourthIdx] 1269 // contains: 1270 // • 1 MAIN from iter F itself (same iter as the 3rd SUMMARY — MAIN 1271 // fires after SUMMARY within the iter and is not gated by backoff) 1272 // • 5 MAINs from iters F+1…F+5 (fully backed off — only MAIN fires) 1273 // • 1 PERSIST from iter F+6 (gate re-opens, right before the 4th SUMMARY) 1274 // So the expected mainBetween is 6, not 5. The threshold `< 6` strictly 1275 // rejects any regression down to summaryBackoffIters=4 (mainBetween=5). 1276 // 1277 // A previous version of this assertion used call-stream index arithmetic 1278 // (`windowEnd := thirdFailureAt + 6`). That is wrong because the iter 1279 // which emits the 4th SUMMARY also contributes MAIN+PERSIST calls to the 1280 // stream, so a 3-iter backoff and a 5-iter backoff both place the 4th 1281 // SUMMARY at roughly the same call index, hiding the regression. Counting 1282 // MAIN calls is the iter-native measure. 1283 // 1284 // This is also the assertion that fails when Task 2's three-way switch is 1285 // applied WITHOUT the `(i - summaryFailures)` → `(i - lastSummaryFailureIter)` 1286 // formula fix: mid-run failures collapse the window so only 0–2 MAIN 1287 // calls separate the 3rd and 4th SUMMARY. 1288 const expectedMainBetween = 6 // 1 same-iter MAIN + summaryBackoffIters backed-off MAINs 1289 if len(summaryIndices) >= 4 { 1290 thirdIdx := summaryIndices[2] 1291 fourthIdx := summaryIndices[3] 1292 mainBetween := 0 1293 for _, c := range calls[thirdIdx+1 : fourthIdx] { 1294 if strings.Contains(c, "MAIN") { 1295 mainBetween++ 1296 } 1297 } 1298 if mainBetween < expectedMainBetween { 1299 t.Errorf("backoff cool-off window too narrow: only %d MAIN iterations "+ 1300 "between 3rd SUMMARY (call %d) and 4th SUMMARY (call %d); expected ≥ %d "+ 1301 "(1 same-iter MAIN + 5 backed-off MAINs).\n"+ 1302 "This is the signature of a broken cool-off window — the 4th retry "+ 1303 "fired too soon.\ncall sequence:\n %s", 1304 mainBetween, thirdIdx, fourthIdx, expectedMainBetween, strings.Join(calls, "\n ")) 1305 } 1306 } 1307 // If there is no 4th SUMMARY at all (len(summaryIndices) == 3), the 1308 // breaker held for the entire remaining run — that is also a valid 1309 // GREEN state and intentionally passes without additional checks. 1310 }