/ internal / agent / loop_compaction_test.go
loop_compaction_test.go
   1  package agent
   2  
   3  import (
   4  	"context"
   5  	"encoding/json"
   6  	"fmt"
   7  	"net/http"
   8  	"net/http/httptest"
   9  	"os"
  10  	"path/filepath"
  11  	"strings"
  12  	"sync"
  13  	"testing"
  14  
  15  	"github.com/Kocoro-lab/ShanClaw/internal/client"
  16  )
  17  
  18  // TestAgentLoop_CompactionAndMemoryPersist verifies the full compaction chain:
  19  //
  20  //  1. Agent loop runs multiple tool-call iterations within a single Run()
  21  //  2. Mock server reports growing input tokens each iteration
  22  //  3. When tokens exceed 85% of context_window → compaction triggers
  23  //  4. PersistLearnings fires (small tier) → writes to MEMORY.md
  24  //  5. GenerateSummary fires (small tier) → creates summary
  25  //  6. ShapeHistory reduces messages
  26  //
  27  // Uses context_window=2000 so 85% threshold = 1700 tokens.
  28  // Needs ≥5 tool iterations so messages > MinShapeable (9).
  29  func TestAgentLoop_CompactionAndMemoryPersist(t *testing.T) {
  30  	memoryDir := t.TempDir()
  31  
  32  	var mu sync.Mutex
  33  	var calls []string // ordered log of all calls
  34  
  35  	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
  36  		raw, _ := readBody(r.Body)
  37  		defer r.Body.Close()
  38  
  39  		var req struct {
  40  			ModelTier string `json:"model_tier"`
  41  			Messages  []struct {
  42  				Role    string          `json:"role"`
  43  				Content json.RawMessage `json:"content"`
  44  			} `json:"messages"`
  45  		}
  46  		json.Unmarshal(raw, &req)
  47  
  48  		mu.Lock()
  49  		callNum := len(calls) + 1
  50  
  51  		// Identify small-tier calls
  52  		if req.ModelTier == "small" {
  53  			isPersist := false
  54  			isSummary := false
  55  			for _, m := range req.Messages {
  56  				var text string
  57  				json.Unmarshal(m.Content, &text)
  58  				if strings.Contains(text, "extracting durable knowledge") {
  59  					isPersist = true
  60  				}
  61  				if strings.Contains(text, "Compress the following conversation") {
  62  					isSummary = true
  63  				}
  64  			}
  65  
  66  			if isPersist {
  67  				calls = append(calls, fmt.Sprintf("call %d: PERSIST", callNum))
  68  				mu.Unlock()
  69  				t.Logf("Call %d: [small] PersistLearnings (messages: %d)", callNum, len(req.Messages))
  70  				json.NewEncoder(w).Encode(nativeResponse(
  71  					"- Agent discussed system architecture\n- Testing compaction flow",
  72  					"end_turn", nil, 50, 30))
  73  				return
  74  			}
  75  			if isSummary {
  76  				calls = append(calls, fmt.Sprintf("call %d: SUMMARY", callNum))
  77  				mu.Unlock()
  78  				t.Logf("Call %d: [small] GenerateSummary", callNum)
  79  				json.NewEncoder(w).Encode(nativeResponse(
  80  					"User asked about architecture. Agent reasoned through multiple steps.",
  81  					"end_turn", nil, 50, 30))
  82  				return
  83  			}
  84  
  85  			calls = append(calls, fmt.Sprintf("call %d: small-other", callNum))
  86  			mu.Unlock()
  87  			t.Logf("Call %d: [small] other", callNum)
  88  			json.NewEncoder(w).Encode(nativeResponse("ok", "end_turn", nil, 50, 30))
  89  			return
  90  		}
  91  
  92  		// Main-tier calls: use message count to decide behavior.
  93  		// We need the loop to iterate 6+ times so messages exceed MinShapeable (9).
  94  		// Report input tokens that grow to exceed the 1700 threshold.
  95  		msgCount := len(req.Messages)
  96  		// Scale input tokens based on message count to simulate realistic growth
  97  		inputTokens := msgCount * 200
  98  
  99  		if msgCount < 12 {
 100  			// Keep looping with tool calls until we have enough messages
 101  			calls = append(calls, fmt.Sprintf("call %d: TOOL (msgs=%d, input=%d)", callNum, msgCount, inputTokens))
 102  			mu.Unlock()
 103  			t.Logf("Call %d: [main] tool_use (msgs=%d, input_tokens=%d)", callNum, msgCount, inputTokens)
 104  			json.NewEncoder(w).Encode(nativeResponse(
 105  				"", "tool_use",
 106  				toolCall("think", fmt.Sprintf(`{"thought":"Analyzing step with %d messages in context"}`, msgCount)),
 107  				inputTokens, 100))
 108  		} else {
 109  			calls = append(calls, fmt.Sprintf("call %d: END_TURN (msgs=%d, input=%d)", callNum, msgCount, inputTokens))
 110  			mu.Unlock()
 111  			t.Logf("Call %d: [main] end_turn (msgs=%d, input_tokens=%d)", callNum, msgCount, inputTokens)
 112  			json.NewEncoder(w).Encode(nativeResponse(
 113  				"Here is the complete analysis based on my reasoning through all the steps.",
 114  				"end_turn", nil, inputTokens, 100))
 115  		}
 116  	}))
 117  	defer server.Close()
 118  
 119  	gw := client.NewGatewayClient(server.URL, "")
 120  	reg := NewToolRegistry()
 121  
 122  	// Register think tool — no approval needed, keeps loop iterating
 123  	reg.Register(&thinkTool{})
 124  
 125  	handler := &mockHandler{approveResult: true}
 126  
 127  	loop := NewAgentLoop(gw, reg, "medium", "", 20, 2000, 200, nil, nil, nil)
 128  	loop.SetContextWindow(2000) // 85% = 1700 triggers compaction
 129  	loop.SetMemoryDir(memoryDir)
 130  	loop.SetHandler(handler)
 131  
 132  	// Run with a big message
 133  	result, usage, err := loop.Run(context.Background(),
 134  		"Explain the complete system architecture. Think through each component step by step. Be thorough.",
 135  		nil, nil)
 136  	if err != nil {
 137  		t.Logf("Run error (may be iteration limit): %v", err)
 138  	}
 139  
 140  	mu.Lock()
 141  	t.Logf("\n=== Call sequence (%d total) ===", len(calls))
 142  	for _, c := range calls {
 143  		t.Logf("  %s", c)
 144  	}
 145  
 146  	hasPersist := false
 147  	hasSummary := false
 148  	for _, c := range calls {
 149  		if strings.Contains(c, "PERSIST") {
 150  			hasPersist = true
 151  		}
 152  		if strings.Contains(c, "SUMMARY") {
 153  			hasSummary = true
 154  		}
 155  	}
 156  	mu.Unlock()
 157  
 158  	t.Logf("Result: %d chars", len(result))
 159  	t.Logf("Usage: %d LLM calls, %d input+output tokens",
 160  		usage.LLMCalls, usage.InputTokens+usage.OutputTokens)
 161  
 162  	// Check compaction fired
 163  	if !hasPersist {
 164  		t.Error("PersistLearnings should have fired during compaction")
 165  	}
 166  	if !hasSummary {
 167  		t.Error("GenerateSummary should have fired during compaction")
 168  	}
 169  
 170  	// Check MEMORY.md
 171  	memPath := filepath.Join(memoryDir, "MEMORY.md")
 172  	memData, err := os.ReadFile(memPath)
 173  	if err != nil {
 174  		if hasPersist {
 175  			t.Fatalf("MEMORY.md should exist since PersistLearnings fired: %v", err)
 176  		}
 177  		t.Logf("MEMORY.md not created — compaction didn't trigger")
 178  		return
 179  	}
 180  
 181  	memContent := string(memData)
 182  	t.Logf("\n=== MEMORY.md ===\n%s", memContent)
 183  
 184  	if !strings.Contains(memContent, "Auto-persisted") {
 185  		t.Error("MEMORY.md should contain Auto-persisted section")
 186  	}
 187  }
 188  
 189  // thinkTool is a minimal think tool for the compaction test.
 190  type thinkTool struct{}
 191  
 192  func (t *thinkTool) Info() ToolInfo {
 193  	return ToolInfo{
 194  		Name:        "think",
 195  		Description: "Plan or reason through tasks",
 196  		Parameters:  map[string]any{"type": "object", "properties": map[string]any{"thought": map[string]any{"type": "string"}}},
 197  		Required:    []string{"thought"},
 198  	}
 199  }
 200  
 201  func (t *thinkTool) Run(ctx context.Context, args string) (ToolResult, error) {
 202  	return ToolResult{Content: "Thought recorded."}, nil
 203  }
 204  
 205  func (t *thinkTool) RequiresApproval() bool { return false }
 206  
 207  func readBody(body interface{ Read([]byte) (int, error) }) ([]byte, error) {
 208  	var buf []byte
 209  	tmp := make([]byte, 4096)
 210  	for {
 211  		n, err := body.Read(tmp)
 212  		buf = append(buf, tmp[:n]...)
 213  		if err != nil {
 214  			break
 215  		}
 216  	}
 217  	return buf, nil
 218  }
 219  
 220  func TestTruncateHeadTail(t *testing.T) {
 221  	t.Run("short string unchanged", func(t *testing.T) {
 222  		s := "hello world"
 223  		got := truncateHeadTail(s, 100)
 224  		if got != s {
 225  			t.Errorf("expected unchanged, got %q", got)
 226  		}
 227  	})
 228  
 229  	t.Run("exact limit unchanged", func(t *testing.T) {
 230  		s := "abcdefghij" // 10 runes
 231  		got := truncateHeadTail(s, 10)
 232  		if got != s {
 233  			t.Errorf("expected unchanged, got %q", got)
 234  		}
 235  	})
 236  
 237  	t.Run("long string gets head+tail", func(t *testing.T) {
 238  		// 100 chars, truncate to 40
 239  		s := strings.Repeat("a", 50) + strings.Repeat("z", 50)
 240  		got := truncateHeadTail(s, 40)
 241  		// keepHead=30, keepTail=10
 242  		if !strings.HasPrefix(got, strings.Repeat("a", 30)) {
 243  			t.Errorf("expected head of 30 'a's, got prefix: %q", got[:40])
 244  		}
 245  		if !strings.HasSuffix(got, strings.Repeat("z", 10)) {
 246  			t.Errorf("expected tail of 10 'z's, got suffix: %q", got[len(got)-20:])
 247  		}
 248  		if !strings.Contains(got, "[... truncated 60 chars ...]") {
 249  			t.Errorf("expected truncation marker with 60 dropped chars, got: %q", got)
 250  		}
 251  	})
 252  
 253  	t.Run("rune-safe with multibyte", func(t *testing.T) {
 254  		// 20 runes of 3 bytes each
 255  		s := strings.Repeat("日", 20)
 256  		got := truncateHeadTail(s, 10)
 257  		// keepHead=7, keepTail=2
 258  		runes := []rune(got)
 259  		// Should start with 7 日 and end with 2 日
 260  		if runes[0] != '日' || runes[len(runes)-1] != '日' {
 261  			t.Errorf("expected rune-safe truncation, got: %q", got)
 262  		}
 263  		if !strings.Contains(got, "[... truncated 10 chars ...]") {
 264  			t.Errorf("expected truncation marker, got: %q", got)
 265  		}
 266  	})
 267  }
 268  
 269  func TestBuildToolCallMap(t *testing.T) {
 270  	messages := []client.Message{
 271  		{
 272  			Role: "assistant",
 273  			Content: client.NewBlockContent([]client.ContentBlock{
 274  				client.NewToolUseBlock("tu-1", "file_read", json.RawMessage(`{"path":"/tmp/foo.txt"}`)),
 275  				client.NewToolUseBlock("tu-2", "bash", json.RawMessage(`{"command":"echo hello"}`)),
 276  			}),
 277  		},
 278  		{
 279  			Role: "user",
 280  			Content: client.NewBlockContent([]client.ContentBlock{
 281  				client.NewToolResultBlock("tu-1", "file contents here", false),
 282  			}),
 283  		},
 284  	}
 285  
 286  	m := buildToolCallMap(messages)
 287  	if len(m) != 2 {
 288  		t.Fatalf("expected 2 entries, got %d", len(m))
 289  	}
 290  	if m["tu-1"].Name != "file_read" {
 291  		t.Errorf("expected file_read, got %q", m["tu-1"].Name)
 292  	}
 293  	if m["tu-2"].Name != "bash" {
 294  		t.Errorf("expected bash, got %q", m["tu-2"].Name)
 295  	}
 296  	if !strings.Contains(m["tu-1"].Args, "/tmp/foo.txt") {
 297  		t.Errorf("expected args to contain path, got %q", m["tu-1"].Args)
 298  	}
 299  }
 300  
 301  func TestBuildToolCallMap_LongArgsTruncated(t *testing.T) {
 302  	longArgs := `{"content":"` + strings.Repeat("x", 200) + `"}`
 303  	messages := []client.Message{
 304  		{
 305  			Role: "assistant",
 306  			Content: client.NewBlockContent([]client.ContentBlock{
 307  				client.NewToolUseBlock("tu-1", "file_write", json.RawMessage(longArgs)),
 308  			}),
 309  		},
 310  	}
 311  
 312  	m := buildToolCallMap(messages)
 313  	if len(m["tu-1"].Args) > 104 { // 100 + "..."
 314  		t.Errorf("expected args truncated to ~103 chars, got %d", len(m["tu-1"].Args))
 315  	}
 316  }
 317  
 318  func TestCompressOldToolResults_TieredBehavior(t *testing.T) {
 319  	// Create 25 tool result pairs to exercise all three tiers with current constants:
 320  	// tier1Threshold=20, keepRecent passed as 8 to match compressAfter.
 321  	const numTools = 25
 322  	const keepRecent = 8
 323  
 324  	var messages []client.Message
 325  	messages = append(messages, client.Message{
 326  		Role:    "user",
 327  		Content: client.NewTextContent("Do some work"),
 328  	})
 329  
 330  	for i := 0; i < numTools; i++ {
 331  		id := fmt.Sprintf("tu-%d", i)
 332  		name := fmt.Sprintf("tool_%d", i)
 333  		args := json.RawMessage(fmt.Sprintf(`{"arg":"value_%d"}`, i))
 334  		content := fmt.Sprintf("Result content for tool %d: %s", i, strings.Repeat("x", 500))
 335  
 336  		messages = append(messages, client.Message{
 337  			Role: "assistant",
 338  			Content: client.NewBlockContent([]client.ContentBlock{
 339  				client.NewToolUseBlock(id, name, args),
 340  			}),
 341  		})
 342  		messages = append(messages, client.Message{
 343  			Role: "user",
 344  			Content: client.NewBlockContent([]client.ContentBlock{
 345  				client.NewToolResultBlock(id, content, false),
 346  			}),
 347  		})
 348  	}
 349  
 350  	compressOldToolResults(context.Background(), messages, keepRecent, 300, nil)
 351  
 352  	for i := 0; i < numTools; i++ {
 353  		msgIdx := 2 + i*2
 354  		msg := messages[msgIdx]
 355  		blocks := msg.Content.Blocks()
 356  		if len(blocks) == 0 {
 357  			t.Fatalf("tool result %d: no blocks", i)
 358  		}
 359  		resultContent := ""
 360  		for _, b := range blocks {
 361  			if b.Type == "tool_result" {
 362  				if s, ok := b.ToolContent.(string); ok {
 363  					resultContent = s
 364  				}
 365  			}
 366  		}
 367  
 368  		distFromEnd := (numTools - 1) - i
 369  
 370  		if distFromEnd < keepRecent {
 371  			// Tier 3: should be full (500+ chars)
 372  			if len(resultContent) < 500 {
 373  				t.Errorf("tool %d (dist=%d): expected tier 3 full content (%d chars), got %d chars",
 374  					i, distFromEnd, 500, len(resultContent))
 375  			}
 376  		} else if distFromEnd >= 20 {
 377  			// Tier 1: should contain "snipped"
 378  			if !strings.Contains(resultContent, "snipped") {
 379  				t.Errorf("tool %d (dist=%d): expected tier 1 metadata with 'snipped', got: %q",
 380  					i, distFromEnd, resultContent)
 381  			}
 382  		} else {
 383  			// Tier 2: should be truncated but not snipped (head+tail)
 384  			if strings.Contains(resultContent, "snipped") {
 385  				t.Errorf("tool %d (dist=%d): tier 2 should not contain 'snipped', got: %q",
 386  					i, distFromEnd, resultContent)
 387  			}
 388  			if len(resultContent) > 400 {
 389  				t.Errorf("tool %d (dist=%d): expected tier 2 truncated to ~300 chars, got %d",
 390  					i, distFromEnd, len(resultContent))
 391  			}
 392  			if !strings.Contains(resultContent, "[... truncated") {
 393  				t.Errorf("tool %d (dist=%d): expected head+tail truncation marker, got: %q",
 394  					i, distFromEnd, resultContent)
 395  			}
 396  		}
 397  	}
 398  }
 399  
 400  func TestCompressOldToolResults_Tier2FloorForReadTools(t *testing.T) {
 401  	// Verify that file_read and grep results never degrade to Tier 1 metadata stubs,
 402  	// even when they would normally be old enough for Tier 1.
 403  	const numTools = 26
 404  	var messages []client.Message
 405  	messages = append(messages, client.Message{
 406  		Role:    "user",
 407  		Content: client.NewTextContent("Start"),
 408  	})
 409  
 410  	// Tools 0-4: floor tools, 5-25: normal tools.
 411  	// With 26 total results, tool 5 sits exactly at distFromEnd=20, so it should
 412  	// hit Tier 1 and serve as the non-floor control case.
 413  	for i := 0; i < numTools; i++ {
 414  		id := fmt.Sprintf("tu-%d", i)
 415  		name := "tool_other"
 416  		if i < 3 {
 417  			name = "file_read"
 418  		} else if i < 5 {
 419  			name = "grep"
 420  		}
 421  		args := json.RawMessage(fmt.Sprintf(`{"arg":"value_%d"}`, i))
 422  		content := fmt.Sprintf("Result %d: %s", i, strings.Repeat("x", 500))
 423  
 424  		messages = append(messages, client.Message{
 425  			Role: "assistant",
 426  			Content: client.NewBlockContent([]client.ContentBlock{
 427  				client.NewToolUseBlock(id, name, args),
 428  			}),
 429  		})
 430  		messages = append(messages, client.Message{
 431  			Role: "user",
 432  			Content: client.NewBlockContent([]client.ContentBlock{
 433  				client.NewToolResultBlock(id, content, false),
 434  			}),
 435  		})
 436  	}
 437  
 438  	compressOldToolResults(context.Background(), messages, 8, 300, nil)
 439  
 440  	// Check the oldest file_read/grep results (tools 0-4, dist 25-21 from end)
 441  	// These should be Tier 2 (truncated with head+tail), NOT Tier 1 (snipped).
 442  	for i := 0; i < 5; i++ {
 443  		msgIdx := 2 + i*2
 444  		blocks := messages[msgIdx].Content.Blocks()
 445  		resultContent := ""
 446  		for _, b := range blocks {
 447  			if b.Type == "tool_result" {
 448  				if s, ok := b.ToolContent.(string); ok {
 449  					resultContent = s
 450  				}
 451  			}
 452  		}
 453  		if strings.Contains(resultContent, "snipped") {
 454  			t.Errorf("floor tool %d: should not be Tier 1 (snipped), got: %q", i, resultContent[:80])
 455  		}
 456  		if !strings.Contains(resultContent, "[... truncated") {
 457  			t.Errorf("floor tool %d: should be Tier 2 (truncated), got: %q", i, resultContent[:80])
 458  		}
 459  	}
 460  
 461  	// Non-floor control: tool 5 is old enough for Tier 1 and should become metadata-only.
 462  	normalIdx := 2 + 5*2
 463  	blocks := messages[normalIdx].Content.Blocks()
 464  	resultContent := ""
 465  	for _, b := range blocks {
 466  		if b.Type == "tool_result" {
 467  			if s, ok := b.ToolContent.(string); ok {
 468  				resultContent = s
 469  			}
 470  		}
 471  	}
 472  	if !strings.Contains(resultContent, "snipped") {
 473  		t.Fatalf("non-floor tool should be Tier 1 (snipped), got: %q", resultContent[:80])
 474  	}
 475  	if strings.Contains(resultContent, "[... truncated") {
 476  		t.Fatalf("non-floor tool should not stay in Tier 2, got: %q", resultContent[:80])
 477  	}
 478  }
 479  
 480  func TestCompressOldToolResults_EmergencyMode(t *testing.T) {
 481  	// Simulate emergency compaction: keepRecent=1, maxChars=100
 482  	var messages []client.Message
 483  	messages = append(messages, client.Message{
 484  		Role:    "user",
 485  		Content: client.NewTextContent("Start"),
 486  	})
 487  
 488  	for i := 0; i < 5; i++ {
 489  		id := fmt.Sprintf("tu-%d", i)
 490  		content := strings.Repeat("y", 300)
 491  		messages = append(messages, client.Message{
 492  			Role: "assistant",
 493  			Content: client.NewBlockContent([]client.ContentBlock{
 494  				client.NewToolUseBlock(id, "bash", json.RawMessage(`{"command":"ls"}`)),
 495  			}),
 496  		})
 497  		messages = append(messages, client.Message{
 498  			Role: "user",
 499  			Content: client.NewBlockContent([]client.ContentBlock{
 500  				client.NewToolResultBlock(id, content, false),
 501  			}),
 502  		})
 503  	}
 504  
 505  	compressOldToolResults(context.Background(), messages, 1, 100, nil)
 506  
 507  	// Only the last tool result should be full
 508  	for i := 0; i < 5; i++ {
 509  		msgIdx := 2 + i*2
 510  		blocks := messages[msgIdx].Content.Blocks()
 511  		for _, b := range blocks {
 512  			if b.Type == "tool_result" {
 513  				s, ok := b.ToolContent.(string)
 514  				if !ok {
 515  					continue
 516  				}
 517  				if i == 4 {
 518  					// Last one: tier 3, full
 519  					if len(s) < 300 {
 520  						t.Errorf("last tool result should be full, got %d chars", len(s))
 521  					}
 522  				} else {
 523  					// All others should be compressed
 524  					if len(s) >= 300 {
 525  						t.Errorf("tool %d should be compressed, got %d chars", i, len(s))
 526  					}
 527  				}
 528  			}
 529  		}
 530  	}
 531  }
 532  
 533  // TestAgentLoop_ReactiveCompaction verifies the reactive compaction safety net:
 534  //
 535  //  1. Agent loop has enough messages to build context (6+ tool iterations)
 536  //  2. Mock server returns HTTP 400 "prompt is too long" after sufficient iterations
 537  //  3. Reactive compaction fires: PersistLearnings → compress → summary → ShapeHistory
 538  //  4. Retry succeeds with compacted messages
 539  //  5. compactionApplied flag prevents infinite retry loops
 540  //
 541  // The proactive compaction is bypassed by reporting low input tokens until the
 542  // server triggers the 400 error, simulating the case where token counting
 543  // underestimates and the API rejects the request.
 544  func TestAgentLoop_ReactiveCompaction(t *testing.T) {
 545  	memoryDir := t.TempDir()
 546  
 547  	var mu sync.Mutex
 548  	var calls []string
 549  
 550  	// After 6 tool iterations (13+ messages), return a 400 context-length error
 551  	// on the next main-tier call, then succeed on retry.
 552  	contextErrorReturned := false
 553  	retrySucceeded := false
 554  
 555  	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 556  		raw, _ := readBody(r.Body)
 557  		defer r.Body.Close()
 558  
 559  		var req struct {
 560  			ModelTier string `json:"model_tier"`
 561  			Messages  []struct {
 562  				Role    string          `json:"role"`
 563  				Content json.RawMessage `json:"content"`
 564  			} `json:"messages"`
 565  		}
 566  		json.Unmarshal(raw, &req)
 567  
 568  		mu.Lock()
 569  		callNum := len(calls) + 1
 570  
 571  		// Small-tier calls (PersistLearnings, GenerateSummary)
 572  		if req.ModelTier == "small" {
 573  			isPersist := false
 574  			isSummary := false
 575  			for _, m := range req.Messages {
 576  				var text string
 577  				json.Unmarshal(m.Content, &text)
 578  				if strings.Contains(text, "extracting durable knowledge") {
 579  					isPersist = true
 580  				}
 581  				if strings.Contains(text, "Compress the following conversation") {
 582  					isSummary = true
 583  				}
 584  			}
 585  
 586  			if isPersist {
 587  				calls = append(calls, fmt.Sprintf("call %d: PERSIST", callNum))
 588  				mu.Unlock()
 589  				t.Logf("Call %d: [small] PersistLearnings (messages: %d)", callNum, len(req.Messages))
 590  				json.NewEncoder(w).Encode(nativeResponse(
 591  					"- Agent was analyzing system architecture\n- Reactive compaction triggered",
 592  					"end_turn", nil, 50, 30))
 593  				return
 594  			}
 595  			if isSummary {
 596  				calls = append(calls, fmt.Sprintf("call %d: SUMMARY", callNum))
 597  				mu.Unlock()
 598  				t.Logf("Call %d: [small] GenerateSummary", callNum)
 599  				json.NewEncoder(w).Encode(nativeResponse(
 600  					"User asked about architecture. Agent analyzed multiple components before context overflow.",
 601  					"end_turn", nil, 50, 30))
 602  				return
 603  			}
 604  
 605  			calls = append(calls, fmt.Sprintf("call %d: small-other", callNum))
 606  			mu.Unlock()
 607  			json.NewEncoder(w).Encode(nativeResponse("ok", "end_turn", nil, 50, 30))
 608  			return
 609  		}
 610  
 611  		// Main-tier calls
 612  		msgCount := len(req.Messages)
 613  
 614  		if msgCount < 12 {
 615  			// Keep looping with tool calls, report LOW tokens so proactive
 616  			// compaction does NOT trigger (under 85% of 128000).
 617  			calls = append(calls, fmt.Sprintf("call %d: TOOL (msgs=%d)", callNum, msgCount))
 618  			mu.Unlock()
 619  			t.Logf("Call %d: [main] tool_use (msgs=%d)", callNum, msgCount)
 620  			json.NewEncoder(w).Encode(nativeResponse(
 621  				"", "tool_use",
 622  				toolCall("think", fmt.Sprintf(`{"thought":"Step %d analysis"}`, msgCount)),
 623  				500, 100)) // Low tokens — proactive compaction won't trigger
 624  			return
 625  		}
 626  
 627  		// At 12+ messages: return 400 context-length error (once)
 628  		if !contextErrorReturned {
 629  			contextErrorReturned = true
 630  			calls = append(calls, fmt.Sprintf("call %d: CONTEXT_ERROR (msgs=%d)", callNum, msgCount))
 631  			mu.Unlock()
 632  			t.Logf("Call %d: [main] → 400 prompt is too long (msgs=%d)", callNum, msgCount)
 633  			w.WriteHeader(http.StatusBadRequest)
 634  			w.Write([]byte(`{"error":{"type":"invalid_request_error","message":"prompt is too long"}}`))
 635  			return
 636  		}
 637  
 638  		// After reactive compaction retries: succeed
 639  		retrySucceeded = true
 640  		calls = append(calls, fmt.Sprintf("call %d: RETRY_SUCCESS (msgs=%d)", callNum, msgCount))
 641  		mu.Unlock()
 642  		t.Logf("Call %d: [main] end_turn after reactive compaction (msgs=%d)", callNum, msgCount)
 643  		json.NewEncoder(w).Encode(nativeResponse(
 644  			"Analysis complete after reactive compaction.",
 645  			"end_turn", nil, 800, 100))
 646  	}))
 647  	defer server.Close()
 648  
 649  	gw := client.NewGatewayClient(server.URL, "")
 650  	reg := NewToolRegistry()
 651  	reg.Register(&thinkTool{})
 652  
 653  	handler := &mockHandler{approveResult: true}
 654  
 655  	loop := NewAgentLoop(gw, reg, "medium", "", 20, 2000, 200, nil, nil, nil)
 656  	loop.SetContextWindow(128000) // High window so proactive compaction doesn't trigger
 657  	loop.SetMemoryDir(memoryDir)
 658  	loop.SetHandler(handler)
 659  
 660  	result, usage, err := loop.Run(context.Background(),
 661  		"Analyze each component of the system. Think through every step carefully.",
 662  		nil, nil)
 663  	if err != nil {
 664  		t.Logf("Run error: %v", err)
 665  	}
 666  
 667  	mu.Lock()
 668  	t.Logf("\n=== Call sequence (%d total) ===", len(calls))
 669  	for _, c := range calls {
 670  		t.Logf("  %s", c)
 671  	}
 672  
 673  	hasPersist := false
 674  	hasSummary := false
 675  	hasContextError := false
 676  	hasRetrySuccess := false
 677  	for _, c := range calls {
 678  		if strings.Contains(c, "PERSIST") {
 679  			hasPersist = true
 680  		}
 681  		if strings.Contains(c, "SUMMARY") {
 682  			hasSummary = true
 683  		}
 684  		if strings.Contains(c, "CONTEXT_ERROR") {
 685  			hasContextError = true
 686  		}
 687  		if strings.Contains(c, "RETRY_SUCCESS") {
 688  			hasRetrySuccess = true
 689  		}
 690  	}
 691  	mu.Unlock()
 692  
 693  	t.Logf("Result: %d chars", len(result))
 694  	t.Logf("Usage: %d LLM calls", usage.LLMCalls)
 695  
 696  	// Verify reactive compaction chain
 697  	if !hasContextError {
 698  		t.Error("expected context-length 400 error to be returned by mock server")
 699  	}
 700  	if !hasPersist {
 701  		t.Error("PersistLearnings should fire during reactive compaction")
 702  	}
 703  	if !hasSummary {
 704  		t.Error("GenerateSummary should fire during reactive compaction")
 705  	}
 706  	if !hasRetrySuccess {
 707  		t.Error("retry after reactive compaction should succeed")
 708  	}
 709  	if !retrySucceeded {
 710  		t.Error("retrySucceeded flag should be true")
 711  	}
 712  
 713  	// Verify MEMORY.md was written
 714  	memPath := filepath.Join(memoryDir, "MEMORY.md")
 715  	memData, err := os.ReadFile(memPath)
 716  	if err != nil {
 717  		t.Fatalf("MEMORY.md should exist after reactive PersistLearnings: %v", err)
 718  	}
 719  	memContent := string(memData)
 720  	t.Logf("\n=== MEMORY.md ===\n%s", memContent)
 721  	if !strings.Contains(memContent, "Auto-persisted") {
 722  		t.Error("MEMORY.md should contain Auto-persisted section")
 723  	}
 724  
 725  	// Verify result came through
 726  	if result == "" {
 727  		t.Error("expected non-empty result after successful retry")
 728  	}
 729  }
 730  
 731  // TestAgentLoop_ReactiveCompactionNoDoubleRetry verifies the compactionApplied
 732  // guard prevents infinite loops: if reactive compaction fires but the retry
 733  // ALSO returns a context-length error, the loop should fail instead of retrying.
 734  func TestAgentLoop_ReactiveCompactionNoDoubleRetry(t *testing.T) {
 735  	contextErrors := 0
 736  
 737  	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 738  		raw, _ := readBody(r.Body)
 739  		defer r.Body.Close()
 740  
 741  		var req struct {
 742  			ModelTier string `json:"model_tier"`
 743  			Messages  []struct {
 744  				Role    string          `json:"role"`
 745  				Content json.RawMessage `json:"content"`
 746  			} `json:"messages"`
 747  		}
 748  		json.Unmarshal(raw, &req)
 749  
 750  		// Small-tier: always succeed
 751  		if req.ModelTier == "small" {
 752  			for _, m := range req.Messages {
 753  				var text string
 754  				json.Unmarshal(m.Content, &text)
 755  				if strings.Contains(text, "extracting durable knowledge") {
 756  					json.NewEncoder(w).Encode(nativeResponse("learnings", "end_turn", nil, 50, 30))
 757  					return
 758  				}
 759  				if strings.Contains(text, "Compress the following conversation") {
 760  					json.NewEncoder(w).Encode(nativeResponse("summary", "end_turn", nil, 50, 30))
 761  					return
 762  				}
 763  			}
 764  			json.NewEncoder(w).Encode(nativeResponse("ok", "end_turn", nil, 50, 30))
 765  			return
 766  		}
 767  
 768  		msgCount := len(req.Messages)
 769  		t.Logf("Main-tier call: msgs=%d, contextErrors=%d", msgCount, contextErrors)
 770  
 771  		if msgCount < 6 && contextErrors == 0 {
 772  			// Build up messages with tool calls until we first trigger overflow.
 773  			json.NewEncoder(w).Encode(nativeResponse(
 774  				"", "tool_use",
 775  				toolCall("think", `{"thought":"building context"}`),
 776  				500, 100))
 777  			return
 778  		}
 779  
 780  		// Always return context-length error once we've started — even after
 781  		// compaction reduces message count. This forces the double-retry guard.
 782  		contextErrors++
 783  		t.Logf("Returning context-length error #%d (msgs=%d)", contextErrors, msgCount)
 784  		w.WriteHeader(http.StatusBadRequest)
 785  		w.Write([]byte(`{"error":{"type":"invalid_request_error","message":"context_length_exceeded"}}`))
 786  	}))
 787  	defer server.Close()
 788  
 789  	gw := client.NewGatewayClient(server.URL, "")
 790  	reg := NewToolRegistry()
 791  	reg.Register(&thinkTool{})
 792  
 793  	handler := &mockHandler{approveResult: true}
 794  	loop := NewAgentLoop(gw, reg, "medium", "", 20, 2000, 200, nil, nil, nil)
 795  	loop.SetContextWindow(128000)
 796  	loop.SetMemoryDir(t.TempDir())
 797  	loop.SetHandler(handler)
 798  
 799  	_, _, err := loop.Run(context.Background(), "Trigger reactive compaction that fails on retry too.", nil, nil)
 800  
 801  	// Should get an error — NOT an infinite loop
 802  	if err == nil {
 803  		t.Fatal("expected error when retry after reactive compaction also fails")
 804  	}
 805  	t.Logf("Got expected error: %v", err)
 806  
 807  	// Should have seen at most 2 context-length errors (original + one retry)
 808  	if contextErrors > 2 {
 809  		t.Errorf("expected at most 2 context-length errors (original + retry), got %d — infinite loop guard may be broken", contextErrors)
 810  	}
 811  }
 812  
 813  func TestReactiveSummaryInput_InsertsPriorSummaryOnce(t *testing.T) {
 814  	messages := []client.Message{
 815  		{Role: "system", Content: client.NewTextContent("system")},
 816  		{Role: "user", Content: client.NewTextContent("first user")},
 817  		{Role: "assistant", Content: client.NewTextContent("recent reply")},
 818  	}
 819  
 820  	withSummary := reactiveSummaryInput(messages, "Earlier work happened")
 821  	if len(withSummary) != len(messages)+1 {
 822  		t.Fatalf("expected injected summary message, got %d messages", len(withSummary))
 823  	}
 824  	if got := withSummary[2].Content.Text(); got != "Previous context summary: Earlier work happened" {
 825  		t.Fatalf("unexpected injected summary message: %q", got)
 826  	}
 827  
 828  	again := reactiveSummaryInput(withSummary, "Earlier work happened")
 829  	if len(again) != len(withSummary) {
 830  		t.Fatal("summary should not be injected twice")
 831  	}
 832  }
 833  
 834  func TestAgentLoop_ReactiveCompaction_UsesEmergencyFallbackWhenSoftStillOverBudget(t *testing.T) {
 835  	var mu sync.Mutex
 836  	var calls []string
 837  	mainCalls := 0
 838  
 839  	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 840  		raw, _ := readBody(r.Body)
 841  		defer r.Body.Close()
 842  
 843  		var req struct {
 844  			ModelTier string `json:"model_tier"`
 845  			Messages  []struct {
 846  				Role    string          `json:"role"`
 847  				Content json.RawMessage `json:"content"`
 848  			} `json:"messages"`
 849  		}
 850  		json.Unmarshal(raw, &req)
 851  
 852  		mu.Lock()
 853  		defer mu.Unlock()
 854  
 855  		if req.ModelTier == "small" {
 856  			calls = append(calls, "summary")
 857  			json.NewEncoder(w).Encode(nativeResponse(
 858  				"condensed summary",
 859  				"end_turn", nil, 50, 30))
 860  			return
 861  		}
 862  
 863  		mainCalls++
 864  		if mainCalls == 1 {
 865  			calls = append(calls, "context_error")
 866  			w.WriteHeader(http.StatusBadRequest)
 867  			w.Write([]byte(`{"error":{"type":"invalid_request_error","message":"prompt is too long"}}`))
 868  			return
 869  		}
 870  
 871  		calls = append(calls, "retry_success")
 872  		json.NewEncoder(w).Encode(nativeResponse(
 873  			"Recovered after emergency fallback.",
 874  			"end_turn", nil, 500, 100))
 875  	}))
 876  	defer server.Close()
 877  
 878  	gw := client.NewGatewayClient(server.URL, "")
 879  	reg := NewToolRegistry()
 880  	reg.Register(&thinkTool{})
 881  
 882  	loop := NewAgentLoop(gw, reg, "medium", "", 10, 2000, 200, nil, nil, nil)
 883  	loop.SetContextWindow(100000)
 884  
 885  	huge := strings.Repeat("x", 450000)
 886  	history := []client.Message{
 887  		{Role: "user", Content: client.NewTextContent(huge)},
 888  		{Role: "assistant", Content: client.NewTextContent("ack")},
 889  		{Role: "user", Content: client.NewTextContent("second turn")},
 890  		{Role: "assistant", Content: client.NewTextContent("second reply")},
 891  		{Role: "user", Content: client.NewTextContent("third turn")},
 892  		{Role: "assistant", Content: client.NewTextContent("third reply")},
 893  	}
 894  
 895  	result, _, err := loop.Run(context.Background(), "trigger reactive overflow", nil, history)
 896  	if err != nil {
 897  		t.Fatalf("unexpected error: %v", err)
 898  	}
 899  	if result != "Recovered after emergency fallback." {
 900  		t.Fatalf("unexpected result: %q", result)
 901  	}
 902  
 903  	mu.Lock()
 904  	gotCalls := append([]string(nil), calls...)
 905  	mu.Unlock()
 906  
 907  	summaryCalls := 0
 908  	for _, call := range gotCalls {
 909  		if call == "summary" {
 910  			summaryCalls++
 911  		}
 912  	}
 913  	if summaryCalls != 2 {
 914  		t.Fatalf("expected soft + emergency summary calls, got %d (%v)", summaryCalls, gotCalls)
 915  	}
 916  	if len(gotCalls) != 4 || gotCalls[0] != "context_error" || gotCalls[1] != "summary" || gotCalls[2] != "summary" || gotCalls[3] != "retry_success" {
 917  		t.Fatalf("unexpected call order: %v", gotCalls)
 918  	}
 919  }
 920  
 921  // TestAgentLoop_CompactionTriggersOnWarmCache is a regression test for the
 922  // compaction-gate fix that sums cached tokens into the gate's input.
 923  //
 924  // Before the fix, lastInputTokens was assigned normalizedUsage.InputTokens —
 925  // which Anthropic defines as *excluding* cached tokens. A long warm-cache
 926  // session would report input_tokens of a few hundred while cache_read_tokens
 927  // carried the real 90K+ prompt, so ShouldCompact never tripped and compaction
 928  // never fired until the cache went cold.
 929  //
 930  // After the fix, totalPromptTokens(u) = input + cache_read + cache_creation,
 931  // which reflects the real context-window consumption.
 932  //
 933  // This test drives the loop against a mock that always reports a small
 934  // InputTokens but a large CacheReadTokens. Once messages grow past
 935  // MinShapeable (9), the gate must trigger — PersistLearnings + GenerateSummary
 936  // must both fire. If the test fails, the gate has regressed to the pre-fix
 937  // behaviour.
 938  func TestAgentLoop_CompactionTriggersOnWarmCache(t *testing.T) {
 939  	memoryDir := t.TempDir()
 940  
 941  	var mu sync.Mutex
 942  	var calls []string
 943  
 944  	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 945  		raw, _ := readBody(r.Body)
 946  		defer r.Body.Close()
 947  
 948  		var req struct {
 949  			ModelTier string `json:"model_tier"`
 950  			Messages  []struct {
 951  				Role    string          `json:"role"`
 952  				Content json.RawMessage `json:"content"`
 953  			} `json:"messages"`
 954  		}
 955  		json.Unmarshal(raw, &req)
 956  
 957  		mu.Lock()
 958  		callNum := len(calls) + 1
 959  
 960  		if req.ModelTier == "small" {
 961  			isPersist := false
 962  			isSummary := false
 963  			for _, m := range req.Messages {
 964  				var text string
 965  				json.Unmarshal(m.Content, &text)
 966  				if strings.Contains(text, "extracting durable knowledge") {
 967  					isPersist = true
 968  				}
 969  				if strings.Contains(text, "Compress the following conversation") {
 970  					isSummary = true
 971  				}
 972  			}
 973  			if isPersist {
 974  				calls = append(calls, fmt.Sprintf("call %d: PERSIST", callNum))
 975  				mu.Unlock()
 976  				json.NewEncoder(w).Encode(nativeResponse(
 977  					"- Warm-cache compaction fired correctly",
 978  					"end_turn", nil, 50, 30))
 979  				return
 980  			}
 981  			if isSummary {
 982  				calls = append(calls, fmt.Sprintf("call %d: SUMMARY", callNum))
 983  				mu.Unlock()
 984  				json.NewEncoder(w).Encode(nativeResponse(
 985  					"Agent summarised cached history.", "end_turn", nil, 50, 30))
 986  				return
 987  			}
 988  			calls = append(calls, fmt.Sprintf("call %d: small-other", callNum))
 989  			mu.Unlock()
 990  			json.NewEncoder(w).Encode(nativeResponse("ok", "end_turn", nil, 50, 30))
 991  			return
 992  		}
 993  
 994  		// Main-tier: simulate a warm cache — small InputTokens, large CacheReadTokens.
 995  		// context_window=2000 so threshold = 1700. InputTokens alone (200) is below
 996  		// threshold; total prompt (200 + 1800 cache_read = 2000) is above. Pre-fix
 997  		// code reads only InputTokens and would NOT compact; post-fix reads
 998  		// totalPromptTokens and SHOULD compact once msgCount > MinShapeable (9).
 999  		msgCount := len(req.Messages)
1000  		resp := client.CompletionResponse{
1001  			Model:        "test-model",
1002  			FinishReason: "tool_use",
1003  			FunctionCall: nil,
1004  			ToolCalls: []client.FunctionCall{{
1005  				Name:      "think",
1006  				Arguments: json.RawMessage(fmt.Sprintf(`{"thought":"step with %d msgs"}`, msgCount)),
1007  			}},
1008  			Usage: client.Usage{
1009  				InputTokens:     200,
1010  				OutputTokens:    50,
1011  				TotalTokens:     250,
1012  				CacheReadTokens: 1800,
1013  			},
1014  			RequestID: "req-test",
1015  		}
1016  		if msgCount >= 12 {
1017  			// Emit end_turn so the run can terminate after compaction fires.
1018  			resp.FinishReason = "end_turn"
1019  			resp.ToolCalls = nil
1020  			resp.OutputText = "Analysis complete after warm-cache compaction."
1021  		}
1022  		calls = append(calls, fmt.Sprintf("call %d: MAIN (msgs=%d, input=200, cache_read=1800)", callNum, msgCount))
1023  		mu.Unlock()
1024  		json.NewEncoder(w).Encode(resp)
1025  	}))
1026  	defer server.Close()
1027  
1028  	gw := client.NewGatewayClient(server.URL, "")
1029  	reg := NewToolRegistry()
1030  	reg.Register(&thinkTool{})
1031  
1032  	handler := &mockHandler{approveResult: true}
1033  
1034  	loop := NewAgentLoop(gw, reg, "medium", "", 20, 2000, 200, nil, nil, nil)
1035  	loop.SetContextWindow(2000)
1036  	loop.SetMemoryDir(memoryDir)
1037  	loop.SetHandler(handler)
1038  
1039  	_, _, err := loop.Run(context.Background(),
1040  		"Run through several reasoning steps so message count grows past MinShapeable.",
1041  		nil, nil)
1042  	if err != nil {
1043  		t.Logf("Run error (iteration limit is acceptable): %v", err)
1044  	}
1045  
1046  	mu.Lock()
1047  	defer mu.Unlock()
1048  	t.Logf("\n=== Call sequence (%d total) ===", len(calls))
1049  	for _, c := range calls {
1050  		t.Logf("  %s", c)
1051  	}
1052  
1053  	hasPersist := false
1054  	hasSummary := false
1055  	for _, c := range calls {
1056  		if strings.Contains(c, "PERSIST") {
1057  			hasPersist = true
1058  		}
1059  		if strings.Contains(c, "SUMMARY") {
1060  			hasSummary = true
1061  		}
1062  	}
1063  
1064  	if !hasPersist {
1065  		t.Error("PersistLearnings must fire once warm-cache total prompt exceeds 85% — gate regressed to pre-fix behavior")
1066  	}
1067  	if !hasSummary {
1068  		t.Error("GenerateSummary must fire once warm-cache total prompt exceeds 85% — gate regressed to pre-fix behavior")
1069  	}
1070  }
1071  
1072  // TestAgentLoop_EmptySummaryTriggersBackoff verifies two related fixes:
1073  //
1074  //  1. When GenerateSummary returns a non-error empty string (e.g. LLM produced
1075  //     <analysis> only, extractSummary filtered to ""), the compaction gate
1076  //     treats it as a failure and increments summaryFailures.
1077  //  2. After 3 consecutive failures, the cool-off window of 5 iterations
1078  //     really skips 5 iterations of SUMMARY attempts — regardless of when
1079  //     the failures happen in the run. The pre-fix `(i - summaryFailures) < 5`
1080  //     expression only yields a full 5-iter window when failures start at
1081  //     i=0; a middle cluster at e.g. i=4,5,6 collapsed the window to 1 iter,
1082  //     a late cluster at i=9,10,11 produced zero backoff at all.
1083  //
1084  // Post-fix assertions:
1085  //   - Total SUMMARY count is ≤ 4 across the whole run (3 initial failures
1086  //     plus at most one post-cool-off retry before the iter cap)
1087  //   - At least 3 SUMMARY calls fire, so the breaker actually trips
1088  //   - Between the 3rd and 4th SUMMARY there are ≥ 5 MAIN completion calls.
1089  //     Every iteration emits exactly one MAIN call regardless of compaction
1090  //     gating, so MAIN count between SUMMARYs is a direct measure of
1091  //     iterations skipped by backoff. This is the key assertion: measuring
1092  //     call-stream index differences (e.g. "4th SUMMARY ≥ call 3rdIndex+6")
1093  //     would silently accept a 3-iter backoff as if it were 5, because the
1094  //     iter that retries also contributes MAIN+PERSIST calls to the stream.
1095  func TestAgentLoop_EmptySummaryTriggersBackoff(t *testing.T) {
1096  	memoryDir := t.TempDir()
1097  
1098  	var mu sync.Mutex
1099  	var calls []string
1100  
1101  	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
1102  		raw, _ := readBody(r.Body)
1103  		defer r.Body.Close()
1104  
1105  		var req struct {
1106  			ModelTier string `json:"model_tier"`
1107  			Messages  []struct {
1108  				Role    string          `json:"role"`
1109  				Content json.RawMessage `json:"content"`
1110  			} `json:"messages"`
1111  		}
1112  		json.Unmarshal(raw, &req)
1113  
1114  		mu.Lock()
1115  		callNum := len(calls) + 1
1116  
1117  		if req.ModelTier == "small" {
1118  			isPersist := false
1119  			isSummary := false
1120  			for _, m := range req.Messages {
1121  				var text string
1122  				json.Unmarshal(m.Content, &text)
1123  				if strings.Contains(text, "extracting durable knowledge") {
1124  					isPersist = true
1125  				}
1126  				if strings.Contains(text, "Compress the following conversation") {
1127  					isSummary = true
1128  				}
1129  			}
1130  			if isPersist {
1131  				calls = append(calls, fmt.Sprintf("call %d: PERSIST", callNum))
1132  				mu.Unlock()
1133  				json.NewEncoder(w).Encode(nativeResponse(
1134  					"- simulated persist", "end_turn", nil, 50, 30))
1135  				return
1136  			}
1137  			if isSummary {
1138  				calls = append(calls, fmt.Sprintf("call %d: SUMMARY(empty)", callNum))
1139  				mu.Unlock()
1140  				// LLM returned <analysis> only — extractSummary strips it and returns "".
1141  				// sumErr is nil; summary is "".
1142  				json.NewEncoder(w).Encode(nativeResponse(
1143  					"<analysis>scratch work, no summary block produced</analysis>",
1144  					"end_turn", nil, 50, 30))
1145  				return
1146  			}
1147  			calls = append(calls, fmt.Sprintf("call %d: small-other", callNum))
1148  			mu.Unlock()
1149  			json.NewEncoder(w).Encode(nativeResponse("ok", "end_turn", nil, 50, 30))
1150  			return
1151  		}
1152  
1153  		// Main-tier: push messages past MinShapeable (9) and keep total prompt above
1154  		// context_window*0.85. With context_window=2000 threshold=1700, small input
1155  		// + large cache_read (1800) makes totalPromptTokens cross every turn.
1156  		msgCount := len(req.Messages)
1157  		resp := client.CompletionResponse{
1158  			Model:        "test-model",
1159  			FinishReason: "tool_use",
1160  			ToolCalls: []client.FunctionCall{{
1161  				Name:      "think",
1162  				Arguments: json.RawMessage(fmt.Sprintf(`{"thought":"iter with %d msgs"}`, msgCount)),
1163  			}},
1164  			Usage: client.Usage{
1165  				InputTokens:     200,
1166  				OutputTokens:    50,
1167  				TotalTokens:     250,
1168  				CacheReadTokens: 1800, // total = 2000 > 1700 threshold
1169  			},
1170  			RequestID: "req-test",
1171  		}
1172  		if msgCount >= 30 {
1173  			// Hard stop after 15 rounds so the test can't loop forever.
1174  			resp.FinishReason = "end_turn"
1175  			resp.ToolCalls = nil
1176  			resp.OutputText = "done"
1177  		}
1178  		calls = append(calls, fmt.Sprintf("call %d: MAIN (msgs=%d)", callNum, msgCount))
1179  		mu.Unlock()
1180  		json.NewEncoder(w).Encode(resp)
1181  	}))
1182  	defer server.Close()
1183  
1184  	gw := client.NewGatewayClient(server.URL, "")
1185  	reg := NewToolRegistry()
1186  	reg.Register(&thinkTool{})
1187  
1188  	handler := &mockHandler{approveResult: true}
1189  
1190  	loop := NewAgentLoop(gw, reg, "medium", "", 20, 2000, 200, nil, nil, nil)
1191  	loop.SetContextWindow(2000)
1192  	loop.SetMemoryDir(memoryDir)
1193  	loop.SetHandler(handler)
1194  
1195  	_, _, err := loop.Run(context.Background(),
1196  		"Drive the loop past MinShapeable while reporting warm-cache tokens.",
1197  		nil, nil)
1198  	if err != nil {
1199  		t.Logf("Run error (iteration cap is acceptable): %v", err)
1200  	}
1201  
1202  	mu.Lock()
1203  	defer mu.Unlock()
1204  	t.Logf("\n=== Call sequence (%d total) ===", len(calls))
1205  	for _, c := range calls {
1206  		t.Logf("  %s", c)
1207  	}
1208  
1209  	// Extract iteration numbers of SUMMARY calls. The `calls` slice records
1210  	// every /v1/completions hit with "call N: …"; the call index is our
1211  	// proxy for iteration ordering since MAIN + SUMMARY + PERSIST are
1212  	// serialized per iter.
1213  	summaryIndices := []int{}
1214  	for idx, c := range calls {
1215  		if strings.Contains(c, "SUMMARY") {
1216  			summaryIndices = append(summaryIndices, idx)
1217  		}
1218  	}
1219  
1220  	// Assertion 1 — empty is treated as failure, so backoff engages after 3.
1221  	// Pre-fix: no backoff on empty → ≥8 SUMMARY in a 15-iter run.
1222  	// Post-fix: fails on 3 then cool-off → at most 4 across the whole run
1223  	// (3 initial failures + at most 1 retry after the 5-iter window closes
1224  	// if the run has not yet hit the 15-iter cap).
1225  	if len(summaryIndices) > 4 {
1226  		t.Errorf("empty-summary backoff did not engage: saw %d SUMMARY calls (expected ≤4)\n"+
1227  			"pre-fix behaviour resets summaryFailures when sumErr==nil && summary==\"\", "+
1228  			"defeating the backoff circuit breaker",
1229  			len(summaryIndices))
1230  	}
1231  
1232  	// Assertion 2 — the first 3 SUMMARY calls land before the run's midpoint.
1233  	// If they straddle too wide an interval it means SUMMARY was silently
1234  	// skipping (shouldCompact gate closed) rather than genuinely firing.
1235  	if len(summaryIndices) < 3 {
1236  		t.Fatalf("expected at least 3 SUMMARY calls to trip the breaker; got %d.\n"+
1237  			"call sequence:\n  %s",
1238  			len(summaryIndices), strings.Join(calls, "\n  "))
1239  	}
1240  
1241  	// Stress-adequacy soft guard — when the breaker holds to end of run
1242  	// (len(summaryIndices) == 3), Assertion 3 below is skipped entirely via
1243  	// its len >= 4 guard. That is a valid GREEN state only if the run was
1244  	// long enough that a 4th SUMMARY could have fired had the cool-off
1245  	// window been too narrow. If too few MAIN iters actually completed, the
1246  	// test is passing vacuously — a future bump to MinShapeable() or the
1247  	// hard-stop condition could silently hollow out Assertion 3 without any
1248  	// real behavior regression. Count total MAIN iters and flag the gap.
1249  	mainIterCount := 0
1250  	for _, c := range calls {
1251  		if strings.Contains(c, "MAIN") {
1252  			mainIterCount++
1253  		}
1254  	}
1255  	t.Logf("ran %d MAIN iterations total", mainIterCount)
1256  	if len(summaryIndices) == 3 && mainIterCount < 12 {
1257  		t.Errorf("test under-stressed: only %d MAIN iters completed; the breaker holding with "+
1258  			"exactly 3 SUMMARY may be because the run ended, not because the cool-off is 5 iters. "+
1259  			"Raise the hard-stop condition (msgCount>=30) or maxIter so the run reaches ≥ 12 MAIN "+
1260  			"iterations — then Assertion 3 can actually measure the cool-off window.",
1261  			mainIterCount)
1262  	}
1263  
1264  	// Assertion 3 — iteration-level cool-off window. Measured by counting
1265  	// MAIN calls between the 3rd and 4th SUMMARY.
1266  	//
1267  	// Within one iteration the call order is PERSIST → SUMMARY → MAIN. So for
1268  	// a correct summaryBackoffIters=5 cool-off, calls[thirdIdx+1 : fourthIdx]
1269  	// contains:
1270  	//   • 1 MAIN from iter F itself (same iter as the 3rd SUMMARY — MAIN
1271  	//     fires after SUMMARY within the iter and is not gated by backoff)
1272  	//   • 5 MAINs from iters F+1…F+5 (fully backed off — only MAIN fires)
1273  	//   • 1 PERSIST from iter F+6 (gate re-opens, right before the 4th SUMMARY)
1274  	// So the expected mainBetween is 6, not 5. The threshold `< 6` strictly
1275  	// rejects any regression down to summaryBackoffIters=4 (mainBetween=5).
1276  	//
1277  	// A previous version of this assertion used call-stream index arithmetic
1278  	// (`windowEnd := thirdFailureAt + 6`). That is wrong because the iter
1279  	// which emits the 4th SUMMARY also contributes MAIN+PERSIST calls to the
1280  	// stream, so a 3-iter backoff and a 5-iter backoff both place the 4th
1281  	// SUMMARY at roughly the same call index, hiding the regression. Counting
1282  	// MAIN calls is the iter-native measure.
1283  	//
1284  	// This is also the assertion that fails when Task 2's three-way switch is
1285  	// applied WITHOUT the `(i - summaryFailures)` → `(i - lastSummaryFailureIter)`
1286  	// formula fix: mid-run failures collapse the window so only 0–2 MAIN
1287  	// calls separate the 3rd and 4th SUMMARY.
1288  	const expectedMainBetween = 6 // 1 same-iter MAIN + summaryBackoffIters backed-off MAINs
1289  	if len(summaryIndices) >= 4 {
1290  		thirdIdx := summaryIndices[2]
1291  		fourthIdx := summaryIndices[3]
1292  		mainBetween := 0
1293  		for _, c := range calls[thirdIdx+1 : fourthIdx] {
1294  			if strings.Contains(c, "MAIN") {
1295  				mainBetween++
1296  			}
1297  		}
1298  		if mainBetween < expectedMainBetween {
1299  			t.Errorf("backoff cool-off window too narrow: only %d MAIN iterations "+
1300  				"between 3rd SUMMARY (call %d) and 4th SUMMARY (call %d); expected ≥ %d "+
1301  				"(1 same-iter MAIN + 5 backed-off MAINs).\n"+
1302  				"This is the signature of a broken cool-off window — the 4th retry "+
1303  				"fired too soon.\ncall sequence:\n  %s",
1304  				mainBetween, thirdIdx, fourthIdx, expectedMainBetween, strings.Join(calls, "\n  "))
1305  		}
1306  	}
1307  	// If there is no 4th SUMMARY at all (len(summaryIndices) == 3), the
1308  	// breaker held for the entire remaining run — that is also a valid
1309  	// GREEN state and intentionally passes without additional checks.
1310  }