Cradicle Explorer

/ internal / agent / loopdetect_test.go
loopdetect_test.go
   1  package agent
   2  
   3  import (
   4  	"fmt"
   5  	"strings"
   6  	"testing"
   7  )
   8  
   9  func TestLoopDetector_ConsecutiveDup_Nudge(t *testing.T) {
  10  	ld := NewLoopDetector()
  11  
  12  	// 1 call: no trigger
  13  	ld.Record("web_search", `{"q":"test"}`, false, "", "", false)
  14  	action, _ := ld.Check("web_search")
  15  	if action != LoopContinue {
  16  		t.Errorf("1 call should not trigger, got %v", action)
  17  	}
  18  
  19  	// 2nd consecutive identical call: no trigger yet (consecDupThreshold=3)
  20  	ld.Record("web_search", `{"q":"test"}`, false, "", "", false)
  21  	action, _ = ld.Check("web_search")
  22  	if action != LoopContinue {
  23  		t.Errorf("2 consecutive identical calls should not trigger (consecDupThreshold=3), got %v", action)
  24  	}
  25  
  26  	// 3rd consecutive identical call: nudge (consecDupThreshold=3)
  27  	ld.Record("web_search", `{"q":"test"}`, false, "", "", false)
  28  	action, msg := ld.Check("web_search")
  29  	if action != LoopNudge {
  30  		t.Errorf("3 consecutive identical calls should nudge, got %v", action)
  31  	}
  32  	if msg == "" {
  33  		t.Error("nudge should have a message")
  34  	}
  35  }
  36  
  37  func TestLoopDetector_ConsecutiveDup_ForceStop(t *testing.T) {
  38  	ld := NewLoopDetector()
  39  
  40  	// 4 consecutive identical calls: force stop (consecDupThreshold+1=4)
  41  	for range 4 {
  42  		ld.Record("web_search", `{"q":"test"}`, false, "", "", false)
  43  	}
  44  	action, _ := ld.Check("web_search")
  45  	if action != LoopForceStop {
  46  		t.Errorf("4 consecutive identical calls should force stop, got %v", action)
  47  	}
  48  }
  49  
  50  func TestLoopDetector_NonConsecutiveDup_NoFalsePositive(t *testing.T) {
  51  	ld := NewLoopDetector()
  52  
  53  	// read → edit → read: NOT consecutive, 2 in window < exactDupThreshold(3)
  54  	ld.Record("file_read", `{"file":"main.go"}`, false, "", "", false)
  55  	ld.Record("file_edit", `{"file":"main.go","old":"a","new":"b"}`, false, "", "", false)
  56  	ld.Record("file_read", `{"file":"main.go"}`, false, "", "", false)
  57  
  58  	action, _ := ld.Check("file_read")
  59  	if action != LoopContinue {
  60  		t.Errorf("read-edit-read should not trigger (non-consecutive), got %v", action)
  61  	}
  62  }
  63  
  64  func TestLoopDetector_WindowDup_Nudge(t *testing.T) {
  65  	ld := NewLoopDetector()
  66  
  67  	// 5 spread-out identical calls: window-based nudge (exactDupThreshold=5)
  68  	ld.Record("file_read", `{"file":"main.go"}`, false, "", "", false)
  69  	ld.Record("file_edit", `{"old":"a","new":"b"}`, false, "", "", false)
  70  	ld.Record("file_read", `{"file":"main.go"}`, false, "", "", false)
  71  	ld.Record("file_edit", `{"old":"b","new":"c"}`, false, "", "", false)
  72  	ld.Record("file_read", `{"file":"main.go"}`, false, "", "", false)
  73  	ld.Record("file_edit", `{"old":"c","new":"d"}`, false, "", "", false)
  74  	ld.Record("file_read", `{"file":"main.go"}`, false, "", "", false)
  75  	ld.Record("file_edit", `{"old":"d","new":"e"}`, false, "", "", false)
  76  	ld.Record("file_read", `{"file":"main.go"}`, false, "", "", false)
  77  
  78  	action, _ := ld.Check("file_read")
  79  	if action != LoopNudge {
  80  		t.Errorf("5 spread-out identical calls should trigger window nudge, got %v", action)
  81  	}
  82  }
  83  
  84  func TestLoopDetector_WindowDup_ForceStop(t *testing.T) {
  85  	ld := NewLoopDetector()
  86  
  87  	// 10 spread-out identical calls: window force stop (2× exactDupThreshold=10)
  88  	for range 10 {
  89  		ld.Record("file_read", `{"file":"main.go"}`, false, "", "", false)
  90  		ld.Record("file_edit", `{"x":"y"}`, false, "", "", false)
  91  	}
  92  	action, _ := ld.Check("file_read")
  93  	if action != LoopForceStop {
  94  		t.Errorf("10 spread-out identical calls should force stop, got %v", action)
  95  	}
  96  }
  97  
  98  func TestLoopDetector_SameToolError_Nudge(t *testing.T) {
  99  	ld := NewLoopDetector()
 100  
 101  	// 5 errors: no trigger (threshold is 6)
 102  	for i := range 5 {
 103  		ld.Record("file_edit", fmt.Sprintf(`{"file":"f%d"}`, i), true, "permission denied", "", false)
 104  	}
 105  	action, _ := ld.Check("file_edit")
 106  	if action != LoopContinue {
 107  		t.Errorf("5 errors should not trigger, got %v", action)
 108  	}
 109  
 110  	// 6th error: nudge
 111  	ld.Record("file_edit", `{"file":"f5"}`, true, "permission denied", "", false)
 112  	action, msg := ld.Check("file_edit")
 113  	if action != LoopNudge {
 114  		t.Errorf("6 errors should trigger nudge, got %v", action)
 115  	}
 116  	if msg == "" {
 117  		t.Error("nudge should have a message")
 118  	}
 119  }
 120  
 121  func TestLoopDetector_SameToolError_ForceStop(t *testing.T) {
 122  	ld := NewLoopDetector()
 123  
 124  	// 12 errors: force stop (2× threshold of 6)
 125  	for i := range 12 {
 126  		ld.Record("file_edit", fmt.Sprintf(`{"file":"f%d"}`, i), true, "permission denied", "", false)
 127  	}
 128  	action, _ := ld.Check("file_edit")
 129  	if action != LoopForceStop {
 130  		t.Errorf("12 errors should trigger force stop, got %v", action)
 131  	}
 132  }
 133  
 134  func TestLoopDetector_NoProgress_Nudge(t *testing.T) {
 135  	ld := NewLoopDetector()
 136  
 137  	// 11 calls with different args: no trigger (threshold is 12)
 138  	// Use think (not in any tool family, not semi-repeatable) to test pure
 139  	// NoProgress detection. bash is semi-repeatable (threshold 16) so it
 140  	// wouldn't trigger at 12.
 141  	for i := range 11 {
 142  		ld.Record("think", fmt.Sprintf(`{"thought":"idea%d"}`, i), false, "", "", false)
 143  	}
 144  	action, _ := ld.Check("think")
 145  	if action != LoopContinue {
 146  		t.Errorf("11 calls should not trigger, got %v", action)
 147  	}
 148  
 149  	// 12th call: nudge
 150  	ld.Record("think", `{"thought":"idea12"}`, false, "", "", false)
 151  	action, _ = ld.Check("think")
 152  	if action != LoopNudge {
 153  		t.Errorf("12 calls should trigger nudge, got %v", action)
 154  	}
 155  }
 156  
 157  func TestLoopDetector_GUIExemptFromNoProgress(t *testing.T) {
 158  	ld := NewLoopDetector()
 159  
 160  	// 10 screenshot calls with different args: should NOT trigger NoProgress
 161  	for i := range 10 {
 162  		ld.Record("screenshot", fmt.Sprintf(`{"delay":%d}`, i), false, "", "", false)
 163  	}
 164  	action, _ := ld.Check("screenshot")
 165  	if action != LoopContinue {
 166  		t.Errorf("screenshot should be exempt from NoProgress, got %v", action)
 167  	}
 168  }
 169  
 170  func TestLoopDetector_GUIConsecutiveDupStillDetected(t *testing.T) {
 171  	ld := NewLoopDetector()
 172  
 173  	// Even GUI tools should trigger consecutive-duplicate detection
 174  	// consecDupThreshold=3 → nudge at 3 consecutive identical calls
 175  	ld.Record("screenshot", `{}`, false, "", "", false)
 176  	ld.Record("screenshot", `{}`, false, "", "", false)
 177  	action, _ := ld.Check("screenshot")
 178  	if action != LoopContinue {
 179  		t.Errorf("2 consecutive identical screenshot calls should not trigger (consecDupThreshold=3), got %v", action)
 180  	}
 181  
 182  	ld.Record("screenshot", `{}`, false, "", "", false)
 183  	action, _ = ld.Check("screenshot")
 184  	if action != LoopNudge {
 185  		t.Errorf("3 consecutive identical screenshot calls should nudge, got %v", action)
 186  	}
 187  }
 188  
 189  func TestLoopDetector_SlidingWindow(t *testing.T) {
 190  	ld := NewLoopDetector()
 191  	ld.historySize = 5 // small window for testing
 192  
 193  	// Fill window with 3 consecutive bash duplicates (triggers consecutive nudge at consecDupThreshold=3)
 194  	ld.Record("bash", `{"cmd":"ls"}`, false, "", "", false)
 195  	ld.Record("bash", `{"cmd":"ls"}`, false, "", "", false)
 196  	ld.Record("bash", `{"cmd":"ls"}`, false, "", "", false)
 197  	action, _ := ld.Check("bash")
 198  	if action != LoopNudge {
 199  		t.Error("3 consecutive exact dups should nudge")
 200  	}
 201  
 202  	// Push old records out of window with 5 different calls
 203  	for i := range 5 {
 204  		ld.Record("file_read", fmt.Sprintf(`{"file":"f%d"}`, i), false, "", "", false)
 205  	}
 206  
 207  	// bash dups should have fallen out of window
 208  	action, _ = ld.Check("bash")
 209  	if action != LoopContinue {
 210  		t.Error("old records should have fallen out of sliding window")
 211  	}
 212  }
 213  
 214  func TestLoopDetector_MixedWorkflow_NoFalsePositive(t *testing.T) {
 215  	ld := NewLoopDetector()
 216  
 217  	// Normal coding workflow: read, edit, read, edit, bash
 218  	ld.Record("file_read", `{"file":"main.go"}`, false, "", "", false)
 219  	ld.Record("file_edit", `{"file":"main.go","old":"a","new":"b"}`, false, "", "", false)
 220  	ld.Record("file_read", `{"file":"main.go"}`, false, "", "", false)
 221  	ld.Record("file_edit", `{"file":"main.go","old":"b","new":"c"}`, false, "", "", false)
 222  	ld.Record("bash", `{"cmd":"go test"}`, false, "", "", false)
 223  
 224  	for _, name := range []string{"file_read", "file_edit", "bash"} {
 225  		action, _ := ld.Check(name)
 226  		if action != LoopContinue {
 227  			t.Errorf("normal workflow should not trigger for %s, got %v", name, action)
 228  		}
 229  	}
 230  }
 231  
 232  func TestLoopDetector_DifferentArgsNoDuplicate(t *testing.T) {
 233  	ld := NewLoopDetector()
 234  
 235  	// Same tool, different args each time — should not trigger
 236  	for i := range 5 {
 237  		ld.Record("file_read", fmt.Sprintf(`{"file":"file%d.go"}`, i), false, "", "", false)
 238  	}
 239  	action, _ := ld.Check("file_read")
 240  	if action != LoopContinue {
 241  		t.Errorf("different args should not trigger, got %v", action)
 242  	}
 243  }
 244  
 245  func TestLoopDetector_ErrorsOnlyCountForSameTool(t *testing.T) {
 246  	ld := NewLoopDetector()
 247  
 248  	// Errors spread across different tools: no trigger for any single tool
 249  	ld.Record("bash", `{"cmd":"a"}`, true, "fail", "", false)
 250  	ld.Record("file_edit", `{"a":"b"}`, true, "fail", "", false)
 251  	ld.Record("grep", `{"p":"c"}`, true, "fail", "", false)
 252  	ld.Record("bash", `{"cmd":"b"}`, true, "fail", "", false)
 253  	ld.Record("file_edit", `{"a":"c"}`, true, "fail", "", false)
 254  
 255  	for _, name := range []string{"bash", "file_edit", "grep"} {
 256  		action, _ := ld.Check(name)
 257  		if action != LoopContinue {
 258  			t.Errorf("spread errors should not trigger for %s, got %v", name, action)
 259  		}
 260  	}
 261  }
 262  
 263  func TestLoopDetector_WebFamily_SameTopicNudge(t *testing.T) {
 264  	ld := NewLoopDetector()
 265  	// 5 web_search calls all normalizing to the "climate world" topic
 266  	// (only date / filler words differ) → family nudge at 5 (v2 threshold).
 267  	ld.Record("web_search", `{"query":"world climate today March 2 2026 major headlines"}`, false, "", "", false)
 268  	ld.Record("web_search", `{"query":"world climate March 2 2026 top headlines latest"}`, false, "", "", false)
 269  	ld.Record("web_search", `{"query":"world climate today March 2 2026 breaking news"}`, false, "", "", false)
 270  	ld.Record("web_search", `{"query":"world climate latest update March 2 2026"}`, false, "", "", false)
 271  	ld.Record("web_search", `{"query":"world climate top headlines current March 2 2026"}`, false, "", "", false)
 272  	action, msg := ld.Check("web_search")
 273  	if action != LoopNudge {
 274  		t.Errorf("5 same-topic web searches should nudge (FamilyNoProgress v2 threshold), got %v", action)
 275  	}
 276  	if msg == "" {
 277  		t.Error("nudge should have a message")
 278  	}
 279  }
 280  
 281  func TestLoopDetector_WebFamily_CrossToolTopicInheritance(t *testing.T) {
 282  	ld := NewLoopDetector()
 283  	// 2 web_search on same topic (only filler/date differences), then web_fetch.
 284  	// Family-level topic lookup should inherit the topic hash from web_search.
 285  	ld.Record("web_search", `{"query":"golang tutorial 2026"}`, false, "", "", false)
 286  	ld.Record("web_search", `{"query":"golang tutorial latest"}`, false, "", "", false)
 287  	ld.Record("web_fetch", `{"url":"https://go.dev/doc/tutorial"}`, false, "", "", false)
 288  
 289  	// 2 same-topic (from web_search) + 1 different (web_fetch URL) → not yet 5
 290  	action, _ := ld.Check("web_fetch")
 291  	if action != LoopContinue {
 292  		t.Errorf("2 same-topic + 1 different should continue, got %v", action)
 293  	}
 294  
 295  	// Add more same-topic searches until nudge at 5 same-topic in family (v2 threshold).
 296  	// All queries normalize to the "golang tutorial" topic (date/filler stripped).
 297  	ld.Record("web_search", `{"query":"latest golang tutorial today"}`, false, "", "", false)
 298  	ld.Record("web_search", `{"query":"golang tutorial top latest"}`, false, "", "", false)
 299  	ld.Record("web_search", `{"query":"golang tutorial current update"}`, false, "", "", false)
 300  	action, _ = ld.Check("web_search")
 301  	if action != LoopNudge {
 302  		t.Errorf("5 same-topic family calls should nudge (v2 threshold), got %v", action)
 303  	}
 304  }
 305  
 306  func TestLoopDetector_WebFamily_ResultSigDedup(t *testing.T) {
 307  	ld := NewLoopDetector()
 308  	// 5 calls returning the same domains → no new info → nudge at 5 (v2 threshold)
 309  	ld.Record("web_search", `{"query":"ai research papers"}`, false, "", "reuters.com,bbc.com", false)
 310  	ld.Record("web_search", `{"query":"ai research latest papers"}`, false, "", "reuters.com,bbc.com", false)
 311  	ld.Record("web_search", `{"query":"ai research papers review"}`, false, "", "reuters.com,bbc.com", false)
 312  	ld.Record("web_search", `{"query":"ai research 2026"}`, false, "", "reuters.com,bbc.com", false)
 313  	ld.Record("web_search", `{"query":"latest ai research papers"}`, false, "", "reuters.com,bbc.com", false)
 314  	action, _ := ld.Check("web_search")
 315  	if action != LoopNudge {
 316  		t.Errorf("5 calls with same result signature should nudge, got %v", action)
 317  	}
 318  }
 319  
 320  func TestLoopDetector_WebFamily_AlternatingSearchFetchStillNudges(t *testing.T) {
 321  	ld := NewLoopDetector()
 322  
 323  	// Mixed web workflows should still nudge when alternating tools keep
 324  	// returning the same source and no new information is being gathered.
 325  	// v2: nudge at 5 same-result-sig calls in the family.
 326  	ld.Record("web_search", `{"query":"go tutorial official"}`, false, "", "go.dev", false)
 327  	ld.Record("web_fetch", `{"url":"https://go.dev/doc/tutorial"}`, false, "", "go.dev", false)
 328  	ld.Record("web_search", `{"query":"golang tutorial latest official"}`, false, "", "go.dev", false)
 329  	ld.Record("web_fetch", `{"url":"https://go.dev/doc/effective_go"}`, false, "", "go.dev", false)
 330  	ld.Record("web_search", `{"query":"golang official tutorial guide"}`, false, "", "go.dev", false)
 331  
 332  	action, _ := ld.Check("web_search")
 333  	if action != LoopNudge {
 334  		t.Errorf("alternating web_search/web_fetch with the same result signature should nudge, got %v", action)
 335  	}
 336  }
 337  
 338  func TestLoopDetector_WebFamily_ForceStopAt7(t *testing.T) {
 339  	ld := NewLoopDetector()
 340  	// 7 web calls with same topic → force stop
 341  	for i := 0; i < 7; i++ {
 342  		ld.Record("web_search", `{"query":"climate change report"}`, false, "", "", false)
 343  	}
 344  	action, _ := ld.Check("web_search")
 345  	if action != LoopForceStop {
 346  		t.Errorf("7 same-topic web calls should force stop, got %v", action)
 347  	}
 348  }
 349  
 350  func TestLoopDetector_WebFamily_7DifferentTopicsNoForceStop(t *testing.T) {
 351  	ld := NewLoopDetector()
 352  	// 7 web family calls on DIFFERENT topics should NOT force stop
 353  	// (legitimate multi-source research)
 354  	for i := 0; i < 4; i++ {
 355  		ld.Record("web_search", fmt.Sprintf(`{"query":"topic%d search"}`, i), false, "", "", false)
 356  	}
 357  	for i := 0; i < 3; i++ {
 358  		ld.Record("web_fetch", fmt.Sprintf(`{"url":"https://example%d.com/page"}`, i), false, "", "", false)
 359  	}
 360  	action, _ := ld.Check("web_fetch")
 361  	if action == LoopForceStop {
 362  		t.Error("7 web family calls with different topics should NOT force stop")
 363  	}
 364  }
 365  
 366  func TestLoopDetector_WebFamily_DifferentTopicsUnder7(t *testing.T) {
 367  	ld := NewLoopDetector()
 368  	// 4 web calls with different topics — should NOT trigger (under 7 total, no topic match)
 369  	ld.Record("web_search", `{"query":"golang concurrency patterns"}`, false, "", "", false)
 370  	ld.Record("web_search", `{"query":"python machine learning tutorial"}`, false, "", "", false)
 371  	ld.Record("web_search", `{"query":"rust ownership explained"}`, false, "", "", false)
 372  	ld.Record("web_search", `{"query":"javascript async await"}`, false, "", "", false)
 373  	action, _ := ld.Check("web_search")
 374  	if action != LoopContinue {
 375  		t.Errorf("4 different-topic web calls should continue, got %v", action)
 376  	}
 377  }
 378  
 379  func TestLoopDetector_NonWebToolUnchanged(t *testing.T) {
 380  	ld := NewLoopDetector()
 381  	// 5 file_read calls with different args — should NOT trigger (threshold still 8)
 382  	for i := 0; i < 5; i++ {
 383  		ld.Record("file_read", fmt.Sprintf(`{"file":"file%d.go"}`, i), false, "", "", false)
 384  	}
 385  	action, _ := ld.Check("file_read")
 386  	if action != LoopContinue {
 387  		t.Errorf("5 file_read calls should not trigger (threshold 8), got %v", action)
 388  	}
 389  }
 390  
 391  // TestLoopDetector_RealWorldWebLoop replays the actual bug that prompted this fix:
 392  // many web_search calls with varied "world news" queries, then web_fetch calls.
 393  // v2 thresholds: nudge fires at 5 same-topic, force-stop fires at 12 same-topic.
 394  func TestLoopDetector_RealWorldWebLoop(t *testing.T) {
 395  	ld := NewLoopDetector()
 396  
 397  	searches := []string{
 398  		`{"query":"world news today March 2 2026"}`,
 399  		`{"query":"world news today March 2 2026 major headlines"}`,
 400  		`{"query":"world news March 2 2026 top headlines Reuters BBC Al Jazeera"}`,
 401  		`{"query":"world news today March 2 2026 top headlines Reuters AP BBC"}`,
 402  		`{"query":"world news March 2 2026 Reuters AP BBC Al Jazeera"}`,
 403  		`{"query":"world news March 2 2026 top headlines"}`,
 404  		`{"query":"world news today March 2 2026 top headlines"}`,
 405  		`{"query":"world news March 2 2026 top headlines Reuters AP BBC Al Jazeera CNN"}`,
 406  		`{"query":"world news March 2 2026 latest updates"}`,
 407  		`{"query":"world news March 2 2026 breaking"}`,
 408  		`{"query":"world news March 2 2026 Reuters AP"}`,
 409  		`{"query":"world news March 2 2026 BBC CNN Al Jazeera"}`,
 410  		`{"query":"world news March 2 2026 top stories"}`,
 411  	}
 412  
 413  	var firstNudge, firstForceStop int
 414  	for i, args := range searches {
 415  		ld.Record("web_search", args, false, "", "reuters.com,bbc.com", false)
 416  		action, _ := ld.Check("web_search")
 417  		if action == LoopNudge && firstNudge == 0 {
 418  			firstNudge = i + 1
 419  		}
 420  		if action == LoopForceStop && firstForceStop == 0 {
 421  			firstForceStop = i + 1
 422  		}
 423  	}
 424  
 425  	// v2: nudge at progressCount>=5, force-stop at progressCount>=12
 426  	if firstNudge == 0 || firstNudge > 5 {
 427  		t.Errorf("expected first nudge by call 5, got %d", firstNudge)
 428  	}
 429  	if firstForceStop == 0 || firstForceStop > 12 {
 430  		t.Errorf("expected force stop by call 12, got %d", firstForceStop)
 431  	}
 432  }
 433  
 434  // TestLoopDetector_RealWorldWebLoop_CrossTool verifies that switching from
 435  // web_search to web_fetch doesn't reset the family counter.
 436  func TestLoopDetector_ToolModeSwitch_NudgeOnGUIAfterSuccess(t *testing.T) {
 437  	ld := NewLoopDetector()
 438  
 439  	// Successful non-GUI call followed by GUI call → nudge
 440  	ld.Record("applescript", `{"script":"create event"}`, false, "", "", false)
 441  	action, _ := ld.Check("applescript")
 442  	if action != LoopContinue {
 443  		t.Errorf("single successful call should continue, got %v", action)
 444  	}
 445  
 446  	ld.Record("screenshot", `{"target":"screen"}`, false, "", "", false)
 447  	action, msg := ld.Check("screenshot")
 448  	if action != LoopNudge {
 449  		t.Errorf("GUI call after successful non-GUI should nudge, got %v", action)
 450  	}
 451  	if msg == "" {
 452  		t.Error("nudge should have a message")
 453  	}
 454  }
 455  
 456  func TestLoopDetector_ToolModeSwitch_NoNudgeAfterError(t *testing.T) {
 457  	ld := NewLoopDetector()
 458  
 459  	// Failed non-GUI call followed by GUI call → no nudge (GUI verification warranted)
 460  	ld.Record("applescript", `{"script":"create event"}`, true, "calendar not found", "", false)
 461  	ld.Record("screenshot", `{"target":"screen"}`, false, "", "", false)
 462  	action, _ := ld.Check("screenshot")
 463  	if action != LoopContinue {
 464  		t.Errorf("GUI after failed non-GUI should continue (verification warranted), got %v", action)
 465  	}
 466  }
 467  
 468  func TestLoopDetector_ToolModeSwitch_NoNudgeForGUIOnlyTask(t *testing.T) {
 469  	ld := NewLoopDetector()
 470  
 471  	// Task starts with GUI tools — no non-GUI success to trigger on
 472  	ld.Record("screenshot", `{"target":"screen"}`, false, "", "", false)
 473  	ld.Record("computer", `{"action":"click","coordinate":[100,200]}`, false, "", "", false)
 474  	ld.Record("screenshot", `{"target":"screen"}`, false, "", "", false)
 475  	action, _ := ld.Check("screenshot")
 476  	if action != LoopContinue {
 477  		t.Errorf("GUI-only task should not trigger mode switch, got %v", action)
 478  	}
 479  }
 480  
 481  func TestLoopDetector_ToolModeSwitch_NudgeOnlyOnce(t *testing.T) {
 482  	ld := NewLoopDetector()
 483  
 484  	// Successful non-GUI → GUI nudge → second GUI should NOT nudge again
 485  	ld.Record("applescript", `{"script":"create event"}`, false, "", "", false)
 486  	ld.Record("screenshot", `{"target":"screen"}`, false, "", "", false)
 487  	action, _ := ld.Check("screenshot")
 488  	if action != LoopNudge {
 489  		t.Errorf("first GUI after success should nudge, got %v", action)
 490  	}
 491  
 492  	ld.Record("computer", `{"action":"click","coordinate":[100,200]}`, false, "", "", false)
 493  	action, _ = ld.Check("computer")
 494  	if action != LoopContinue {
 495  		t.Errorf("second GUI should not re-nudge (already nudged), got %v", action)
 496  	}
 497  }
 498  
 499  func TestLoopDetector_ToolModeSwitch_ResetsOnNewNonGUI(t *testing.T) {
 500  	ld := NewLoopDetector()
 501  
 502  	// Success → GUI nudge → new GUI-adjacent success → GUI nudge again (new mode switch)
 503  	ld.Record("applescript", `{"script":"create event"}`, false, "", "", false)
 504  	ld.Record("screenshot", `{"target":"screen"}`, false, "", "", false)
 505  	action, _ := ld.Check("screenshot")
 506  	if action != LoopNudge {
 507  		t.Errorf("first mode switch should nudge, got %v", action)
 508  	}
 509  
 510  	// New GUI-adjacent success resets the detector
 511  	ld.Record("browser", `{"action":"navigate","url":"http://example.com"}`, false, "", "", false)
 512  	ld.Record("screenshot", `{"target":"screen"}`, false, "", "", false)
 513  	action, _ = ld.Check("screenshot")
 514  	if action != LoopNudge {
 515  		t.Errorf("new mode switch after reset should nudge again, got %v", action)
 516  	}
 517  }
 518  
 519  func TestLoopDetector_ToolModeSwitch_NoNudgeAfterNonGUITool(t *testing.T) {
 520  	ld := NewLoopDetector()
 521  
 522  	// Non-GUI tool (bash, file_read, etc.) success → screenshot should NOT trigger
 523  	// mode switch since these aren't GUI-adjacent tools.
 524  	ld.Record("bash", `{"command":"echo hello"}`, false, "", "", false)
 525  	ld.Record("screenshot", `{"target":"screen"}`, false, "", "", false)
 526  	action, _ := ld.Check("screenshot")
 527  	if action != LoopContinue {
 528  		t.Errorf("screenshot after bash should not trigger mode switch, got %v", action)
 529  	}
 530  }
 531  
 532  func TestLoopDetector_RealWorldWebLoop_CrossTool(t *testing.T) {
 533  	ld := NewLoopDetector()
 534  
 535  	// All queries normalize to the "climate world" topic — only filler / date
 536  	// variations so the topic hash stays stable across all calls.
 537  	// 5 searches on same topic → nudge at 5 (v2 threshold)
 538  	ld.Record("web_search", `{"query":"world climate today March 2 2026"}`, false, "", "", false)
 539  	ld.Record("web_search", `{"query":"world climate March 2 2026 latest"}`, false, "", "", false)
 540  	ld.Record("web_search", `{"query":"world climate today latest headlines"}`, false, "", "", false)
 541  	ld.Record("web_search", `{"query":"world climate top breaking news"}`, false, "", "", false)
 542  	ld.Record("web_search", `{"query":"world climate current update major"}`, false, "", "", false)
 543  
 544  	action, _ := ld.Check("web_search")
 545  	if action != LoopNudge {
 546  		t.Errorf("expected nudge after 5 same-topic searches, got %v", action)
 547  	}
 548  
 549  	// Switch to web_fetch then back — same-topic counter continues via family lookup.
 550  	// web_fetch URL is treated as its own topic, then we add more same-topic searches.
 551  	ld.Record("web_fetch", `{"url":"https://reuters.com/world/climate"}`, false, "", "", false)
 552  	ld.Record("web_search", `{"query":"world climate today"}`, false, "", "", false)
 553  	ld.Record("web_search", `{"query":"world climate latest"}`, false, "", "", false)
 554  	ld.Record("web_search", `{"query":"world climate top current"}`, false, "", "", false)
 555  
 556  	// 5 original + 3 more same-topic web_search = 8 same-topic → stronger nudge (stage 1)
 557  	action, _ = ld.Check("web_search")
 558  	if action != LoopNudge {
 559  		t.Errorf("expected nudge after 8 same-topic web calls, got %v", action)
 560  	}
 561  
 562  	// Add more same-topic calls until force stop at progressCount >= 12.
 563  	ld.Record("web_search", `{"query":"world climate breaking"}`, false, "", "", false)
 564  	ld.Record("web_search", `{"query":"world climate update"}`, false, "", "", false)
 565  	ld.Record("web_search", `{"query":"world climate news today"}`, false, "", "", false)
 566  	ld.Record("web_search", `{"query":"world climate headlines major"}`, false, "", "", false)
 567  	action, _ = ld.Check("web_search")
 568  	if action != LoopForceStop {
 569  		t.Errorf("expected force stop after 12 same-topic web calls, got %v", action)
 570  	}
 571  }
 572  
 573  func TestLoopDetector_SuccessAfterError_NudgeOnPostRecoveryGUI(t *testing.T) {
 574  	ld := NewLoopDetector()
 575  
 576  	// Tool fails, then succeeds with different args, then agent goes to GUI → nudge
 577  	ld.Record("applescript", `{"script":"tell calendar \"Calendar\""}`, true, "calendar not found", "", false)
 578  	ld.Record("applescript", `{"script":"get name of every calendar"}`, false, "", "", false)
 579  	ld.Record("applescript", `{"script":"tell calendar \"日历\""}`, false, "", "", false)
 580  
 581  	// Now agent switches to GUI to verify — should nudge
 582  	ld.Record("screenshot", `{"target":"screen"}`, false, "", "", false)
 583  	action, msg := ld.Check("screenshot")
 584  	if action != LoopNudge {
 585  		t.Errorf("GUI after recovery should nudge, got %v", action)
 586  	}
 587  	if msg == "" {
 588  		t.Error("nudge should have a message")
 589  	}
 590  }
 591  
 592  func TestLoopDetector_SuccessAfterError_NoNudgeIfNoRecovery(t *testing.T) {
 593  	ld := NewLoopDetector()
 594  
 595  	// Tool fails, no retry yet, agent takes screenshot → no nudge from this detector
 596  	// (ToolModeSwitch won't fire either since last non-GUI was an error)
 597  	ld.Record("applescript", `{"script":"tell calendar \"Calendar\""}`, true, "not found", "", false)
 598  	ld.Record("screenshot", `{"target":"screen"}`, false, "", "", false)
 599  	action, _ := ld.Check("screenshot")
 600  	if action != LoopContinue {
 601  		t.Errorf("no recovery happened, should continue, got %v", action)
 602  	}
 603  }
 604  
 605  func TestLoopDetector_SuccessAfterError_ResetsOnNewWork(t *testing.T) {
 606  	ld := NewLoopDetector()
 607  
 608  	// Recovery happens, then agent moves on to genuinely different work
 609  	ld.Record("applescript", `{"script":"tell calendar \"Calendar\""}`, true, "not found", "", false)
 610  	ld.Record("applescript", `{"script":"tell calendar \"日历\""}`, false, "", "", false)
 611  
 612  	// Agent moves to a different non-GUI tool → recovery state resets
 613  	ld.Record("bash", `{"command":"echo done"}`, false, "", "", false)
 614  	ld.Record("file_read", `{"path":"notes.md"}`, false, "", "", false)
 615  
 616  	// GUI now should NOT nudge for recovery (agent moved on)
 617  	// Note: ToolModeSwitch may nudge since file_read succeeded — that's a different detector
 618  	// We specifically check that the nudge message does NOT mention recovery
 619  	ld.Record("screenshot", `{"target":"screen"}`, false, "", "", false)
 620  	action, msg := ld.Check("screenshot")
 621  	// It may nudge from ToolModeSwitch, but NOT from SuccessAfterError
 622  	if action == LoopNudge && strings.Contains(msg, "recovered") {
 623  		t.Errorf("recovery should have reset, but got recovery nudge: %s", msg)
 624  	}
 625  }
 626  
 627  func TestLoopDetector_SleepDetection_Nudge(t *testing.T) {
 628  	ld := NewLoopDetector()
 629  
 630  	// 1 sleep call: no trigger
 631  	ld.Record("bash", `{"command":"sleep 5"}`, false, "", "", false)
 632  	action, _ := ld.Check("bash")
 633  	if action != LoopContinue {
 634  		t.Errorf("1 sleep call should not trigger, got %v", action)
 635  	}
 636  
 637  	// 2nd sleep call: nudge
 638  	ld.Record("bash", `{"command":"sleep 5 && curl http://localhost:8080"}`, false, "", "", false)
 639  	action, msg := ld.Check("bash")
 640  	if action != LoopNudge {
 641  		t.Errorf("2 sleep calls should nudge, got %v", action)
 642  	}
 643  	if msg == "" {
 644  		t.Error("nudge should have a message")
 645  	}
 646  }
 647  
 648  func TestLoopDetector_SleepDetection_ForceStop(t *testing.T) {
 649  	ld := NewLoopDetector()
 650  
 651  	// 4 sleep calls: force stop
 652  	ld.Record("bash", `{"command":"sleep 5"}`, false, "", "", false)
 653  	ld.Record("bash", `{"command":"sleep 1 && echo done"}`, false, "", "", false)
 654  	ld.Record("bash", `{"command":"while true; do sleep 1; done"}`, false, "", "", false)
 655  	ld.Record("bash", `{"command":"sleep 10"}`, false, "", "", false)
 656  	action, _ := ld.Check("bash")
 657  	if action != LoopForceStop {
 658  		t.Errorf("4 sleep calls should force stop, got %v", action)
 659  	}
 660  }
 661  
 662  func TestLoopDetector_SleepDetection_NoFalsePositive(t *testing.T) {
 663  	ld := NewLoopDetector()
 664  
 665  	// bash commands without sleep: no trigger
 666  	ld.Record("bash", `{"command":"echo hello"}`, false, "", "", false)
 667  	ld.Record("bash", `{"command":"cat sleep.log"}`, false, "", "", false)
 668  	ld.Record("bash", `{"command":"grep sleeper main.go"}`, false, "", "", false)
 669  	ld.Record("bash", `{"command":"ls -la"}`, false, "", "", false)
 670  	action, _ := ld.Check("bash")
 671  	if action != LoopContinue {
 672  		t.Errorf("non-sleep bash commands should not trigger, got %v", action)
 673  	}
 674  }
 675  
 676  func TestLoopDetector_SleepDetection_IgnoreNonBash(t *testing.T) {
 677  	ld := NewLoopDetector()
 678  
 679  	// sleep in non-bash tool args: no trigger (different args to avoid dup detection)
 680  	ld.Record("file_read", `{"command":"sleep 5"}`, false, "", "", false)
 681  	ld.Record("grep", `{"command":"sleep 10"}`, false, "", "", false)
 682  	ld.Record("file_read", `{"command":"sleep 15"}`, false, "", "", false)
 683  	ld.Record("grep", `{"command":"sleep 20"}`, false, "", "", false)
 684  	action, _ := ld.Check("grep")
 685  	if action != LoopContinue {
 686  		t.Errorf("sleep in non-bash tool args should not trigger, got %v", action)
 687  	}
 688  }
 689  
 690  func TestLoopDetector_SearchEscalation_Nudge(t *testing.T) {
 691  	ld := NewLoopDetector()
 692  
 693  	// 6 consecutive unproductive search calls: no trigger yet (threshold is 7)
 694  	for i := 0; i < 6; i++ {
 695  		ld.Record("grep", fmt.Sprintf(`{"pattern":"term%d"}`, i), false, "", "", true)
 696  	}
 697  	action, _ := ld.Check("grep")
 698  	if action != LoopContinue {
 699  		t.Errorf("6 unproductive search calls should not trigger, got %v", action)
 700  	}
 701  
 702  	// 7th unproductive search call: nudge
 703  	ld.Record("grep", `{"pattern":"term6"}`, false, "", "", true)
 704  	action, msg := ld.Check("grep")
 705  	if action != LoopNudge {
 706  		t.Errorf("7 unproductive search calls should nudge, got %v", action)
 707  	}
 708  	if msg == "" {
 709  		t.Error("nudge should have a message")
 710  	}
 711  }
 712  
 713  func TestLoopDetector_SearchEscalation_ForceStop(t *testing.T) {
 714  	ld := NewLoopDetector()
 715  
 716  	// 12 consecutive unproductive search calls (mixed grep/glob): force stop
 717  	for i := 0; i < 12; i++ {
 718  		tool := "grep"
 719  		if i%2 == 1 {
 720  			tool = "glob"
 721  		}
 722  		ld.Record(tool, fmt.Sprintf(`{"pattern":"term%d"}`, i), false, "", "", true)
 723  	}
 724  	action, _ := ld.Check("glob")
 725  	if action != LoopForceStop {
 726  		t.Errorf("12 unproductive search calls should force stop, got %v", action)
 727  	}
 728  }
 729  
 730  func TestLoopDetector_SearchEscalation_NoFalsePositive(t *testing.T) {
 731  	ld := NewLoopDetector()
 732  
 733  	// grep interspersed with file_edit: no consecutive run builds up
 734  	ld.Record("grep", `{"pattern":"foo"}`, false, "", "", false)
 735  	ld.Record("file_edit", `{"file":"main.go","old":"a","new":"b"}`, false, "", "", false)
 736  	ld.Record("grep", `{"pattern":"bar"}`, false, "", "", false)
 737  	ld.Record("file_edit", `{"file":"main.go","old":"b","new":"c"}`, false, "", "", false)
 738  	ld.Record("grep", `{"pattern":"baz"}`, false, "", "", false)
 739  
 740  	action, _ := ld.Check("grep")
 741  	if action != LoopContinue {
 742  		t.Errorf("grep interspersed with edits should not trigger search escalation, got %v", action)
 743  	}
 744  }
 745  
 746  func TestLoopDetector_SearchEscalation_MixedSearchTools(t *testing.T) {
 747  	ld := NewLoopDetector()
 748  
 749  	// 7 unproductive mixed grep+glob calls: nudge (v2 threshold)
 750  	ld.Record("grep", `{"pattern":"foo"}`, false, "", "", true)
 751  	ld.Record("glob", `{"pattern":"**/*.go"}`, false, "", "", true)
 752  	ld.Record("grep", `{"pattern":"bar"}`, false, "", "", true)
 753  	ld.Record("glob", `{"pattern":"**/*.ts"}`, false, "", "", true)
 754  	ld.Record("grep", `{"pattern":"baz"}`, false, "", "", true)
 755  	ld.Record("glob", `{"pattern":"**/*.json"}`, false, "", "", true)
 756  	ld.Record("grep", `{"pattern":"qux"}`, false, "", "", true)
 757  
 758  	action, msg := ld.Check("grep")
 759  	if action != LoopNudge {
 760  		t.Errorf("7 unproductive mixed search calls should nudge, got %v", action)
 761  	}
 762  	if msg == "" {
 763  		t.Error("nudge should have a message")
 764  	}
 765  }
 766  
 767  func TestLoopDetector_SearchEscalation_ProductiveResets(t *testing.T) {
 768  	ld := NewLoopDetector()
 769  
 770  	// 2 unproductive, then 1 productive, then 1 more unproductive.
 771  	// Trailing unproductive streak is only 1, well below the nudge threshold of 5.
 772  	ld.Record("grep", `{"pattern":"a"}`, false, "", "", true)
 773  	ld.Record("grep", `{"pattern":"b"}`, false, "", "", true)
 774  	ld.Record("grep", `{"pattern":"c"}`, false, "", "", false) // productive — resets streak
 775  	ld.Record("grep", `{"pattern":"d"}`, false, "", "", true)
 776  
 777  	action, _ := ld.Check("grep")
 778  	if action != LoopContinue {
 779  		t.Errorf("productive search should reset streak, expected continue, got %v", action)
 780  	}
 781  }
 782  
 783  func TestLoopDetector_SearchEscalation_ProductiveSearchesDontHitNoProgress(t *testing.T) {
 784  	ld := NewLoopDetector()
 785  
 786  	// Repeated productive grep calls with different args are normal during
 787  	// repository exploration and should not trigger the generic NoProgress path.
 788  	for i := 0; i < 8; i++ {
 789  		ld.Record("grep", fmt.Sprintf(`{"pattern":"term%d"}`, i), false, "", "", false)
 790  	}
 791  
 792  	action, _ := ld.Check("grep")
 793  	if action != LoopContinue {
 794  		t.Errorf("productive search calls should not hit NoProgress, got %v", action)
 795  	}
 796  }
 797  
 798  func TestLoopDetector_BrowserFamilyNoProgress(t *testing.T) {
 799  	ld := NewLoopDetector()
 800  
 801  	// Simulate 5 browser calls with the same URL (same topic hash) but different
 802  	// extra fields to produce different ArgsHash and avoid ConsecutiveDup detector.
 803  	// v2: FamilyNoProgress nudge at progressCount >= 5.
 804  	ld.Record("browser", `{"action":"navigate","url":"https://jd.com/search?q=huawei","wait":1}`, false, "", "", false)
 805  	ld.Record("browser", `{"action":"navigate","url":"https://jd.com/search?q=huawei","wait":2}`, false, "", "", false)
 806  	ld.Record("browser", `{"action":"navigate","url":"https://jd.com/search?q=huawei","wait":3}`, false, "", "", false)
 807  	ld.Record("browser", `{"action":"navigate","url":"https://jd.com/search?q=huawei","wait":4}`, false, "", "", false)
 808  	ld.Record("browser", `{"action":"navigate","url":"https://jd.com/search?q=huawei","wait":5}`, false, "", "", false)
 809  	action, msg := ld.Check("browser")
 810  	if action != LoopNudge {
 811  		t.Errorf("5 same-topic browser calls should nudge, got %v", action)
 812  	}
 813  	if strings.Contains(msg, "searched") || strings.Contains(msg, "query") {
 814  		t.Errorf("browser-family nudge should not use search vocabulary, got: %s", msg)
 815  	}
 816  	if !strings.Contains(msg, "UI action") {
 817  		t.Errorf("expected browser-family nudge to mention 'UI action', got: %s", msg)
 818  	}
 819  }
 820  
 821  // TestFamilyNoProgressMessage_VocabularyByFamily asserts the helper emits
 822  // family-appropriate wording at each stage. Protects against the regression
 823  // where browser callers received search-vocabulary nudges ("You've searched
 824  // the same topic…") after FamilyNoProgress was extended to cover browser_*.
 825  func TestFamilyNoProgressMessage_VocabularyByFamily(t *testing.T) {
 826  	cases := []struct {
 827  		family        string
 828  		stage         int
 829  		forbidSubstrs []string
 830  		wantSubstrs   []string
 831  	}{
 832  		{"browser", 0, []string{"searched", "query"}, []string{"UI action", "selector"}},
 833  		{"browser", 1, []string{"searched", "query"}, []string{"UI action"}},
 834  		{"browser", 2, []string{"searched", "query"}, []string{"UI action", "browser-family"}},
 835  		{"gui", 0, []string{"searched", "query"}, []string{"UI action"}},
 836  		{"search", 0, nil, []string{"searched the same topic"}},
 837  		{"web", 2, nil, []string{"web calls", "same topic"}},
 838  	}
 839  	for _, tc := range cases {
 840  		msg := familyNoProgressMessage(tc.family, 3, 4, tc.stage)
 841  		for _, forbid := range tc.forbidSubstrs {
 842  			if strings.Contains(msg, forbid) {
 843  				t.Errorf("family=%s stage=%d: message must not contain %q, got: %s", tc.family, tc.stage, forbid, msg)
 844  			}
 845  		}
 846  		for _, want := range tc.wantSubstrs {
 847  			if !strings.Contains(msg, want) {
 848  				t.Errorf("family=%s stage=%d: message must contain %q, got: %s", tc.family, tc.stage, want, msg)
 849  			}
 850  		}
 851  	}
 852  }
 853  
 854  func TestBrowserInToolFamilies(t *testing.T) {
 855  	family := toolFamily("browser")
 856  	if family != "browser" {
 857  		t.Errorf("browser family should be 'browser', got %q", family)
 858  	}
 859  }
 860  
 861  // TestLoopDetector_BrowserToolsRepeatable ensures that browser_* MCP tools
 862  // are treated as repeatable GUI tools. Before the fix, `repeatableGUITools`
 863  // was keyed on the literal string "browser", but real tool names are
 864  // "browser_navigate", "browser_snapshot", etc., so the NoProgress detector
 865  // (8+ same tool → nudge) would fire on legit multi-page browsing sessions.
 866  func TestLoopDetector_BrowserToolsRepeatable(t *testing.T) {
 867  	ld := NewLoopDetector()
 868  
 869  	// 9 browser_navigate calls to different URLs — progressCount stays at 1
 870  	// per topic, so the FamilyNoProgress detector won't fire. But before the
 871  	// fix the outer NoProgress detector (line 355) WOULD nudge at 8 because
 872  	// repeatableTools["browser_navigate"] == false. After the fix it stays Continue.
 873  	urls := []string{"a", "b", "c", "d", "e", "f", "g", "h", "i"}
 874  	for _, u := range urls {
 875  		ld.Record("browser_navigate", fmt.Sprintf(`{"url":"https://example.com/%s"}`, u), false, "", "", false)
 876  	}
 877  	action, msg := ld.Check("browser_navigate")
 878  	if action != LoopContinue {
 879  		t.Fatalf("browser_navigate x9 to different URLs should Continue (it is a repeatable GUI tool), got %v: %s", action, msg)
 880  	}
 881  }
 882  
 883  func TestLoopDetector_BrowserSnapshotInterleavedRepeatable(t *testing.T) {
 884  	ld := NewLoopDetector()
 885  	// Realistic multi-step pattern: snapshot → click → snapshot → click → ...
 886  	// Each snapshot has the same args but is separated by clicks, so it is
 887  	// not a consecutive duplicate. Over 10 steps we accumulate 5 snapshots,
 888  	// under the consecutive-dup threshold and under the no-progress threshold
 889  	// of 8 same-name calls — must stay Continue.
 890  	for i := range 5 {
 891  		ld.Record("browser_snapshot", `{}`, false, "", "", false)
 892  		ld.Record("browser_click", fmt.Sprintf(`{"ref":"e%d"}`, i), false, "", "", false)
 893  	}
 894  	action, msg := ld.Check("browser_click")
 895  	if action != LoopContinue {
 896  		t.Fatalf("interleaved browser_snapshot/browser_click should Continue, got %v: %s", action, msg)
 897  	}
 898  }
 899  
 900  // TestLoopDetector_SemiRepeatable_BashHigherThreshold verifies that bash
 901  // gets the elevated NoProgress threshold (16) instead of the generic (12),
 902  // so multi-step scripting workflows (fetch → process → install → build)
 903  // aren't killed before completing. The exact-dup, same-error, and sleep
 904  // detectors still catch real loops at their own lower thresholds.
 905  func TestLoopDetector_SemiRepeatable_BashHigherThreshold(t *testing.T) {
 906  	ld := NewLoopDetector()
 907  
 908  	// 12 distinct bash calls — would nudge with the generic threshold (12),
 909  	// but should be Continue with the semi-repeatable threshold of 16.
 910  	for i := range 12 {
 911  		ld.Record("bash", fmt.Sprintf(`{"command":"step_%d"}`, i), false, "", "", false)
 912  	}
 913  	action, _ := ld.Check("bash")
 914  	if action != LoopContinue {
 915  		t.Errorf("12 distinct bash calls should Continue (semi-repeatable threshold 16), got %v", action)
 916  	}
 917  
 918  	// 15 calls — still under 16.
 919  	for i := 12; i < 15; i++ {
 920  		ld.Record("bash", fmt.Sprintf(`{"command":"step_%d"}`, i), false, "", "", false)
 921  	}
 922  	action, _ = ld.Check("bash")
 923  	if action != LoopContinue {
 924  		t.Errorf("15 distinct bash calls should Continue, got %v", action)
 925  	}
 926  
 927  	// 16th call → nudge.
 928  	ld.Record("bash", `{"command":"step_16"}`, false, "", "", false)
 929  	action, _ = ld.Check("bash")
 930  	if action != LoopNudge {
 931  		t.Errorf("16 bash calls should nudge, got %v", action)
 932  	}
 933  }
 934  
 935  // TestLoopDetector_SemiRepeatable_NonBashUnchanged verifies that the generic
 936  // NoProgress threshold (12) still applies to non-semi-repeatable tools like
 937  // file_write, think, etc. — unchanged from the v1 bash-only relaxation.
 938  func TestLoopDetector_SemiRepeatable_NonBashUnchanged(t *testing.T) {
 939  	ld := NewLoopDetector()
 940  
 941  	for i := range 12 {
 942  		ld.Record("think", fmt.Sprintf(`{"thought":"idea_%d"}`, i), false, "", "", false)
 943  	}
 944  	action, _ := ld.Check("think")
 945  	if action != LoopNudge {
 946  		t.Errorf("12 think calls should nudge at generic threshold, got %v", action)
 947  	}
 948  }
 949  
 950  // TestLoopDetector_BrowserMultiToolFlowNoFalsePositive verifies that a
 951  // realistic mixed browser workflow does not trigger FamilyNoProgress just
 952  // because every call on the same page produces the same URL-only result
 953  // signature. Before the fix, navigate → click → click → upload on
 954  // chatgpt.com would emit a "same topic/UI action 3 times" nudge because
 955  // extractResultSignature collapses every browser-family call on that URL
 956  // to the same hash.
 957  func TestLoopDetector_BrowserMultiToolFlowNoFalsePositive(t *testing.T) {
 958  	ld := NewLoopDetector()
 959  
 960  	// All four calls return snapshots whose URL set boils down to
 961  	// https://chatgpt.com/ → identical result signatures.
 962  	sameResultSig := "https://chatgpt.com"
 963  	ld.Record("browser_navigate", `{"url":"https://chatgpt.com"}`, false, "", sameResultSig, false)
 964  	ld.Record("browser_click", `{"ref":"e120","element":"plus"}`, false, "", sameResultSig, false)
 965  	ld.Record("browser_click", `{"ref":"e513","element":"photos"}`, false, "", sameResultSig, false)
 966  	ld.Record("browser_file_upload", `{"paths":["/tmp/x.png"]}`, false, "", sameResultSig, false)
 967  
 968  	action, msg := ld.Check("browser_file_upload")
 969  	if action != LoopContinue {
 970  		t.Errorf("mixed browser workflow with unique tool names must not nudge, got %v (%q)", action, msg)
 971  	}
 972  }
 973  
 974  // TestLoopDetector_BrowserSameToolStillDetected guards the opposite case:
 975  // true repetition of the same browser tool on the same page should still
 976  // trip the detector. Ensures the same-name scoping doesn't break real-loop
 977  // detection.
 978  func TestLoopDetector_BrowserSameToolStillDetected(t *testing.T) {
 979  	ld := NewLoopDetector()
 980  	sameResultSig := "https://chatgpt.com"
 981  	// Three back-to-back identical calls hit the consecutive-duplicate
 982  	// detector at threshold 3 → nudge. A fourth would force-stop, which is
 983  	// also correct behavior but not what this test locks in.
 984  	for i := 0; i < 3; i++ {
 985  		ld.Record("browser_click", `{"ref":"e120","element":"plus"}`, false, "", sameResultSig, false)
 986  	}
 987  	action, _ := ld.Check("browser_click")
 988  	if action != LoopNudge {
 989  		t.Errorf("3 consecutive identical browser_click calls should nudge, got %v", action)
 990  	}
 991  }
 992  
 993  // TestLoopDetector_BrowserSnapshotConsecutiveDupStillForceStops preserves the
 994  // load-bearing polling guard after the repeatable-result-only relaxation:
 995  // repeated browser_snapshot calls with identical args must still be stopped by
 996  // the duplicate detectors instead of silently inheriting the raised threshold.
 997  // With consecDupThreshold=3, force-stop fires at consecDupThreshold+1=4.
 998  func TestLoopDetector_BrowserSnapshotConsecutiveDupStillForceStops(t *testing.T) {
 999  	ld := NewLoopDetector()
1000  	const pageURL = "https://example.com/app"
1001  	for range 4 {
1002  		ld.Record("browser_snapshot", `{}`, false, "", pageURL, false)
1003  	}
1004  	action, msg := ld.Check("browser_snapshot")
1005  	if action != LoopForceStop {
1006  		t.Fatalf("4 identical browser_snapshot calls must still force-stop via duplicate detection, got %v: %s", action, msg)
1007  	}
1008  }
1009  
1010  // TestIsReadMCPName locks the read-verb whitelist used to populate
1011  // the loop detector's batchTolerant set. Read-only MCP tools must match
1012  // (eligible for uniqueness-gated NoProgress relief); write-capable tools
1013  // must NOT match (stay under the count-based guard), because the
1014  // permission engine does not gate MCP calls and a write loop with
1015  // unique arguments could otherwise create many remote records.
1016  func TestIsReadMCPName(t *testing.T) {
1017  	tests := []struct {
1018  		name string
1019  		want bool
1020  	}{
1021  		// Direct read-verb prefix.
1022  		{"list_calendars", true},
1023  		{"get_events", true},
1024  		{"search_gmail_messages", true},
1025  		{"query_database", true},
1026  		{"fetch_profile", true},
1027  		{"describe_table", true},
1028  		{"find_files", true},
1029  		// Namespaced read-verbs (vendor prefix + separator + verb).
1030  		{"API-query-data-source", true},
1031  		{"google_gmail_search_messages", true},
1032  		{"notion_list_pages", true},
1033  		{"Notion_Search_Databases", true}, // case-insensitive
1034  		// Write verbs must stay OUT.
1035  		{"create_notion_page", false},
1036  		{"update_page_properties", false},
1037  		{"delete_event", false},
1038  		{"send_gmail_message", false},
1039  		{"modify_permissions", false},
1040  		{"remove_label", false},
1041  		{"insert_row", false},
1042  		{"append_content_to_page", false},
1043  		{"archive_thread", false},
1044  		// Namespaced writes must also stay out.
1045  		{"google_calendar_create_event", false},
1046  		{"notion_create_comment", false},
1047  		{"drive_upload_file", false},
1048  		// Compound-verb names: a read verb AND a write verb in the first
1049  		// three tokens must return false — the write blacklist dominates.
1050  		// This is the defensive half of the heuristic: destructive suffixes
1051  		// must not sneak through on a position-0 read-verb match.
1052  		{"lookup_and_delete_all_records", false}, // lookup + delete
1053  		{"get_or_create_item", false},            // get + create
1054  		{"find_and_remove_entry", false},         // find + remove
1055  		{"list-and-archive", false},              // list + archive
1056  		// Data-transfer / property-mutation verbs (GitHub/Linear/Notion/
1057  		// Slack MCP patterns). Each pairs a position-0 read with a
1058  		// write verb that earlier versions of writeVerbs missed.
1059  		{"get_and_add_member", false},        // get + add
1060  		{"list_and_set_properties", false},   // list + set
1061  		{"search_and_replace", false},        // search + replace
1062  		{"get_and_write_cache", false},       // get + write
1063  		{"find_and_patch_record", false},     // find + patch
1064  		{"query_and_put_result", false},      // query + put
1065  		{"list_and_clear_flags", false},      // list + clear
1066  		{"get_and_post_update", false},       // get + post
1067  		{"list_and_push_changes", false},     // list + push
1068  		{"fetch_and_publish_item", false},    // fetch + publish
1069  		{"get_and_submit_form", false},       // get + submit
1070  		{"list_and_drop_table", false},       // list + drop
1071  		{"find_and_prune_entries", false},    // find + prune
1072  		// "run"/"execute" are in writeVerbs (fail-closed on ambiguous
1073  		// action verbs). Snowflake/ClickHouse "run_query" used to be
1074  		// accepted as SELECT convention, but a Medium review finding
1075  		// pointed out that ambiguity should fall on the safe side —
1076  		// the server is free to rename to "query_database" if it wants
1077  		// NoProgress relief.
1078  		{"run_query", false},       // run is a write verb (fail closed)
1079  		{"execute_script", false},  // execute is a write verb (fail closed)
1080  		{"transform_data", false},  // no read verb
1081  		{"process_batch", false},   // no read verb
1082  		// Pathological: write name with a read-verb at position 4+ must
1083  		// NOT match (token scan stops at position 3).
1084  		{"request_write_access_and_get_token_afterwards", false},
1085  	}
1086  	for _, tt := range tests {
1087  		t.Run(tt.name, func(t *testing.T) {
1088  			if got := isReadMCPName(tt.name); got != tt.want {
1089  				t.Errorf("isReadMCPName(%q) = %v, want %v", tt.name, got, tt.want)
1090  			}
1091  		})
1092  	}
1093  }
1094  
1095  // TestLoopDetector_NoProgress_BashUniqueArgs_NoNudge covers the Task 5
1096  // benchmark pattern: ~15 bash calls during a multi-step investigation, each
1097  // with distinct argsJSON. Pre-gate, this force-stops via maxNudges escalation.
1098  // With bash in the batchTolerant set and ≥50% unique argsHashes, NoProgress
1099  // must treat this as a legitimate batch and stay Continue.
1100  func TestLoopDetector_NoProgress_BashUniqueArgs_NoNudge(t *testing.T) {
1101  	ld := NewLoopDetector()
1102  	ld.batchTolerant = map[string]bool{"bash": true}
1103  
1104  	for i := range 15 {
1105  		ld.Record("bash", fmt.Sprintf(`{"cmd":"step_%d"}`, i), false, "", "", false)
1106  		action, msg := ld.Check("bash")
1107  		if action != LoopContinue {
1108  			t.Fatalf("call %d: unique-args bash on a batch-tolerant tool should stay Continue, got %v (%s)", i+1, action, msg)
1109  		}
1110  	}
1111  }
1112  
1113  // TestLoopDetector_NoProgress_MCPUniqueArgs_NoNudge covers the Task 6
1114  // benchmark pattern: 16 MCP-tool calls each querying a distinct UUID during a
1115  // legitimate Notion database enumeration. Pre-gate, this hit the generic
1116  // NoProgress threshold at count=8. With the MCP tool registered in
1117  // batchTolerant, unique-args enumeration stays Continue.
1118  func TestLoopDetector_NoProgress_MCPUniqueArgs_NoNudge(t *testing.T) {
1119  	ld := NewLoopDetector()
1120  	ld.batchTolerant = map[string]bool{"API-query-data-source": true}
1121  
1122  	for i := range 16 {
1123  		ld.Record("API-query-data-source", fmt.Sprintf(`{"id":"uuid-%d"}`, i), false, "", "", false)
1124  		action, msg := ld.Check("API-query-data-source")
1125  		if action != LoopContinue {
1126  			t.Fatalf("call %d: unique-args MCP tool on batch-tolerant list should stay Continue, got %v (%s)", i+1, action, msg)
1127  		}
1128  	}
1129  }
1130  
1131  // TestLoopDetector_NoProgress_MCPIdenticalArgs_StillStops locks the invariant
1132  // that batch-tolerance does NOT relax the identical-args case. Regardless of
1133  // which layered detector catches it (ConsecutiveDup fires earliest at 2
1134  // consecutive identical calls; ExactDup at 3 spread out; NoProgress at 8),
1135  // the outcome must be "not Continue" — identical-args spin is always caught.
1136  func TestLoopDetector_NoProgress_MCPIdenticalArgs_StillStops(t *testing.T) {
1137  	ld := NewLoopDetector()
1138  	ld.batchTolerant = map[string]bool{"API-query-data-source": true}
1139  
1140  	for range 8 {
1141  		ld.Record("API-query-data-source", `{"id":"same-uuid"}`, false, "", "", false)
1142  	}
1143  	action, msg := ld.Check("API-query-data-source")
1144  	if action == LoopContinue {
1145  		t.Fatalf("identical-args calls must be stopped by some detector despite batch-tolerance, got Continue (%s)", msg)
1146  	}
1147  }
1148  
1149  // TestLoopDetector_NoProgress_GenericToolUniqueArgs_StillNudges_Regression
1150  // pins the core constraint of Phase 1: the uniqueness gate must NOT relax
1151  // generic NoProgress detection for tools outside the batchTolerant set.
1152  // `think` (not in batchTolerant, not semi-repeatable) called 12 times with
1153  // distinct argsJSON must still nudge — catching "spinning on thought
1154  // variations without progress" is the generic path's load-bearing role.
1155  // v2: noProgressThreshold=12, so nudge fires at call 12.
1156  func TestLoopDetector_NoProgress_GenericToolUniqueArgs_StillNudges_Regression(t *testing.T) {
1157  	ld := NewLoopDetector()
1158  	// Explicitly NOT populating batchTolerant — this test must behave the
1159  	// same whether the field is nil or empty.
1160  
1161  	for i := range 12 {
1162  		ld.Record("think", fmt.Sprintf(`{"thought":"idea%d"}`, i), false, "", "", false)
1163  	}
1164  	action, msg := ld.Check("think")
1165  	if action != LoopNudge {
1166  		t.Fatalf("12 unique-args think calls must still nudge (generic path unchanged), got %v (%s)", action, msg)
1167  	}
1168  }
1169  
1170  // TestLoopDetector_NoProgress_BashMixedArgsRatio_GateIsolated exercises the
1171  // NoProgress uniqueness gate without letting ConsecutiveDup / ExactDup fire
1172  // first. The sequence uses 8 distinct argsHashes each appearing exactly twice
1173  // (16 calls, 50% unique) interleaved so no hash runs ≥3 times in a row and
1174  // ExactDup's "same-arg 3 times in window" threshold is not tripped.
1175  //
1176  // On a batch-tolerant bash, the gate suppresses the nudge at count≥12.
1177  // Without batch-tolerance (Generic path), the same stream must nudge — this
1178  // sub-test covers the non-relaxation invariant at the threshold boundary.
1179  func TestLoopDetector_NoProgress_BashMixedArgsRatio_GateIsolated(t *testing.T) {
1180  	// Build a non-consecutive pattern to keep ConsecutiveDup (need ≥2 back-to-back)
1181  	// and ExactDup (need ≥3 of the same argsHash in the window) quiet.
1182  	// Pattern: 1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8 — each hash appears twice,
1183  	// separated by 7 others. ExactDup threshold is 3 so two appearances is
1184  	// safe; ConsecutiveDup needs adjacency so interleaving avoids it.
1185  	pattern := []int{0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}
1186  
1187  	t.Run("gated_when_batch_tolerant", func(t *testing.T) {
1188  		ld := NewLoopDetector()
1189  		ld.batchTolerant = map[string]bool{"bash": true}
1190  		for _, i := range pattern {
1191  			ld.Record("bash", fmt.Sprintf(`{"cmd":"script_%d"}`, i), false, "", "", false)
1192  		}
1193  		action, msg := ld.Check("bash")
1194  		if action != LoopContinue {
1195  			t.Fatalf("50%% unique on batch-tolerant bash should be gated (Continue), got %v (%s)", action, msg)
1196  		}
1197  	})
1198  
1199  	t.Run("not_gated_when_not_batch_tolerant", func(t *testing.T) {
1200  		ld := NewLoopDetector()
1201  		// Explicitly empty batchTolerant — same sequence, no gate.
1202  		for _, i := range pattern {
1203  			ld.Record("bash", fmt.Sprintf(`{"cmd":"script_%d"}`, i), false, "", "", false)
1204  		}
1205  		action, msg := ld.Check("bash")
1206  		if action != LoopNudge {
1207  			t.Fatalf("same sequence without batch-tolerance should nudge at count≥12, got %v (%s)", action, msg)
1208  		}
1209  	})
1210  }
1211  
1212  // TestLoopDetector_UseSkill_RepeatedNeverFiresAnyDup documents production
1213  // issue: 9 force-stops in audit log on use_skill same-args ×3, iter=3,
1214  // killing queries before they were processed. use_skill is an idempotent
1215  // metadata load (see internal/tools/skill.go) — repeating it is harmless.
1216  // After the fix, ×5 same-args should return LoopContinue from Check
1217  // (neither ConsecutiveDup nor ExactDup fires).
1218  func TestLoopDetector_UseSkill_RepeatedNeverFiresAnyDup(t *testing.T) {
1219  	ld := NewLoopDetector()
1220  	for range 5 {
1221  		ld.Record("use_skill", `{"skill_name":"kocoro"}`, false, "", "", false)
1222  	}
1223  	action, msg := ld.Check("use_skill")
1224  	if action != LoopContinue {
1225  		t.Fatalf("use_skill ×5 same-args must return LoopContinue (idempotent metadata load), got %v: %s", action, msg)
1226  	}
1227  }
1228  
1229  // TestLoopDetector_UseSkill_ExemptionScopedToSelf guards against the
1230  // dupExempt entry leaking into other tools. After 5 use_skill calls
1231  // (which would normally trip ExactDup), records 4 same-args web_search
1232  // calls — those must still force-stop. This catches the regression
1233  // where an over-broad exemption (e.g. checking against the whole
1234  // dupExemptTools map outside the name-scoped path) would suppress
1235  // legitimate signals on adjacent tools.
1236  // With consecDupThreshold=3, force-stop fires at consecCount >= 4.
1237  func TestLoopDetector_UseSkill_ExemptionScopedToSelf(t *testing.T) {
1238  	ld := NewLoopDetector()
1239  	for range 5 {
1240  		ld.Record("use_skill", `{"skill_name":"kocoro"}`, false, "", "", false)
1241  	}
1242  	for range 4 {
1243  		ld.Record("web_search", `{"q":"climate"}`, false, "", "", false)
1244  	}
1245  	action, _ := ld.Check("web_search")
1246  	if action != LoopForceStop {
1247  		t.Fatalf("web_search ×4 same args must still force-stop after use_skill exemption activity, got %v", action)
1248  	}
1249  }
1250  
1251  // TestNudgeWindow_RollsOff documents the rolling-window semantics: nudges
1252  // older than `nudgeWindow` iterations age out. A long workflow with widely
1253  // spaced harmless nudges should never trigger maxNudges escalation.
1254  func TestNudgeWindow_RollsOff(t *testing.T) {
1255  	w := newNudgeWindow(3, 5) // 3 max, 5-iter window
1256  	if w.recordAndCheck(1) {
1257  		t.Fatal("1 nudge in window should not escalate")
1258  	}
1259  	if w.recordAndCheck(2) {
1260  		t.Fatal("2 nudges in window should not escalate")
1261  	}
1262  	// iter 3-7: no nudges. By iter 8, the iter-1 and iter-2 nudges should age out (cutoff = 8 - 5 + 1 = 4).
1263  	if w.recordAndCheck(8) {
1264  		t.Fatal("3rd nudge at iter 8 (window=5) should not escalate — first two aged out")
1265  	}
1266  }
1267  
1268  func TestNudgeWindow_BurstEscalates(t *testing.T) {
1269  	w := newNudgeWindow(3, 5)
1270  	if w.recordAndCheck(1) {
1271  		t.Fatal("1st nudge should not escalate")
1272  	}
1273  	if w.recordAndCheck(2) {
1274  		t.Fatal("2nd nudge should not escalate")
1275  	}
1276  	if !w.recordAndCheck(3) {
1277  		t.Fatal("3rd nudge in 5-iter window should escalate")
1278  	}
1279  }
1280  
1281  // TestConsecutiveDup_FailFailSuccessRetry locks the invariant that a flaky
1282  // retry pattern (fail, fail, succeed) on the same args does NOT force-stop.
1283  // Real Playwright selectors race page-load timing — the model must be
1284  // allowed to retry without being killed at attempt 3. Rule 1: tail-success
1285  // after any error in the run → skip detector (model recovered).
1286  func TestConsecutiveDup_FailFailSuccessRetry(t *testing.T) {
1287  	ld := NewLoopDetector()
1288  	ld.Record("browser_click", `{"ref":"e1"}`, true, "element not found", "", false)
1289  	ld.Record("browser_click", `{"ref":"e1"}`, true, "element not found", "", false)
1290  	ld.Record("browser_click", `{"ref":"e1"}`, false, "", "", false)
1291  	action, msg := ld.Check("browser_click")
1292  	if action != LoopContinue {
1293  		t.Fatalf("fail-fail-success retry must return LoopContinue (tail recovery), got %v: %s", action, msg)
1294  	}
1295  }
1296  
1297  // TestConsecutiveDup_ThreeSuccessfulSameArgsStillStops confirms the
1298  // legitimate "spinning on identical successful results" case still
1299  // triggers. Rule 1 doesn't apply (no error in run), Rule 2 doesn't apply
1300  // (not all errors) → original strict threshold.
1301  // With consecDupThreshold=3: nudge at 3, force-stop at 4.
1302  func TestConsecutiveDup_ThreeSuccessfulSameArgsStillStops(t *testing.T) {
1303  	ld := NewLoopDetector()
1304  	for range 4 {
1305  		ld.Record("web_search", `{"q":"climate"}`, false, "", "", false)
1306  	}
1307  	action, _ := ld.Check("web_search")
1308  	if action != LoopForceStop {
1309  		t.Fatalf("4 successful identical web_search must still force-stop, got %v", action)
1310  	}
1311  }
1312  
1313  // TestConsecutiveDup_SixErrorsNudgesNotForceStop: 6 same-args fails uses
1314  // Rule 2's 2x threshold (6 nudge, 7 force-stop). At 6 errors → nudge, not force-stop.
1315  // consecDupThreshold=3 → all-errors budget = 2x = 6/7.
1316  func TestConsecutiveDup_SixErrorsNudgesNotForceStop(t *testing.T) {
1317  	ld := NewLoopDetector()
1318  	for range 6 {
1319  		ld.Record("browser_click", `{"ref":"e1"}`, true, "element not found", "", false)
1320  	}
1321  	action, _ := ld.Check("browser_click")
1322  	if action != LoopNudge {
1323  		t.Fatalf("6 same-args consecutive errors should nudge (error budget 6/7), got %v", action)
1324  	}
1325  }
1326  
1327  // TestConsecutiveDup_FiveAllErrorsForceStops: 7 same-args all-error hits
1328  // the 2x force-stop budget. No tail success, no recovery — real stuck loop.
1329  // consecDupThreshold=3 → all-errors budget = 2x = 6 nudge / 7 force-stop.
1330  func TestConsecutiveDup_FiveAllErrorsForceStops(t *testing.T) {
1331  	ld := NewLoopDetector()
1332  	for range 7 {
1333  		ld.Record("browser_click", `{"ref":"e1"}`, true, "element not found", "", false)
1334  	}
1335  	action, _ := ld.Check("browser_click")
1336  	if action != LoopForceStop {
1337  		t.Fatalf("7 same-args consecutive errors should force-stop (2x budget), got %v", action)
1338  	}
1339  }
1340  
1341  // TestExactDup_RecoveryThenSecondSpinNotSkipped locks the invariant that
1342  // `latestRecoveredAfterSameArgsErrors` requires the LATEST call to be
1343  // success. A pattern of "errors → success → more errors" should NOT be
1344  // treated as recovered when the latest call is back to error — the
1345  // detector must count the new error streak fresh.
1346  //
1347  // Regression guard: if a future refactor of latestRecoveredAfterSameArgsErrors
1348  // caches "we recovered earlier" and skips ExactDup forever, this test breaks.
1349  func TestExactDup_RecoveryThenSecondSpinNotSkipped(t *testing.T) {
1350  	ld := NewLoopDetector()
1351  	// Phase 1: 4 same-args errors
1352  	for range 4 {
1353  		ld.Record("browser_click", `{"ref":"e1"}`, true, "element not found", "", false)
1354  	}
1355  	// Phase 2: 1 success (the "recovery" the helper looks for)
1356  	ld.Record("browser_click", `{"ref":"e1"}`, false, "", "", false)
1357  	// Phase 3: latest is now an error again. ExactDup must count the
1358  	// 5 same-args errors (4 + this new one) at strict v2 threshold (5 nudge)
1359  	// — exactRecovered must be false because latest is an error.
1360  	ld.Record("browser_click", `{"ref":"e1"}`, true, "element not found", "", false)
1361  	action, _ := ld.Check("browser_click")
1362  	if action == LoopContinue {
1363  		t.Fatalf("post-recovery spin (4err + success + 1err) must NOT be silently skipped, got LoopContinue")
1364  	}
1365  	// Either LoopNudge or LoopForceStop is acceptable here — what we're
1366  	// proving is that recovery does NOT cache and disable the detector.
1367  }
1368  
1369  // TestExactDup_SixAllErrorsSpreadNotForceStop: 6 spread-out same-args
1370  // failures (with intervening different-tool calls) is past the old
1371  // exactDupThreshold*2=6 force-stop trigger. With all-error 2x budget,
1372  // the new threshold for all-errors is 6 nudge / 12 force-stop.
1373  // 6 errors → should nudge, not force-stop.
1374  func TestExactDup_SixAllErrorsSpreadNotForceStop(t *testing.T) {
1375  	ld := NewLoopDetector()
1376  	for i := 0; i < 6; i++ {
1377  		ld.Record("browser_click", `{"ref":"e1"}`, true, "element not found", "", false)
1378  		ld.Record("browser_snapshot", `{}`, false, "",
1379  			fmt.Sprintf("https://example.com/state%d", i), false)
1380  	}
1381  	action, msg := ld.Check("browser_click")
1382  	if action == LoopForceStop {
1383  		t.Fatalf("6 spread-out same-args errors should not force-stop (2x all-error budget), got: %s", msg)
1384  	}
1385  }
1386  
1387  // TestExactDup_SixAllSuccessSpreadStillForceStops: 10 spread-out same-args
1388  // successes (no errors) uses the original threshold → force-stop at 2×exactDupThreshold=10.
1389  // This is real spin, not flaky retry.
1390  func TestExactDup_SixAllSuccessSpreadStillForceStops(t *testing.T) {
1391  	ld := NewLoopDetector()
1392  	for i := 0; i < 10; i++ {
1393  		ld.Record("file_read", `{"file":"main.go"}`, false, "", "", false)
1394  		ld.Record("file_edit",
1395  			fmt.Sprintf(`{"old":"a%d","new":"b%d"}`, i, i), false, "", "", false)
1396  	}
1397  	action, _ := ld.Check("file_read")
1398  	if action != LoopForceStop {
1399  		t.Fatalf("10 spread-out same-args successes must still force-stop (2×exactDupThreshold budget), got %v", action)
1400  	}
1401  }
1402  
1403  // TestExactDup_MixedSuccessAndErrorsUsesStrictThreshold: if ANY of the
1404  // repeats succeeded, we no longer have "all errors" — use the strict
1405  // threshold. Mixed means the tool sometimes works; continuing to call it
1406  // with identical args is spin.
1407  // Final call in sequence is an error (so tail-success recovery skip does
1408  // NOT apply — recovery requires tail=success AND errCount>0).
1409  // With exactDupThreshold=5: nudge fires at dupCount >= 5 (strict, mixed).
1410  func TestExactDup_MixedSuccessAndErrorsUsesStrictThreshold(t *testing.T) {
1411  	ld := NewLoopDetector()
1412  	// 5 same-args repeats with mixed success/error, tail=error → strict threshold → nudge at 5
1413  	ld.Record("browser_click", `{"ref":"e1"}`, true, "element not found", "", false)
1414  	ld.Record("browser_snapshot", `{}`, false, "", "sigA", false)
1415  	ld.Record("browser_click", `{"ref":"e1"}`, false, "", "", false)
1416  	ld.Record("browser_snapshot", `{}`, false, "", "sigB", false)
1417  	ld.Record("browser_click", `{"ref":"e1"}`, true, "element not found", "", false)
1418  	ld.Record("browser_snapshot", `{}`, false, "", "sigC", false)
1419  	ld.Record("browser_click", `{"ref":"e1"}`, false, "", "", false)
1420  	ld.Record("browser_snapshot", `{}`, false, "", "sigD", false)
1421  	ld.Record("browser_click", `{"ref":"e1"}`, true, "element not found", "", false)
1422  	action, _ := ld.Check("browser_click")
1423  	if action != LoopNudge {
1424  		t.Fatalf("5 mixed same-args repeats (tail=error) should nudge (strict threshold), got %v", action)
1425  	}
1426  }
1427  
1428  // TestExactDup_FailFailSuccessSpreadRetrySkipsOnRecoveredTail documents the
1429  // spread-out retry shape the comments describe for ExactDup: the model retries
1430  // the same browser_click across intervening snapshots, then succeeds. The
1431  // first success after a same-args error streak is recovery, not spin.
1432  func TestExactDup_FailFailSuccessSpreadRetrySkipsOnRecoveredTail(t *testing.T) {
1433  	ld := NewLoopDetector()
1434  	ld.Record("browser_click", `{"ref":"e1"}`, true, "element not found", "", false)
1435  	ld.Record("browser_snapshot", `{}`, false, "", "https://example.com/state1", false)
1436  	ld.Record("browser_click", `{"ref":"e1"}`, true, "element not found", "", false)
1437  	ld.Record("browser_snapshot", `{}`, false, "", "https://example.com/state2", false)
1438  	ld.Record("browser_click", `{"ref":"e1"}`, false, "", "", false)
1439  	action, msg := ld.Check("browser_click")
1440  	if action != LoopContinue {
1441  		t.Fatalf("fail-snapshot-fail-snapshot-success must return LoopContinue (spread recovery), got %v: %s", action, msg)
1442  	}
1443  }
1444  
1445  // TestFamilyNoProgress_RepeatableVaryingArgsUnder15Silent: 14 varying-args
1446  // browser_snapshot calls on a stable URL. Pre-fix: FamilyNoProgress main
1447  // path force-stops at progressCount=7. Post-fix: repeatable + no topic
1448  // signal → force-stop-only-at-15 → silent until pathological threshold.
1449  //
1450  // Covers form-fill-equivalent workloads (7-14 same-page ops should all
1451  // continue — no intermediate nudges that might stack into Task 2's
1452  // rolling-window escalation).
1453  func TestFamilyNoProgress_RepeatableVaryingArgsUnder15Silent(t *testing.T) {
1454  	ld := NewLoopDetector()
1455  	const url = "https://app.example.com/dashboard"
1456  	for i := 0; i < 14; i++ {
1457  		args := fmt.Sprintf(`{"wait":%d}`, i)
1458  		ld.Record("browser_snapshot", args, false, "", url, false)
1459  	}
1460  	action, msg := ld.Check("browser_snapshot")
1461  	if action != LoopContinue {
1462  		t.Fatalf("14 varying-args repeatable calls on stable URL must be silent (force-stop-only-at-15), got %v: %s", action, msg)
1463  	}
1464  }
1465  
1466  // TestFamilyNoProgress_RepeatableFormFillContinues: 10 click + 10 type on a
1467  // stable URL — representative of a large form fill. Must continue silently
1468  // (no nudge — nudges feed Task 2 rolling-window escalation).
1469  func TestFamilyNoProgress_RepeatableFormFillContinues(t *testing.T) {
1470  	ld := NewLoopDetector()
1471  	const url = "https://app.example.com/settings"
1472  	for i := 0; i < 10; i++ {
1473  		ld.Record("browser_click",
1474  			fmt.Sprintf(`{"ref":"e%d"}`, i), false, "", url, false)
1475  		ld.Record("browser_type",
1476  			fmt.Sprintf(`{"ref":"e%d","text":"v%d"}`, i, i), false, "", url, false)
1477  	}
1478  	action, _ := ld.Check("browser_click")
1479  	if action != LoopContinue {
1480  		t.Fatalf("10 varying-args browser_click on stable URL (form fill) must continue, got %v", action)
1481  	}
1482  }
1483  
1484  // TestFamilyNoProgress_RepeatableResultOnly_SelfTopicOnlySilentBelow15 covers
1485  // repeatable tools whose args include a URL, so the latest topic hash matches
1486  // the current call itself but no prior calls. That is still result-only: the
1487  // strong topic signal is absent, so stable result_sig should stay silent until
1488  // the raised threshold instead of force-stopping at 7.
1489  func TestFamilyNoProgress_RepeatableResultOnly_SelfTopicOnlySilentBelow15(t *testing.T) {
1490  	ld := NewLoopDetector()
1491  	const resultSig = "https://app.example.com/search"
1492  	for i := 0; i < 7; i++ {
1493  		args := fmt.Sprintf(`{"url":"https://app.example.com/search?q=item-%d"}`, i)
1494  		ld.Record("browser_navigate", args, false, "", resultSig, false)
1495  	}
1496  	action, msg := ld.Check("browser_navigate")
1497  	if action != LoopContinue {
1498  		t.Fatalf("7 browser_navigate calls with self-only topic match and stable result_sig must stay silent below 15, got %v: %s", action, msg)
1499  	}
1500  }
1501  
1502  // TestFamilyNoProgress_RepeatableVaryingArgsExtremeForceStops: 15
1503  // varying-args snapshots on stable URL — past the raised force-stop
1504  // threshold. Real pathological polling still caught.
1505  func TestFamilyNoProgress_RepeatableVaryingArgsExtremeForceStops(t *testing.T) {
1506  	ld := NewLoopDetector()
1507  	const url = "https://app.example.com/status"
1508  	for i := 0; i < 15; i++ {
1509  		args := fmt.Sprintf(`{"wait":%d}`, i)
1510  		ld.Record("browser_snapshot", args, false, "", url, false)
1511  	}
1512  	action, _ := ld.Check("browser_snapshot")
1513  	if action != LoopForceStop {
1514  		t.Fatalf("15 varying-args same-URL snapshots must still force-stop (pathological polling), got %v", action)
1515  	}
1516  }
1517  
1518  // TestFamilyNoProgress_NonRepeatableOriginalThresholds: web_search family
1519  // must still hit force-stop at 12 same-topic calls (v2 threshold).
1520  // Raised thresholds apply uniformly; repeatable tools have a separate
1521  // result-only path with a higher threshold.
1522  func TestFamilyNoProgress_NonRepeatableOriginalThresholds(t *testing.T) {
1523  	ld := NewLoopDetector()
1524  	// All 12 queries normalize to the "change climate effects" topic
1525  	// (only filler words differ).
1526  	fillers := []string{"today", "latest", "top", "current", "major", "breaking",
1527  		"news", "update", "headlines", "recent", "today latest", "top current"}
1528  	for _, f := range fillers {
1529  		args := fmt.Sprintf(`{"q":"climate change effects %s"}`, f)
1530  		ld.Record("web_search", args, false, "", "", false)
1531  	}
1532  	action, _ := ld.Check("web_search")
1533  	if action != LoopForceStop {
1534  		t.Fatalf("12 same-topic web_search must still force-stop (v2 threshold), got %v", action)
1535  	}
1536  }