loopdetect_test.go
1 package agent 2 3 import ( 4 "fmt" 5 "strings" 6 "testing" 7 ) 8 9 func TestLoopDetector_ConsecutiveDup_Nudge(t *testing.T) { 10 ld := NewLoopDetector() 11 12 // 1 call: no trigger 13 ld.Record("web_search", `{"q":"test"}`, false, "", "", false) 14 action, _ := ld.Check("web_search") 15 if action != LoopContinue { 16 t.Errorf("1 call should not trigger, got %v", action) 17 } 18 19 // 2nd consecutive identical call: no trigger yet (consecDupThreshold=3) 20 ld.Record("web_search", `{"q":"test"}`, false, "", "", false) 21 action, _ = ld.Check("web_search") 22 if action != LoopContinue { 23 t.Errorf("2 consecutive identical calls should not trigger (consecDupThreshold=3), got %v", action) 24 } 25 26 // 3rd consecutive identical call: nudge (consecDupThreshold=3) 27 ld.Record("web_search", `{"q":"test"}`, false, "", "", false) 28 action, msg := ld.Check("web_search") 29 if action != LoopNudge { 30 t.Errorf("3 consecutive identical calls should nudge, got %v", action) 31 } 32 if msg == "" { 33 t.Error("nudge should have a message") 34 } 35 } 36 37 func TestLoopDetector_ConsecutiveDup_ForceStop(t *testing.T) { 38 ld := NewLoopDetector() 39 40 // 4 consecutive identical calls: force stop (consecDupThreshold+1=4) 41 for range 4 { 42 ld.Record("web_search", `{"q":"test"}`, false, "", "", false) 43 } 44 action, _ := ld.Check("web_search") 45 if action != LoopForceStop { 46 t.Errorf("4 consecutive identical calls should force stop, got %v", action) 47 } 48 } 49 50 func TestLoopDetector_NonConsecutiveDup_NoFalsePositive(t *testing.T) { 51 ld := NewLoopDetector() 52 53 // read → edit → read: NOT consecutive, 2 in window < exactDupThreshold(3) 54 ld.Record("file_read", `{"file":"main.go"}`, false, "", "", false) 55 ld.Record("file_edit", `{"file":"main.go","old":"a","new":"b"}`, false, "", "", false) 56 ld.Record("file_read", `{"file":"main.go"}`, false, "", "", false) 57 58 action, _ := ld.Check("file_read") 59 if action != LoopContinue { 60 t.Errorf("read-edit-read should not trigger (non-consecutive), got %v", action) 61 } 62 } 63 64 func TestLoopDetector_WindowDup_Nudge(t *testing.T) { 65 ld := NewLoopDetector() 66 67 // 5 spread-out identical calls: window-based nudge (exactDupThreshold=5) 68 ld.Record("file_read", `{"file":"main.go"}`, false, "", "", false) 69 ld.Record("file_edit", `{"old":"a","new":"b"}`, false, "", "", false) 70 ld.Record("file_read", `{"file":"main.go"}`, false, "", "", false) 71 ld.Record("file_edit", `{"old":"b","new":"c"}`, false, "", "", false) 72 ld.Record("file_read", `{"file":"main.go"}`, false, "", "", false) 73 ld.Record("file_edit", `{"old":"c","new":"d"}`, false, "", "", false) 74 ld.Record("file_read", `{"file":"main.go"}`, false, "", "", false) 75 ld.Record("file_edit", `{"old":"d","new":"e"}`, false, "", "", false) 76 ld.Record("file_read", `{"file":"main.go"}`, false, "", "", false) 77 78 action, _ := ld.Check("file_read") 79 if action != LoopNudge { 80 t.Errorf("5 spread-out identical calls should trigger window nudge, got %v", action) 81 } 82 } 83 84 func TestLoopDetector_WindowDup_ForceStop(t *testing.T) { 85 ld := NewLoopDetector() 86 87 // 10 spread-out identical calls: window force stop (2× exactDupThreshold=10) 88 for range 10 { 89 ld.Record("file_read", `{"file":"main.go"}`, false, "", "", false) 90 ld.Record("file_edit", `{"x":"y"}`, false, "", "", false) 91 } 92 action, _ := ld.Check("file_read") 93 if action != LoopForceStop { 94 t.Errorf("10 spread-out identical calls should force stop, got %v", action) 95 } 96 } 97 98 func TestLoopDetector_SameToolError_Nudge(t *testing.T) { 99 ld := NewLoopDetector() 100 101 // 5 errors: no trigger (threshold is 6) 102 for i := range 5 { 103 ld.Record("file_edit", fmt.Sprintf(`{"file":"f%d"}`, i), true, "permission denied", "", false) 104 } 105 action, _ := ld.Check("file_edit") 106 if action != LoopContinue { 107 t.Errorf("5 errors should not trigger, got %v", action) 108 } 109 110 // 6th error: nudge 111 ld.Record("file_edit", `{"file":"f5"}`, true, "permission denied", "", false) 112 action, msg := ld.Check("file_edit") 113 if action != LoopNudge { 114 t.Errorf("6 errors should trigger nudge, got %v", action) 115 } 116 if msg == "" { 117 t.Error("nudge should have a message") 118 } 119 } 120 121 func TestLoopDetector_SameToolError_ForceStop(t *testing.T) { 122 ld := NewLoopDetector() 123 124 // 12 errors: force stop (2× threshold of 6) 125 for i := range 12 { 126 ld.Record("file_edit", fmt.Sprintf(`{"file":"f%d"}`, i), true, "permission denied", "", false) 127 } 128 action, _ := ld.Check("file_edit") 129 if action != LoopForceStop { 130 t.Errorf("12 errors should trigger force stop, got %v", action) 131 } 132 } 133 134 func TestLoopDetector_NoProgress_Nudge(t *testing.T) { 135 ld := NewLoopDetector() 136 137 // 11 calls with different args: no trigger (threshold is 12) 138 // Use think (not in any tool family, not semi-repeatable) to test pure 139 // NoProgress detection. bash is semi-repeatable (threshold 16) so it 140 // wouldn't trigger at 12. 141 for i := range 11 { 142 ld.Record("think", fmt.Sprintf(`{"thought":"idea%d"}`, i), false, "", "", false) 143 } 144 action, _ := ld.Check("think") 145 if action != LoopContinue { 146 t.Errorf("11 calls should not trigger, got %v", action) 147 } 148 149 // 12th call: nudge 150 ld.Record("think", `{"thought":"idea12"}`, false, "", "", false) 151 action, _ = ld.Check("think") 152 if action != LoopNudge { 153 t.Errorf("12 calls should trigger nudge, got %v", action) 154 } 155 } 156 157 func TestLoopDetector_GUIExemptFromNoProgress(t *testing.T) { 158 ld := NewLoopDetector() 159 160 // 10 screenshot calls with different args: should NOT trigger NoProgress 161 for i := range 10 { 162 ld.Record("screenshot", fmt.Sprintf(`{"delay":%d}`, i), false, "", "", false) 163 } 164 action, _ := ld.Check("screenshot") 165 if action != LoopContinue { 166 t.Errorf("screenshot should be exempt from NoProgress, got %v", action) 167 } 168 } 169 170 func TestLoopDetector_GUIConsecutiveDupStillDetected(t *testing.T) { 171 ld := NewLoopDetector() 172 173 // Even GUI tools should trigger consecutive-duplicate detection 174 // consecDupThreshold=3 → nudge at 3 consecutive identical calls 175 ld.Record("screenshot", `{}`, false, "", "", false) 176 ld.Record("screenshot", `{}`, false, "", "", false) 177 action, _ := ld.Check("screenshot") 178 if action != LoopContinue { 179 t.Errorf("2 consecutive identical screenshot calls should not trigger (consecDupThreshold=3), got %v", action) 180 } 181 182 ld.Record("screenshot", `{}`, false, "", "", false) 183 action, _ = ld.Check("screenshot") 184 if action != LoopNudge { 185 t.Errorf("3 consecutive identical screenshot calls should nudge, got %v", action) 186 } 187 } 188 189 func TestLoopDetector_SlidingWindow(t *testing.T) { 190 ld := NewLoopDetector() 191 ld.historySize = 5 // small window for testing 192 193 // Fill window with 3 consecutive bash duplicates (triggers consecutive nudge at consecDupThreshold=3) 194 ld.Record("bash", `{"cmd":"ls"}`, false, "", "", false) 195 ld.Record("bash", `{"cmd":"ls"}`, false, "", "", false) 196 ld.Record("bash", `{"cmd":"ls"}`, false, "", "", false) 197 action, _ := ld.Check("bash") 198 if action != LoopNudge { 199 t.Error("3 consecutive exact dups should nudge") 200 } 201 202 // Push old records out of window with 5 different calls 203 for i := range 5 { 204 ld.Record("file_read", fmt.Sprintf(`{"file":"f%d"}`, i), false, "", "", false) 205 } 206 207 // bash dups should have fallen out of window 208 action, _ = ld.Check("bash") 209 if action != LoopContinue { 210 t.Error("old records should have fallen out of sliding window") 211 } 212 } 213 214 func TestLoopDetector_MixedWorkflow_NoFalsePositive(t *testing.T) { 215 ld := NewLoopDetector() 216 217 // Normal coding workflow: read, edit, read, edit, bash 218 ld.Record("file_read", `{"file":"main.go"}`, false, "", "", false) 219 ld.Record("file_edit", `{"file":"main.go","old":"a","new":"b"}`, false, "", "", false) 220 ld.Record("file_read", `{"file":"main.go"}`, false, "", "", false) 221 ld.Record("file_edit", `{"file":"main.go","old":"b","new":"c"}`, false, "", "", false) 222 ld.Record("bash", `{"cmd":"go test"}`, false, "", "", false) 223 224 for _, name := range []string{"file_read", "file_edit", "bash"} { 225 action, _ := ld.Check(name) 226 if action != LoopContinue { 227 t.Errorf("normal workflow should not trigger for %s, got %v", name, action) 228 } 229 } 230 } 231 232 func TestLoopDetector_DifferentArgsNoDuplicate(t *testing.T) { 233 ld := NewLoopDetector() 234 235 // Same tool, different args each time — should not trigger 236 for i := range 5 { 237 ld.Record("file_read", fmt.Sprintf(`{"file":"file%d.go"}`, i), false, "", "", false) 238 } 239 action, _ := ld.Check("file_read") 240 if action != LoopContinue { 241 t.Errorf("different args should not trigger, got %v", action) 242 } 243 } 244 245 func TestLoopDetector_ErrorsOnlyCountForSameTool(t *testing.T) { 246 ld := NewLoopDetector() 247 248 // Errors spread across different tools: no trigger for any single tool 249 ld.Record("bash", `{"cmd":"a"}`, true, "fail", "", false) 250 ld.Record("file_edit", `{"a":"b"}`, true, "fail", "", false) 251 ld.Record("grep", `{"p":"c"}`, true, "fail", "", false) 252 ld.Record("bash", `{"cmd":"b"}`, true, "fail", "", false) 253 ld.Record("file_edit", `{"a":"c"}`, true, "fail", "", false) 254 255 for _, name := range []string{"bash", "file_edit", "grep"} { 256 action, _ := ld.Check(name) 257 if action != LoopContinue { 258 t.Errorf("spread errors should not trigger for %s, got %v", name, action) 259 } 260 } 261 } 262 263 func TestLoopDetector_WebFamily_SameTopicNudge(t *testing.T) { 264 ld := NewLoopDetector() 265 // 5 web_search calls all normalizing to the "climate world" topic 266 // (only date / filler words differ) → family nudge at 5 (v2 threshold). 267 ld.Record("web_search", `{"query":"world climate today March 2 2026 major headlines"}`, false, "", "", false) 268 ld.Record("web_search", `{"query":"world climate March 2 2026 top headlines latest"}`, false, "", "", false) 269 ld.Record("web_search", `{"query":"world climate today March 2 2026 breaking news"}`, false, "", "", false) 270 ld.Record("web_search", `{"query":"world climate latest update March 2 2026"}`, false, "", "", false) 271 ld.Record("web_search", `{"query":"world climate top headlines current March 2 2026"}`, false, "", "", false) 272 action, msg := ld.Check("web_search") 273 if action != LoopNudge { 274 t.Errorf("5 same-topic web searches should nudge (FamilyNoProgress v2 threshold), got %v", action) 275 } 276 if msg == "" { 277 t.Error("nudge should have a message") 278 } 279 } 280 281 func TestLoopDetector_WebFamily_CrossToolTopicInheritance(t *testing.T) { 282 ld := NewLoopDetector() 283 // 2 web_search on same topic (only filler/date differences), then web_fetch. 284 // Family-level topic lookup should inherit the topic hash from web_search. 285 ld.Record("web_search", `{"query":"golang tutorial 2026"}`, false, "", "", false) 286 ld.Record("web_search", `{"query":"golang tutorial latest"}`, false, "", "", false) 287 ld.Record("web_fetch", `{"url":"https://go.dev/doc/tutorial"}`, false, "", "", false) 288 289 // 2 same-topic (from web_search) + 1 different (web_fetch URL) → not yet 5 290 action, _ := ld.Check("web_fetch") 291 if action != LoopContinue { 292 t.Errorf("2 same-topic + 1 different should continue, got %v", action) 293 } 294 295 // Add more same-topic searches until nudge at 5 same-topic in family (v2 threshold). 296 // All queries normalize to the "golang tutorial" topic (date/filler stripped). 297 ld.Record("web_search", `{"query":"latest golang tutorial today"}`, false, "", "", false) 298 ld.Record("web_search", `{"query":"golang tutorial top latest"}`, false, "", "", false) 299 ld.Record("web_search", `{"query":"golang tutorial current update"}`, false, "", "", false) 300 action, _ = ld.Check("web_search") 301 if action != LoopNudge { 302 t.Errorf("5 same-topic family calls should nudge (v2 threshold), got %v", action) 303 } 304 } 305 306 func TestLoopDetector_WebFamily_ResultSigDedup(t *testing.T) { 307 ld := NewLoopDetector() 308 // 5 calls returning the same domains → no new info → nudge at 5 (v2 threshold) 309 ld.Record("web_search", `{"query":"ai research papers"}`, false, "", "reuters.com,bbc.com", false) 310 ld.Record("web_search", `{"query":"ai research latest papers"}`, false, "", "reuters.com,bbc.com", false) 311 ld.Record("web_search", `{"query":"ai research papers review"}`, false, "", "reuters.com,bbc.com", false) 312 ld.Record("web_search", `{"query":"ai research 2026"}`, false, "", "reuters.com,bbc.com", false) 313 ld.Record("web_search", `{"query":"latest ai research papers"}`, false, "", "reuters.com,bbc.com", false) 314 action, _ := ld.Check("web_search") 315 if action != LoopNudge { 316 t.Errorf("5 calls with same result signature should nudge, got %v", action) 317 } 318 } 319 320 func TestLoopDetector_WebFamily_AlternatingSearchFetchStillNudges(t *testing.T) { 321 ld := NewLoopDetector() 322 323 // Mixed web workflows should still nudge when alternating tools keep 324 // returning the same source and no new information is being gathered. 325 // v2: nudge at 5 same-result-sig calls in the family. 326 ld.Record("web_search", `{"query":"go tutorial official"}`, false, "", "go.dev", false) 327 ld.Record("web_fetch", `{"url":"https://go.dev/doc/tutorial"}`, false, "", "go.dev", false) 328 ld.Record("web_search", `{"query":"golang tutorial latest official"}`, false, "", "go.dev", false) 329 ld.Record("web_fetch", `{"url":"https://go.dev/doc/effective_go"}`, false, "", "go.dev", false) 330 ld.Record("web_search", `{"query":"golang official tutorial guide"}`, false, "", "go.dev", false) 331 332 action, _ := ld.Check("web_search") 333 if action != LoopNudge { 334 t.Errorf("alternating web_search/web_fetch with the same result signature should nudge, got %v", action) 335 } 336 } 337 338 func TestLoopDetector_WebFamily_ForceStopAt7(t *testing.T) { 339 ld := NewLoopDetector() 340 // 7 web calls with same topic → force stop 341 for i := 0; i < 7; i++ { 342 ld.Record("web_search", `{"query":"climate change report"}`, false, "", "", false) 343 } 344 action, _ := ld.Check("web_search") 345 if action != LoopForceStop { 346 t.Errorf("7 same-topic web calls should force stop, got %v", action) 347 } 348 } 349 350 func TestLoopDetector_WebFamily_7DifferentTopicsNoForceStop(t *testing.T) { 351 ld := NewLoopDetector() 352 // 7 web family calls on DIFFERENT topics should NOT force stop 353 // (legitimate multi-source research) 354 for i := 0; i < 4; i++ { 355 ld.Record("web_search", fmt.Sprintf(`{"query":"topic%d search"}`, i), false, "", "", false) 356 } 357 for i := 0; i < 3; i++ { 358 ld.Record("web_fetch", fmt.Sprintf(`{"url":"https://example%d.com/page"}`, i), false, "", "", false) 359 } 360 action, _ := ld.Check("web_fetch") 361 if action == LoopForceStop { 362 t.Error("7 web family calls with different topics should NOT force stop") 363 } 364 } 365 366 func TestLoopDetector_WebFamily_DifferentTopicsUnder7(t *testing.T) { 367 ld := NewLoopDetector() 368 // 4 web calls with different topics — should NOT trigger (under 7 total, no topic match) 369 ld.Record("web_search", `{"query":"golang concurrency patterns"}`, false, "", "", false) 370 ld.Record("web_search", `{"query":"python machine learning tutorial"}`, false, "", "", false) 371 ld.Record("web_search", `{"query":"rust ownership explained"}`, false, "", "", false) 372 ld.Record("web_search", `{"query":"javascript async await"}`, false, "", "", false) 373 action, _ := ld.Check("web_search") 374 if action != LoopContinue { 375 t.Errorf("4 different-topic web calls should continue, got %v", action) 376 } 377 } 378 379 func TestLoopDetector_NonWebToolUnchanged(t *testing.T) { 380 ld := NewLoopDetector() 381 // 5 file_read calls with different args — should NOT trigger (threshold still 8) 382 for i := 0; i < 5; i++ { 383 ld.Record("file_read", fmt.Sprintf(`{"file":"file%d.go"}`, i), false, "", "", false) 384 } 385 action, _ := ld.Check("file_read") 386 if action != LoopContinue { 387 t.Errorf("5 file_read calls should not trigger (threshold 8), got %v", action) 388 } 389 } 390 391 // TestLoopDetector_RealWorldWebLoop replays the actual bug that prompted this fix: 392 // many web_search calls with varied "world news" queries, then web_fetch calls. 393 // v2 thresholds: nudge fires at 5 same-topic, force-stop fires at 12 same-topic. 394 func TestLoopDetector_RealWorldWebLoop(t *testing.T) { 395 ld := NewLoopDetector() 396 397 searches := []string{ 398 `{"query":"world news today March 2 2026"}`, 399 `{"query":"world news today March 2 2026 major headlines"}`, 400 `{"query":"world news March 2 2026 top headlines Reuters BBC Al Jazeera"}`, 401 `{"query":"world news today March 2 2026 top headlines Reuters AP BBC"}`, 402 `{"query":"world news March 2 2026 Reuters AP BBC Al Jazeera"}`, 403 `{"query":"world news March 2 2026 top headlines"}`, 404 `{"query":"world news today March 2 2026 top headlines"}`, 405 `{"query":"world news March 2 2026 top headlines Reuters AP BBC Al Jazeera CNN"}`, 406 `{"query":"world news March 2 2026 latest updates"}`, 407 `{"query":"world news March 2 2026 breaking"}`, 408 `{"query":"world news March 2 2026 Reuters AP"}`, 409 `{"query":"world news March 2 2026 BBC CNN Al Jazeera"}`, 410 `{"query":"world news March 2 2026 top stories"}`, 411 } 412 413 var firstNudge, firstForceStop int 414 for i, args := range searches { 415 ld.Record("web_search", args, false, "", "reuters.com,bbc.com", false) 416 action, _ := ld.Check("web_search") 417 if action == LoopNudge && firstNudge == 0 { 418 firstNudge = i + 1 419 } 420 if action == LoopForceStop && firstForceStop == 0 { 421 firstForceStop = i + 1 422 } 423 } 424 425 // v2: nudge at progressCount>=5, force-stop at progressCount>=12 426 if firstNudge == 0 || firstNudge > 5 { 427 t.Errorf("expected first nudge by call 5, got %d", firstNudge) 428 } 429 if firstForceStop == 0 || firstForceStop > 12 { 430 t.Errorf("expected force stop by call 12, got %d", firstForceStop) 431 } 432 } 433 434 // TestLoopDetector_RealWorldWebLoop_CrossTool verifies that switching from 435 // web_search to web_fetch doesn't reset the family counter. 436 func TestLoopDetector_ToolModeSwitch_NudgeOnGUIAfterSuccess(t *testing.T) { 437 ld := NewLoopDetector() 438 439 // Successful non-GUI call followed by GUI call → nudge 440 ld.Record("applescript", `{"script":"create event"}`, false, "", "", false) 441 action, _ := ld.Check("applescript") 442 if action != LoopContinue { 443 t.Errorf("single successful call should continue, got %v", action) 444 } 445 446 ld.Record("screenshot", `{"target":"screen"}`, false, "", "", false) 447 action, msg := ld.Check("screenshot") 448 if action != LoopNudge { 449 t.Errorf("GUI call after successful non-GUI should nudge, got %v", action) 450 } 451 if msg == "" { 452 t.Error("nudge should have a message") 453 } 454 } 455 456 func TestLoopDetector_ToolModeSwitch_NoNudgeAfterError(t *testing.T) { 457 ld := NewLoopDetector() 458 459 // Failed non-GUI call followed by GUI call → no nudge (GUI verification warranted) 460 ld.Record("applescript", `{"script":"create event"}`, true, "calendar not found", "", false) 461 ld.Record("screenshot", `{"target":"screen"}`, false, "", "", false) 462 action, _ := ld.Check("screenshot") 463 if action != LoopContinue { 464 t.Errorf("GUI after failed non-GUI should continue (verification warranted), got %v", action) 465 } 466 } 467 468 func TestLoopDetector_ToolModeSwitch_NoNudgeForGUIOnlyTask(t *testing.T) { 469 ld := NewLoopDetector() 470 471 // Task starts with GUI tools — no non-GUI success to trigger on 472 ld.Record("screenshot", `{"target":"screen"}`, false, "", "", false) 473 ld.Record("computer", `{"action":"click","coordinate":[100,200]}`, false, "", "", false) 474 ld.Record("screenshot", `{"target":"screen"}`, false, "", "", false) 475 action, _ := ld.Check("screenshot") 476 if action != LoopContinue { 477 t.Errorf("GUI-only task should not trigger mode switch, got %v", action) 478 } 479 } 480 481 func TestLoopDetector_ToolModeSwitch_NudgeOnlyOnce(t *testing.T) { 482 ld := NewLoopDetector() 483 484 // Successful non-GUI → GUI nudge → second GUI should NOT nudge again 485 ld.Record("applescript", `{"script":"create event"}`, false, "", "", false) 486 ld.Record("screenshot", `{"target":"screen"}`, false, "", "", false) 487 action, _ := ld.Check("screenshot") 488 if action != LoopNudge { 489 t.Errorf("first GUI after success should nudge, got %v", action) 490 } 491 492 ld.Record("computer", `{"action":"click","coordinate":[100,200]}`, false, "", "", false) 493 action, _ = ld.Check("computer") 494 if action != LoopContinue { 495 t.Errorf("second GUI should not re-nudge (already nudged), got %v", action) 496 } 497 } 498 499 func TestLoopDetector_ToolModeSwitch_ResetsOnNewNonGUI(t *testing.T) { 500 ld := NewLoopDetector() 501 502 // Success → GUI nudge → new GUI-adjacent success → GUI nudge again (new mode switch) 503 ld.Record("applescript", `{"script":"create event"}`, false, "", "", false) 504 ld.Record("screenshot", `{"target":"screen"}`, false, "", "", false) 505 action, _ := ld.Check("screenshot") 506 if action != LoopNudge { 507 t.Errorf("first mode switch should nudge, got %v", action) 508 } 509 510 // New GUI-adjacent success resets the detector 511 ld.Record("browser", `{"action":"navigate","url":"http://example.com"}`, false, "", "", false) 512 ld.Record("screenshot", `{"target":"screen"}`, false, "", "", false) 513 action, _ = ld.Check("screenshot") 514 if action != LoopNudge { 515 t.Errorf("new mode switch after reset should nudge again, got %v", action) 516 } 517 } 518 519 func TestLoopDetector_ToolModeSwitch_NoNudgeAfterNonGUITool(t *testing.T) { 520 ld := NewLoopDetector() 521 522 // Non-GUI tool (bash, file_read, etc.) success → screenshot should NOT trigger 523 // mode switch since these aren't GUI-adjacent tools. 524 ld.Record("bash", `{"command":"echo hello"}`, false, "", "", false) 525 ld.Record("screenshot", `{"target":"screen"}`, false, "", "", false) 526 action, _ := ld.Check("screenshot") 527 if action != LoopContinue { 528 t.Errorf("screenshot after bash should not trigger mode switch, got %v", action) 529 } 530 } 531 532 func TestLoopDetector_RealWorldWebLoop_CrossTool(t *testing.T) { 533 ld := NewLoopDetector() 534 535 // All queries normalize to the "climate world" topic — only filler / date 536 // variations so the topic hash stays stable across all calls. 537 // 5 searches on same topic → nudge at 5 (v2 threshold) 538 ld.Record("web_search", `{"query":"world climate today March 2 2026"}`, false, "", "", false) 539 ld.Record("web_search", `{"query":"world climate March 2 2026 latest"}`, false, "", "", false) 540 ld.Record("web_search", `{"query":"world climate today latest headlines"}`, false, "", "", false) 541 ld.Record("web_search", `{"query":"world climate top breaking news"}`, false, "", "", false) 542 ld.Record("web_search", `{"query":"world climate current update major"}`, false, "", "", false) 543 544 action, _ := ld.Check("web_search") 545 if action != LoopNudge { 546 t.Errorf("expected nudge after 5 same-topic searches, got %v", action) 547 } 548 549 // Switch to web_fetch then back — same-topic counter continues via family lookup. 550 // web_fetch URL is treated as its own topic, then we add more same-topic searches. 551 ld.Record("web_fetch", `{"url":"https://reuters.com/world/climate"}`, false, "", "", false) 552 ld.Record("web_search", `{"query":"world climate today"}`, false, "", "", false) 553 ld.Record("web_search", `{"query":"world climate latest"}`, false, "", "", false) 554 ld.Record("web_search", `{"query":"world climate top current"}`, false, "", "", false) 555 556 // 5 original + 3 more same-topic web_search = 8 same-topic → stronger nudge (stage 1) 557 action, _ = ld.Check("web_search") 558 if action != LoopNudge { 559 t.Errorf("expected nudge after 8 same-topic web calls, got %v", action) 560 } 561 562 // Add more same-topic calls until force stop at progressCount >= 12. 563 ld.Record("web_search", `{"query":"world climate breaking"}`, false, "", "", false) 564 ld.Record("web_search", `{"query":"world climate update"}`, false, "", "", false) 565 ld.Record("web_search", `{"query":"world climate news today"}`, false, "", "", false) 566 ld.Record("web_search", `{"query":"world climate headlines major"}`, false, "", "", false) 567 action, _ = ld.Check("web_search") 568 if action != LoopForceStop { 569 t.Errorf("expected force stop after 12 same-topic web calls, got %v", action) 570 } 571 } 572 573 func TestLoopDetector_SuccessAfterError_NudgeOnPostRecoveryGUI(t *testing.T) { 574 ld := NewLoopDetector() 575 576 // Tool fails, then succeeds with different args, then agent goes to GUI → nudge 577 ld.Record("applescript", `{"script":"tell calendar \"Calendar\""}`, true, "calendar not found", "", false) 578 ld.Record("applescript", `{"script":"get name of every calendar"}`, false, "", "", false) 579 ld.Record("applescript", `{"script":"tell calendar \"日历\""}`, false, "", "", false) 580 581 // Now agent switches to GUI to verify — should nudge 582 ld.Record("screenshot", `{"target":"screen"}`, false, "", "", false) 583 action, msg := ld.Check("screenshot") 584 if action != LoopNudge { 585 t.Errorf("GUI after recovery should nudge, got %v", action) 586 } 587 if msg == "" { 588 t.Error("nudge should have a message") 589 } 590 } 591 592 func TestLoopDetector_SuccessAfterError_NoNudgeIfNoRecovery(t *testing.T) { 593 ld := NewLoopDetector() 594 595 // Tool fails, no retry yet, agent takes screenshot → no nudge from this detector 596 // (ToolModeSwitch won't fire either since last non-GUI was an error) 597 ld.Record("applescript", `{"script":"tell calendar \"Calendar\""}`, true, "not found", "", false) 598 ld.Record("screenshot", `{"target":"screen"}`, false, "", "", false) 599 action, _ := ld.Check("screenshot") 600 if action != LoopContinue { 601 t.Errorf("no recovery happened, should continue, got %v", action) 602 } 603 } 604 605 func TestLoopDetector_SuccessAfterError_ResetsOnNewWork(t *testing.T) { 606 ld := NewLoopDetector() 607 608 // Recovery happens, then agent moves on to genuinely different work 609 ld.Record("applescript", `{"script":"tell calendar \"Calendar\""}`, true, "not found", "", false) 610 ld.Record("applescript", `{"script":"tell calendar \"日历\""}`, false, "", "", false) 611 612 // Agent moves to a different non-GUI tool → recovery state resets 613 ld.Record("bash", `{"command":"echo done"}`, false, "", "", false) 614 ld.Record("file_read", `{"path":"notes.md"}`, false, "", "", false) 615 616 // GUI now should NOT nudge for recovery (agent moved on) 617 // Note: ToolModeSwitch may nudge since file_read succeeded — that's a different detector 618 // We specifically check that the nudge message does NOT mention recovery 619 ld.Record("screenshot", `{"target":"screen"}`, false, "", "", false) 620 action, msg := ld.Check("screenshot") 621 // It may nudge from ToolModeSwitch, but NOT from SuccessAfterError 622 if action == LoopNudge && strings.Contains(msg, "recovered") { 623 t.Errorf("recovery should have reset, but got recovery nudge: %s", msg) 624 } 625 } 626 627 func TestLoopDetector_SleepDetection_Nudge(t *testing.T) { 628 ld := NewLoopDetector() 629 630 // 1 sleep call: no trigger 631 ld.Record("bash", `{"command":"sleep 5"}`, false, "", "", false) 632 action, _ := ld.Check("bash") 633 if action != LoopContinue { 634 t.Errorf("1 sleep call should not trigger, got %v", action) 635 } 636 637 // 2nd sleep call: nudge 638 ld.Record("bash", `{"command":"sleep 5 && curl http://localhost:8080"}`, false, "", "", false) 639 action, msg := ld.Check("bash") 640 if action != LoopNudge { 641 t.Errorf("2 sleep calls should nudge, got %v", action) 642 } 643 if msg == "" { 644 t.Error("nudge should have a message") 645 } 646 } 647 648 func TestLoopDetector_SleepDetection_ForceStop(t *testing.T) { 649 ld := NewLoopDetector() 650 651 // 4 sleep calls: force stop 652 ld.Record("bash", `{"command":"sleep 5"}`, false, "", "", false) 653 ld.Record("bash", `{"command":"sleep 1 && echo done"}`, false, "", "", false) 654 ld.Record("bash", `{"command":"while true; do sleep 1; done"}`, false, "", "", false) 655 ld.Record("bash", `{"command":"sleep 10"}`, false, "", "", false) 656 action, _ := ld.Check("bash") 657 if action != LoopForceStop { 658 t.Errorf("4 sleep calls should force stop, got %v", action) 659 } 660 } 661 662 func TestLoopDetector_SleepDetection_NoFalsePositive(t *testing.T) { 663 ld := NewLoopDetector() 664 665 // bash commands without sleep: no trigger 666 ld.Record("bash", `{"command":"echo hello"}`, false, "", "", false) 667 ld.Record("bash", `{"command":"cat sleep.log"}`, false, "", "", false) 668 ld.Record("bash", `{"command":"grep sleeper main.go"}`, false, "", "", false) 669 ld.Record("bash", `{"command":"ls -la"}`, false, "", "", false) 670 action, _ := ld.Check("bash") 671 if action != LoopContinue { 672 t.Errorf("non-sleep bash commands should not trigger, got %v", action) 673 } 674 } 675 676 func TestLoopDetector_SleepDetection_IgnoreNonBash(t *testing.T) { 677 ld := NewLoopDetector() 678 679 // sleep in non-bash tool args: no trigger (different args to avoid dup detection) 680 ld.Record("file_read", `{"command":"sleep 5"}`, false, "", "", false) 681 ld.Record("grep", `{"command":"sleep 10"}`, false, "", "", false) 682 ld.Record("file_read", `{"command":"sleep 15"}`, false, "", "", false) 683 ld.Record("grep", `{"command":"sleep 20"}`, false, "", "", false) 684 action, _ := ld.Check("grep") 685 if action != LoopContinue { 686 t.Errorf("sleep in non-bash tool args should not trigger, got %v", action) 687 } 688 } 689 690 func TestLoopDetector_SearchEscalation_Nudge(t *testing.T) { 691 ld := NewLoopDetector() 692 693 // 6 consecutive unproductive search calls: no trigger yet (threshold is 7) 694 for i := 0; i < 6; i++ { 695 ld.Record("grep", fmt.Sprintf(`{"pattern":"term%d"}`, i), false, "", "", true) 696 } 697 action, _ := ld.Check("grep") 698 if action != LoopContinue { 699 t.Errorf("6 unproductive search calls should not trigger, got %v", action) 700 } 701 702 // 7th unproductive search call: nudge 703 ld.Record("grep", `{"pattern":"term6"}`, false, "", "", true) 704 action, msg := ld.Check("grep") 705 if action != LoopNudge { 706 t.Errorf("7 unproductive search calls should nudge, got %v", action) 707 } 708 if msg == "" { 709 t.Error("nudge should have a message") 710 } 711 } 712 713 func TestLoopDetector_SearchEscalation_ForceStop(t *testing.T) { 714 ld := NewLoopDetector() 715 716 // 12 consecutive unproductive search calls (mixed grep/glob): force stop 717 for i := 0; i < 12; i++ { 718 tool := "grep" 719 if i%2 == 1 { 720 tool = "glob" 721 } 722 ld.Record(tool, fmt.Sprintf(`{"pattern":"term%d"}`, i), false, "", "", true) 723 } 724 action, _ := ld.Check("glob") 725 if action != LoopForceStop { 726 t.Errorf("12 unproductive search calls should force stop, got %v", action) 727 } 728 } 729 730 func TestLoopDetector_SearchEscalation_NoFalsePositive(t *testing.T) { 731 ld := NewLoopDetector() 732 733 // grep interspersed with file_edit: no consecutive run builds up 734 ld.Record("grep", `{"pattern":"foo"}`, false, "", "", false) 735 ld.Record("file_edit", `{"file":"main.go","old":"a","new":"b"}`, false, "", "", false) 736 ld.Record("grep", `{"pattern":"bar"}`, false, "", "", false) 737 ld.Record("file_edit", `{"file":"main.go","old":"b","new":"c"}`, false, "", "", false) 738 ld.Record("grep", `{"pattern":"baz"}`, false, "", "", false) 739 740 action, _ := ld.Check("grep") 741 if action != LoopContinue { 742 t.Errorf("grep interspersed with edits should not trigger search escalation, got %v", action) 743 } 744 } 745 746 func TestLoopDetector_SearchEscalation_MixedSearchTools(t *testing.T) { 747 ld := NewLoopDetector() 748 749 // 7 unproductive mixed grep+glob calls: nudge (v2 threshold) 750 ld.Record("grep", `{"pattern":"foo"}`, false, "", "", true) 751 ld.Record("glob", `{"pattern":"**/*.go"}`, false, "", "", true) 752 ld.Record("grep", `{"pattern":"bar"}`, false, "", "", true) 753 ld.Record("glob", `{"pattern":"**/*.ts"}`, false, "", "", true) 754 ld.Record("grep", `{"pattern":"baz"}`, false, "", "", true) 755 ld.Record("glob", `{"pattern":"**/*.json"}`, false, "", "", true) 756 ld.Record("grep", `{"pattern":"qux"}`, false, "", "", true) 757 758 action, msg := ld.Check("grep") 759 if action != LoopNudge { 760 t.Errorf("7 unproductive mixed search calls should nudge, got %v", action) 761 } 762 if msg == "" { 763 t.Error("nudge should have a message") 764 } 765 } 766 767 func TestLoopDetector_SearchEscalation_ProductiveResets(t *testing.T) { 768 ld := NewLoopDetector() 769 770 // 2 unproductive, then 1 productive, then 1 more unproductive. 771 // Trailing unproductive streak is only 1, well below the nudge threshold of 5. 772 ld.Record("grep", `{"pattern":"a"}`, false, "", "", true) 773 ld.Record("grep", `{"pattern":"b"}`, false, "", "", true) 774 ld.Record("grep", `{"pattern":"c"}`, false, "", "", false) // productive — resets streak 775 ld.Record("grep", `{"pattern":"d"}`, false, "", "", true) 776 777 action, _ := ld.Check("grep") 778 if action != LoopContinue { 779 t.Errorf("productive search should reset streak, expected continue, got %v", action) 780 } 781 } 782 783 func TestLoopDetector_SearchEscalation_ProductiveSearchesDontHitNoProgress(t *testing.T) { 784 ld := NewLoopDetector() 785 786 // Repeated productive grep calls with different args are normal during 787 // repository exploration and should not trigger the generic NoProgress path. 788 for i := 0; i < 8; i++ { 789 ld.Record("grep", fmt.Sprintf(`{"pattern":"term%d"}`, i), false, "", "", false) 790 } 791 792 action, _ := ld.Check("grep") 793 if action != LoopContinue { 794 t.Errorf("productive search calls should not hit NoProgress, got %v", action) 795 } 796 } 797 798 func TestLoopDetector_BrowserFamilyNoProgress(t *testing.T) { 799 ld := NewLoopDetector() 800 801 // Simulate 5 browser calls with the same URL (same topic hash) but different 802 // extra fields to produce different ArgsHash and avoid ConsecutiveDup detector. 803 // v2: FamilyNoProgress nudge at progressCount >= 5. 804 ld.Record("browser", `{"action":"navigate","url":"https://jd.com/search?q=huawei","wait":1}`, false, "", "", false) 805 ld.Record("browser", `{"action":"navigate","url":"https://jd.com/search?q=huawei","wait":2}`, false, "", "", false) 806 ld.Record("browser", `{"action":"navigate","url":"https://jd.com/search?q=huawei","wait":3}`, false, "", "", false) 807 ld.Record("browser", `{"action":"navigate","url":"https://jd.com/search?q=huawei","wait":4}`, false, "", "", false) 808 ld.Record("browser", `{"action":"navigate","url":"https://jd.com/search?q=huawei","wait":5}`, false, "", "", false) 809 action, msg := ld.Check("browser") 810 if action != LoopNudge { 811 t.Errorf("5 same-topic browser calls should nudge, got %v", action) 812 } 813 if strings.Contains(msg, "searched") || strings.Contains(msg, "query") { 814 t.Errorf("browser-family nudge should not use search vocabulary, got: %s", msg) 815 } 816 if !strings.Contains(msg, "UI action") { 817 t.Errorf("expected browser-family nudge to mention 'UI action', got: %s", msg) 818 } 819 } 820 821 // TestFamilyNoProgressMessage_VocabularyByFamily asserts the helper emits 822 // family-appropriate wording at each stage. Protects against the regression 823 // where browser callers received search-vocabulary nudges ("You've searched 824 // the same topic…") after FamilyNoProgress was extended to cover browser_*. 825 func TestFamilyNoProgressMessage_VocabularyByFamily(t *testing.T) { 826 cases := []struct { 827 family string 828 stage int 829 forbidSubstrs []string 830 wantSubstrs []string 831 }{ 832 {"browser", 0, []string{"searched", "query"}, []string{"UI action", "selector"}}, 833 {"browser", 1, []string{"searched", "query"}, []string{"UI action"}}, 834 {"browser", 2, []string{"searched", "query"}, []string{"UI action", "browser-family"}}, 835 {"gui", 0, []string{"searched", "query"}, []string{"UI action"}}, 836 {"search", 0, nil, []string{"searched the same topic"}}, 837 {"web", 2, nil, []string{"web calls", "same topic"}}, 838 } 839 for _, tc := range cases { 840 msg := familyNoProgressMessage(tc.family, 3, 4, tc.stage) 841 for _, forbid := range tc.forbidSubstrs { 842 if strings.Contains(msg, forbid) { 843 t.Errorf("family=%s stage=%d: message must not contain %q, got: %s", tc.family, tc.stage, forbid, msg) 844 } 845 } 846 for _, want := range tc.wantSubstrs { 847 if !strings.Contains(msg, want) { 848 t.Errorf("family=%s stage=%d: message must contain %q, got: %s", tc.family, tc.stage, want, msg) 849 } 850 } 851 } 852 } 853 854 func TestBrowserInToolFamilies(t *testing.T) { 855 family := toolFamily("browser") 856 if family != "browser" { 857 t.Errorf("browser family should be 'browser', got %q", family) 858 } 859 } 860 861 // TestLoopDetector_BrowserToolsRepeatable ensures that browser_* MCP tools 862 // are treated as repeatable GUI tools. Before the fix, `repeatableGUITools` 863 // was keyed on the literal string "browser", but real tool names are 864 // "browser_navigate", "browser_snapshot", etc., so the NoProgress detector 865 // (8+ same tool → nudge) would fire on legit multi-page browsing sessions. 866 func TestLoopDetector_BrowserToolsRepeatable(t *testing.T) { 867 ld := NewLoopDetector() 868 869 // 9 browser_navigate calls to different URLs — progressCount stays at 1 870 // per topic, so the FamilyNoProgress detector won't fire. But before the 871 // fix the outer NoProgress detector (line 355) WOULD nudge at 8 because 872 // repeatableTools["browser_navigate"] == false. After the fix it stays Continue. 873 urls := []string{"a", "b", "c", "d", "e", "f", "g", "h", "i"} 874 for _, u := range urls { 875 ld.Record("browser_navigate", fmt.Sprintf(`{"url":"https://example.com/%s"}`, u), false, "", "", false) 876 } 877 action, msg := ld.Check("browser_navigate") 878 if action != LoopContinue { 879 t.Fatalf("browser_navigate x9 to different URLs should Continue (it is a repeatable GUI tool), got %v: %s", action, msg) 880 } 881 } 882 883 func TestLoopDetector_BrowserSnapshotInterleavedRepeatable(t *testing.T) { 884 ld := NewLoopDetector() 885 // Realistic multi-step pattern: snapshot → click → snapshot → click → ... 886 // Each snapshot has the same args but is separated by clicks, so it is 887 // not a consecutive duplicate. Over 10 steps we accumulate 5 snapshots, 888 // under the consecutive-dup threshold and under the no-progress threshold 889 // of 8 same-name calls — must stay Continue. 890 for i := range 5 { 891 ld.Record("browser_snapshot", `{}`, false, "", "", false) 892 ld.Record("browser_click", fmt.Sprintf(`{"ref":"e%d"}`, i), false, "", "", false) 893 } 894 action, msg := ld.Check("browser_click") 895 if action != LoopContinue { 896 t.Fatalf("interleaved browser_snapshot/browser_click should Continue, got %v: %s", action, msg) 897 } 898 } 899 900 // TestLoopDetector_SemiRepeatable_BashHigherThreshold verifies that bash 901 // gets the elevated NoProgress threshold (16) instead of the generic (12), 902 // so multi-step scripting workflows (fetch → process → install → build) 903 // aren't killed before completing. The exact-dup, same-error, and sleep 904 // detectors still catch real loops at their own lower thresholds. 905 func TestLoopDetector_SemiRepeatable_BashHigherThreshold(t *testing.T) { 906 ld := NewLoopDetector() 907 908 // 12 distinct bash calls — would nudge with the generic threshold (12), 909 // but should be Continue with the semi-repeatable threshold of 16. 910 for i := range 12 { 911 ld.Record("bash", fmt.Sprintf(`{"command":"step_%d"}`, i), false, "", "", false) 912 } 913 action, _ := ld.Check("bash") 914 if action != LoopContinue { 915 t.Errorf("12 distinct bash calls should Continue (semi-repeatable threshold 16), got %v", action) 916 } 917 918 // 15 calls — still under 16. 919 for i := 12; i < 15; i++ { 920 ld.Record("bash", fmt.Sprintf(`{"command":"step_%d"}`, i), false, "", "", false) 921 } 922 action, _ = ld.Check("bash") 923 if action != LoopContinue { 924 t.Errorf("15 distinct bash calls should Continue, got %v", action) 925 } 926 927 // 16th call → nudge. 928 ld.Record("bash", `{"command":"step_16"}`, false, "", "", false) 929 action, _ = ld.Check("bash") 930 if action != LoopNudge { 931 t.Errorf("16 bash calls should nudge, got %v", action) 932 } 933 } 934 935 // TestLoopDetector_SemiRepeatable_NonBashUnchanged verifies that the generic 936 // NoProgress threshold (12) still applies to non-semi-repeatable tools like 937 // file_write, think, etc. — unchanged from the v1 bash-only relaxation. 938 func TestLoopDetector_SemiRepeatable_NonBashUnchanged(t *testing.T) { 939 ld := NewLoopDetector() 940 941 for i := range 12 { 942 ld.Record("think", fmt.Sprintf(`{"thought":"idea_%d"}`, i), false, "", "", false) 943 } 944 action, _ := ld.Check("think") 945 if action != LoopNudge { 946 t.Errorf("12 think calls should nudge at generic threshold, got %v", action) 947 } 948 } 949 950 // TestLoopDetector_BrowserMultiToolFlowNoFalsePositive verifies that a 951 // realistic mixed browser workflow does not trigger FamilyNoProgress just 952 // because every call on the same page produces the same URL-only result 953 // signature. Before the fix, navigate → click → click → upload on 954 // chatgpt.com would emit a "same topic/UI action 3 times" nudge because 955 // extractResultSignature collapses every browser-family call on that URL 956 // to the same hash. 957 func TestLoopDetector_BrowserMultiToolFlowNoFalsePositive(t *testing.T) { 958 ld := NewLoopDetector() 959 960 // All four calls return snapshots whose URL set boils down to 961 // https://chatgpt.com/ → identical result signatures. 962 sameResultSig := "https://chatgpt.com" 963 ld.Record("browser_navigate", `{"url":"https://chatgpt.com"}`, false, "", sameResultSig, false) 964 ld.Record("browser_click", `{"ref":"e120","element":"plus"}`, false, "", sameResultSig, false) 965 ld.Record("browser_click", `{"ref":"e513","element":"photos"}`, false, "", sameResultSig, false) 966 ld.Record("browser_file_upload", `{"paths":["/tmp/x.png"]}`, false, "", sameResultSig, false) 967 968 action, msg := ld.Check("browser_file_upload") 969 if action != LoopContinue { 970 t.Errorf("mixed browser workflow with unique tool names must not nudge, got %v (%q)", action, msg) 971 } 972 } 973 974 // TestLoopDetector_BrowserSameToolStillDetected guards the opposite case: 975 // true repetition of the same browser tool on the same page should still 976 // trip the detector. Ensures the same-name scoping doesn't break real-loop 977 // detection. 978 func TestLoopDetector_BrowserSameToolStillDetected(t *testing.T) { 979 ld := NewLoopDetector() 980 sameResultSig := "https://chatgpt.com" 981 // Three back-to-back identical calls hit the consecutive-duplicate 982 // detector at threshold 3 → nudge. A fourth would force-stop, which is 983 // also correct behavior but not what this test locks in. 984 for i := 0; i < 3; i++ { 985 ld.Record("browser_click", `{"ref":"e120","element":"plus"}`, false, "", sameResultSig, false) 986 } 987 action, _ := ld.Check("browser_click") 988 if action != LoopNudge { 989 t.Errorf("3 consecutive identical browser_click calls should nudge, got %v", action) 990 } 991 } 992 993 // TestLoopDetector_BrowserSnapshotConsecutiveDupStillForceStops preserves the 994 // load-bearing polling guard after the repeatable-result-only relaxation: 995 // repeated browser_snapshot calls with identical args must still be stopped by 996 // the duplicate detectors instead of silently inheriting the raised threshold. 997 // With consecDupThreshold=3, force-stop fires at consecDupThreshold+1=4. 998 func TestLoopDetector_BrowserSnapshotConsecutiveDupStillForceStops(t *testing.T) { 999 ld := NewLoopDetector() 1000 const pageURL = "https://example.com/app" 1001 for range 4 { 1002 ld.Record("browser_snapshot", `{}`, false, "", pageURL, false) 1003 } 1004 action, msg := ld.Check("browser_snapshot") 1005 if action != LoopForceStop { 1006 t.Fatalf("4 identical browser_snapshot calls must still force-stop via duplicate detection, got %v: %s", action, msg) 1007 } 1008 } 1009 1010 // TestIsReadMCPName locks the read-verb whitelist used to populate 1011 // the loop detector's batchTolerant set. Read-only MCP tools must match 1012 // (eligible for uniqueness-gated NoProgress relief); write-capable tools 1013 // must NOT match (stay under the count-based guard), because the 1014 // permission engine does not gate MCP calls and a write loop with 1015 // unique arguments could otherwise create many remote records. 1016 func TestIsReadMCPName(t *testing.T) { 1017 tests := []struct { 1018 name string 1019 want bool 1020 }{ 1021 // Direct read-verb prefix. 1022 {"list_calendars", true}, 1023 {"get_events", true}, 1024 {"search_gmail_messages", true}, 1025 {"query_database", true}, 1026 {"fetch_profile", true}, 1027 {"describe_table", true}, 1028 {"find_files", true}, 1029 // Namespaced read-verbs (vendor prefix + separator + verb). 1030 {"API-query-data-source", true}, 1031 {"google_gmail_search_messages", true}, 1032 {"notion_list_pages", true}, 1033 {"Notion_Search_Databases", true}, // case-insensitive 1034 // Write verbs must stay OUT. 1035 {"create_notion_page", false}, 1036 {"update_page_properties", false}, 1037 {"delete_event", false}, 1038 {"send_gmail_message", false}, 1039 {"modify_permissions", false}, 1040 {"remove_label", false}, 1041 {"insert_row", false}, 1042 {"append_content_to_page", false}, 1043 {"archive_thread", false}, 1044 // Namespaced writes must also stay out. 1045 {"google_calendar_create_event", false}, 1046 {"notion_create_comment", false}, 1047 {"drive_upload_file", false}, 1048 // Compound-verb names: a read verb AND a write verb in the first 1049 // three tokens must return false — the write blacklist dominates. 1050 // This is the defensive half of the heuristic: destructive suffixes 1051 // must not sneak through on a position-0 read-verb match. 1052 {"lookup_and_delete_all_records", false}, // lookup + delete 1053 {"get_or_create_item", false}, // get + create 1054 {"find_and_remove_entry", false}, // find + remove 1055 {"list-and-archive", false}, // list + archive 1056 // Data-transfer / property-mutation verbs (GitHub/Linear/Notion/ 1057 // Slack MCP patterns). Each pairs a position-0 read with a 1058 // write verb that earlier versions of writeVerbs missed. 1059 {"get_and_add_member", false}, // get + add 1060 {"list_and_set_properties", false}, // list + set 1061 {"search_and_replace", false}, // search + replace 1062 {"get_and_write_cache", false}, // get + write 1063 {"find_and_patch_record", false}, // find + patch 1064 {"query_and_put_result", false}, // query + put 1065 {"list_and_clear_flags", false}, // list + clear 1066 {"get_and_post_update", false}, // get + post 1067 {"list_and_push_changes", false}, // list + push 1068 {"fetch_and_publish_item", false}, // fetch + publish 1069 {"get_and_submit_form", false}, // get + submit 1070 {"list_and_drop_table", false}, // list + drop 1071 {"find_and_prune_entries", false}, // find + prune 1072 // "run"/"execute" are in writeVerbs (fail-closed on ambiguous 1073 // action verbs). Snowflake/ClickHouse "run_query" used to be 1074 // accepted as SELECT convention, but a Medium review finding 1075 // pointed out that ambiguity should fall on the safe side — 1076 // the server is free to rename to "query_database" if it wants 1077 // NoProgress relief. 1078 {"run_query", false}, // run is a write verb (fail closed) 1079 {"execute_script", false}, // execute is a write verb (fail closed) 1080 {"transform_data", false}, // no read verb 1081 {"process_batch", false}, // no read verb 1082 // Pathological: write name with a read-verb at position 4+ must 1083 // NOT match (token scan stops at position 3). 1084 {"request_write_access_and_get_token_afterwards", false}, 1085 } 1086 for _, tt := range tests { 1087 t.Run(tt.name, func(t *testing.T) { 1088 if got := isReadMCPName(tt.name); got != tt.want { 1089 t.Errorf("isReadMCPName(%q) = %v, want %v", tt.name, got, tt.want) 1090 } 1091 }) 1092 } 1093 } 1094 1095 // TestLoopDetector_NoProgress_BashUniqueArgs_NoNudge covers the Task 5 1096 // benchmark pattern: ~15 bash calls during a multi-step investigation, each 1097 // with distinct argsJSON. Pre-gate, this force-stops via maxNudges escalation. 1098 // With bash in the batchTolerant set and ≥50% unique argsHashes, NoProgress 1099 // must treat this as a legitimate batch and stay Continue. 1100 func TestLoopDetector_NoProgress_BashUniqueArgs_NoNudge(t *testing.T) { 1101 ld := NewLoopDetector() 1102 ld.batchTolerant = map[string]bool{"bash": true} 1103 1104 for i := range 15 { 1105 ld.Record("bash", fmt.Sprintf(`{"cmd":"step_%d"}`, i), false, "", "", false) 1106 action, msg := ld.Check("bash") 1107 if action != LoopContinue { 1108 t.Fatalf("call %d: unique-args bash on a batch-tolerant tool should stay Continue, got %v (%s)", i+1, action, msg) 1109 } 1110 } 1111 } 1112 1113 // TestLoopDetector_NoProgress_MCPUniqueArgs_NoNudge covers the Task 6 1114 // benchmark pattern: 16 MCP-tool calls each querying a distinct UUID during a 1115 // legitimate Notion database enumeration. Pre-gate, this hit the generic 1116 // NoProgress threshold at count=8. With the MCP tool registered in 1117 // batchTolerant, unique-args enumeration stays Continue. 1118 func TestLoopDetector_NoProgress_MCPUniqueArgs_NoNudge(t *testing.T) { 1119 ld := NewLoopDetector() 1120 ld.batchTolerant = map[string]bool{"API-query-data-source": true} 1121 1122 for i := range 16 { 1123 ld.Record("API-query-data-source", fmt.Sprintf(`{"id":"uuid-%d"}`, i), false, "", "", false) 1124 action, msg := ld.Check("API-query-data-source") 1125 if action != LoopContinue { 1126 t.Fatalf("call %d: unique-args MCP tool on batch-tolerant list should stay Continue, got %v (%s)", i+1, action, msg) 1127 } 1128 } 1129 } 1130 1131 // TestLoopDetector_NoProgress_MCPIdenticalArgs_StillStops locks the invariant 1132 // that batch-tolerance does NOT relax the identical-args case. Regardless of 1133 // which layered detector catches it (ConsecutiveDup fires earliest at 2 1134 // consecutive identical calls; ExactDup at 3 spread out; NoProgress at 8), 1135 // the outcome must be "not Continue" — identical-args spin is always caught. 1136 func TestLoopDetector_NoProgress_MCPIdenticalArgs_StillStops(t *testing.T) { 1137 ld := NewLoopDetector() 1138 ld.batchTolerant = map[string]bool{"API-query-data-source": true} 1139 1140 for range 8 { 1141 ld.Record("API-query-data-source", `{"id":"same-uuid"}`, false, "", "", false) 1142 } 1143 action, msg := ld.Check("API-query-data-source") 1144 if action == LoopContinue { 1145 t.Fatalf("identical-args calls must be stopped by some detector despite batch-tolerance, got Continue (%s)", msg) 1146 } 1147 } 1148 1149 // TestLoopDetector_NoProgress_GenericToolUniqueArgs_StillNudges_Regression 1150 // pins the core constraint of Phase 1: the uniqueness gate must NOT relax 1151 // generic NoProgress detection for tools outside the batchTolerant set. 1152 // `think` (not in batchTolerant, not semi-repeatable) called 12 times with 1153 // distinct argsJSON must still nudge — catching "spinning on thought 1154 // variations without progress" is the generic path's load-bearing role. 1155 // v2: noProgressThreshold=12, so nudge fires at call 12. 1156 func TestLoopDetector_NoProgress_GenericToolUniqueArgs_StillNudges_Regression(t *testing.T) { 1157 ld := NewLoopDetector() 1158 // Explicitly NOT populating batchTolerant — this test must behave the 1159 // same whether the field is nil or empty. 1160 1161 for i := range 12 { 1162 ld.Record("think", fmt.Sprintf(`{"thought":"idea%d"}`, i), false, "", "", false) 1163 } 1164 action, msg := ld.Check("think") 1165 if action != LoopNudge { 1166 t.Fatalf("12 unique-args think calls must still nudge (generic path unchanged), got %v (%s)", action, msg) 1167 } 1168 } 1169 1170 // TestLoopDetector_NoProgress_BashMixedArgsRatio_GateIsolated exercises the 1171 // NoProgress uniqueness gate without letting ConsecutiveDup / ExactDup fire 1172 // first. The sequence uses 8 distinct argsHashes each appearing exactly twice 1173 // (16 calls, 50% unique) interleaved so no hash runs ≥3 times in a row and 1174 // ExactDup's "same-arg 3 times in window" threshold is not tripped. 1175 // 1176 // On a batch-tolerant bash, the gate suppresses the nudge at count≥12. 1177 // Without batch-tolerance (Generic path), the same stream must nudge — this 1178 // sub-test covers the non-relaxation invariant at the threshold boundary. 1179 func TestLoopDetector_NoProgress_BashMixedArgsRatio_GateIsolated(t *testing.T) { 1180 // Build a non-consecutive pattern to keep ConsecutiveDup (need ≥2 back-to-back) 1181 // and ExactDup (need ≥3 of the same argsHash in the window) quiet. 1182 // Pattern: 1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8 — each hash appears twice, 1183 // separated by 7 others. ExactDup threshold is 3 so two appearances is 1184 // safe; ConsecutiveDup needs adjacency so interleaving avoids it. 1185 pattern := []int{0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7} 1186 1187 t.Run("gated_when_batch_tolerant", func(t *testing.T) { 1188 ld := NewLoopDetector() 1189 ld.batchTolerant = map[string]bool{"bash": true} 1190 for _, i := range pattern { 1191 ld.Record("bash", fmt.Sprintf(`{"cmd":"script_%d"}`, i), false, "", "", false) 1192 } 1193 action, msg := ld.Check("bash") 1194 if action != LoopContinue { 1195 t.Fatalf("50%% unique on batch-tolerant bash should be gated (Continue), got %v (%s)", action, msg) 1196 } 1197 }) 1198 1199 t.Run("not_gated_when_not_batch_tolerant", func(t *testing.T) { 1200 ld := NewLoopDetector() 1201 // Explicitly empty batchTolerant — same sequence, no gate. 1202 for _, i := range pattern { 1203 ld.Record("bash", fmt.Sprintf(`{"cmd":"script_%d"}`, i), false, "", "", false) 1204 } 1205 action, msg := ld.Check("bash") 1206 if action != LoopNudge { 1207 t.Fatalf("same sequence without batch-tolerance should nudge at count≥12, got %v (%s)", action, msg) 1208 } 1209 }) 1210 } 1211 1212 // TestLoopDetector_UseSkill_RepeatedNeverFiresAnyDup documents production 1213 // issue: 9 force-stops in audit log on use_skill same-args ×3, iter=3, 1214 // killing queries before they were processed. use_skill is an idempotent 1215 // metadata load (see internal/tools/skill.go) — repeating it is harmless. 1216 // After the fix, ×5 same-args should return LoopContinue from Check 1217 // (neither ConsecutiveDup nor ExactDup fires). 1218 func TestLoopDetector_UseSkill_RepeatedNeverFiresAnyDup(t *testing.T) { 1219 ld := NewLoopDetector() 1220 for range 5 { 1221 ld.Record("use_skill", `{"skill_name":"kocoro"}`, false, "", "", false) 1222 } 1223 action, msg := ld.Check("use_skill") 1224 if action != LoopContinue { 1225 t.Fatalf("use_skill ×5 same-args must return LoopContinue (idempotent metadata load), got %v: %s", action, msg) 1226 } 1227 } 1228 1229 // TestLoopDetector_UseSkill_ExemptionScopedToSelf guards against the 1230 // dupExempt entry leaking into other tools. After 5 use_skill calls 1231 // (which would normally trip ExactDup), records 4 same-args web_search 1232 // calls — those must still force-stop. This catches the regression 1233 // where an over-broad exemption (e.g. checking against the whole 1234 // dupExemptTools map outside the name-scoped path) would suppress 1235 // legitimate signals on adjacent tools. 1236 // With consecDupThreshold=3, force-stop fires at consecCount >= 4. 1237 func TestLoopDetector_UseSkill_ExemptionScopedToSelf(t *testing.T) { 1238 ld := NewLoopDetector() 1239 for range 5 { 1240 ld.Record("use_skill", `{"skill_name":"kocoro"}`, false, "", "", false) 1241 } 1242 for range 4 { 1243 ld.Record("web_search", `{"q":"climate"}`, false, "", "", false) 1244 } 1245 action, _ := ld.Check("web_search") 1246 if action != LoopForceStop { 1247 t.Fatalf("web_search ×4 same args must still force-stop after use_skill exemption activity, got %v", action) 1248 } 1249 } 1250 1251 // TestNudgeWindow_RollsOff documents the rolling-window semantics: nudges 1252 // older than `nudgeWindow` iterations age out. A long workflow with widely 1253 // spaced harmless nudges should never trigger maxNudges escalation. 1254 func TestNudgeWindow_RollsOff(t *testing.T) { 1255 w := newNudgeWindow(3, 5) // 3 max, 5-iter window 1256 if w.recordAndCheck(1) { 1257 t.Fatal("1 nudge in window should not escalate") 1258 } 1259 if w.recordAndCheck(2) { 1260 t.Fatal("2 nudges in window should not escalate") 1261 } 1262 // iter 3-7: no nudges. By iter 8, the iter-1 and iter-2 nudges should age out (cutoff = 8 - 5 + 1 = 4). 1263 if w.recordAndCheck(8) { 1264 t.Fatal("3rd nudge at iter 8 (window=5) should not escalate — first two aged out") 1265 } 1266 } 1267 1268 func TestNudgeWindow_BurstEscalates(t *testing.T) { 1269 w := newNudgeWindow(3, 5) 1270 if w.recordAndCheck(1) { 1271 t.Fatal("1st nudge should not escalate") 1272 } 1273 if w.recordAndCheck(2) { 1274 t.Fatal("2nd nudge should not escalate") 1275 } 1276 if !w.recordAndCheck(3) { 1277 t.Fatal("3rd nudge in 5-iter window should escalate") 1278 } 1279 } 1280 1281 // TestConsecutiveDup_FailFailSuccessRetry locks the invariant that a flaky 1282 // retry pattern (fail, fail, succeed) on the same args does NOT force-stop. 1283 // Real Playwright selectors race page-load timing — the model must be 1284 // allowed to retry without being killed at attempt 3. Rule 1: tail-success 1285 // after any error in the run → skip detector (model recovered). 1286 func TestConsecutiveDup_FailFailSuccessRetry(t *testing.T) { 1287 ld := NewLoopDetector() 1288 ld.Record("browser_click", `{"ref":"e1"}`, true, "element not found", "", false) 1289 ld.Record("browser_click", `{"ref":"e1"}`, true, "element not found", "", false) 1290 ld.Record("browser_click", `{"ref":"e1"}`, false, "", "", false) 1291 action, msg := ld.Check("browser_click") 1292 if action != LoopContinue { 1293 t.Fatalf("fail-fail-success retry must return LoopContinue (tail recovery), got %v: %s", action, msg) 1294 } 1295 } 1296 1297 // TestConsecutiveDup_ThreeSuccessfulSameArgsStillStops confirms the 1298 // legitimate "spinning on identical successful results" case still 1299 // triggers. Rule 1 doesn't apply (no error in run), Rule 2 doesn't apply 1300 // (not all errors) → original strict threshold. 1301 // With consecDupThreshold=3: nudge at 3, force-stop at 4. 1302 func TestConsecutiveDup_ThreeSuccessfulSameArgsStillStops(t *testing.T) { 1303 ld := NewLoopDetector() 1304 for range 4 { 1305 ld.Record("web_search", `{"q":"climate"}`, false, "", "", false) 1306 } 1307 action, _ := ld.Check("web_search") 1308 if action != LoopForceStop { 1309 t.Fatalf("4 successful identical web_search must still force-stop, got %v", action) 1310 } 1311 } 1312 1313 // TestConsecutiveDup_SixErrorsNudgesNotForceStop: 6 same-args fails uses 1314 // Rule 2's 2x threshold (6 nudge, 7 force-stop). At 6 errors → nudge, not force-stop. 1315 // consecDupThreshold=3 → all-errors budget = 2x = 6/7. 1316 func TestConsecutiveDup_SixErrorsNudgesNotForceStop(t *testing.T) { 1317 ld := NewLoopDetector() 1318 for range 6 { 1319 ld.Record("browser_click", `{"ref":"e1"}`, true, "element not found", "", false) 1320 } 1321 action, _ := ld.Check("browser_click") 1322 if action != LoopNudge { 1323 t.Fatalf("6 same-args consecutive errors should nudge (error budget 6/7), got %v", action) 1324 } 1325 } 1326 1327 // TestConsecutiveDup_FiveAllErrorsForceStops: 7 same-args all-error hits 1328 // the 2x force-stop budget. No tail success, no recovery — real stuck loop. 1329 // consecDupThreshold=3 → all-errors budget = 2x = 6 nudge / 7 force-stop. 1330 func TestConsecutiveDup_FiveAllErrorsForceStops(t *testing.T) { 1331 ld := NewLoopDetector() 1332 for range 7 { 1333 ld.Record("browser_click", `{"ref":"e1"}`, true, "element not found", "", false) 1334 } 1335 action, _ := ld.Check("browser_click") 1336 if action != LoopForceStop { 1337 t.Fatalf("7 same-args consecutive errors should force-stop (2x budget), got %v", action) 1338 } 1339 } 1340 1341 // TestExactDup_RecoveryThenSecondSpinNotSkipped locks the invariant that 1342 // `latestRecoveredAfterSameArgsErrors` requires the LATEST call to be 1343 // success. A pattern of "errors → success → more errors" should NOT be 1344 // treated as recovered when the latest call is back to error — the 1345 // detector must count the new error streak fresh. 1346 // 1347 // Regression guard: if a future refactor of latestRecoveredAfterSameArgsErrors 1348 // caches "we recovered earlier" and skips ExactDup forever, this test breaks. 1349 func TestExactDup_RecoveryThenSecondSpinNotSkipped(t *testing.T) { 1350 ld := NewLoopDetector() 1351 // Phase 1: 4 same-args errors 1352 for range 4 { 1353 ld.Record("browser_click", `{"ref":"e1"}`, true, "element not found", "", false) 1354 } 1355 // Phase 2: 1 success (the "recovery" the helper looks for) 1356 ld.Record("browser_click", `{"ref":"e1"}`, false, "", "", false) 1357 // Phase 3: latest is now an error again. ExactDup must count the 1358 // 5 same-args errors (4 + this new one) at strict v2 threshold (5 nudge) 1359 // — exactRecovered must be false because latest is an error. 1360 ld.Record("browser_click", `{"ref":"e1"}`, true, "element not found", "", false) 1361 action, _ := ld.Check("browser_click") 1362 if action == LoopContinue { 1363 t.Fatalf("post-recovery spin (4err + success + 1err) must NOT be silently skipped, got LoopContinue") 1364 } 1365 // Either LoopNudge or LoopForceStop is acceptable here — what we're 1366 // proving is that recovery does NOT cache and disable the detector. 1367 } 1368 1369 // TestExactDup_SixAllErrorsSpreadNotForceStop: 6 spread-out same-args 1370 // failures (with intervening different-tool calls) is past the old 1371 // exactDupThreshold*2=6 force-stop trigger. With all-error 2x budget, 1372 // the new threshold for all-errors is 6 nudge / 12 force-stop. 1373 // 6 errors → should nudge, not force-stop. 1374 func TestExactDup_SixAllErrorsSpreadNotForceStop(t *testing.T) { 1375 ld := NewLoopDetector() 1376 for i := 0; i < 6; i++ { 1377 ld.Record("browser_click", `{"ref":"e1"}`, true, "element not found", "", false) 1378 ld.Record("browser_snapshot", `{}`, false, "", 1379 fmt.Sprintf("https://example.com/state%d", i), false) 1380 } 1381 action, msg := ld.Check("browser_click") 1382 if action == LoopForceStop { 1383 t.Fatalf("6 spread-out same-args errors should not force-stop (2x all-error budget), got: %s", msg) 1384 } 1385 } 1386 1387 // TestExactDup_SixAllSuccessSpreadStillForceStops: 10 spread-out same-args 1388 // successes (no errors) uses the original threshold → force-stop at 2×exactDupThreshold=10. 1389 // This is real spin, not flaky retry. 1390 func TestExactDup_SixAllSuccessSpreadStillForceStops(t *testing.T) { 1391 ld := NewLoopDetector() 1392 for i := 0; i < 10; i++ { 1393 ld.Record("file_read", `{"file":"main.go"}`, false, "", "", false) 1394 ld.Record("file_edit", 1395 fmt.Sprintf(`{"old":"a%d","new":"b%d"}`, i, i), false, "", "", false) 1396 } 1397 action, _ := ld.Check("file_read") 1398 if action != LoopForceStop { 1399 t.Fatalf("10 spread-out same-args successes must still force-stop (2×exactDupThreshold budget), got %v", action) 1400 } 1401 } 1402 1403 // TestExactDup_MixedSuccessAndErrorsUsesStrictThreshold: if ANY of the 1404 // repeats succeeded, we no longer have "all errors" — use the strict 1405 // threshold. Mixed means the tool sometimes works; continuing to call it 1406 // with identical args is spin. 1407 // Final call in sequence is an error (so tail-success recovery skip does 1408 // NOT apply — recovery requires tail=success AND errCount>0). 1409 // With exactDupThreshold=5: nudge fires at dupCount >= 5 (strict, mixed). 1410 func TestExactDup_MixedSuccessAndErrorsUsesStrictThreshold(t *testing.T) { 1411 ld := NewLoopDetector() 1412 // 5 same-args repeats with mixed success/error, tail=error → strict threshold → nudge at 5 1413 ld.Record("browser_click", `{"ref":"e1"}`, true, "element not found", "", false) 1414 ld.Record("browser_snapshot", `{}`, false, "", "sigA", false) 1415 ld.Record("browser_click", `{"ref":"e1"}`, false, "", "", false) 1416 ld.Record("browser_snapshot", `{}`, false, "", "sigB", false) 1417 ld.Record("browser_click", `{"ref":"e1"}`, true, "element not found", "", false) 1418 ld.Record("browser_snapshot", `{}`, false, "", "sigC", false) 1419 ld.Record("browser_click", `{"ref":"e1"}`, false, "", "", false) 1420 ld.Record("browser_snapshot", `{}`, false, "", "sigD", false) 1421 ld.Record("browser_click", `{"ref":"e1"}`, true, "element not found", "", false) 1422 action, _ := ld.Check("browser_click") 1423 if action != LoopNudge { 1424 t.Fatalf("5 mixed same-args repeats (tail=error) should nudge (strict threshold), got %v", action) 1425 } 1426 } 1427 1428 // TestExactDup_FailFailSuccessSpreadRetrySkipsOnRecoveredTail documents the 1429 // spread-out retry shape the comments describe for ExactDup: the model retries 1430 // the same browser_click across intervening snapshots, then succeeds. The 1431 // first success after a same-args error streak is recovery, not spin. 1432 func TestExactDup_FailFailSuccessSpreadRetrySkipsOnRecoveredTail(t *testing.T) { 1433 ld := NewLoopDetector() 1434 ld.Record("browser_click", `{"ref":"e1"}`, true, "element not found", "", false) 1435 ld.Record("browser_snapshot", `{}`, false, "", "https://example.com/state1", false) 1436 ld.Record("browser_click", `{"ref":"e1"}`, true, "element not found", "", false) 1437 ld.Record("browser_snapshot", `{}`, false, "", "https://example.com/state2", false) 1438 ld.Record("browser_click", `{"ref":"e1"}`, false, "", "", false) 1439 action, msg := ld.Check("browser_click") 1440 if action != LoopContinue { 1441 t.Fatalf("fail-snapshot-fail-snapshot-success must return LoopContinue (spread recovery), got %v: %s", action, msg) 1442 } 1443 } 1444 1445 // TestFamilyNoProgress_RepeatableVaryingArgsUnder15Silent: 14 varying-args 1446 // browser_snapshot calls on a stable URL. Pre-fix: FamilyNoProgress main 1447 // path force-stops at progressCount=7. Post-fix: repeatable + no topic 1448 // signal → force-stop-only-at-15 → silent until pathological threshold. 1449 // 1450 // Covers form-fill-equivalent workloads (7-14 same-page ops should all 1451 // continue — no intermediate nudges that might stack into Task 2's 1452 // rolling-window escalation). 1453 func TestFamilyNoProgress_RepeatableVaryingArgsUnder15Silent(t *testing.T) { 1454 ld := NewLoopDetector() 1455 const url = "https://app.example.com/dashboard" 1456 for i := 0; i < 14; i++ { 1457 args := fmt.Sprintf(`{"wait":%d}`, i) 1458 ld.Record("browser_snapshot", args, false, "", url, false) 1459 } 1460 action, msg := ld.Check("browser_snapshot") 1461 if action != LoopContinue { 1462 t.Fatalf("14 varying-args repeatable calls on stable URL must be silent (force-stop-only-at-15), got %v: %s", action, msg) 1463 } 1464 } 1465 1466 // TestFamilyNoProgress_RepeatableFormFillContinues: 10 click + 10 type on a 1467 // stable URL — representative of a large form fill. Must continue silently 1468 // (no nudge — nudges feed Task 2 rolling-window escalation). 1469 func TestFamilyNoProgress_RepeatableFormFillContinues(t *testing.T) { 1470 ld := NewLoopDetector() 1471 const url = "https://app.example.com/settings" 1472 for i := 0; i < 10; i++ { 1473 ld.Record("browser_click", 1474 fmt.Sprintf(`{"ref":"e%d"}`, i), false, "", url, false) 1475 ld.Record("browser_type", 1476 fmt.Sprintf(`{"ref":"e%d","text":"v%d"}`, i, i), false, "", url, false) 1477 } 1478 action, _ := ld.Check("browser_click") 1479 if action != LoopContinue { 1480 t.Fatalf("10 varying-args browser_click on stable URL (form fill) must continue, got %v", action) 1481 } 1482 } 1483 1484 // TestFamilyNoProgress_RepeatableResultOnly_SelfTopicOnlySilentBelow15 covers 1485 // repeatable tools whose args include a URL, so the latest topic hash matches 1486 // the current call itself but no prior calls. That is still result-only: the 1487 // strong topic signal is absent, so stable result_sig should stay silent until 1488 // the raised threshold instead of force-stopping at 7. 1489 func TestFamilyNoProgress_RepeatableResultOnly_SelfTopicOnlySilentBelow15(t *testing.T) { 1490 ld := NewLoopDetector() 1491 const resultSig = "https://app.example.com/search" 1492 for i := 0; i < 7; i++ { 1493 args := fmt.Sprintf(`{"url":"https://app.example.com/search?q=item-%d"}`, i) 1494 ld.Record("browser_navigate", args, false, "", resultSig, false) 1495 } 1496 action, msg := ld.Check("browser_navigate") 1497 if action != LoopContinue { 1498 t.Fatalf("7 browser_navigate calls with self-only topic match and stable result_sig must stay silent below 15, got %v: %s", action, msg) 1499 } 1500 } 1501 1502 // TestFamilyNoProgress_RepeatableVaryingArgsExtremeForceStops: 15 1503 // varying-args snapshots on stable URL — past the raised force-stop 1504 // threshold. Real pathological polling still caught. 1505 func TestFamilyNoProgress_RepeatableVaryingArgsExtremeForceStops(t *testing.T) { 1506 ld := NewLoopDetector() 1507 const url = "https://app.example.com/status" 1508 for i := 0; i < 15; i++ { 1509 args := fmt.Sprintf(`{"wait":%d}`, i) 1510 ld.Record("browser_snapshot", args, false, "", url, false) 1511 } 1512 action, _ := ld.Check("browser_snapshot") 1513 if action != LoopForceStop { 1514 t.Fatalf("15 varying-args same-URL snapshots must still force-stop (pathological polling), got %v", action) 1515 } 1516 } 1517 1518 // TestFamilyNoProgress_NonRepeatableOriginalThresholds: web_search family 1519 // must still hit force-stop at 12 same-topic calls (v2 threshold). 1520 // Raised thresholds apply uniformly; repeatable tools have a separate 1521 // result-only path with a higher threshold. 1522 func TestFamilyNoProgress_NonRepeatableOriginalThresholds(t *testing.T) { 1523 ld := NewLoopDetector() 1524 // All 12 queries normalize to the "change climate effects" topic 1525 // (only filler words differ). 1526 fillers := []string{"today", "latest", "top", "current", "major", "breaking", 1527 "news", "update", "headlines", "recent", "today latest", "top current"} 1528 for _, f := range fillers { 1529 args := fmt.Sprintf(`{"q":"climate change effects %s"}`, f) 1530 ld.Record("web_search", args, false, "", "", false) 1531 } 1532 action, _ := ld.Check("web_search") 1533 if action != LoopForceStop { 1534 t.Fatalf("12 same-topic web_search must still force-stop (v2 threshold), got %v", action) 1535 } 1536 }