normalize.go
1 package agent 2 3 import ( 4 "encoding/json" 5 "regexp" 6 "sort" 7 "strings" 8 "unicode" 9 10 "github.com/Kocoro-lab/ShanClaw/internal/client" 11 ) 12 13 type FamilySpec struct { 14 Core []string 15 Extended []string 16 } 17 18 // ToolFamilies maps tool names to their logical family for grouping 19 // related tools in loop detection (e.g., web_search + web_fetch = "web"). 20 var ToolFamilies = map[string]string{ 21 "web_search": "web", 22 "web_fetch": "web", 23 "x_search": "web", 24 "browser": "browser", 25 "accessibility": "gui", 26 "screenshot": "gui", 27 "computer": "gui", 28 "applescript": "gui", 29 "grep": "search", 30 "glob": "search", 31 } 32 33 var FamilyRegistry = map[string]FamilySpec{ 34 "browser": { 35 Core: []string{ 36 "browser_navigate", 37 "browser_snapshot", 38 "browser_click", 39 "browser_type", 40 "browser_press_key", 41 "browser_take_screenshot", 42 "browser_tabs", 43 }, 44 Extended: []string{ 45 "browser_drag", 46 "browser_select_option", 47 }, 48 }, 49 } 50 51 func toolFamily(name string) string { 52 if strings.HasPrefix(name, "browser_") { 53 return "browser" 54 } 55 return ToolFamilies[name] 56 } 57 58 // fillerWords are common query padding that don't affect semantic meaning. 59 var fillerWords = map[string]bool{ 60 "today": true, 61 "yesterday": true, 62 "latest": true, 63 "recent": true, 64 "top": true, 65 "major": true, 66 "breaking": true, 67 "headlines": true, 68 "news": true, 69 "current": true, 70 "update": true, 71 "updates": true, 72 } 73 74 // isoDatePattern matches YYYY-MM-DD dates. 75 var isoDatePattern = regexp.MustCompile(`\b\d{4}-\d{2}-\d{2}\b`) 76 77 // monthDayYearPattern matches "March 2 2026" or "March 02 2026". 78 var monthDayYearPattern = regexp.MustCompile(`(?i)\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}\s+\d{4}\b`) 79 80 // dayMonthYearPattern matches "2 March 2026" or "02 March 2026". 81 var dayMonthYearPattern = regexp.MustCompile(`(?i)\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b`) 82 83 // standaloneYearPattern matches 4-digit years (2000-2099) as standalone tokens. 84 var standaloneYearPattern = regexp.MustCompile(`\b20\d{2}\b`) 85 86 // urlPattern matches http/https URLs (domain + path, excluding query strings). 87 // Captures the full URL minus trailing punctuation and query params. 88 var urlPattern = regexp.MustCompile(`https?://[^\s"'<>\])\},]+`) 89 90 var serializedToolCallLinePattern = regexp.MustCompile(`^Tool:\s*([^,]+),\s*Args:\s*(.+)$`) 91 92 // normalizeWebQuery extracts a search query from JSON args, strips dates and 93 // filler words, sorts remaining tokens, and returns a canonical form. 94 // Two queries about the same topic with different date/filler noise will 95 // produce the same normalized string and thus the same hash. 96 func normalizeWebQuery(argsJSON string) string { 97 // Try to extract the query string from known JSON keys. 98 var raw map[string]any 99 if err := json.Unmarshal([]byte(argsJSON), &raw); err != nil { 100 return "" 101 } 102 103 query := "" 104 for _, key := range []string{"query", "q", "queries", "url", "urls"} { 105 if v, ok := raw[key]; ok { 106 switch val := v.(type) { 107 case string: 108 query = val 109 case []any: 110 // Take first element if it's a string array. 111 if len(val) > 0 { 112 if s, ok := val[0].(string); ok { 113 query = s 114 } 115 } 116 } 117 if query != "" { 118 break 119 } 120 } 121 } 122 123 if query == "" { 124 return "" 125 } 126 127 // For URL values, return as-is (no token stripping). 128 if strings.HasPrefix(query, "http://") || strings.HasPrefix(query, "https://") { 129 return query 130 } 131 132 // Strip date patterns. 133 query = isoDatePattern.ReplaceAllString(query, " ") 134 query = monthDayYearPattern.ReplaceAllString(query, " ") 135 query = dayMonthYearPattern.ReplaceAllString(query, " ") 136 query = standaloneYearPattern.ReplaceAllString(query, " ") 137 138 // Tokenize, strip punctuation, filter fillers and short tokens. 139 tokens := strings.Fields(query) 140 var cleaned []string 141 for _, tok := range tokens { 142 // Trim punctuation from edges. 143 tok = strings.TrimFunc(tok, func(r rune) bool { 144 return unicode.IsPunct(r) || unicode.IsSymbol(r) 145 }) 146 tok = strings.ToLower(tok) 147 if len(tok) < 2 { 148 continue 149 } 150 if fillerWords[tok] { 151 continue 152 } 153 cleaned = append(cleaned, tok) 154 } 155 156 sort.Strings(cleaned) 157 if len(cleaned) == 0 { 158 // All tokens were filler/dates — return a sentinel so all-filler 159 // queries match each other (prevents bypassing topic detection). 160 return "[empty]" 161 } 162 return strings.Join(cleaned, " ") 163 } 164 165 // extractResultSignature extracts unique URLs (domain+path) from text content, 166 // strips query strings and trailing punctuation, then hashes the sorted set. 167 // More granular than domain-only: reuters.com/climate ≠ reuters.com/economics. 168 func extractResultSignature(content string) string { 169 matches := urlPattern.FindAllString(content, -1) 170 if len(matches) == 0 { 171 return "" 172 } 173 174 seen := make(map[string]bool) 175 var urls []string 176 for _, u := range matches { 177 // Strip query string 178 if idx := strings.IndexByte(u, '?'); idx != -1 { 179 u = u[:idx] 180 } 181 // Strip fragment 182 if idx := strings.IndexByte(u, '#'); idx != -1 { 183 u = u[:idx] 184 } 185 // Trim trailing punctuation that leaked in 186 u = strings.TrimRight(u, ".,;:!)") 187 u = strings.ToLower(u) 188 if !seen[u] { 189 seen[u] = true 190 urls = append(urls, u) 191 } 192 } 193 194 sort.Strings(urls) 195 return strings.Join(urls, ",") 196 } 197 198 // isNonActionableSearch returns true if a search-family tool returned results 199 // that don't help the model make progress: no matches, binary-only matches, 200 // or errors. Productive searches (actual source code hits) return false. 201 func isNonActionableSearch(toolName string, result ToolResult) bool { 202 if toolFamily(toolName) != "search" { 203 return false 204 } 205 if result.IsError { 206 return true 207 } 208 content := result.Content 209 if content == "no matches found" || content == "no files matched" { 210 return true 211 } 212 // Binary-only matches (defensive — after grep binary exclusion, this is rare) 213 if strings.HasPrefix(content, "Binary file ") && !strings.Contains(content, "\n") { 214 return true 215 } 216 // Multiple binary matches with no real content 217 lines := strings.Split(strings.TrimSpace(content), "\n") 218 allBinary := len(lines) > 0 219 for _, line := range lines { 220 if !strings.HasPrefix(line, "Binary file ") { 221 allBinary = false 222 break 223 } 224 } 225 if allBinary { 226 return true 227 } 228 return false 229 } 230 231 type serializedToolCall struct { 232 Name string 233 Args string 234 } 235 236 func normalizeStructuredToolCallPreamble(text string, toolCalls []client.FunctionCall) string { 237 if len(toolCalls) == 0 || strings.TrimSpace(text) == "" { 238 return text 239 } 240 241 parsed, ok := parseSerializedToolCallPreamble(text) 242 if !ok || len(parsed) != len(toolCalls) { 243 return text 244 } 245 246 for i, tc := range toolCalls { 247 if parsed[i].Name != tc.Name { 248 return text 249 } 250 if normalizeJSON(json.RawMessage(parsed[i].Args)) != normalizeJSON(tc.Arguments) { 251 return text 252 } 253 } 254 255 return "" 256 } 257 258 func parseSerializedToolCallPreamble(text string) ([]serializedToolCall, bool) { 259 lines := strings.Split(strings.TrimSpace(text), "\n") 260 parsed := make([]serializedToolCall, 0, len(lines)) 261 262 for _, line := range lines { 263 line = strings.TrimSpace(line) 264 if line == "" { 265 continue 266 } 267 if line == "Tool calls:" || line == "Tool call:" { 268 continue 269 } 270 271 matches := serializedToolCallLinePattern.FindStringSubmatch(line) 272 if len(matches) != 3 { 273 return nil, false 274 } 275 parsed = append(parsed, serializedToolCall{ 276 Name: strings.TrimSpace(matches[1]), 277 Args: strings.TrimSpace(matches[2]), 278 }) 279 } 280 281 if len(parsed) == 0 { 282 return nil, false 283 } 284 return parsed, true 285 }