/ internal / agent / normalize.go
normalize.go
  1  package agent
  2  
  3  import (
  4  	"encoding/json"
  5  	"regexp"
  6  	"sort"
  7  	"strings"
  8  	"unicode"
  9  
 10  	"github.com/Kocoro-lab/ShanClaw/internal/client"
 11  )
 12  
 13  type FamilySpec struct {
 14  	Core     []string
 15  	Extended []string
 16  }
 17  
 18  // ToolFamilies maps tool names to their logical family for grouping
 19  // related tools in loop detection (e.g., web_search + web_fetch = "web").
 20  var ToolFamilies = map[string]string{
 21  	"web_search":    "web",
 22  	"web_fetch":     "web",
 23  	"x_search":      "web",
 24  	"browser":       "browser",
 25  	"accessibility": "gui",
 26  	"screenshot":    "gui",
 27  	"computer":      "gui",
 28  	"applescript":   "gui",
 29  	"grep":          "search",
 30  	"glob":          "search",
 31  }
 32  
 33  var FamilyRegistry = map[string]FamilySpec{
 34  	"browser": {
 35  		Core: []string{
 36  			"browser_navigate",
 37  			"browser_snapshot",
 38  			"browser_click",
 39  			"browser_type",
 40  			"browser_press_key",
 41  			"browser_take_screenshot",
 42  			"browser_tabs",
 43  		},
 44  		Extended: []string{
 45  			"browser_drag",
 46  			"browser_select_option",
 47  		},
 48  	},
 49  }
 50  
 51  func toolFamily(name string) string {
 52  	if strings.HasPrefix(name, "browser_") {
 53  		return "browser"
 54  	}
 55  	return ToolFamilies[name]
 56  }
 57  
 58  // fillerWords are common query padding that don't affect semantic meaning.
 59  var fillerWords = map[string]bool{
 60  	"today":     true,
 61  	"yesterday": true,
 62  	"latest":    true,
 63  	"recent":    true,
 64  	"top":       true,
 65  	"major":     true,
 66  	"breaking":  true,
 67  	"headlines": true,
 68  	"news":      true,
 69  	"current":   true,
 70  	"update":    true,
 71  	"updates":   true,
 72  }
 73  
 74  // isoDatePattern matches YYYY-MM-DD dates.
 75  var isoDatePattern = regexp.MustCompile(`\b\d{4}-\d{2}-\d{2}\b`)
 76  
 77  // monthDayYearPattern matches "March 2 2026" or "March 02 2026".
 78  var monthDayYearPattern = regexp.MustCompile(`(?i)\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}\s+\d{4}\b`)
 79  
 80  // dayMonthYearPattern matches "2 March 2026" or "02 March 2026".
 81  var dayMonthYearPattern = regexp.MustCompile(`(?i)\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b`)
 82  
 83  // standaloneYearPattern matches 4-digit years (2000-2099) as standalone tokens.
 84  var standaloneYearPattern = regexp.MustCompile(`\b20\d{2}\b`)
 85  
 86  // urlPattern matches http/https URLs (domain + path, excluding query strings).
 87  // Captures the full URL minus trailing punctuation and query params.
 88  var urlPattern = regexp.MustCompile(`https?://[^\s"'<>\])\},]+`)
 89  
 90  var serializedToolCallLinePattern = regexp.MustCompile(`^Tool:\s*([^,]+),\s*Args:\s*(.+)$`)
 91  
 92  // normalizeWebQuery extracts a search query from JSON args, strips dates and
 93  // filler words, sorts remaining tokens, and returns a canonical form.
 94  // Two queries about the same topic with different date/filler noise will
 95  // produce the same normalized string and thus the same hash.
 96  func normalizeWebQuery(argsJSON string) string {
 97  	// Try to extract the query string from known JSON keys.
 98  	var raw map[string]any
 99  	if err := json.Unmarshal([]byte(argsJSON), &raw); err != nil {
100  		return ""
101  	}
102  
103  	query := ""
104  	for _, key := range []string{"query", "q", "queries", "url", "urls"} {
105  		if v, ok := raw[key]; ok {
106  			switch val := v.(type) {
107  			case string:
108  				query = val
109  			case []any:
110  				// Take first element if it's a string array.
111  				if len(val) > 0 {
112  					if s, ok := val[0].(string); ok {
113  						query = s
114  					}
115  				}
116  			}
117  			if query != "" {
118  				break
119  			}
120  		}
121  	}
122  
123  	if query == "" {
124  		return ""
125  	}
126  
127  	// For URL values, return as-is (no token stripping).
128  	if strings.HasPrefix(query, "http://") || strings.HasPrefix(query, "https://") {
129  		return query
130  	}
131  
132  	// Strip date patterns.
133  	query = isoDatePattern.ReplaceAllString(query, " ")
134  	query = monthDayYearPattern.ReplaceAllString(query, " ")
135  	query = dayMonthYearPattern.ReplaceAllString(query, " ")
136  	query = standaloneYearPattern.ReplaceAllString(query, " ")
137  
138  	// Tokenize, strip punctuation, filter fillers and short tokens.
139  	tokens := strings.Fields(query)
140  	var cleaned []string
141  	for _, tok := range tokens {
142  		// Trim punctuation from edges.
143  		tok = strings.TrimFunc(tok, func(r rune) bool {
144  			return unicode.IsPunct(r) || unicode.IsSymbol(r)
145  		})
146  		tok = strings.ToLower(tok)
147  		if len(tok) < 2 {
148  			continue
149  		}
150  		if fillerWords[tok] {
151  			continue
152  		}
153  		cleaned = append(cleaned, tok)
154  	}
155  
156  	sort.Strings(cleaned)
157  	if len(cleaned) == 0 {
158  		// All tokens were filler/dates — return a sentinel so all-filler
159  		// queries match each other (prevents bypassing topic detection).
160  		return "[empty]"
161  	}
162  	return strings.Join(cleaned, " ")
163  }
164  
165  // extractResultSignature extracts unique URLs (domain+path) from text content,
166  // strips query strings and trailing punctuation, then hashes the sorted set.
167  // More granular than domain-only: reuters.com/climate ≠ reuters.com/economics.
168  func extractResultSignature(content string) string {
169  	matches := urlPattern.FindAllString(content, -1)
170  	if len(matches) == 0 {
171  		return ""
172  	}
173  
174  	seen := make(map[string]bool)
175  	var urls []string
176  	for _, u := range matches {
177  		// Strip query string
178  		if idx := strings.IndexByte(u, '?'); idx != -1 {
179  			u = u[:idx]
180  		}
181  		// Strip fragment
182  		if idx := strings.IndexByte(u, '#'); idx != -1 {
183  			u = u[:idx]
184  		}
185  		// Trim trailing punctuation that leaked in
186  		u = strings.TrimRight(u, ".,;:!)")
187  		u = strings.ToLower(u)
188  		if !seen[u] {
189  			seen[u] = true
190  			urls = append(urls, u)
191  		}
192  	}
193  
194  	sort.Strings(urls)
195  	return strings.Join(urls, ",")
196  }
197  
198  // isNonActionableSearch returns true if a search-family tool returned results
199  // that don't help the model make progress: no matches, binary-only matches,
200  // or errors. Productive searches (actual source code hits) return false.
201  func isNonActionableSearch(toolName string, result ToolResult) bool {
202  	if toolFamily(toolName) != "search" {
203  		return false
204  	}
205  	if result.IsError {
206  		return true
207  	}
208  	content := result.Content
209  	if content == "no matches found" || content == "no files matched" {
210  		return true
211  	}
212  	// Binary-only matches (defensive — after grep binary exclusion, this is rare)
213  	if strings.HasPrefix(content, "Binary file ") && !strings.Contains(content, "\n") {
214  		return true
215  	}
216  	// Multiple binary matches with no real content
217  	lines := strings.Split(strings.TrimSpace(content), "\n")
218  	allBinary := len(lines) > 0
219  	for _, line := range lines {
220  		if !strings.HasPrefix(line, "Binary file ") {
221  			allBinary = false
222  			break
223  		}
224  	}
225  	if allBinary {
226  		return true
227  	}
228  	return false
229  }
230  
231  type serializedToolCall struct {
232  	Name string
233  	Args string
234  }
235  
236  func normalizeStructuredToolCallPreamble(text string, toolCalls []client.FunctionCall) string {
237  	if len(toolCalls) == 0 || strings.TrimSpace(text) == "" {
238  		return text
239  	}
240  
241  	parsed, ok := parseSerializedToolCallPreamble(text)
242  	if !ok || len(parsed) != len(toolCalls) {
243  		return text
244  	}
245  
246  	for i, tc := range toolCalls {
247  		if parsed[i].Name != tc.Name {
248  			return text
249  		}
250  		if normalizeJSON(json.RawMessage(parsed[i].Args)) != normalizeJSON(tc.Arguments) {
251  			return text
252  		}
253  	}
254  
255  	return ""
256  }
257  
258  func parseSerializedToolCallPreamble(text string) ([]serializedToolCall, bool) {
259  	lines := strings.Split(strings.TrimSpace(text), "\n")
260  	parsed := make([]serializedToolCall, 0, len(lines))
261  
262  	for _, line := range lines {
263  		line = strings.TrimSpace(line)
264  		if line == "" {
265  			continue
266  		}
267  		if line == "Tool calls:" || line == "Tool call:" {
268  			continue
269  		}
270  
271  		matches := serializedToolCallLinePattern.FindStringSubmatch(line)
272  		if len(matches) != 3 {
273  			return nil, false
274  		}
275  		parsed = append(parsed, serializedToolCall{
276  			Name: strings.TrimSpace(matches[1]),
277  			Args: strings.TrimSpace(matches[2]),
278  		})
279  	}
280  
281  	if len(parsed) == 0 {
282  		return nil, false
283  	}
284  	return parsed, true
285  }