browser.go
1 package tools 2 3 import ( 4 "context" 5 "encoding/json" 6 "fmt" 7 "log" 8 "os" 9 "os/exec" 10 "strings" 11 "sync" 12 "time" 13 14 "github.com/chromedp/chromedp" 15 16 "github.com/Kocoro-lab/ShanClaw/internal/agent" 17 ) 18 19 // backend tracks which browser engine is active. 20 type browserBackend int 21 22 const ( 23 backendNone browserBackend = iota 24 backendPinchtab // pinchtab HTTP API 25 backendChromedp // embedded chromedp (fallback) 26 ) 27 28 type BrowserTool struct { 29 mu sync.Mutex 30 backend browserBackend 31 32 // pinchtab 33 pt *pinchtabClient 34 tabID string // active tab in pinchtab 35 36 // chromedp fallback 37 ctx context.Context 38 cancel context.CancelFunc 39 active bool 40 } 41 42 type browserArgs struct { 43 Action string `json:"action"` 44 URL string `json:"url,omitempty"` 45 Selector string `json:"selector,omitempty"` 46 Ref string `json:"ref,omitempty"` 47 Text string `json:"text,omitempty"` 48 Key string `json:"key,omitempty"` 49 Value string `json:"value,omitempty"` 50 Script string `json:"script,omitempty"` 51 Query string `json:"query,omitempty"` 52 Filter string `json:"filter,omitempty"` 53 WaitFor string `json:"waitFor,omitempty"` 54 WaitSelector string `json:"waitSelector,omitempty"` 55 BlockImages bool `json:"blockImages,omitempty"` 56 BlockAds bool `json:"blockAds,omitempty"` 57 TextMode string `json:"textMode,omitempty"` 58 MaxChars int `json:"maxChars,omitempty"` 59 Raw bool `json:"raw,omitempty"` 60 Timeout int `json:"timeout,omitempty"` 61 } 62 63 func (t *BrowserTool) Info() agent.ToolInfo { 64 return agent.ToolInfo{ 65 Name: "browser", 66 Description: "Control a headless browser with an isolated profile. " + 67 "FIRST CHOICE for any web page interaction: navigating, clicking, reading, scraping, screenshots of web content. " + 68 "Only skip this for pages requiring user login/authentication — use GUI tools for those. " + 69 "Actions: navigate, click, type, scroll, screenshot, read_page, execute_js, wait, close. " + 70 "Use 'read_page' (textMode 'raw' for full DOM) to inspect page structure, or 'execute_js' to query the DOM programmatically and return JSON. " + 71 "Note: snapshot/find (accessibility-tree actions) are not advertised — they only work with the legacy pinchtab backend; use Playwright MCP for equivalent functionality when available.", 72 Parameters: map[string]any{ 73 "type": "object", 74 "properties": map[string]any{ 75 "action": map[string]any{"type": "string", "description": "Action: navigate, click, type, scroll, screenshot, read_page, execute_js, wait, close"}, 76 "url": map[string]any{"type": "string", "description": "URL to navigate to (for navigate action)"}, 77 "selector": map[string]any{"type": "string", "description": "CSS selector (for click, type, read_page, scroll, wait)"}, 78 "ref": map[string]any{"type": "string", "description": "Element ref, e.g. 'e5' (for click, type, scroll — alternative to selector). Only meaningful when another tool has produced refs for the current page."}, 79 "text": map[string]any{"type": "string", "description": "Text to type (for type action)"}, 80 "key": map[string]any{"type": "string", "description": "Key to press, e.g. 'Enter' (for press action via click with key)"}, 81 "value": map[string]any{"type": "string", "description": "Value to select (for select action via click with value)"}, 82 "script": map[string]any{"type": "string", "description": "JavaScript to execute (for execute_js action). Expression context: a plain expression is evaluated and its value returned. Scripts whose first token is a top-level statement keyword (`return`, `const`, `let`, `var`, `function`, `async`, `if`, `for`, `while`, `try`) are auto-wrapped in an async IIFE on the chromedp backend so they evaluate correctly; plain expressions (including semicolon-terminated or multi-line ones) pass through unchanged."}, 83 "waitFor": map[string]any{"type": "string", "description": "Navigation wait strategy: e.g. 'domcontentloaded', 'networkidle' (for navigate action)"}, 84 "waitSelector": map[string]any{"type": "string", "description": "CSS selector to wait for after navigation"}, 85 "blockImages": map[string]any{"type": "boolean", "description": "Disable image loading during navigation"}, 86 "blockAds": map[string]any{"type": "boolean", "description": "Enable PinchTab ad blocking during navigation"}, 87 "textMode": map[string]any{"type": "string", "description": "Text extraction mode for read_page (for example: 'readability' or 'raw')"}, 88 "maxChars": map[string]any{"type": "integer", "description": "Maximum characters for read_page output"}, 89 "raw": map[string]any{"type": "boolean", "description": "Convenience flag for read_page raw mode"}, 90 "timeout": map[string]any{"type": "integer", "description": "Timeout in seconds (default: 30)"}, 91 }, 92 }, 93 Required: []string{"action"}, 94 } 95 } 96 97 func (t *BrowserTool) RequiresApproval() bool { return true } 98 99 func (t *BrowserTool) IsReadOnlyCall(string) bool { return false } 100 101 func (t *BrowserTool) Run(ctx context.Context, argsJSON string) (agent.ToolResult, error) { 102 var args browserArgs 103 if err := json.Unmarshal([]byte(argsJSON), &args); err != nil { 104 return agent.ToolResult{Content: fmt.Sprintf("invalid arguments: %v", err), IsError: true}, nil 105 } 106 107 if args.Action == "" { 108 return agent.ToolResult{Content: "missing required parameter: action", IsError: true}, nil 109 } 110 111 timeout := 30 * time.Second 112 if args.Timeout > 0 { 113 timeout = time.Duration(args.Timeout) * time.Second 114 } 115 116 // close doesn't need a running backend 117 if args.Action == "close" { 118 return t.closeBrowser() 119 } 120 121 // Validate required params before starting a browser 122 if err := t.validateArgs(args); err != nil { 123 return agent.ToolResult{Content: err.Error(), IsError: true}, nil 124 } 125 126 // Ensure a backend is available (pinchtab preferred, chromedp fallback) 127 if err := t.ensureBackend(ctx); err != nil { 128 return agent.ToolResult{Content: fmt.Sprintf("failed to start browser: %v", err), IsError: true}, nil 129 } 130 131 switch args.Action { 132 case "navigate": 133 return t.navigate(ctx, args, timeout) 134 case "click": 135 return t.click(ctx, args, timeout) 136 case "type": 137 return t.typeText(ctx, args, timeout) 138 case "scroll": 139 return t.scroll(ctx, args, timeout) 140 case "screenshot": 141 return t.screenshot(ctx, args, timeout) 142 case "read_page": 143 return t.readPage(ctx, args, timeout) 144 case "execute_js": 145 return t.executeJS(ctx, args, timeout) 146 case "wait": 147 return t.waitVisible(ctx, args, timeout) 148 case "snapshot": 149 // Pinchtab-only; returns a "requires pinchtab" error on the chromedp 150 // fallback. No longer advertised in Info() so fresh calls should not 151 // arrive here — but the dispatch stays to keep pinchtab environments 152 // working (see ensureBackend's pinchtab-first preference). 153 return t.snapshotAction(ctx, args) 154 case "find": 155 return t.findAction(ctx, args) 156 default: 157 // unreachable — validateArgs catches unknown actions 158 return agent.ToolResult{Content: fmt.Sprintf("unknown action: %q", args.Action), IsError: true}, nil 159 } 160 } 161 162 // validateArgs checks required params before starting a browser. 163 func (t *BrowserTool) validateArgs(args browserArgs) error { 164 switch args.Action { 165 case "navigate": 166 if args.URL == "" { 167 return fmt.Errorf("navigate action requires 'url' parameter") 168 } 169 case "click": 170 if args.Ref == "" && args.Selector == "" { 171 return fmt.Errorf("click action requires 'ref' or 'selector' parameter") 172 } 173 case "type": 174 if args.Ref == "" && args.Selector == "" { 175 return fmt.Errorf("type action requires 'ref' or 'selector' parameter") 176 } 177 case "wait": 178 if args.Selector == "" { 179 return fmt.Errorf("wait action requires 'selector' parameter") 180 } 181 case "execute_js": 182 if args.Script == "" { 183 return fmt.Errorf("execute_js action requires 'script' parameter") 184 } 185 case "find": 186 if args.Query == "" { 187 return fmt.Errorf("find action requires 'query' parameter") 188 } 189 case "scroll", "screenshot", "read_page", "snapshot": 190 // no required params 191 default: 192 return fmt.Errorf("unknown action: %q (valid: navigate, click, type, scroll, screenshot, read_page, execute_js, wait, close)", args.Action) 193 } 194 return nil 195 } 196 197 // ensureBackend picks pinchtab if available, else falls back to chromedp. 198 func (t *BrowserTool) ensureBackend(ctx context.Context) error { 199 t.mu.Lock() 200 defer t.mu.Unlock() 201 202 // Already have a working backend? 203 switch t.backend { 204 case backendPinchtab: 205 if t.pt.available(ctx) { 206 return nil 207 } 208 // pinchtab died — clear stale tab ID, try to restart or fall through to chromedp 209 t.tabID = "" 210 t.backend = backendNone 211 case backendChromedp: 212 if t.ctx != nil && t.ctx.Err() == nil { 213 return nil 214 } 215 // chromedp context dead — reset 216 if t.cancel != nil { 217 t.cancel() 218 } 219 t.ctx = nil 220 t.cancel = nil 221 t.active = false 222 t.backend = backendNone 223 } 224 225 // Try pinchtab first 226 if t.pt == nil { 227 t.pt = newPinchtabClient() 228 } 229 if err := t.pt.ensure(ctx); err == nil { 230 t.backend = backendPinchtab 231 return nil 232 } 233 234 // Fall back to chromedp 235 return t.startChromedp() 236 } 237 238 func (t *BrowserTool) startChromedp() error { 239 opts := append(chromedp.DefaultExecAllocatorOptions[:], 240 chromedp.Flag("headless", false), 241 chromedp.Flag("disable-gpu", true), 242 chromedp.Flag("no-first-run", true), 243 chromedp.Flag("no-default-browser-check", true), 244 ) 245 246 allocCtx, allocCancel := chromedp.NewExecAllocator(context.Background(), opts...) 247 browserCtx, browserCancel := chromedp.NewContext(allocCtx) 248 249 if err := chromedp.Run(browserCtx); err != nil { 250 browserCancel() 251 allocCancel() 252 return fmt.Errorf("failed to start browser: %w", err) 253 } 254 255 t.ctx = browserCtx 256 t.cancel = func() { 257 browserCancel() 258 allocCancel() 259 } 260 t.active = true 261 t.backend = backendChromedp 262 return nil 263 } 264 265 func (t *BrowserTool) isPinchtab() bool { 266 return t.backend == backendPinchtab 267 } 268 269 // --- Actions --- 270 271 // formatNavigateResult builds the navigate result string with anti-bot warning and content preview. 272 func formatNavigateResult(pageURL, title, textPreview string) string { 273 content := fmt.Sprintf("Navigated to: %s\nTitle: %s", pageURL, title) 274 275 if detectAntiBotPage(title) { 276 content += "\n\nWARNING: This page appears to be an anti-bot challenge or CAPTCHA. " + 277 "The page content is likely NOT the expected website content. " + 278 "Do NOT attempt to extract data from this page. " + 279 "Report to the user that the site blocked automated access." 280 } 281 282 preview := strings.TrimSpace(textPreview) 283 if preview != "" { 284 const maxPreviewRunes = 200 285 runes := []rune(preview) 286 if len(runes) > maxPreviewRunes { 287 preview = string(runes[:maxPreviewRunes]) + "..." 288 } 289 content += fmt.Sprintf("\nPreview: %s", preview) 290 } 291 292 return content 293 } 294 295 func (t *BrowserTool) navigate(_ context.Context, args browserArgs, timeout time.Duration) (agent.ToolResult, error) { 296 if t.isPinchtab() { 297 ctx, cancel := context.WithTimeout(context.Background(), timeout) 298 defer cancel() 299 // Always open a new tab to isolate navigation from previous tasks 300 resp, err := t.pt.navigate(ctx, ptNavigateReq{ 301 URL: args.URL, 302 NewTab: true, 303 BlockImages: args.BlockImages, 304 BlockAds: args.BlockAds, 305 WaitFor: args.WaitFor, 306 WaitSelector: args.WaitSelector, 307 }) 308 if err != nil { 309 return agent.ToolResult{Content: fmt.Sprintf("navigate error: %v", err), IsError: true}, nil 310 } 311 if resp.TabID != "" { 312 t.tabID = resp.TabID 313 } 314 315 // Best-effort content preview — don't fail navigate if text fetch fails. 316 // Only fetch if we have a valid tab ID from this navigation response. 317 var preview string 318 if resp.TabID != "" { 319 if textResp, err := t.pt.text(ctx, resp.TabID, "", 0, false); err == nil { 320 preview = textResp.Text 321 } 322 } 323 324 return agent.ToolResult{Content: formatNavigateResult(resp.URL, resp.Title, preview)}, nil 325 } 326 327 // chromedp 328 tCtx, cancel := context.WithTimeout(t.ctx, timeout) 329 defer cancel() 330 331 var title, textContent string 332 err := chromedp.Run(tCtx, 333 chromedp.Navigate(args.URL), 334 chromedp.WaitReady("body", chromedp.ByQuery), 335 chromedp.Title(&title), 336 ) 337 if err != nil { 338 return agent.ToolResult{Content: fmt.Sprintf("navigate error: %v", err), IsError: true}, nil 339 } 340 341 // Best-effort content preview 342 _ = chromedp.Run(tCtx, chromedp.Evaluate( 343 `(document.querySelector("html")?.innerText || "").substring(0, 300)`, 344 &textContent, 345 )) 346 347 return agent.ToolResult{Content: formatNavigateResult(args.URL, title, textContent)}, nil 348 } 349 350 func (t *BrowserTool) click(_ context.Context, args browserArgs, timeout time.Duration) (agent.ToolResult, error) { 351 if t.isPinchtab() { 352 ctx, cancel := context.WithTimeout(context.Background(), timeout) 353 defer cancel() 354 kind := "click" 355 if args.Key != "" { 356 kind = "press" 357 } else if args.Value != "" { 358 kind = "select" 359 } 360 req := ptActionReq{TabID: t.tabID, Kind: kind, Ref: args.Ref, Selector: args.Selector, Key: args.Key, Value: args.Value} 361 resp, err := t.pt.action(ctx, req) 362 if err != nil { 363 return agent.ToolResult{Content: fmt.Sprintf("click error: %v", err), IsError: true}, nil 364 } 365 target := args.Ref 366 if target == "" { 367 target = args.Selector 368 } 369 _ = resp 370 return agent.ToolResult{Content: fmt.Sprintf("Clicked: %s", target)}, nil 371 } 372 373 // chromedp (selector only) 374 if args.Selector == "" { 375 return agent.ToolResult{Content: "chromedp fallback requires 'selector' (refs not supported without pinchtab)", IsError: true}, nil 376 } 377 tCtx, cancel := context.WithTimeout(t.ctx, timeout) 378 defer cancel() 379 if err := chromedp.Run(tCtx, chromedp.Click(args.Selector)); err != nil { 380 return agent.ToolResult{Content: fmt.Sprintf("click error: %v", err), IsError: true}, nil 381 } 382 return agent.ToolResult{Content: fmt.Sprintf("Clicked: %s", args.Selector)}, nil 383 } 384 385 func (t *BrowserTool) typeText(_ context.Context, args browserArgs, timeout time.Duration) (agent.ToolResult, error) { 386 if t.isPinchtab() { 387 ctx, cancel := context.WithTimeout(context.Background(), timeout) 388 defer cancel() 389 req := ptActionReq{TabID: t.tabID, Kind: "type", Ref: args.Ref, Selector: args.Selector, Text: args.Text} 390 _, err := t.pt.action(ctx, req) 391 if err != nil { 392 return agent.ToolResult{Content: fmt.Sprintf("type error: %v", err), IsError: true}, nil 393 } 394 target := args.Ref 395 if target == "" { 396 target = args.Selector 397 } 398 return agent.ToolResult{Content: fmt.Sprintf("Typed into: %s", target)}, nil 399 } 400 401 // chromedp 402 if args.Selector == "" { 403 return agent.ToolResult{Content: "chromedp fallback requires 'selector'", IsError: true}, nil 404 } 405 tCtx, cancel := context.WithTimeout(t.ctx, timeout) 406 defer cancel() 407 if err := chromedp.Run(tCtx, chromedp.SendKeys(args.Selector, args.Text)); err != nil { 408 return agent.ToolResult{Content: fmt.Sprintf("type error: %v", err), IsError: true}, nil 409 } 410 return agent.ToolResult{Content: fmt.Sprintf("Typed into: %s", args.Selector)}, nil 411 } 412 413 func (t *BrowserTool) scroll(_ context.Context, args browserArgs, timeout time.Duration) (agent.ToolResult, error) { 414 if t.isPinchtab() { 415 ctx, cancel := context.WithTimeout(context.Background(), timeout) 416 defer cancel() 417 req := ptActionReq{TabID: t.tabID, Kind: "scroll", Ref: args.Ref, Selector: args.Selector} 418 if args.Ref == "" && args.Selector == "" { 419 req.ScrollY = 800 // scroll down by default 420 } 421 _, err := t.pt.action(ctx, req) 422 if err != nil { 423 return agent.ToolResult{Content: fmt.Sprintf("scroll error: %v", err), IsError: true}, nil 424 } 425 target := args.Ref 426 if target == "" { 427 target = args.Selector 428 } 429 if target == "" { 430 target = "page" 431 } 432 return agent.ToolResult{Content: fmt.Sprintf("Scrolled: %s", target)}, nil 433 } 434 435 // chromedp 436 tCtx, cancel := context.WithTimeout(t.ctx, timeout) 437 defer cancel() 438 439 if args.Selector != "" { 440 if err := chromedp.Run(tCtx, chromedp.ScrollIntoView(args.Selector)); err != nil { 441 return agent.ToolResult{Content: fmt.Sprintf("scroll error: %v", err), IsError: true}, nil 442 } 443 return agent.ToolResult{Content: fmt.Sprintf("Scrolled to: %s", args.Selector)}, nil 444 } 445 446 var scrollHeight int 447 if err := chromedp.Run(tCtx, 448 chromedp.Evaluate(`window.scrollTo(0, document.body.scrollHeight); document.body.scrollHeight`, &scrollHeight), 449 ); err != nil { 450 return agent.ToolResult{Content: fmt.Sprintf("scroll error: %v", err), IsError: true}, nil 451 } 452 return agent.ToolResult{Content: fmt.Sprintf("Scrolled to bottom (height: %d)", scrollHeight)}, nil 453 } 454 455 func (t *BrowserTool) screenshot(_ context.Context, _ browserArgs, timeout time.Duration) (agent.ToolResult, error) { 456 if t.isPinchtab() { 457 ctx, cancel := context.WithTimeout(context.Background(), timeout) 458 defer cancel() 459 // Note: pinchtab v0.7.6 captures viewport only (no full-page support). 460 // For full-page, the LLM can scroll + take multiple screenshots. 461 data, err := t.pt.screenshot(ctx, t.tabID) 462 if err != nil { 463 return agent.ToolResult{Content: fmt.Sprintf("screenshot error: %v", err), IsError: true}, nil 464 } 465 466 // Save to temp file, resize for vision loop 467 f, err := os.CreateTemp("", "browser-screenshot-*.jpg") 468 if err != nil { 469 return agent.ToolResult{Content: fmt.Sprintf("failed to create temp file: %v", err), IsError: true}, nil 470 } 471 f.Write(data) 472 f.Close() 473 474 // Best-effort resize — skip if image is too small or sips fails 475 ResizeImage(f.Name(), DefaultAPIWidth) 476 477 block, err := EncodeImage(f.Name()) 478 if err != nil { 479 return agent.ToolResult{Content: fmt.Sprintf("encode error: %v", err), IsError: true}, nil 480 } 481 return agent.ToolResult{ 482 Content: fmt.Sprintf("Screenshot saved to: %s", f.Name()), 483 Images: []agent.ImageBlock{block}, 484 }, nil 485 } 486 487 // chromedp 488 tCtx, cancel := context.WithTimeout(t.ctx, timeout) 489 defer cancel() 490 491 var buf []byte 492 if err := chromedp.Run(tCtx, chromedp.FullScreenshot(&buf, 90)); err != nil { 493 return agent.ToolResult{Content: fmt.Sprintf("screenshot error: %v", err), IsError: true}, nil 494 } 495 496 f, err := os.CreateTemp("", "browser-screenshot-*.png") 497 if err != nil { 498 return agent.ToolResult{Content: fmt.Sprintf("failed to create temp file: %v", err), IsError: true}, nil 499 } 500 f.Write(buf) 501 f.Close() 502 503 // Best-effort resize 504 ResizeImage(f.Name(), DefaultAPIWidth) 505 506 block, err := EncodeImage(f.Name()) 507 if err != nil { 508 return agent.ToolResult{Content: fmt.Sprintf("encode error: %v", err), IsError: true}, nil 509 } 510 return agent.ToolResult{ 511 Content: fmt.Sprintf("Screenshot saved to: %s", f.Name()), 512 Images: []agent.ImageBlock{block}, 513 }, nil 514 } 515 516 // isPageContentEmpty returns true if content is empty/whitespace-only. 517 func isPageContentEmpty(content string) bool { 518 return strings.TrimSpace(content) == "" 519 } 520 521 // antiBotTitlePatterns matches common anti-bot/CAPTCHA page titles. 522 var antiBotTitlePatterns = []string{ 523 "just a moment", 524 "verify you are human", 525 "are you a robot", 526 "robot check", 527 "access denied", 528 "attention required", 529 "security check", 530 "请验证", 531 "人机验证", 532 "安全验证", 533 "please wait while we verify", 534 "checking your browser", 535 "ddos protection", 536 "captcha", 537 "bot detection", 538 } 539 540 // detectAntiBotPage checks if a page title indicates an anti-bot/CAPTCHA challenge. 541 func detectAntiBotPage(title string) bool { 542 lower := strings.ToLower(title) 543 for _, pattern := range antiBotTitlePatterns { 544 if strings.Contains(lower, pattern) { 545 return true 546 } 547 } 548 return false 549 } 550 551 func (t *BrowserTool) readPage(_ context.Context, args browserArgs, timeout time.Duration) (agent.ToolResult, error) { 552 if t.isPinchtab() { 553 ctx, cancel := context.WithTimeout(context.Background(), timeout) 554 defer cancel() 555 resp, err := t.pt.text(ctx, t.tabID, args.TextMode, args.MaxChars, args.Raw) 556 if err != nil { 557 return agent.ToolResult{Content: fmt.Sprintf("read_page error: %v", err), IsError: true}, nil 558 } 559 text := resp.Text 560 if isPageContentEmpty(text) { 561 return agent.ToolResult{Content: fmt.Sprintf("URL: %s\nTitle: %s\n\nread_page returned empty content — the page may not have loaded correctly or may be blocked", resp.URL, resp.Title), IsError: true}, nil 562 } 563 const maxLen = 10240 564 if len(text) > maxLen { 565 text = text[:maxLen] + "\n... [truncated to 10KB]" 566 } 567 return agent.ToolResult{Content: fmt.Sprintf("URL: %s\nTitle: %s\n\n%s", resp.URL, resp.Title, text)}, nil 568 } 569 570 // chromedp 571 tCtx, cancel := context.WithTimeout(t.ctx, timeout) 572 defer cancel() 573 574 selector := "html" 575 if args.Selector != "" { 576 selector = args.Selector 577 } 578 579 var textContent string 580 err := chromedp.Run(tCtx, chromedp.Evaluate( 581 fmt.Sprintf(`document.querySelector(%q)?.innerText || ""`, selector), 582 &textContent, 583 )) 584 if err != nil { 585 // Fall back to outerHTML 586 var html string 587 if err2 := chromedp.Run(tCtx, chromedp.OuterHTML(selector, &html)); err2 != nil { 588 return agent.ToolResult{Content: fmt.Sprintf("read_page error: %v (fallback: %v)", err, err2), IsError: true}, nil 589 } 590 textContent = html 591 } 592 593 if isPageContentEmpty(textContent) { 594 return agent.ToolResult{Content: "read_page returned empty content — the page may not have loaded correctly or may be blocked", IsError: true}, nil 595 } 596 597 const maxLen = 10240 598 if len(textContent) > maxLen { 599 textContent = textContent[:maxLen] + "\n... [truncated to 10KB]" 600 } 601 return agent.ToolResult{Content: textContent}, nil 602 } 603 604 func (t *BrowserTool) executeJS(_ context.Context, args browserArgs, timeout time.Duration) (agent.ToolResult, error) { 605 if t.isPinchtab() { 606 ctx, cancel := context.WithTimeout(context.Background(), timeout) 607 defer cancel() 608 resp, err := t.pt.evaluate(ctx, t.tabID, args.Script) 609 if err != nil { 610 return agent.ToolResult{Content: fmt.Sprintf("execute_js error: %v", err), IsError: true}, nil 611 } 612 output := fmt.Sprintf("%v", resp.Result) 613 const maxLen = 10240 614 if len(output) > maxLen { 615 output = output[:maxLen] + "\n... [truncated to 10KB]" 616 } 617 return agent.ToolResult{Content: output}, nil 618 } 619 620 // chromedp: Evaluate runs in expression context, so multi-statement 621 // scripts with `return`/`const`/`let` would fail with "Illegal return 622 // statement". Transparently wrap them in an async IIFE so the script 623 // author can write natural multi-statement JS. 624 script := wrapJSForEvaluate(args.Script) 625 626 tCtx, cancel := context.WithTimeout(t.ctx, timeout) 627 defer cancel() 628 629 var result any 630 if err := chromedp.Run(tCtx, chromedp.Evaluate(script, &result)); err != nil { 631 return agent.ToolResult{Content: fmt.Sprintf("execute_js error: %v", err), IsError: true}, nil 632 } 633 output := fmt.Sprintf("%v", result) 634 const maxLen = 10240 635 if len(output) > maxLen { 636 output = output[:maxLen] + "\n... [truncated to 10KB]" 637 } 638 return agent.ToolResult{Content: output}, nil 639 } 640 641 func (t *BrowserTool) waitVisible(_ context.Context, args browserArgs, timeout time.Duration) (agent.ToolResult, error) { 642 if t.isPinchtab() { 643 // Use JS polling via evaluate 644 ctx, cancel := context.WithTimeout(context.Background(), timeout) 645 defer cancel() 646 script := fmt.Sprintf(` 647 await new Promise((resolve, reject) => { 648 const el = document.querySelector(%q); 649 if (el) return resolve(true); 650 const obs = new MutationObserver(() => { 651 if (document.querySelector(%q)) { obs.disconnect(); resolve(true); } 652 }); 653 obs.observe(document.body, {childList: true, subtree: true}); 654 setTimeout(() => { obs.disconnect(); reject('timeout'); }, %d); 655 }) 656 `, args.Selector, args.Selector, int(timeout.Milliseconds())) 657 _, err := t.pt.evaluate(ctx, t.tabID, script) 658 if err != nil { 659 return agent.ToolResult{Content: fmt.Sprintf("wait error: %v", err), IsError: true}, nil 660 } 661 return agent.ToolResult{Content: fmt.Sprintf("Element visible: %s", args.Selector)}, nil 662 } 663 664 // chromedp 665 tCtx, cancel := context.WithTimeout(t.ctx, timeout) 666 defer cancel() 667 if err := chromedp.Run(tCtx, chromedp.WaitVisible(args.Selector)); err != nil { 668 return agent.ToolResult{Content: fmt.Sprintf("wait error: %v", err), IsError: true}, nil 669 } 670 return agent.ToolResult{Content: fmt.Sprintf("Element visible: %s", args.Selector)}, nil 671 } 672 673 // --- New pinchtab-only actions --- 674 675 func (t *BrowserTool) snapshotAction(_ context.Context, args browserArgs) (agent.ToolResult, error) { 676 if !t.isPinchtab() { 677 return agent.ToolResult{ 678 Content: "snapshot action requires pinchtab (not available, using chromedp fallback). Use read_page instead.", 679 IsError: true, 680 }, nil 681 } 682 683 filter := args.Filter 684 if filter == "" { 685 filter = "interactive" 686 } 687 688 ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) 689 defer cancel() 690 resp, err := t.pt.snapshot(ctx, t.tabID, filter) 691 if err != nil { 692 return agent.ToolResult{Content: fmt.Sprintf("snapshot error: %v", err), IsError: true}, nil 693 } 694 695 var sb strings.Builder 696 sb.WriteString(fmt.Sprintf("URL: %s\nTitle: %s\nElements: %d\n\n", resp.URL, resp.Title, resp.Count)) 697 698 for _, n := range resp.Nodes { 699 indent := strings.Repeat(" ", n.Depth) 700 line := fmt.Sprintf("%s[%s] %s: %s", indent, n.Ref, n.Role, n.Name) 701 if n.Value != "" { 702 line += fmt.Sprintf(" = %q", n.Value) 703 } 704 if n.Focused { 705 line += " (focused)" 706 } 707 if n.Disabled { 708 line += " (disabled)" 709 } 710 sb.WriteString(line + "\n") 711 } 712 713 content := sb.String() 714 const maxLen = 20480 // snapshot can be larger 715 if len(content) > maxLen { 716 content = content[:maxLen] + "\n... [truncated]" 717 } 718 719 return agent.ToolResult{Content: content}, nil 720 } 721 722 func (t *BrowserTool) findAction(_ context.Context, args browserArgs) (agent.ToolResult, error) { 723 if !t.isPinchtab() { 724 return agent.ToolResult{ 725 Content: "find action requires pinchtab (not available, using chromedp fallback). Use execute_js or read_page instead.", 726 IsError: true, 727 }, nil 728 } 729 730 ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) 731 defer cancel() 732 resp, err := t.pt.find(ctx, ptFindReq{Query: args.Query, TabID: t.tabID, TopK: 5}) 733 if err != nil { 734 // /find may not exist in older pinchtab versions — suggest snapshot instead 735 if strings.Contains(err.Error(), "404") { 736 return agent.ToolResult{ 737 Content: "find is not available in this pinchtab version. Use 'snapshot' to get element refs, then click/type by ref.", 738 IsError: true, 739 }, nil 740 } 741 return agent.ToolResult{Content: fmt.Sprintf("find error: %v", err), IsError: true}, nil 742 } 743 744 var sb strings.Builder 745 sb.WriteString(fmt.Sprintf("Best match: %s (confidence: %s, score: %.2f)\n\n", resp.BestRef, resp.Confidence, resp.Score)) 746 for _, m := range resp.Matches { 747 sb.WriteString(fmt.Sprintf(" [%s] %s: %s (score: %.2f)\n", m.Ref, m.Role, m.Name, m.Score)) 748 } 749 750 return agent.ToolResult{Content: sb.String()}, nil 751 } 752 753 // wrapJSForEvaluate rewrites a script intended for chromedp.Evaluate so that 754 // bare top-level statements (`return x`, `const x = …; return x`) evaluate 755 // without a "SyntaxError: Illegal return statement". chromedp.Evaluate runs 756 // in expression context, so only statement-like leading keywords trigger the 757 // wrap — plain expressions (including semicolon-terminated or multi-line ones 758 // like `JSON.stringify(x);` or `a\nb`) must pass through unchanged, because 759 // wrapping them in an IIFE without an explicit `return` would silently change 760 // the returned value to `undefined`. 761 func wrapJSForEvaluate(script string) string { 762 trimmed := strings.TrimSpace(script) 763 if trimmed == "" { 764 return script 765 } 766 // Already wrapped in a user-authored IIFE? Leave it alone — a redundant 767 // wrap is still valid JS, but this keeps behavior predictable in tests. 768 if strings.HasPrefix(trimmed, "(async") || strings.HasPrefix(trimmed, "(()") || 769 strings.HasPrefix(trimmed, "(function") { 770 return script 771 } 772 // `async` alone is ambiguous: `async () => expr` is a perfectly valid 773 // expression and wrapping it in an IIFE without a `return` would turn 774 // the arrow-function result into `undefined`. Only `async function …` 775 // (a declaration) needs wrapping, so match that two-token form and 776 // leave bare `async` out of the general keyword list. 777 if hasAsyncFunctionPrefix(trimmed) { 778 return "(async () => { " + script + " })()" 779 } 780 if !hasLeadingKeyword(trimmed, "return", "const", "let", "var", "function", "if", "for", "while", "try") { 781 return script 782 } 783 return "(async () => { " + script + " })()" 784 } 785 786 // hasAsyncFunctionPrefix reports whether s starts with "async function" 787 // (with whitespace between the tokens). The arrow-function form 788 // `async () => …` and identifier-like forms (`asyncFoo`) return false. 789 func hasAsyncFunctionPrefix(s string) bool { 790 const prefix = "async" 791 if !strings.HasPrefix(s, prefix) { 792 return false 793 } 794 rest := s[len(prefix):] 795 if rest == "" || (rest[0] != ' ' && rest[0] != '\t') { 796 return false 797 } 798 rest = strings.TrimLeft(rest, " \t") 799 return hasLeadingKeyword(rest, "function") 800 } 801 802 // hasLeadingKeyword reports whether s starts with any of the keywords followed 803 // by whitespace, `(`, or `{` — i.e. a statement boundary rather than an 804 // identifier that happens to share a prefix (`returnValue`). 805 func hasLeadingKeyword(s string, keywords ...string) bool { 806 for _, kw := range keywords { 807 if !strings.HasPrefix(s, kw) { 808 continue 809 } 810 if len(s) == len(kw) { 811 return true 812 } 813 next := s[len(kw)] 814 if next == ' ' || next == '\t' || next == '(' || next == '{' { 815 return true 816 } 817 } 818 return false 819 } 820 821 func (t *BrowserTool) closeBrowser() (agent.ToolResult, error) { 822 t.mu.Lock() 823 defer t.mu.Unlock() 824 825 if t.backend == backendNone { 826 return agent.ToolResult{Content: "Browser is not running"}, nil 827 } 828 829 t.cleanup() 830 return agent.ToolResult{Content: "Browser closed"}, nil 831 } 832 833 // Cleanup shuts down the browser. Safe to call multiple times. 834 func (t *BrowserTool) Cleanup() { 835 t.mu.Lock() 836 defer t.mu.Unlock() 837 t.cleanup() 838 } 839 840 // cleanup must be called with mu held. 841 func (t *BrowserTool) cleanup() { 842 switch t.backend { 843 case backendPinchtab: 844 if t.pt != nil { 845 t.pt.close() 846 } 847 t.tabID = "" 848 case backendChromedp: 849 if t.cancel != nil { 850 t.cancel() 851 } 852 t.ctx = nil 853 t.cancel = nil 854 t.active = false 855 } 856 t.backend = backendNone 857 } 858 859 // CleanupOrphanedChromedp kills any Chrome processes started by chromedp from 860 // previous daemon runs that weren't properly cleaned up (e.g. force-kill, crash). 861 // Safe to call at daemon startup before registering tools. 862 func CleanupOrphanedChromedp() { 863 // chromedp Chrome instances use --user-data-dir pointing to a temp dir 864 // matching "chromedp-runner*". Find and kill them. 865 out, err := exec.Command("pgrep", "-f", "user-data-dir.*chromedp-runner").Output() 866 if err != nil || strings.TrimSpace(string(out)) == "" { 867 return 868 } 869 pids := strings.Fields(strings.TrimSpace(string(out))) 870 for _, pid := range pids { 871 exec.Command("kill", pid).Run() 872 } 873 if len(pids) > 0 { 874 log.Printf("cleaned up %d orphaned chromedp Chrome process(es)", len(pids)) 875 } 876 }