/ internal / tools / browser.go
browser.go
  1  package tools
  2  
  3  import (
  4  	"context"
  5  	"encoding/json"
  6  	"fmt"
  7  	"log"
  8  	"os"
  9  	"os/exec"
 10  	"strings"
 11  	"sync"
 12  	"time"
 13  
 14  	"github.com/chromedp/chromedp"
 15  
 16  	"github.com/Kocoro-lab/ShanClaw/internal/agent"
 17  )
 18  
 19  // backend tracks which browser engine is active.
 20  type browserBackend int
 21  
 22  const (
 23  	backendNone     browserBackend = iota
 24  	backendPinchtab                // pinchtab HTTP API
 25  	backendChromedp                // embedded chromedp (fallback)
 26  )
 27  
 28  type BrowserTool struct {
 29  	mu      sync.Mutex
 30  	backend browserBackend
 31  
 32  	// pinchtab
 33  	pt    *pinchtabClient
 34  	tabID string // active tab in pinchtab
 35  
 36  	// chromedp fallback
 37  	ctx    context.Context
 38  	cancel context.CancelFunc
 39  	active bool
 40  }
 41  
 42  type browserArgs struct {
 43  	Action       string `json:"action"`
 44  	URL          string `json:"url,omitempty"`
 45  	Selector     string `json:"selector,omitempty"`
 46  	Ref          string `json:"ref,omitempty"`
 47  	Text         string `json:"text,omitempty"`
 48  	Key          string `json:"key,omitempty"`
 49  	Value        string `json:"value,omitempty"`
 50  	Script       string `json:"script,omitempty"`
 51  	Query        string `json:"query,omitempty"`
 52  	Filter       string `json:"filter,omitempty"`
 53  	WaitFor      string `json:"waitFor,omitempty"`
 54  	WaitSelector string `json:"waitSelector,omitempty"`
 55  	BlockImages  bool   `json:"blockImages,omitempty"`
 56  	BlockAds     bool   `json:"blockAds,omitempty"`
 57  	TextMode     string `json:"textMode,omitempty"`
 58  	MaxChars     int    `json:"maxChars,omitempty"`
 59  	Raw          bool   `json:"raw,omitempty"`
 60  	Timeout      int    `json:"timeout,omitempty"`
 61  }
 62  
 63  func (t *BrowserTool) Info() agent.ToolInfo {
 64  	return agent.ToolInfo{
 65  		Name: "browser",
 66  		Description: "Control a headless browser with an isolated profile. " +
 67  			"FIRST CHOICE for any web page interaction: navigating, clicking, reading, scraping, screenshots of web content. " +
 68  			"Only skip this for pages requiring user login/authentication — use GUI tools for those. " +
 69  			"Actions: navigate, click, type, scroll, screenshot, read_page, execute_js, wait, close. " +
 70  			"Use 'read_page' (textMode 'raw' for full DOM) to inspect page structure, or 'execute_js' to query the DOM programmatically and return JSON. " +
 71  			"Note: snapshot/find (accessibility-tree actions) are not advertised — they only work with the legacy pinchtab backend; use Playwright MCP for equivalent functionality when available.",
 72  		Parameters: map[string]any{
 73  			"type": "object",
 74  			"properties": map[string]any{
 75  				"action":   map[string]any{"type": "string", "description": "Action: navigate, click, type, scroll, screenshot, read_page, execute_js, wait, close"},
 76  				"url":      map[string]any{"type": "string", "description": "URL to navigate to (for navigate action)"},
 77  				"selector": map[string]any{"type": "string", "description": "CSS selector (for click, type, read_page, scroll, wait)"},
 78  				"ref":      map[string]any{"type": "string", "description": "Element ref, e.g. 'e5' (for click, type, scroll — alternative to selector). Only meaningful when another tool has produced refs for the current page."},
 79  				"text":     map[string]any{"type": "string", "description": "Text to type (for type action)"},
 80  				"key":      map[string]any{"type": "string", "description": "Key to press, e.g. 'Enter' (for press action via click with key)"},
 81  				"value":    map[string]any{"type": "string", "description": "Value to select (for select action via click with value)"},
 82  				"script":   map[string]any{"type": "string", "description": "JavaScript to execute (for execute_js action). Expression context: a plain expression is evaluated and its value returned. Scripts whose first token is a top-level statement keyword (`return`, `const`, `let`, `var`, `function`, `async`, `if`, `for`, `while`, `try`) are auto-wrapped in an async IIFE on the chromedp backend so they evaluate correctly; plain expressions (including semicolon-terminated or multi-line ones) pass through unchanged."},
 83  				"waitFor":      map[string]any{"type": "string", "description": "Navigation wait strategy: e.g. 'domcontentloaded', 'networkidle' (for navigate action)"},
 84  				"waitSelector": map[string]any{"type": "string", "description": "CSS selector to wait for after navigation"},
 85  				"blockImages":  map[string]any{"type": "boolean", "description": "Disable image loading during navigation"},
 86  				"blockAds":     map[string]any{"type": "boolean", "description": "Enable PinchTab ad blocking during navigation"},
 87  				"textMode":     map[string]any{"type": "string", "description": "Text extraction mode for read_page (for example: 'readability' or 'raw')"},
 88  				"maxChars":     map[string]any{"type": "integer", "description": "Maximum characters for read_page output"},
 89  				"raw":          map[string]any{"type": "boolean", "description": "Convenience flag for read_page raw mode"},
 90  				"timeout":      map[string]any{"type": "integer", "description": "Timeout in seconds (default: 30)"},
 91  			},
 92  		},
 93  		Required: []string{"action"},
 94  	}
 95  }
 96  
 97  func (t *BrowserTool) RequiresApproval() bool { return true }
 98  
 99  func (t *BrowserTool) IsReadOnlyCall(string) bool { return false }
100  
101  func (t *BrowserTool) Run(ctx context.Context, argsJSON string) (agent.ToolResult, error) {
102  	var args browserArgs
103  	if err := json.Unmarshal([]byte(argsJSON), &args); err != nil {
104  		return agent.ToolResult{Content: fmt.Sprintf("invalid arguments: %v", err), IsError: true}, nil
105  	}
106  
107  	if args.Action == "" {
108  		return agent.ToolResult{Content: "missing required parameter: action", IsError: true}, nil
109  	}
110  
111  	timeout := 30 * time.Second
112  	if args.Timeout > 0 {
113  		timeout = time.Duration(args.Timeout) * time.Second
114  	}
115  
116  	// close doesn't need a running backend
117  	if args.Action == "close" {
118  		return t.closeBrowser()
119  	}
120  
121  	// Validate required params before starting a browser
122  	if err := t.validateArgs(args); err != nil {
123  		return agent.ToolResult{Content: err.Error(), IsError: true}, nil
124  	}
125  
126  	// Ensure a backend is available (pinchtab preferred, chromedp fallback)
127  	if err := t.ensureBackend(ctx); err != nil {
128  		return agent.ToolResult{Content: fmt.Sprintf("failed to start browser: %v", err), IsError: true}, nil
129  	}
130  
131  	switch args.Action {
132  	case "navigate":
133  		return t.navigate(ctx, args, timeout)
134  	case "click":
135  		return t.click(ctx, args, timeout)
136  	case "type":
137  		return t.typeText(ctx, args, timeout)
138  	case "scroll":
139  		return t.scroll(ctx, args, timeout)
140  	case "screenshot":
141  		return t.screenshot(ctx, args, timeout)
142  	case "read_page":
143  		return t.readPage(ctx, args, timeout)
144  	case "execute_js":
145  		return t.executeJS(ctx, args, timeout)
146  	case "wait":
147  		return t.waitVisible(ctx, args, timeout)
148  	case "snapshot":
149  		// Pinchtab-only; returns a "requires pinchtab" error on the chromedp
150  		// fallback. No longer advertised in Info() so fresh calls should not
151  		// arrive here — but the dispatch stays to keep pinchtab environments
152  		// working (see ensureBackend's pinchtab-first preference).
153  		return t.snapshotAction(ctx, args)
154  	case "find":
155  		return t.findAction(ctx, args)
156  	default:
157  		// unreachable — validateArgs catches unknown actions
158  		return agent.ToolResult{Content: fmt.Sprintf("unknown action: %q", args.Action), IsError: true}, nil
159  	}
160  }
161  
162  // validateArgs checks required params before starting a browser.
163  func (t *BrowserTool) validateArgs(args browserArgs) error {
164  	switch args.Action {
165  	case "navigate":
166  		if args.URL == "" {
167  			return fmt.Errorf("navigate action requires 'url' parameter")
168  		}
169  	case "click":
170  		if args.Ref == "" && args.Selector == "" {
171  			return fmt.Errorf("click action requires 'ref' or 'selector' parameter")
172  		}
173  	case "type":
174  		if args.Ref == "" && args.Selector == "" {
175  			return fmt.Errorf("type action requires 'ref' or 'selector' parameter")
176  		}
177  	case "wait":
178  		if args.Selector == "" {
179  			return fmt.Errorf("wait action requires 'selector' parameter")
180  		}
181  	case "execute_js":
182  		if args.Script == "" {
183  			return fmt.Errorf("execute_js action requires 'script' parameter")
184  		}
185  	case "find":
186  		if args.Query == "" {
187  			return fmt.Errorf("find action requires 'query' parameter")
188  		}
189  	case "scroll", "screenshot", "read_page", "snapshot":
190  		// no required params
191  	default:
192  		return fmt.Errorf("unknown action: %q (valid: navigate, click, type, scroll, screenshot, read_page, execute_js, wait, close)", args.Action)
193  	}
194  	return nil
195  }
196  
197  // ensureBackend picks pinchtab if available, else falls back to chromedp.
198  func (t *BrowserTool) ensureBackend(ctx context.Context) error {
199  	t.mu.Lock()
200  	defer t.mu.Unlock()
201  
202  	// Already have a working backend?
203  	switch t.backend {
204  	case backendPinchtab:
205  		if t.pt.available(ctx) {
206  			return nil
207  		}
208  		// pinchtab died — clear stale tab ID, try to restart or fall through to chromedp
209  		t.tabID = ""
210  		t.backend = backendNone
211  	case backendChromedp:
212  		if t.ctx != nil && t.ctx.Err() == nil {
213  			return nil
214  		}
215  		// chromedp context dead — reset
216  		if t.cancel != nil {
217  			t.cancel()
218  		}
219  		t.ctx = nil
220  		t.cancel = nil
221  		t.active = false
222  		t.backend = backendNone
223  	}
224  
225  	// Try pinchtab first
226  	if t.pt == nil {
227  		t.pt = newPinchtabClient()
228  	}
229  	if err := t.pt.ensure(ctx); err == nil {
230  		t.backend = backendPinchtab
231  		return nil
232  	}
233  
234  	// Fall back to chromedp
235  	return t.startChromedp()
236  }
237  
238  func (t *BrowserTool) startChromedp() error {
239  	opts := append(chromedp.DefaultExecAllocatorOptions[:],
240  		chromedp.Flag("headless", false),
241  		chromedp.Flag("disable-gpu", true),
242  		chromedp.Flag("no-first-run", true),
243  		chromedp.Flag("no-default-browser-check", true),
244  	)
245  
246  	allocCtx, allocCancel := chromedp.NewExecAllocator(context.Background(), opts...)
247  	browserCtx, browserCancel := chromedp.NewContext(allocCtx)
248  
249  	if err := chromedp.Run(browserCtx); err != nil {
250  		browserCancel()
251  		allocCancel()
252  		return fmt.Errorf("failed to start browser: %w", err)
253  	}
254  
255  	t.ctx = browserCtx
256  	t.cancel = func() {
257  		browserCancel()
258  		allocCancel()
259  	}
260  	t.active = true
261  	t.backend = backendChromedp
262  	return nil
263  }
264  
265  func (t *BrowserTool) isPinchtab() bool {
266  	return t.backend == backendPinchtab
267  }
268  
269  // --- Actions ---
270  
271  // formatNavigateResult builds the navigate result string with anti-bot warning and content preview.
272  func formatNavigateResult(pageURL, title, textPreview string) string {
273  	content := fmt.Sprintf("Navigated to: %s\nTitle: %s", pageURL, title)
274  
275  	if detectAntiBotPage(title) {
276  		content += "\n\nWARNING: This page appears to be an anti-bot challenge or CAPTCHA. " +
277  			"The page content is likely NOT the expected website content. " +
278  			"Do NOT attempt to extract data from this page. " +
279  			"Report to the user that the site blocked automated access."
280  	}
281  
282  	preview := strings.TrimSpace(textPreview)
283  	if preview != "" {
284  		const maxPreviewRunes = 200
285  		runes := []rune(preview)
286  		if len(runes) > maxPreviewRunes {
287  			preview = string(runes[:maxPreviewRunes]) + "..."
288  		}
289  		content += fmt.Sprintf("\nPreview: %s", preview)
290  	}
291  
292  	return content
293  }
294  
295  func (t *BrowserTool) navigate(_ context.Context, args browserArgs, timeout time.Duration) (agent.ToolResult, error) {
296  	if t.isPinchtab() {
297  		ctx, cancel := context.WithTimeout(context.Background(), timeout)
298  		defer cancel()
299  		// Always open a new tab to isolate navigation from previous tasks
300  		resp, err := t.pt.navigate(ctx, ptNavigateReq{
301  			URL:          args.URL,
302  			NewTab:       true,
303  			BlockImages:  args.BlockImages,
304  			BlockAds:     args.BlockAds,
305  			WaitFor:      args.WaitFor,
306  			WaitSelector: args.WaitSelector,
307  		})
308  		if err != nil {
309  			return agent.ToolResult{Content: fmt.Sprintf("navigate error: %v", err), IsError: true}, nil
310  		}
311  		if resp.TabID != "" {
312  			t.tabID = resp.TabID
313  		}
314  
315  		// Best-effort content preview — don't fail navigate if text fetch fails.
316  		// Only fetch if we have a valid tab ID from this navigation response.
317  		var preview string
318  		if resp.TabID != "" {
319  			if textResp, err := t.pt.text(ctx, resp.TabID, "", 0, false); err == nil {
320  				preview = textResp.Text
321  			}
322  		}
323  
324  		return agent.ToolResult{Content: formatNavigateResult(resp.URL, resp.Title, preview)}, nil
325  	}
326  
327  	// chromedp
328  	tCtx, cancel := context.WithTimeout(t.ctx, timeout)
329  	defer cancel()
330  
331  	var title, textContent string
332  	err := chromedp.Run(tCtx,
333  		chromedp.Navigate(args.URL),
334  		chromedp.WaitReady("body", chromedp.ByQuery),
335  		chromedp.Title(&title),
336  	)
337  	if err != nil {
338  		return agent.ToolResult{Content: fmt.Sprintf("navigate error: %v", err), IsError: true}, nil
339  	}
340  
341  	// Best-effort content preview
342  	_ = chromedp.Run(tCtx, chromedp.Evaluate(
343  		`(document.querySelector("html")?.innerText || "").substring(0, 300)`,
344  		&textContent,
345  	))
346  
347  	return agent.ToolResult{Content: formatNavigateResult(args.URL, title, textContent)}, nil
348  }
349  
350  func (t *BrowserTool) click(_ context.Context, args browserArgs, timeout time.Duration) (agent.ToolResult, error) {
351  	if t.isPinchtab() {
352  		ctx, cancel := context.WithTimeout(context.Background(), timeout)
353  		defer cancel()
354  		kind := "click"
355  		if args.Key != "" {
356  			kind = "press"
357  		} else if args.Value != "" {
358  			kind = "select"
359  		}
360  		req := ptActionReq{TabID: t.tabID, Kind: kind, Ref: args.Ref, Selector: args.Selector, Key: args.Key, Value: args.Value}
361  		resp, err := t.pt.action(ctx, req)
362  		if err != nil {
363  			return agent.ToolResult{Content: fmt.Sprintf("click error: %v", err), IsError: true}, nil
364  		}
365  		target := args.Ref
366  		if target == "" {
367  			target = args.Selector
368  		}
369  		_ = resp
370  		return agent.ToolResult{Content: fmt.Sprintf("Clicked: %s", target)}, nil
371  	}
372  
373  	// chromedp (selector only)
374  	if args.Selector == "" {
375  		return agent.ToolResult{Content: "chromedp fallback requires 'selector' (refs not supported without pinchtab)", IsError: true}, nil
376  	}
377  	tCtx, cancel := context.WithTimeout(t.ctx, timeout)
378  	defer cancel()
379  	if err := chromedp.Run(tCtx, chromedp.Click(args.Selector)); err != nil {
380  		return agent.ToolResult{Content: fmt.Sprintf("click error: %v", err), IsError: true}, nil
381  	}
382  	return agent.ToolResult{Content: fmt.Sprintf("Clicked: %s", args.Selector)}, nil
383  }
384  
385  func (t *BrowserTool) typeText(_ context.Context, args browserArgs, timeout time.Duration) (agent.ToolResult, error) {
386  	if t.isPinchtab() {
387  		ctx, cancel := context.WithTimeout(context.Background(), timeout)
388  		defer cancel()
389  		req := ptActionReq{TabID: t.tabID, Kind: "type", Ref: args.Ref, Selector: args.Selector, Text: args.Text}
390  		_, err := t.pt.action(ctx, req)
391  		if err != nil {
392  			return agent.ToolResult{Content: fmt.Sprintf("type error: %v", err), IsError: true}, nil
393  		}
394  		target := args.Ref
395  		if target == "" {
396  			target = args.Selector
397  		}
398  		return agent.ToolResult{Content: fmt.Sprintf("Typed into: %s", target)}, nil
399  	}
400  
401  	// chromedp
402  	if args.Selector == "" {
403  		return agent.ToolResult{Content: "chromedp fallback requires 'selector'", IsError: true}, nil
404  	}
405  	tCtx, cancel := context.WithTimeout(t.ctx, timeout)
406  	defer cancel()
407  	if err := chromedp.Run(tCtx, chromedp.SendKeys(args.Selector, args.Text)); err != nil {
408  		return agent.ToolResult{Content: fmt.Sprintf("type error: %v", err), IsError: true}, nil
409  	}
410  	return agent.ToolResult{Content: fmt.Sprintf("Typed into: %s", args.Selector)}, nil
411  }
412  
413  func (t *BrowserTool) scroll(_ context.Context, args browserArgs, timeout time.Duration) (agent.ToolResult, error) {
414  	if t.isPinchtab() {
415  		ctx, cancel := context.WithTimeout(context.Background(), timeout)
416  		defer cancel()
417  		req := ptActionReq{TabID: t.tabID, Kind: "scroll", Ref: args.Ref, Selector: args.Selector}
418  		if args.Ref == "" && args.Selector == "" {
419  			req.ScrollY = 800 // scroll down by default
420  		}
421  		_, err := t.pt.action(ctx, req)
422  		if err != nil {
423  			return agent.ToolResult{Content: fmt.Sprintf("scroll error: %v", err), IsError: true}, nil
424  		}
425  		target := args.Ref
426  		if target == "" {
427  			target = args.Selector
428  		}
429  		if target == "" {
430  			target = "page"
431  		}
432  		return agent.ToolResult{Content: fmt.Sprintf("Scrolled: %s", target)}, nil
433  	}
434  
435  	// chromedp
436  	tCtx, cancel := context.WithTimeout(t.ctx, timeout)
437  	defer cancel()
438  
439  	if args.Selector != "" {
440  		if err := chromedp.Run(tCtx, chromedp.ScrollIntoView(args.Selector)); err != nil {
441  			return agent.ToolResult{Content: fmt.Sprintf("scroll error: %v", err), IsError: true}, nil
442  		}
443  		return agent.ToolResult{Content: fmt.Sprintf("Scrolled to: %s", args.Selector)}, nil
444  	}
445  
446  	var scrollHeight int
447  	if err := chromedp.Run(tCtx,
448  		chromedp.Evaluate(`window.scrollTo(0, document.body.scrollHeight); document.body.scrollHeight`, &scrollHeight),
449  	); err != nil {
450  		return agent.ToolResult{Content: fmt.Sprintf("scroll error: %v", err), IsError: true}, nil
451  	}
452  	return agent.ToolResult{Content: fmt.Sprintf("Scrolled to bottom (height: %d)", scrollHeight)}, nil
453  }
454  
455  func (t *BrowserTool) screenshot(_ context.Context, _ browserArgs, timeout time.Duration) (agent.ToolResult, error) {
456  	if t.isPinchtab() {
457  		ctx, cancel := context.WithTimeout(context.Background(), timeout)
458  		defer cancel()
459  		// Note: pinchtab v0.7.6 captures viewport only (no full-page support).
460  		// For full-page, the LLM can scroll + take multiple screenshots.
461  		data, err := t.pt.screenshot(ctx, t.tabID)
462  		if err != nil {
463  			return agent.ToolResult{Content: fmt.Sprintf("screenshot error: %v", err), IsError: true}, nil
464  		}
465  
466  		// Save to temp file, resize for vision loop
467  		f, err := os.CreateTemp("", "browser-screenshot-*.jpg")
468  		if err != nil {
469  			return agent.ToolResult{Content: fmt.Sprintf("failed to create temp file: %v", err), IsError: true}, nil
470  		}
471  		f.Write(data)
472  		f.Close()
473  
474  		// Best-effort resize — skip if image is too small or sips fails
475  		ResizeImage(f.Name(), DefaultAPIWidth)
476  
477  		block, err := EncodeImage(f.Name())
478  		if err != nil {
479  			return agent.ToolResult{Content: fmt.Sprintf("encode error: %v", err), IsError: true}, nil
480  		}
481  		return agent.ToolResult{
482  			Content: fmt.Sprintf("Screenshot saved to: %s", f.Name()),
483  			Images:  []agent.ImageBlock{block},
484  		}, nil
485  	}
486  
487  	// chromedp
488  	tCtx, cancel := context.WithTimeout(t.ctx, timeout)
489  	defer cancel()
490  
491  	var buf []byte
492  	if err := chromedp.Run(tCtx, chromedp.FullScreenshot(&buf, 90)); err != nil {
493  		return agent.ToolResult{Content: fmt.Sprintf("screenshot error: %v", err), IsError: true}, nil
494  	}
495  
496  	f, err := os.CreateTemp("", "browser-screenshot-*.png")
497  	if err != nil {
498  		return agent.ToolResult{Content: fmt.Sprintf("failed to create temp file: %v", err), IsError: true}, nil
499  	}
500  	f.Write(buf)
501  	f.Close()
502  
503  	// Best-effort resize
504  	ResizeImage(f.Name(), DefaultAPIWidth)
505  
506  	block, err := EncodeImage(f.Name())
507  	if err != nil {
508  		return agent.ToolResult{Content: fmt.Sprintf("encode error: %v", err), IsError: true}, nil
509  	}
510  	return agent.ToolResult{
511  		Content: fmt.Sprintf("Screenshot saved to: %s", f.Name()),
512  		Images:  []agent.ImageBlock{block},
513  	}, nil
514  }
515  
516  // isPageContentEmpty returns true if content is empty/whitespace-only.
517  func isPageContentEmpty(content string) bool {
518  	return strings.TrimSpace(content) == ""
519  }
520  
521  // antiBotTitlePatterns matches common anti-bot/CAPTCHA page titles.
522  var antiBotTitlePatterns = []string{
523  	"just a moment",
524  	"verify you are human",
525  	"are you a robot",
526  	"robot check",
527  	"access denied",
528  	"attention required",
529  	"security check",
530  	"请验证",
531  	"人机验证",
532  	"安全验证",
533  	"please wait while we verify",
534  	"checking your browser",
535  	"ddos protection",
536  	"captcha",
537  	"bot detection",
538  }
539  
540  // detectAntiBotPage checks if a page title indicates an anti-bot/CAPTCHA challenge.
541  func detectAntiBotPage(title string) bool {
542  	lower := strings.ToLower(title)
543  	for _, pattern := range antiBotTitlePatterns {
544  		if strings.Contains(lower, pattern) {
545  			return true
546  		}
547  	}
548  	return false
549  }
550  
551  func (t *BrowserTool) readPage(_ context.Context, args browserArgs, timeout time.Duration) (agent.ToolResult, error) {
552  	if t.isPinchtab() {
553  		ctx, cancel := context.WithTimeout(context.Background(), timeout)
554  		defer cancel()
555  		resp, err := t.pt.text(ctx, t.tabID, args.TextMode, args.MaxChars, args.Raw)
556  		if err != nil {
557  			return agent.ToolResult{Content: fmt.Sprintf("read_page error: %v", err), IsError: true}, nil
558  		}
559  		text := resp.Text
560  		if isPageContentEmpty(text) {
561  			return agent.ToolResult{Content: fmt.Sprintf("URL: %s\nTitle: %s\n\nread_page returned empty content — the page may not have loaded correctly or may be blocked", resp.URL, resp.Title), IsError: true}, nil
562  		}
563  		const maxLen = 10240
564  		if len(text) > maxLen {
565  			text = text[:maxLen] + "\n... [truncated to 10KB]"
566  		}
567  		return agent.ToolResult{Content: fmt.Sprintf("URL: %s\nTitle: %s\n\n%s", resp.URL, resp.Title, text)}, nil
568  	}
569  
570  	// chromedp
571  	tCtx, cancel := context.WithTimeout(t.ctx, timeout)
572  	defer cancel()
573  
574  	selector := "html"
575  	if args.Selector != "" {
576  		selector = args.Selector
577  	}
578  
579  	var textContent string
580  	err := chromedp.Run(tCtx, chromedp.Evaluate(
581  		fmt.Sprintf(`document.querySelector(%q)?.innerText || ""`, selector),
582  		&textContent,
583  	))
584  	if err != nil {
585  		// Fall back to outerHTML
586  		var html string
587  		if err2 := chromedp.Run(tCtx, chromedp.OuterHTML(selector, &html)); err2 != nil {
588  			return agent.ToolResult{Content: fmt.Sprintf("read_page error: %v (fallback: %v)", err, err2), IsError: true}, nil
589  		}
590  		textContent = html
591  	}
592  
593  	if isPageContentEmpty(textContent) {
594  		return agent.ToolResult{Content: "read_page returned empty content — the page may not have loaded correctly or may be blocked", IsError: true}, nil
595  	}
596  
597  	const maxLen = 10240
598  	if len(textContent) > maxLen {
599  		textContent = textContent[:maxLen] + "\n... [truncated to 10KB]"
600  	}
601  	return agent.ToolResult{Content: textContent}, nil
602  }
603  
604  func (t *BrowserTool) executeJS(_ context.Context, args browserArgs, timeout time.Duration) (agent.ToolResult, error) {
605  	if t.isPinchtab() {
606  		ctx, cancel := context.WithTimeout(context.Background(), timeout)
607  		defer cancel()
608  		resp, err := t.pt.evaluate(ctx, t.tabID, args.Script)
609  		if err != nil {
610  			return agent.ToolResult{Content: fmt.Sprintf("execute_js error: %v", err), IsError: true}, nil
611  		}
612  		output := fmt.Sprintf("%v", resp.Result)
613  		const maxLen = 10240
614  		if len(output) > maxLen {
615  			output = output[:maxLen] + "\n... [truncated to 10KB]"
616  		}
617  		return agent.ToolResult{Content: output}, nil
618  	}
619  
620  	// chromedp: Evaluate runs in expression context, so multi-statement
621  	// scripts with `return`/`const`/`let` would fail with "Illegal return
622  	// statement". Transparently wrap them in an async IIFE so the script
623  	// author can write natural multi-statement JS.
624  	script := wrapJSForEvaluate(args.Script)
625  
626  	tCtx, cancel := context.WithTimeout(t.ctx, timeout)
627  	defer cancel()
628  
629  	var result any
630  	if err := chromedp.Run(tCtx, chromedp.Evaluate(script, &result)); err != nil {
631  		return agent.ToolResult{Content: fmt.Sprintf("execute_js error: %v", err), IsError: true}, nil
632  	}
633  	output := fmt.Sprintf("%v", result)
634  	const maxLen = 10240
635  	if len(output) > maxLen {
636  		output = output[:maxLen] + "\n... [truncated to 10KB]"
637  	}
638  	return agent.ToolResult{Content: output}, nil
639  }
640  
641  func (t *BrowserTool) waitVisible(_ context.Context, args browserArgs, timeout time.Duration) (agent.ToolResult, error) {
642  	if t.isPinchtab() {
643  		// Use JS polling via evaluate
644  		ctx, cancel := context.WithTimeout(context.Background(), timeout)
645  		defer cancel()
646  		script := fmt.Sprintf(`
647  			await new Promise((resolve, reject) => {
648  				const el = document.querySelector(%q);
649  				if (el) return resolve(true);
650  				const obs = new MutationObserver(() => {
651  					if (document.querySelector(%q)) { obs.disconnect(); resolve(true); }
652  				});
653  				obs.observe(document.body, {childList: true, subtree: true});
654  				setTimeout(() => { obs.disconnect(); reject('timeout'); }, %d);
655  			})
656  		`, args.Selector, args.Selector, int(timeout.Milliseconds()))
657  		_, err := t.pt.evaluate(ctx, t.tabID, script)
658  		if err != nil {
659  			return agent.ToolResult{Content: fmt.Sprintf("wait error: %v", err), IsError: true}, nil
660  		}
661  		return agent.ToolResult{Content: fmt.Sprintf("Element visible: %s", args.Selector)}, nil
662  	}
663  
664  	// chromedp
665  	tCtx, cancel := context.WithTimeout(t.ctx, timeout)
666  	defer cancel()
667  	if err := chromedp.Run(tCtx, chromedp.WaitVisible(args.Selector)); err != nil {
668  		return agent.ToolResult{Content: fmt.Sprintf("wait error: %v", err), IsError: true}, nil
669  	}
670  	return agent.ToolResult{Content: fmt.Sprintf("Element visible: %s", args.Selector)}, nil
671  }
672  
673  // --- New pinchtab-only actions ---
674  
675  func (t *BrowserTool) snapshotAction(_ context.Context, args browserArgs) (agent.ToolResult, error) {
676  	if !t.isPinchtab() {
677  		return agent.ToolResult{
678  			Content: "snapshot action requires pinchtab (not available, using chromedp fallback). Use read_page instead.",
679  			IsError: true,
680  		}, nil
681  	}
682  
683  	filter := args.Filter
684  	if filter == "" {
685  		filter = "interactive"
686  	}
687  
688  	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
689  	defer cancel()
690  	resp, err := t.pt.snapshot(ctx, t.tabID, filter)
691  	if err != nil {
692  		return agent.ToolResult{Content: fmt.Sprintf("snapshot error: %v", err), IsError: true}, nil
693  	}
694  
695  	var sb strings.Builder
696  	sb.WriteString(fmt.Sprintf("URL: %s\nTitle: %s\nElements: %d\n\n", resp.URL, resp.Title, resp.Count))
697  
698  	for _, n := range resp.Nodes {
699  		indent := strings.Repeat("  ", n.Depth)
700  		line := fmt.Sprintf("%s[%s] %s: %s", indent, n.Ref, n.Role, n.Name)
701  		if n.Value != "" {
702  			line += fmt.Sprintf(" = %q", n.Value)
703  		}
704  		if n.Focused {
705  			line += " (focused)"
706  		}
707  		if n.Disabled {
708  			line += " (disabled)"
709  		}
710  		sb.WriteString(line + "\n")
711  	}
712  
713  	content := sb.String()
714  	const maxLen = 20480 // snapshot can be larger
715  	if len(content) > maxLen {
716  		content = content[:maxLen] + "\n... [truncated]"
717  	}
718  
719  	return agent.ToolResult{Content: content}, nil
720  }
721  
722  func (t *BrowserTool) findAction(_ context.Context, args browserArgs) (agent.ToolResult, error) {
723  	if !t.isPinchtab() {
724  		return agent.ToolResult{
725  			Content: "find action requires pinchtab (not available, using chromedp fallback). Use execute_js or read_page instead.",
726  			IsError: true,
727  		}, nil
728  	}
729  
730  	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
731  	defer cancel()
732  	resp, err := t.pt.find(ctx, ptFindReq{Query: args.Query, TabID: t.tabID, TopK: 5})
733  	if err != nil {
734  		// /find may not exist in older pinchtab versions — suggest snapshot instead
735  		if strings.Contains(err.Error(), "404") {
736  			return agent.ToolResult{
737  				Content: "find is not available in this pinchtab version. Use 'snapshot' to get element refs, then click/type by ref.",
738  				IsError: true,
739  			}, nil
740  		}
741  		return agent.ToolResult{Content: fmt.Sprintf("find error: %v", err), IsError: true}, nil
742  	}
743  
744  	var sb strings.Builder
745  	sb.WriteString(fmt.Sprintf("Best match: %s (confidence: %s, score: %.2f)\n\n", resp.BestRef, resp.Confidence, resp.Score))
746  	for _, m := range resp.Matches {
747  		sb.WriteString(fmt.Sprintf("  [%s] %s: %s (score: %.2f)\n", m.Ref, m.Role, m.Name, m.Score))
748  	}
749  
750  	return agent.ToolResult{Content: sb.String()}, nil
751  }
752  
753  // wrapJSForEvaluate rewrites a script intended for chromedp.Evaluate so that
754  // bare top-level statements (`return x`, `const x = …; return x`) evaluate
755  // without a "SyntaxError: Illegal return statement". chromedp.Evaluate runs
756  // in expression context, so only statement-like leading keywords trigger the
757  // wrap — plain expressions (including semicolon-terminated or multi-line ones
758  // like `JSON.stringify(x);` or `a\nb`) must pass through unchanged, because
759  // wrapping them in an IIFE without an explicit `return` would silently change
760  // the returned value to `undefined`.
761  func wrapJSForEvaluate(script string) string {
762  	trimmed := strings.TrimSpace(script)
763  	if trimmed == "" {
764  		return script
765  	}
766  	// Already wrapped in a user-authored IIFE? Leave it alone — a redundant
767  	// wrap is still valid JS, but this keeps behavior predictable in tests.
768  	if strings.HasPrefix(trimmed, "(async") || strings.HasPrefix(trimmed, "(()") ||
769  		strings.HasPrefix(trimmed, "(function") {
770  		return script
771  	}
772  	// `async` alone is ambiguous: `async () => expr` is a perfectly valid
773  	// expression and wrapping it in an IIFE without a `return` would turn
774  	// the arrow-function result into `undefined`. Only `async function …`
775  	// (a declaration) needs wrapping, so match that two-token form and
776  	// leave bare `async` out of the general keyword list.
777  	if hasAsyncFunctionPrefix(trimmed) {
778  		return "(async () => { " + script + " })()"
779  	}
780  	if !hasLeadingKeyword(trimmed, "return", "const", "let", "var", "function", "if", "for", "while", "try") {
781  		return script
782  	}
783  	return "(async () => { " + script + " })()"
784  }
785  
786  // hasAsyncFunctionPrefix reports whether s starts with "async function"
787  // (with whitespace between the tokens). The arrow-function form
788  // `async () => …` and identifier-like forms (`asyncFoo`) return false.
789  func hasAsyncFunctionPrefix(s string) bool {
790  	const prefix = "async"
791  	if !strings.HasPrefix(s, prefix) {
792  		return false
793  	}
794  	rest := s[len(prefix):]
795  	if rest == "" || (rest[0] != ' ' && rest[0] != '\t') {
796  		return false
797  	}
798  	rest = strings.TrimLeft(rest, " \t")
799  	return hasLeadingKeyword(rest, "function")
800  }
801  
802  // hasLeadingKeyword reports whether s starts with any of the keywords followed
803  // by whitespace, `(`, or `{` — i.e. a statement boundary rather than an
804  // identifier that happens to share a prefix (`returnValue`).
805  func hasLeadingKeyword(s string, keywords ...string) bool {
806  	for _, kw := range keywords {
807  		if !strings.HasPrefix(s, kw) {
808  			continue
809  		}
810  		if len(s) == len(kw) {
811  			return true
812  		}
813  		next := s[len(kw)]
814  		if next == ' ' || next == '\t' || next == '(' || next == '{' {
815  			return true
816  		}
817  	}
818  	return false
819  }
820  
821  func (t *BrowserTool) closeBrowser() (agent.ToolResult, error) {
822  	t.mu.Lock()
823  	defer t.mu.Unlock()
824  
825  	if t.backend == backendNone {
826  		return agent.ToolResult{Content: "Browser is not running"}, nil
827  	}
828  
829  	t.cleanup()
830  	return agent.ToolResult{Content: "Browser closed"}, nil
831  }
832  
833  // Cleanup shuts down the browser. Safe to call multiple times.
834  func (t *BrowserTool) Cleanup() {
835  	t.mu.Lock()
836  	defer t.mu.Unlock()
837  	t.cleanup()
838  }
839  
840  // cleanup must be called with mu held.
841  func (t *BrowserTool) cleanup() {
842  	switch t.backend {
843  	case backendPinchtab:
844  		if t.pt != nil {
845  			t.pt.close()
846  		}
847  		t.tabID = ""
848  	case backendChromedp:
849  		if t.cancel != nil {
850  			t.cancel()
851  		}
852  		t.ctx = nil
853  		t.cancel = nil
854  		t.active = false
855  	}
856  	t.backend = backendNone
857  }
858  
859  // CleanupOrphanedChromedp kills any Chrome processes started by chromedp from
860  // previous daemon runs that weren't properly cleaned up (e.g. force-kill, crash).
861  // Safe to call at daemon startup before registering tools.
862  func CleanupOrphanedChromedp() {
863  	// chromedp Chrome instances use --user-data-dir pointing to a temp dir
864  	// matching "chromedp-runner*". Find and kill them.
865  	out, err := exec.Command("pgrep", "-f", "user-data-dir.*chromedp-runner").Output()
866  	if err != nil || strings.TrimSpace(string(out)) == "" {
867  		return
868  	}
869  	pids := strings.Fields(strings.TrimSpace(string(out)))
870  	for _, pid := range pids {
871  		exec.Command("kill", pid).Run()
872  	}
873  	if len(pids) > 0 {
874  		log.Printf("cleaned up %d orphaned chromedp Chrome process(es)", len(pids))
875  	}
876  }