/ internal / tools / computer.go
computer.go
  1  package tools
  2  
  3  import (
  4  	"context"
  5  	"encoding/json"
  6  	"fmt"
  7  	"strings"
  8  	"time"
  9  
 10  	"github.com/Kocoro-lab/ShanClaw/internal/agent"
 11  	"github.com/Kocoro-lab/ShanClaw/internal/client"
 12  )
 13  
 14  type ComputerTool struct {
 15  	client  *AXClient
 16  	screenW int
 17  	screenH int
 18  }
 19  
 20  func (t *ComputerTool) ensureScreenDims() {
 21  	if t.screenW > 0 {
 22  		return
 23  	}
 24  	w, h, err := GetScreenDimensions()
 25  	if err != nil {
 26  		t.screenW = DefaultAPIWidth
 27  		t.screenH = DefaultAPIHeight
 28  		return
 29  	}
 30  	t.screenW = w
 31  	t.screenH = h
 32  }
 33  
 34  func (t *ComputerTool) scaleXY(apiX, apiY int) (int, int) {
 35  	t.ensureScreenDims()
 36  	x, y := ScaleCoordinates(apiX, apiY, DefaultAPIWidth, DefaultAPIHeight, t.screenW, t.screenH)
 37  	return ClampCoordinates(x, y, t.screenW, t.screenH)
 38  }
 39  
 40  func (t *ComputerTool) captureAfterAction(result agent.ToolResult) agent.ToolResult {
 41  	time.Sleep(500 * time.Millisecond)
 42  	_, block, err := CaptureAndEncode(DefaultAPIWidth)
 43  	if err != nil {
 44  		return result // Non-fatal
 45  	}
 46  	result.Images = []agent.ImageBlock{block}
 47  	return result
 48  }
 49  
 50  type computerArgs struct {
 51  	Action     string `json:"action"`
 52  	X          int    `json:"x,omitempty"`
 53  	Y          int    `json:"y,omitempty"`
 54  	Text       string `json:"text,omitempty"`
 55  	Keys       string `json:"keys,omitempty"`
 56  	Button     string `json:"button,omitempty"`
 57  	Clicks     int    `json:"clicks,omitempty"`
 58  	Coordinate []int  `json:"coordinate,omitempty"` // Anthropic native: [x, y]
 59  }
 60  
 61  // normalizeArgs maps Anthropic native action names and coordinate format
 62  // to our internal format.
 63  func normalizeArgs(args *computerArgs) {
 64  	// Map Anthropic coordinate array to x, y
 65  	if len(args.Coordinate) == 2 {
 66  		args.X = args.Coordinate[0]
 67  		args.Y = args.Coordinate[1]
 68  	}
 69  
 70  	// Map Anthropic native action names to our actions
 71  	switch args.Action {
 72  	case "left_click":
 73  		args.Action = "click"
 74  		args.Button = "left"
 75  		args.Clicks = 1
 76  	case "right_click":
 77  		args.Action = "click"
 78  		args.Button = "right"
 79  		args.Clicks = 1
 80  	case "double_click":
 81  		args.Action = "click"
 82  		args.Button = "left"
 83  		args.Clicks = 2
 84  	case "middle_click":
 85  		args.Action = "click"
 86  		args.Button = "left" // fallback — no middle click support
 87  		args.Clicks = 1
 88  	case "triple_click":
 89  		args.Action = "click"
 90  		args.Button = "left"
 91  		args.Clicks = 3
 92  	case "mouse_move":
 93  		args.Action = "move"
 94  	case "key":
 95  		args.Action = "hotkey"
 96  		if args.Text != "" && args.Keys == "" {
 97  			args.Keys = args.Text // Anthropic sends key combo in "text" field
 98  		}
 99  	case "screenshot":
100  		args.Action = "screenshot"
101  	}
102  }
103  
104  func (t *ComputerTool) Info() agent.ToolInfo {
105  	return agent.ToolInfo{
106  		Name:        "computer",
107  		Description: "OS-level mouse and keyboard control for macOS. Use for coordinate-based clicks, typing text (CJK/emoji safe), and keyboard shortcuts. For clicking UI elements, prefer accessibility tool (ref-based) over coordinate clicks. Actions: click, type, hotkey, move, screenshot.",
108  		Parameters: map[string]any{
109  			"type": "object",
110  			"properties": map[string]any{
111  				"action": map[string]any{"type": "string", "description": "Action to perform: click, type, hotkey, move"},
112  				"x":      map[string]any{"type": "integer", "description": "Screen X coordinate (for click/move)"},
113  				"y":      map[string]any{"type": "integer", "description": "Screen Y coordinate (for click/move)"},
114  				"text":   map[string]any{"type": "string", "description": "Text to type (for type action)"},
115  				"keys":   map[string]any{"type": "string", "description": "Key combination like command+c, command+shift+4 (for hotkey action)"},
116  				"button": map[string]any{"type": "string", "description": "Mouse button: left (default), right (for click action)"},
117  				"clicks": map[string]any{"type": "integer", "description": "Number of clicks: 1 (default), 2 for double-click (for click action)"},
118  			},
119  		},
120  		Required: []string{"action"},
121  	}
122  }
123  
124  func (t *ComputerTool) RequiresApproval() bool { return true }
125  
126  func (t *ComputerTool) IsReadOnlyCall(string) bool { return false }
127  
128  func (t *ComputerTool) NativeToolDef() *client.NativeToolDef {
129  	return &client.NativeToolDef{
130  		Type:            "computer_20251124",
131  		Name:            "computer",
132  		DisplayWidthPx:  DefaultAPIWidth,
133  		DisplayHeightPx: DefaultAPIHeight,
134  	}
135  }
136  
137  func (t *ComputerTool) Run(ctx context.Context, argsJSON string) (agent.ToolResult, error) {
138  	var args computerArgs
139  	if err := json.Unmarshal([]byte(argsJSON), &args); err != nil {
140  		return agent.ToolResult{Content: fmt.Sprintf("invalid arguments: %v", err), IsError: true}, nil
141  	}
142  
143  	if args.Action == "" {
144  		return agent.ToolResult{Content: "missing required parameter: action", IsError: true}, nil
145  	}
146  
147  	normalizeArgs(&args)
148  
149  	switch args.Action {
150  	case "screenshot":
151  		return t.screenshot()
152  	case "click":
153  		if t.client == nil {
154  			return agent.ToolResult{Content: "computer tool requires macOS with ax_server", IsError: true}, nil
155  		}
156  		return t.click(ctx, args)
157  	case "type":
158  		if t.client == nil {
159  			return agent.ToolResult{Content: "computer tool requires macOS with ax_server", IsError: true}, nil
160  		}
161  		return t.typeText(ctx, args)
162  	case "hotkey":
163  		if t.client == nil {
164  			return agent.ToolResult{Content: "computer tool requires macOS with ax_server", IsError: true}, nil
165  		}
166  		return t.hotkey(ctx, args)
167  	case "move":
168  		if t.client == nil {
169  			return agent.ToolResult{Content: "computer tool requires macOS with ax_server", IsError: true}, nil
170  		}
171  		return t.move(ctx, args)
172  	default:
173  		return agent.ToolResult{
174  			Content: fmt.Sprintf("unknown action: %q (valid: click, type, hotkey, move, screenshot)", args.Action),
175  			IsError: true,
176  		}, nil
177  	}
178  }
179  
180  func (t *ComputerTool) screenshot() (agent.ToolResult, error) {
181  	path, block, err := CaptureAndEncode(DefaultAPIWidth)
182  	if err != nil {
183  		return agent.ToolResult{Content: fmt.Sprintf("screenshot error: %v", err), IsError: true}, nil
184  	}
185  	return agent.ToolResult{
186  		Content: fmt.Sprintf("Screenshot captured. Saved to: %s", path),
187  		Images:  []agent.ImageBlock{block},
188  	}, nil
189  }
190  
191  func (t *ComputerTool) click(ctx context.Context, args computerArgs) (agent.ToolResult, error) {
192  	x, y := t.scaleXY(args.X, args.Y)
193  	button := args.Button
194  	if button == "" {
195  		button = "left"
196  	}
197  	clicks := args.Clicks
198  	if clicks < 1 {
199  		clicks = 1
200  	}
201  
202  	rawResult, err := t.client.Call(ctx, "mouse_event", map[string]any{
203  		"type":   "click",
204  		"x":      float64(x),
205  		"y":      float64(y),
206  		"button": button,
207  		"clicks": clicks,
208  	})
209  	if err != nil {
210  		return agent.ToolResult{
211  			Content: fmt.Sprintf("click error: %v", err),
212  			IsError: true,
213  		}, nil
214  	}
215  
216  	msg := fmt.Sprintf("Clicked %s button %d time(s) at (%d, %d)", button, clicks, x, y)
217  	msg += parseActionContext(rawResult)
218  	result := agent.ToolResult{Content: msg}
219  	return t.captureAfterAction(result), nil
220  }
221  
222  func (t *ComputerTool) typeText(ctx context.Context, args computerArgs) (agent.ToolResult, error) {
223  	if args.Text == "" {
224  		return agent.ToolResult{Content: "type action requires 'text' parameter", IsError: true}, nil
225  	}
226  
227  	// ax_server handles CJK/non-ASCII via clipboard paste automatically
228  	rawResult, err := t.client.Call(ctx, "type_text", map[string]any{
229  		"value": args.Text,
230  	})
231  	if err != nil {
232  		return agent.ToolResult{
233  			Content: fmt.Sprintf("type error: %v", err),
234  			IsError: true,
235  		}, nil
236  	}
237  
238  	msg := fmt.Sprintf("Typed: %s", args.Text)
239  	msg += parseActionContext(rawResult)
240  	result := agent.ToolResult{Content: msg}
241  	return t.captureAfterAction(result), nil
242  }
243  
244  func (t *ComputerTool) hotkey(ctx context.Context, args computerArgs) (agent.ToolResult, error) {
245  	if args.Keys == "" {
246  		return agent.ToolResult{Content: "hotkey action requires 'keys' parameter", IsError: true}, nil
247  	}
248  
249  	parts := strings.Split(strings.ToLower(args.Keys), "+")
250  	if len(parts) == 0 {
251  		return agent.ToolResult{Content: fmt.Sprintf("invalid key combination: %q", args.Keys), IsError: true}, nil
252  	}
253  
254  	key := strings.TrimSpace(parts[len(parts)-1])
255  	var modifiers []string
256  	for _, part := range parts[:len(parts)-1] {
257  		modifiers = append(modifiers, strings.TrimSpace(part))
258  	}
259  
260  	rawResult, err := t.client.Call(ctx, "key_event", map[string]any{
261  		"key":       key,
262  		"modifiers": modifiers,
263  	})
264  	if err != nil {
265  		return agent.ToolResult{
266  			Content: fmt.Sprintf("hotkey error: %v", err),
267  			IsError: true,
268  		}, nil
269  	}
270  
271  	msg := fmt.Sprintf("Pressed: %s", args.Keys)
272  	msg += parseActionContext(rawResult)
273  	result := agent.ToolResult{Content: msg}
274  	return t.captureAfterAction(result), nil
275  }
276  
277  func (t *ComputerTool) move(ctx context.Context, args computerArgs) (agent.ToolResult, error) {
278  	x, y := t.scaleXY(args.X, args.Y)
279  
280  	rawResult, err := t.client.Call(ctx, "mouse_event", map[string]any{
281  		"type": "move",
282  		"x":    float64(x),
283  		"y":    float64(y),
284  	})
285  	if err != nil {
286  		return agent.ToolResult{
287  			Content: fmt.Sprintf("move error: %v", err),
288  			IsError: true,
289  		}, nil
290  	}
291  
292  	msg := fmt.Sprintf("Moved cursor to (%d, %d)", x, y)
293  	msg += parseActionContext(rawResult)
294  	result := agent.ToolResult{Content: msg}
295  	return t.captureAfterAction(result), nil
296  }
297  
298  // parseActionContext extracts the context field from an ax_server action response
299  // and formats it as a human-readable string.
300  func parseActionContext(raw json.RawMessage) string {
301  	var resp struct {
302  		Context *appContext `json:"context,omitempty"`
303  	}
304  	if err := json.Unmarshal(raw, &resp); err != nil {
305  		return ""
306  	}
307  	return formatContext(resp.Context)
308  }