computer.go
1 package tools 2 3 import ( 4 "context" 5 "encoding/json" 6 "fmt" 7 "strings" 8 "time" 9 10 "github.com/Kocoro-lab/ShanClaw/internal/agent" 11 "github.com/Kocoro-lab/ShanClaw/internal/client" 12 ) 13 14 type ComputerTool struct { 15 client *AXClient 16 screenW int 17 screenH int 18 } 19 20 func (t *ComputerTool) ensureScreenDims() { 21 if t.screenW > 0 { 22 return 23 } 24 w, h, err := GetScreenDimensions() 25 if err != nil { 26 t.screenW = DefaultAPIWidth 27 t.screenH = DefaultAPIHeight 28 return 29 } 30 t.screenW = w 31 t.screenH = h 32 } 33 34 func (t *ComputerTool) scaleXY(apiX, apiY int) (int, int) { 35 t.ensureScreenDims() 36 x, y := ScaleCoordinates(apiX, apiY, DefaultAPIWidth, DefaultAPIHeight, t.screenW, t.screenH) 37 return ClampCoordinates(x, y, t.screenW, t.screenH) 38 } 39 40 func (t *ComputerTool) captureAfterAction(result agent.ToolResult) agent.ToolResult { 41 time.Sleep(500 * time.Millisecond) 42 _, block, err := CaptureAndEncode(DefaultAPIWidth) 43 if err != nil { 44 return result // Non-fatal 45 } 46 result.Images = []agent.ImageBlock{block} 47 return result 48 } 49 50 type computerArgs struct { 51 Action string `json:"action"` 52 X int `json:"x,omitempty"` 53 Y int `json:"y,omitempty"` 54 Text string `json:"text,omitempty"` 55 Keys string `json:"keys,omitempty"` 56 Button string `json:"button,omitempty"` 57 Clicks int `json:"clicks,omitempty"` 58 Coordinate []int `json:"coordinate,omitempty"` // Anthropic native: [x, y] 59 } 60 61 // normalizeArgs maps Anthropic native action names and coordinate format 62 // to our internal format. 63 func normalizeArgs(args *computerArgs) { 64 // Map Anthropic coordinate array to x, y 65 if len(args.Coordinate) == 2 { 66 args.X = args.Coordinate[0] 67 args.Y = args.Coordinate[1] 68 } 69 70 // Map Anthropic native action names to our actions 71 switch args.Action { 72 case "left_click": 73 args.Action = "click" 74 args.Button = "left" 75 args.Clicks = 1 76 case "right_click": 77 args.Action = "click" 78 args.Button = "right" 79 args.Clicks = 1 80 case "double_click": 81 args.Action = "click" 82 args.Button = "left" 83 args.Clicks = 2 84 case "middle_click": 85 args.Action = "click" 86 args.Button = "left" // fallback — no middle click support 87 args.Clicks = 1 88 case "triple_click": 89 args.Action = "click" 90 args.Button = "left" 91 args.Clicks = 3 92 case "mouse_move": 93 args.Action = "move" 94 case "key": 95 args.Action = "hotkey" 96 if args.Text != "" && args.Keys == "" { 97 args.Keys = args.Text // Anthropic sends key combo in "text" field 98 } 99 case "screenshot": 100 args.Action = "screenshot" 101 } 102 } 103 104 func (t *ComputerTool) Info() agent.ToolInfo { 105 return agent.ToolInfo{ 106 Name: "computer", 107 Description: "OS-level mouse and keyboard control for macOS. Use for coordinate-based clicks, typing text (CJK/emoji safe), and keyboard shortcuts. For clicking UI elements, prefer accessibility tool (ref-based) over coordinate clicks. Actions: click, type, hotkey, move, screenshot.", 108 Parameters: map[string]any{ 109 "type": "object", 110 "properties": map[string]any{ 111 "action": map[string]any{"type": "string", "description": "Action to perform: click, type, hotkey, move"}, 112 "x": map[string]any{"type": "integer", "description": "Screen X coordinate (for click/move)"}, 113 "y": map[string]any{"type": "integer", "description": "Screen Y coordinate (for click/move)"}, 114 "text": map[string]any{"type": "string", "description": "Text to type (for type action)"}, 115 "keys": map[string]any{"type": "string", "description": "Key combination like command+c, command+shift+4 (for hotkey action)"}, 116 "button": map[string]any{"type": "string", "description": "Mouse button: left (default), right (for click action)"}, 117 "clicks": map[string]any{"type": "integer", "description": "Number of clicks: 1 (default), 2 for double-click (for click action)"}, 118 }, 119 }, 120 Required: []string{"action"}, 121 } 122 } 123 124 func (t *ComputerTool) RequiresApproval() bool { return true } 125 126 func (t *ComputerTool) IsReadOnlyCall(string) bool { return false } 127 128 func (t *ComputerTool) NativeToolDef() *client.NativeToolDef { 129 return &client.NativeToolDef{ 130 Type: "computer_20251124", 131 Name: "computer", 132 DisplayWidthPx: DefaultAPIWidth, 133 DisplayHeightPx: DefaultAPIHeight, 134 } 135 } 136 137 func (t *ComputerTool) Run(ctx context.Context, argsJSON string) (agent.ToolResult, error) { 138 var args computerArgs 139 if err := json.Unmarshal([]byte(argsJSON), &args); err != nil { 140 return agent.ToolResult{Content: fmt.Sprintf("invalid arguments: %v", err), IsError: true}, nil 141 } 142 143 if args.Action == "" { 144 return agent.ToolResult{Content: "missing required parameter: action", IsError: true}, nil 145 } 146 147 normalizeArgs(&args) 148 149 switch args.Action { 150 case "screenshot": 151 return t.screenshot() 152 case "click": 153 if t.client == nil { 154 return agent.ToolResult{Content: "computer tool requires macOS with ax_server", IsError: true}, nil 155 } 156 return t.click(ctx, args) 157 case "type": 158 if t.client == nil { 159 return agent.ToolResult{Content: "computer tool requires macOS with ax_server", IsError: true}, nil 160 } 161 return t.typeText(ctx, args) 162 case "hotkey": 163 if t.client == nil { 164 return agent.ToolResult{Content: "computer tool requires macOS with ax_server", IsError: true}, nil 165 } 166 return t.hotkey(ctx, args) 167 case "move": 168 if t.client == nil { 169 return agent.ToolResult{Content: "computer tool requires macOS with ax_server", IsError: true}, nil 170 } 171 return t.move(ctx, args) 172 default: 173 return agent.ToolResult{ 174 Content: fmt.Sprintf("unknown action: %q (valid: click, type, hotkey, move, screenshot)", args.Action), 175 IsError: true, 176 }, nil 177 } 178 } 179 180 func (t *ComputerTool) screenshot() (agent.ToolResult, error) { 181 path, block, err := CaptureAndEncode(DefaultAPIWidth) 182 if err != nil { 183 return agent.ToolResult{Content: fmt.Sprintf("screenshot error: %v", err), IsError: true}, nil 184 } 185 return agent.ToolResult{ 186 Content: fmt.Sprintf("Screenshot captured. Saved to: %s", path), 187 Images: []agent.ImageBlock{block}, 188 }, nil 189 } 190 191 func (t *ComputerTool) click(ctx context.Context, args computerArgs) (agent.ToolResult, error) { 192 x, y := t.scaleXY(args.X, args.Y) 193 button := args.Button 194 if button == "" { 195 button = "left" 196 } 197 clicks := args.Clicks 198 if clicks < 1 { 199 clicks = 1 200 } 201 202 rawResult, err := t.client.Call(ctx, "mouse_event", map[string]any{ 203 "type": "click", 204 "x": float64(x), 205 "y": float64(y), 206 "button": button, 207 "clicks": clicks, 208 }) 209 if err != nil { 210 return agent.ToolResult{ 211 Content: fmt.Sprintf("click error: %v", err), 212 IsError: true, 213 }, nil 214 } 215 216 msg := fmt.Sprintf("Clicked %s button %d time(s) at (%d, %d)", button, clicks, x, y) 217 msg += parseActionContext(rawResult) 218 result := agent.ToolResult{Content: msg} 219 return t.captureAfterAction(result), nil 220 } 221 222 func (t *ComputerTool) typeText(ctx context.Context, args computerArgs) (agent.ToolResult, error) { 223 if args.Text == "" { 224 return agent.ToolResult{Content: "type action requires 'text' parameter", IsError: true}, nil 225 } 226 227 // ax_server handles CJK/non-ASCII via clipboard paste automatically 228 rawResult, err := t.client.Call(ctx, "type_text", map[string]any{ 229 "value": args.Text, 230 }) 231 if err != nil { 232 return agent.ToolResult{ 233 Content: fmt.Sprintf("type error: %v", err), 234 IsError: true, 235 }, nil 236 } 237 238 msg := fmt.Sprintf("Typed: %s", args.Text) 239 msg += parseActionContext(rawResult) 240 result := agent.ToolResult{Content: msg} 241 return t.captureAfterAction(result), nil 242 } 243 244 func (t *ComputerTool) hotkey(ctx context.Context, args computerArgs) (agent.ToolResult, error) { 245 if args.Keys == "" { 246 return agent.ToolResult{Content: "hotkey action requires 'keys' parameter", IsError: true}, nil 247 } 248 249 parts := strings.Split(strings.ToLower(args.Keys), "+") 250 if len(parts) == 0 { 251 return agent.ToolResult{Content: fmt.Sprintf("invalid key combination: %q", args.Keys), IsError: true}, nil 252 } 253 254 key := strings.TrimSpace(parts[len(parts)-1]) 255 var modifiers []string 256 for _, part := range parts[:len(parts)-1] { 257 modifiers = append(modifiers, strings.TrimSpace(part)) 258 } 259 260 rawResult, err := t.client.Call(ctx, "key_event", map[string]any{ 261 "key": key, 262 "modifiers": modifiers, 263 }) 264 if err != nil { 265 return agent.ToolResult{ 266 Content: fmt.Sprintf("hotkey error: %v", err), 267 IsError: true, 268 }, nil 269 } 270 271 msg := fmt.Sprintf("Pressed: %s", args.Keys) 272 msg += parseActionContext(rawResult) 273 result := agent.ToolResult{Content: msg} 274 return t.captureAfterAction(result), nil 275 } 276 277 func (t *ComputerTool) move(ctx context.Context, args computerArgs) (agent.ToolResult, error) { 278 x, y := t.scaleXY(args.X, args.Y) 279 280 rawResult, err := t.client.Call(ctx, "mouse_event", map[string]any{ 281 "type": "move", 282 "x": float64(x), 283 "y": float64(y), 284 }) 285 if err != nil { 286 return agent.ToolResult{ 287 Content: fmt.Sprintf("move error: %v", err), 288 IsError: true, 289 }, nil 290 } 291 292 msg := fmt.Sprintf("Moved cursor to (%d, %d)", x, y) 293 msg += parseActionContext(rawResult) 294 result := agent.ToolResult{Content: msg} 295 return t.captureAfterAction(result), nil 296 } 297 298 // parseActionContext extracts the context field from an ax_server action response 299 // and formats it as a human-readable string. 300 func parseActionContext(raw json.RawMessage) string { 301 var resp struct { 302 Context *appContext `json:"context,omitempty"` 303 } 304 if err := json.Unmarshal(raw, &resp); err != nil { 305 return "" 306 } 307 return formatContext(resp.Context) 308 }