scenario_agent_e2e_test.go
1 // Copyright 2026 Alibaba Group Holding Ltd. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package e2e 16 17 import ( 18 "bytes" 19 "context" 20 "encoding/json" 21 "fmt" 22 "io" 23 "net/http" 24 "os" 25 "strings" 26 "testing" 27 "time" 28 29 "github.com/alibaba/OpenSandbox/sdks/sandbox/go" 30 "github.com/stretchr/testify/require" 31 ) 32 33 func getLLMEndpoint() string { 34 if v := os.Getenv("LLM_ENDPOINT"); v != "" { 35 return v 36 } 37 domain := os.Getenv("OPENSANDBOX_TEST_DOMAIN") 38 if domain == "" { 39 return "" 40 } 41 protocol := os.Getenv("OPENSANDBOX_TEST_PROTOCOL") 42 if protocol == "" { 43 protocol = "https" 44 } 45 return fmt.Sprintf("%s://%s/v1", protocol, domain) 46 } 47 48 func getLLMModel() string { 49 if v := os.Getenv("LLM_MODEL"); v != "" { 50 return v 51 } 52 return "azure/gpt-4o-mini" 53 } 54 55 func chatCompletion(ctx context.Context, endpoint, model string, messages []map[string]string) (string, error) { 56 body, _ := json.Marshal(map[string]any{ 57 "model": model, 58 "messages": messages, 59 "max_tokens": 1024, 60 }) 61 62 req, err := http.NewRequestWithContext(ctx, "POST", endpoint+"/chat/completions", bytes.NewReader(body)) 63 if err != nil { 64 return "", err 65 } 66 req.Header.Set("Content-Type", "application/json") 67 68 resp, err := http.DefaultClient.Do(req) 69 if err != nil { 70 return "", err 71 } 72 defer resp.Body.Close() 73 74 data, _ := io.ReadAll(resp.Body) 75 if resp.StatusCode != 200 { 76 return "", fmt.Errorf("LLM returned %d: %s", resp.StatusCode, string(data)) 77 } 78 79 var result struct { 80 Choices []struct { 81 Message struct { 82 Content string `json:"content"` 83 } `json:"message"` 84 } `json:"choices"` 85 } 86 if err := json.Unmarshal(data, &result); err != nil { 87 return "", fmt.Errorf("parse LLM response: %w", err) 88 } 89 if len(result.Choices) == 0 { 90 return "", fmt.Errorf("no choices in LLM response") 91 } 92 return result.Choices[0].Message.Content, nil 93 } 94 95 func extractCode(text string) string { 96 start := strings.Index(text, "```python") 97 if start == -1 { 98 start = strings.Index(text, "```") 99 if start == -1 { 100 return "" 101 } 102 start += 3 103 } else { 104 start += 9 105 } 106 if nl := strings.Index(text[start:], "\n"); nl != -1 { 107 start += nl + 1 108 } 109 end := strings.Index(text[start:], "```") 110 if end == -1 { 111 return text[start:] 112 } 113 return strings.TrimSpace(text[start : start+end]) 114 } 115 116 func TestScenario_SimpleAgentLoop(t *testing.T) { 117 llmEndpoint := getLLMEndpoint() 118 if llmEndpoint == "" { 119 t.Skip("LLM_ENDPOINT or OPENSANDBOX_TEST_DOMAIN not set") 120 } 121 122 ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) 123 defer cancel() 124 125 config := getConnectionConfig(t) 126 sb, err := opensandbox.CreateSandbox(ctx, config, opensandbox.SandboxCreateOptions{ 127 Image: getSandboxImage(), 128 }) 129 require.NoError(t, err) 130 defer sb.Kill(context.Background()) 131 t.Logf("Sandbox ready: %s", sb.ID()) 132 133 task := "Write Python code that calculates the first 10 Fibonacci numbers and prints them as a comma-separated list. Only output the code block, nothing else." 134 t.Logf("Task: %s", task) 135 136 llmResponse, err := chatCompletion(ctx, llmEndpoint, getLLMModel(), []map[string]string{ 137 {"role": "system", "content": "You are a coding assistant. Respond ONLY with a Python code block. No explanation."}, 138 {"role": "user", "content": task}, 139 }) 140 require.NoError(t, err) 141 t.Logf("LLM response:\n%s", llmResponse) 142 143 code := extractCode(llmResponse) 144 if code == "" { 145 code = strings.TrimSpace(llmResponse) 146 } 147 t.Logf("Extracted code:\n%s", code) 148 149 writeCmd := fmt.Sprintf("cat > /tmp/agent_task.py << 'PYEOF'\n%s\nPYEOF", code) 150 writeResult, writeErr := sb.RunCommand(ctx, writeCmd, nil) 151 require.NoError(t, writeErr) 152 if writeResult.ExitCode != nil { 153 require.Equal(t, 0, *writeResult.ExitCode, "write code to sandbox: %s", writeResult.Text()) 154 } 155 exec, err := sb.RunCommand(ctx, "python3 /tmp/agent_task.py", nil) 156 require.NoError(t, err) 157 158 output := exec.Text() 159 t.Logf("Execution output: %s", output) 160 161 if exec.ExitCode != nil { 162 require.Equal(t, 0, *exec.ExitCode, "code execution exit code") 163 } 164 165 require.Contains(t, output, "34", "expected Fibonacci output") 166 require.Contains(t, output, "8") 167 require.True(t, strings.Contains(output, "13") || strings.Contains(output, "21") || strings.Contains(output, "5"), 168 "expected mid-sequence Fibonacci digits (5/13/21), got: %q", output) 169 t.Log("Agent loop completed successfully: task → LLM → code → execute → result") 170 } 171 172 func TestScenario_CodeInterpreterAgent(t *testing.T) { 173 llmEndpoint := getLLMEndpoint() 174 if llmEndpoint == "" { 175 t.Skip("LLM_ENDPOINT or OPENSANDBOX_TEST_DOMAIN not set") 176 } 177 178 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) 179 defer cancel() 180 181 config := getConnectionConfig(t) 182 ci, err := opensandbox.CreateCodeInterpreter(ctx, config, opensandbox.CodeInterpreterCreateOptions{ 183 ReadyTimeout: 60 * time.Second, 184 HealthCheckInterval: 500 * time.Millisecond, 185 }) 186 require.NoError(t, err) 187 defer ci.Kill(context.Background()) 188 t.Logf("Code interpreter ready: %s", ci.ID()) 189 190 codeCtx, err := ci.CreateContext(ctx, opensandbox.CreateContextRequest{Language: "python"}) 191 require.NoError(t, err) 192 t.Logf("Python context: %s", codeCtx.ID) 193 194 conversation := []map[string]string{ 195 {"role": "system", "content": "You are a data analysis assistant. When asked to analyze data, respond ONLY with a Python code block. The code will be executed in a Jupyter-like environment where variables persist between turns. Always print your results. Use only the Python standard library — do NOT import numpy, pandas, or any external packages."}, 196 } 197 198 t.Log("--- Turn 1: Create dataset ---") 199 conversation = append(conversation, map[string]string{ 200 "role": "user", "content": "Create a list called 'sales' with these monthly values: [120, 150, 90, 200, 180, 220, 160, 190, 210, 170, 230, 250]. Print the list.", 201 }) 202 203 reply1, err := chatCompletion(ctx, llmEndpoint, getLLMModel(), conversation) 204 require.NoError(t, err) 205 code1 := extractCode(reply1) 206 if code1 == "" { 207 code1 = strings.TrimSpace(reply1) 208 } 209 t.Logf("Turn 1 code: %s", code1) 210 211 exec1, err := ci.ExecuteInContext(ctx, codeCtx.ID, "python", code1, nil) 212 require.NoError(t, err) 213 t.Logf("Turn 1 output: %s", exec1.Text()) 214 conversation = append(conversation, map[string]string{"role": "assistant", "content": reply1}) 215 216 t.Log("--- Turn 2: Analyze dataset ---") 217 conversation = append(conversation, map[string]string{ 218 "role": "user", "content": "Using the 'sales' variable from the previous step, calculate and print: the mean, the max month (1-indexed), and whether total sales exceed 2000.", 219 }) 220 221 reply2, err := chatCompletion(ctx, llmEndpoint, getLLMModel(), conversation) 222 require.NoError(t, err) 223 code2 := extractCode(reply2) 224 if code2 == "" { 225 code2 = strings.TrimSpace(reply2) 226 } 227 t.Logf("Turn 2 code: %s", code2) 228 229 exec2, err := ci.ExecuteInContext(ctx, codeCtx.ID, "python", code2, nil) 230 require.NoError(t, err) 231 output2 := exec2.Text() 232 t.Logf("Turn 2 output: %s", output2) 233 234 require.NotEmpty(t, output2, "turn 2 produced no output — context persistence may have failed") 235 if !strings.Contains(strings.ToLower(output2), "true") && !strings.Contains(strings.ToLower(output2), "yes") && !strings.Contains(output2, "2170") { 236 t.Logf("Warning: output may not confirm total > 2000: %q", output2) 237 } 238 239 ci.DeleteContext(ctx, codeCtx.ID) 240 t.Log("Multi-turn code interpreter agent completed successfully") 241 } 242 243 func TestScenario_SandboxToolUse(t *testing.T) { 244 llmEndpoint := getLLMEndpoint() 245 if llmEndpoint == "" { 246 t.Skip("LLM_ENDPOINT or OPENSANDBOX_TEST_DOMAIN not set") 247 } 248 249 ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) 250 defer cancel() 251 252 config := getConnectionConfig(t) 253 sb, err := opensandbox.CreateSandbox(ctx, config, opensandbox.SandboxCreateOptions{ 254 Image: getSandboxImage(), 255 }) 256 require.NoError(t, err) 257 defer sb.Kill(context.Background()) 258 259 reply, err := chatCompletion(ctx, llmEndpoint, getLLMModel(), []map[string]string{ 260 {"role": "system", "content": "You have access to a Linux shell. Respond ONLY with the exact shell command to run. No explanation, no code blocks, just the raw command."}, 261 {"role": "user", "content": "What command shows the Linux kernel version, CPU count, and total memory in one line?"}, 262 }) 263 require.NoError(t, err) 264 command := strings.TrimSpace(reply) 265 command = strings.TrimPrefix(command, "```bash\n") 266 command = strings.TrimPrefix(command, "```\n") 267 command = strings.TrimSuffix(command, "\n```") 268 command = strings.TrimPrefix(command, "```") 269 command = strings.TrimSpace(command) 270 t.Logf("LLM suggested command: %s", command) 271 272 exec, err := sb.RunCommand(ctx, command, nil) 273 require.NoError(t, err) 274 shellOutput := exec.Text() 275 t.Logf("Shell output: %s", shellOutput) 276 277 interpretation, err := chatCompletion(ctx, llmEndpoint, getLLMModel(), []map[string]string{ 278 {"role": "system", "content": "Summarize the system information in one sentence."}, 279 {"role": "user", "content": fmt.Sprintf("Shell output:\n%s", shellOutput)}, 280 }) 281 require.NoError(t, err) 282 t.Logf("LLM interpretation: %s", interpretation) 283 284 require.NotEmpty(t, interpretation, "LLM produced no interpretation") 285 t.Log("Tool-use agent completed: task → LLM → shell → LLM → answer") 286 }