Cradicle Explorer

/ tests / go / scenario_agent_e2e_test.go
scenario_agent_e2e_test.go
  1  // Copyright 2026 Alibaba Group Holding Ltd.
  2  //
  3  // Licensed under the Apache License, Version 2.0 (the "License");
  4  // you may not use this file except in compliance with the License.
  5  // You may obtain a copy of the License at
  6  //
  7  //     http://www.apache.org/licenses/LICENSE-2.0
  8  //
  9  // Unless required by applicable law or agreed to in writing, software
 10  // distributed under the License is distributed on an "AS IS" BASIS,
 11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12  // See the License for the specific language governing permissions and
 13  // limitations under the License.
 14  
 15  package e2e
 16  
 17  import (
 18  	"bytes"
 19  	"context"
 20  	"encoding/json"
 21  	"fmt"
 22  	"io"
 23  	"net/http"
 24  	"os"
 25  	"strings"
 26  	"testing"
 27  	"time"
 28  
 29  	"github.com/alibaba/OpenSandbox/sdks/sandbox/go"
 30  	"github.com/stretchr/testify/require"
 31  )
 32  
 33  func getLLMEndpoint() string {
 34  	if v := os.Getenv("LLM_ENDPOINT"); v != "" {
 35  		return v
 36  	}
 37  	domain := os.Getenv("OPENSANDBOX_TEST_DOMAIN")
 38  	if domain == "" {
 39  		return ""
 40  	}
 41  	protocol := os.Getenv("OPENSANDBOX_TEST_PROTOCOL")
 42  	if protocol == "" {
 43  		protocol = "https"
 44  	}
 45  	return fmt.Sprintf("%s://%s/v1", protocol, domain)
 46  }
 47  
 48  func getLLMModel() string {
 49  	if v := os.Getenv("LLM_MODEL"); v != "" {
 50  		return v
 51  	}
 52  	return "azure/gpt-4o-mini"
 53  }
 54  
 55  func chatCompletion(ctx context.Context, endpoint, model string, messages []map[string]string) (string, error) {
 56  	body, _ := json.Marshal(map[string]any{
 57  		"model":      model,
 58  		"messages":   messages,
 59  		"max_tokens": 1024,
 60  	})
 61  
 62  	req, err := http.NewRequestWithContext(ctx, "POST", endpoint+"/chat/completions", bytes.NewReader(body))
 63  	if err != nil {
 64  		return "", err
 65  	}
 66  	req.Header.Set("Content-Type", "application/json")
 67  
 68  	resp, err := http.DefaultClient.Do(req)
 69  	if err != nil {
 70  		return "", err
 71  	}
 72  	defer resp.Body.Close()
 73  
 74  	data, _ := io.ReadAll(resp.Body)
 75  	if resp.StatusCode != 200 {
 76  		return "", fmt.Errorf("LLM returned %d: %s", resp.StatusCode, string(data))
 77  	}
 78  
 79  	var result struct {
 80  		Choices []struct {
 81  			Message struct {
 82  				Content string `json:"content"`
 83  			} `json:"message"`
 84  		} `json:"choices"`
 85  	}
 86  	if err := json.Unmarshal(data, &result); err != nil {
 87  		return "", fmt.Errorf("parse LLM response: %w", err)
 88  	}
 89  	if len(result.Choices) == 0 {
 90  		return "", fmt.Errorf("no choices in LLM response")
 91  	}
 92  	return result.Choices[0].Message.Content, nil
 93  }
 94  
 95  func extractCode(text string) string {
 96  	start := strings.Index(text, "```python")
 97  	if start == -1 {
 98  		start = strings.Index(text, "```")
 99  		if start == -1 {
100  			return ""
101  		}
102  		start += 3
103  	} else {
104  		start += 9
105  	}
106  	if nl := strings.Index(text[start:], "\n"); nl != -1 {
107  		start += nl + 1
108  	}
109  	end := strings.Index(text[start:], "```")
110  	if end == -1 {
111  		return text[start:]
112  	}
113  	return strings.TrimSpace(text[start : start+end])
114  }
115  
116  func TestScenario_SimpleAgentLoop(t *testing.T) {
117  	llmEndpoint := getLLMEndpoint()
118  	if llmEndpoint == "" {
119  		t.Skip("LLM_ENDPOINT or OPENSANDBOX_TEST_DOMAIN not set")
120  	}
121  
122  	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
123  	defer cancel()
124  
125  	config := getConnectionConfig(t)
126  	sb, err := opensandbox.CreateSandbox(ctx, config, opensandbox.SandboxCreateOptions{
127  		Image: getSandboxImage(),
128  	})
129  	require.NoError(t, err)
130  	defer sb.Kill(context.Background())
131  	t.Logf("Sandbox ready: %s", sb.ID())
132  
133  	task := "Write Python code that calculates the first 10 Fibonacci numbers and prints them as a comma-separated list. Only output the code block, nothing else."
134  	t.Logf("Task: %s", task)
135  
136  	llmResponse, err := chatCompletion(ctx, llmEndpoint, getLLMModel(), []map[string]string{
137  		{"role": "system", "content": "You are a coding assistant. Respond ONLY with a Python code block. No explanation."},
138  		{"role": "user", "content": task},
139  	})
140  	require.NoError(t, err)
141  	t.Logf("LLM response:\n%s", llmResponse)
142  
143  	code := extractCode(llmResponse)
144  	if code == "" {
145  		code = strings.TrimSpace(llmResponse)
146  	}
147  	t.Logf("Extracted code:\n%s", code)
148  
149  	writeCmd := fmt.Sprintf("cat > /tmp/agent_task.py << 'PYEOF'\n%s\nPYEOF", code)
150  	writeResult, writeErr := sb.RunCommand(ctx, writeCmd, nil)
151  	require.NoError(t, writeErr)
152  	if writeResult.ExitCode != nil {
153  		require.Equal(t, 0, *writeResult.ExitCode, "write code to sandbox: %s", writeResult.Text())
154  	}
155  	exec, err := sb.RunCommand(ctx, "python3 /tmp/agent_task.py", nil)
156  	require.NoError(t, err)
157  
158  	output := exec.Text()
159  	t.Logf("Execution output: %s", output)
160  
161  	if exec.ExitCode != nil {
162  		require.Equal(t, 0, *exec.ExitCode, "code execution exit code")
163  	}
164  
165  	require.Contains(t, output, "34", "expected Fibonacci output")
166  	require.Contains(t, output, "8")
167  	require.True(t, strings.Contains(output, "13") || strings.Contains(output, "21") || strings.Contains(output, "5"),
168  		"expected mid-sequence Fibonacci digits (5/13/21), got: %q", output)
169  	t.Log("Agent loop completed successfully: task → LLM → code → execute → result")
170  }
171  
172  func TestScenario_CodeInterpreterAgent(t *testing.T) {
173  	llmEndpoint := getLLMEndpoint()
174  	if llmEndpoint == "" {
175  		t.Skip("LLM_ENDPOINT or OPENSANDBOX_TEST_DOMAIN not set")
176  	}
177  
178  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
179  	defer cancel()
180  
181  	config := getConnectionConfig(t)
182  	ci, err := opensandbox.CreateCodeInterpreter(ctx, config, opensandbox.CodeInterpreterCreateOptions{
183  		ReadyTimeout:        60 * time.Second,
184  		HealthCheckInterval: 500 * time.Millisecond,
185  	})
186  	require.NoError(t, err)
187  	defer ci.Kill(context.Background())
188  	t.Logf("Code interpreter ready: %s", ci.ID())
189  
190  	codeCtx, err := ci.CreateContext(ctx, opensandbox.CreateContextRequest{Language: "python"})
191  	require.NoError(t, err)
192  	t.Logf("Python context: %s", codeCtx.ID)
193  
194  	conversation := []map[string]string{
195  		{"role": "system", "content": "You are a data analysis assistant. When asked to analyze data, respond ONLY with a Python code block. The code will be executed in a Jupyter-like environment where variables persist between turns. Always print your results. Use only the Python standard library — do NOT import numpy, pandas, or any external packages."},
196  	}
197  
198  	t.Log("--- Turn 1: Create dataset ---")
199  	conversation = append(conversation, map[string]string{
200  		"role": "user", "content": "Create a list called 'sales' with these monthly values: [120, 150, 90, 200, 180, 220, 160, 190, 210, 170, 230, 250]. Print the list.",
201  	})
202  
203  	reply1, err := chatCompletion(ctx, llmEndpoint, getLLMModel(), conversation)
204  	require.NoError(t, err)
205  	code1 := extractCode(reply1)
206  	if code1 == "" {
207  		code1 = strings.TrimSpace(reply1)
208  	}
209  	t.Logf("Turn 1 code: %s", code1)
210  
211  	exec1, err := ci.ExecuteInContext(ctx, codeCtx.ID, "python", code1, nil)
212  	require.NoError(t, err)
213  	t.Logf("Turn 1 output: %s", exec1.Text())
214  	conversation = append(conversation, map[string]string{"role": "assistant", "content": reply1})
215  
216  	t.Log("--- Turn 2: Analyze dataset ---")
217  	conversation = append(conversation, map[string]string{
218  		"role": "user", "content": "Using the 'sales' variable from the previous step, calculate and print: the mean, the max month (1-indexed), and whether total sales exceed 2000.",
219  	})
220  
221  	reply2, err := chatCompletion(ctx, llmEndpoint, getLLMModel(), conversation)
222  	require.NoError(t, err)
223  	code2 := extractCode(reply2)
224  	if code2 == "" {
225  		code2 = strings.TrimSpace(reply2)
226  	}
227  	t.Logf("Turn 2 code: %s", code2)
228  
229  	exec2, err := ci.ExecuteInContext(ctx, codeCtx.ID, "python", code2, nil)
230  	require.NoError(t, err)
231  	output2 := exec2.Text()
232  	t.Logf("Turn 2 output: %s", output2)
233  
234  	require.NotEmpty(t, output2, "turn 2 produced no output — context persistence may have failed")
235  	if !strings.Contains(strings.ToLower(output2), "true") && !strings.Contains(strings.ToLower(output2), "yes") && !strings.Contains(output2, "2170") {
236  		t.Logf("Warning: output may not confirm total > 2000: %q", output2)
237  	}
238  
239  	ci.DeleteContext(ctx, codeCtx.ID)
240  	t.Log("Multi-turn code interpreter agent completed successfully")
241  }
242  
243  func TestScenario_SandboxToolUse(t *testing.T) {
244  	llmEndpoint := getLLMEndpoint()
245  	if llmEndpoint == "" {
246  		t.Skip("LLM_ENDPOINT or OPENSANDBOX_TEST_DOMAIN not set")
247  	}
248  
249  	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute)
250  	defer cancel()
251  
252  	config := getConnectionConfig(t)
253  	sb, err := opensandbox.CreateSandbox(ctx, config, opensandbox.SandboxCreateOptions{
254  		Image: getSandboxImage(),
255  	})
256  	require.NoError(t, err)
257  	defer sb.Kill(context.Background())
258  
259  	reply, err := chatCompletion(ctx, llmEndpoint, getLLMModel(), []map[string]string{
260  		{"role": "system", "content": "You have access to a Linux shell. Respond ONLY with the exact shell command to run. No explanation, no code blocks, just the raw command."},
261  		{"role": "user", "content": "What command shows the Linux kernel version, CPU count, and total memory in one line?"},
262  	})
263  	require.NoError(t, err)
264  	command := strings.TrimSpace(reply)
265  	command = strings.TrimPrefix(command, "```bash\n")
266  	command = strings.TrimPrefix(command, "```\n")
267  	command = strings.TrimSuffix(command, "\n```")
268  	command = strings.TrimPrefix(command, "```")
269  	command = strings.TrimSpace(command)
270  	t.Logf("LLM suggested command: %s", command)
271  
272  	exec, err := sb.RunCommand(ctx, command, nil)
273  	require.NoError(t, err)
274  	shellOutput := exec.Text()
275  	t.Logf("Shell output: %s", shellOutput)
276  
277  	interpretation, err := chatCompletion(ctx, llmEndpoint, getLLMModel(), []map[string]string{
278  		{"role": "system", "content": "Summarize the system information in one sentence."},
279  		{"role": "user", "content": fmt.Sprintf("Shell output:\n%s", shellOutput)},
280  	})
281  	require.NoError(t, err)
282  	t.Logf("LLM interpretation: %s", interpretation)
283  
284  	require.NotEmpty(t, interpretation, "LLM produced no interpretation")
285  	t.Log("Tool-use agent completed: task → LLM → shell → LLM → answer")
286  }