Cradicle Explorer

/ examples / terminal_bench / test_agent_comparison.py
test_agent_comparison.py
  1  #!/usr/bin/env python3
  2  """
  3  Agent Comparison Test - Run 5 tasks with both wrapper and direct agent
  4  
  5  This test compares:
  6  1. Direct Agent approach (praisonai_external_agent pattern)
  7  2. CLI Wrapper approach (praisonai_wrapper_agent pattern)
  8  
  9  Usage:
 10      python test_agent_comparison.py
 11  """
 12  import sys
 13  import os
 14  import time
 15  import asyncio
 16  from datetime import datetime
 17  
 18  # Add paths
 19  sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src', 'praisonai-agents'))
 20  
 21  # 5 Test Tasks (simple to moderately complex)
 22  TEST_TASKS = [
 23      {
 24          "name": "list_directory",
 25          "instruction": "List all files in the current directory and count how many Python files exist",
 26          "expected_artifacts": ["file count", ".py count"]
 27      },
 28      {
 29          "name": "create_structure",
 30          "instruction": "Create a directory called 'test_workspace', create a file inside it named 'readme.txt' with content 'Hello World', then verify it exists",
 31          "expected_artifacts": ["test_workspace/", "readme.txt"]
 32      },
 33      {
 34          "name": "file_operations",
 35          "instruction": "Create a file called 'numbers.txt' with numbers 1-10 each on a new line, then display the first 3 lines",
 36          "expected_artifacts": ["numbers.txt"]
 37      },
 38      {
 39          "name": "system_info",
 40          "instruction": "Show the current working directory, current user, and Python version",
 41          "expected_artifacts": ["pwd output", "whoami output"]
 42      },
 43      {
 44          "name": "text_processing",
 45          "instruction": "Create a file 'words.txt' with 5 words (apple, banana, cherry, date, elderberry), then count the total characters in the file",
 46          "expected_artifacts": ["words.txt", "character count"]
 47      }
 48  ]
 49  
 50  
 51  class DirectAgentRunner:
 52      """Runs tasks using direct Agent class approach."""
 53      
 54      def __init__(self):
 55          from praisonaiagents import Agent
 56          from praisonaiagents.tools import execute_command
 57          from praisonaiagents.approval import get_approval_registry, AutoApproveBackend
 58          
 59          self.Agent = Agent
 60          self.execute_command = execute_command
 61          self.registry = get_approval_registry()
 62          self.registry.set_backend(AutoApproveBackend())
 63          self.results = []
 64      
 65      def run_task(self, task: dict) -> dict:
 66          """Run a single task and return metrics."""
 67          print(f"\n🎯 Direct Agent - Running: {task['name']}")
 68          print(f"   Instruction: {task['instruction'][:60]}...")
 69          
 70          start_time = time.time()
 71          
 72          try:
 73              agent = self.Agent(
 74                  name=f"direct-{task['name']}",
 75                  instructions="You are a terminal task agent. Use execute_command tool to complete tasks.",
 76                  tools=[self.execute_command],
 77                  llm="gpt-4o-mini"
 78              )
 79              
 80              result = agent.start(task['instruction'])
 81              elapsed = time.time() - start_time
 82              
 83              # Get cost if available
 84              cost = getattr(agent, 'total_cost', 0) or 0
 85              
 86              print(f"   ✅ Completed in {elapsed:.2f}s")
 87              print(f"   💰 Cost: ${cost:.6f}")
 88              
 89              return {
 90                  "task": task['name'],
 91                  "success": True,
 92                  "time": elapsed,
 93                  "cost": cost,
 94                  "result_preview": str(result)[:100] if result else "No output",
 95                  "error": None
 96              }
 97              
 98          except Exception as e:
 99              elapsed = time.time() - start_time
100              print(f"   ❌ Failed in {elapsed:.2f}s: {e}")
101              return {
102                  "task": task['name'],
103                  "success": False,
104                  "time": elapsed,
105                  "cost": 0,
106                  "result_preview": None,
107                  "error": str(e)
108              }
109  
110  
111  class WrapperAgentRunner:
112      """Runs tasks using CLI wrapper approach via subprocess."""
113      
114      def __init__(self):
115          self.results = []
116      
117      def run_task(self, task: dict) -> dict:
118          """Run a single task using praisonai CLI via subprocess."""
119          print(f"\n🎯 Wrapper Agent - Running: {task['name']}")
120          print(f"   Instruction: {task['instruction'][:60]}...")
121          
122          start_time = time.time()
123          
124          try:
125              import subprocess
126              import shlex
127              
128              # Build the praisonai CLI command
129              # Format: praisonai "TASK" --model MODEL
130              cmd = [
131                  "praisonai",
132                  task['instruction'],
133                  "--model", "gpt-4o-mini"
134              ]
135              
136              # Run the command
137              result = subprocess.run(
138                  cmd,
139                  capture_output=True,
140                  text=True,
141                  timeout=300,  # 5 minute timeout
142                  env={**os.environ, "LOGLEVEL": "WARNING"}
143              )
144              
145              elapsed = time.time() - start_time
146              
147              if result.returncode == 0:
148                  print(f"   ✅ Completed in {elapsed:.2f}s")
149                  return {
150                      "task": task['name'],
151                      "success": True,
152                      "time": elapsed,
153                      "cost": 0,  # Cost tracking not available in CLI mode
154                      "result_preview": result.stdout[:100] if result.stdout else "No output",
155                      "error": None
156                  }
157              else:
158                  print(f"   ❌ Failed with exit code {result.returncode} in {elapsed:.2f}s")
159                  return {
160                      "task": task['name'],
161                      "success": False,
162                      "time": elapsed,
163                      "cost": 0,
164                      "result_preview": None,
165                      "error": f"Exit code {result.returncode}: {result.stderr[:200]}"
166                  }
167              
168          except subprocess.TimeoutExpired:
169              elapsed = time.time() - start_time
170              print(f"   ⏱️  Timeout after {elapsed:.2f}s")
171              return {
172                  "task": task['name'],
173                  "success": False,
174                  "time": elapsed,
175                  "cost": 0,
176                  "result_preview": None,
177                  "error": "Timeout after 300s"
178              }
179              
180          except FileNotFoundError:
181              elapsed = time.time() - start_time
182              print(f"   ⚠️ praisonai CLI not found")
183              return {
184                  "task": task['name'],
185                  "success": False,
186                  "time": elapsed,
187                  "cost": 0,
188                  "result_preview": None,
189                  "error": "praisonai CLI not installed (pip install praisonai)"
190              }
191              
192          except Exception as e:
193              elapsed = time.time() - start_time
194              print(f"   ❌ Failed in {elapsed:.2f}s: {e}")
195              return {
196                  "task": task['name'],
197                  "success": False,
198                  "time": elapsed,
199                  "cost": 0,
200                  "result_preview": None,
201                  "error": str(e)
202              }
203  
204  
205  def compare_results(direct_results: list, wrapper_results: list):
206      """Compare results from both approaches."""
207      print("\n" + "=" * 70)
208      print("📊 COMPARISON RESULTS")
209      print("=" * 70)
210      
211      # Calculate metrics
212      direct_success = sum(1 for r in direct_results if r['success'])
213      wrapper_success = sum(1 for r in wrapper_results if r['success'])
214      
215      direct_total_time = sum(r['time'] for r in direct_results)
216      wrapper_total_time = sum(r['time'] for r in wrapper_results)
217      
218      direct_total_cost = sum(r['cost'] for r in direct_results)
219      
220      print(f"\n✅ Success Rate:")
221      print(f"   Direct Agent:  {direct_success}/5 ({direct_success*20}%)")
222      print(f"   Wrapper Agent: {wrapper_success}/5 ({wrapper_success*20}%)")
223      
224      print(f"\n⏱️  Total Time:")
225      print(f"   Direct Agent:  {direct_total_time:.2f}s (avg: {direct_total_time/5:.2f}s per task)")
226      print(f"   Wrapper Agent: {wrapper_total_time:.2f}s (avg: {wrapper_total_time/5:.2f}s per task)")
227      
228      print(f"\n💰 Total Cost (Direct Agent only):")
229      print(f"   ${direct_total_cost:.6f} (avg: ${direct_total_cost/5:.6f} per task)")
230      
231      print(f"\n📋 Detailed Results:")
232      print(f"   {'Task':<20} {'Direct':<12} {'Wrapper':<12} {'Winner':<10}")
233      print(f"   {'-'*56}")
234      
235      for d, w in zip(direct_results, wrapper_results):
236          d_status = "✅" if d['success'] else "❌"
237          w_status = "✅" if w['success'] else "❌"
238          
239          if d['success'] and w['success']:
240              winner = "Direct" if d['time'] < w['time'] else "Wrapper"
241          elif d['success']:
242              winner = "Direct"
243          elif w['success']:
244              winner = "Wrapper"
245          else:
246              winner = "None"
247          
248          print(f"   {d['task']:<20} {d_status} {d['time']:>6.2f}s  {w_status} {w['time']:>6.2f}s  {winner}")
249      
250      # Overall winner
251      print(f"\n🏆 OVERALL WINNER:")
252      if direct_success > wrapper_success:
253          print(f"   🥇 Direct Agent (higher success rate)")
254      elif wrapper_success > direct_success:
255          print(f"   🥇 Wrapper Agent (higher success rate)")
256      elif direct_success == wrapper_success == 5:
257          if direct_total_time < wrapper_total_time:
258              print(f"   🥇 Direct Agent (faster, same success rate)")
259          else:
260              print(f"   🥇 Wrapper Agent (faster, same success rate)")
261      else:
262          print(f"   ⚠️  Mixed results - both had failures")
263      
264      print("\n" + "=" * 70)
265  
266  
267  def main():
268      print("=" * 70)
269      print("🔬 AGENT COMPARISON TEST - 5 Tasks")
270      print("=" * 70)
271      print(f"\nTesting {len(TEST_TASKS)} tasks with both approaches...")
272      print("Model: gpt-4o-mini")
273      print(f"Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
274      
275      # Clean up any previous test artifacts
276      print("\n🧹 Cleaning up previous test artifacts...")
277      cleanup_commands = [
278          "rm -rf test_workspace numbers.txt words.txt readme.txt 2>/dev/null; true"
279      ]
280      for cmd in cleanup_commands:
281          os.system(cmd)
282      
283      # Run with Direct Agent
284      print("\n" + "=" * 70)
285      print("🤖 RUNNING WITH DIRECT AGENT (Agent class)")
286      print("=" * 70)
287      
288      direct_runner = DirectAgentRunner()
289      direct_results = []
290      
291      for task in TEST_TASKS:
292          result = direct_runner.run_task(task)
293          direct_results.append(result)
294      
295      # Clean up between runs
296      print("\n🧹 Cleaning up before wrapper test...")
297      for cmd in cleanup_commands:
298          os.system(cmd)
299      
300      # Run with Wrapper Agent
301      print("\n" + "=" * 70)
302      print("📦 RUNNING WITH WRAPPER AGENT (CLI approach)")
303      print("=" * 70)
304      
305      wrapper_runner = WrapperAgentRunner()
306      wrapper_results = []
307      
308      for task in TEST_TASKS:
309          result = wrapper_runner.run_task(task)
310          wrapper_results.append(result)
311      
312      # Compare results
313      compare_results(direct_results, wrapper_results)
314      
315      # Final cleanup
316      print("\n🧹 Final cleanup...")
317      for cmd in cleanup_commands:
318          os.system(cmd)
319      
320      print(f"\n✅ Comparison complete! End time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
321      
322      return direct_results, wrapper_results
323  
324  
325  if __name__ == "__main__":
326      try:
327          direct_results, wrapper_results = main()
328          
329          # Exit with error if both failed completely
330          direct_success = sum(1 for r in direct_results if r['success'])
331          wrapper_success = sum(1 for r in wrapper_results if r['success'])
332          
333          if direct_success == 0 and wrapper_success == 0:
334              print("\n❌ Both approaches failed completely!")
335              sys.exit(1)
336          else:
337              print("\n✅ At least one approach succeeded on some tasks")
338              sys.exit(0)
339              
340      except KeyboardInterrupt:
341          print("\n\n⚠️  Test interrupted by user")
342          sys.exit(130)
343      except Exception as e:
344          print(f"\n❌ Fatal error: {e}")
345          import traceback
346          traceback.print_exc()
347          sys.exit(1)