test_agent_comparison.py
1 #!/usr/bin/env python3 2 """ 3 Agent Comparison Test - Run 5 tasks with both wrapper and direct agent 4 5 This test compares: 6 1. Direct Agent approach (praisonai_external_agent pattern) 7 2. CLI Wrapper approach (praisonai_wrapper_agent pattern) 8 9 Usage: 10 python test_agent_comparison.py 11 """ 12 import sys 13 import os 14 import time 15 import asyncio 16 from datetime import datetime 17 18 # Add paths 19 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src', 'praisonai-agents')) 20 21 # 5 Test Tasks (simple to moderately complex) 22 TEST_TASKS = [ 23 { 24 "name": "list_directory", 25 "instruction": "List all files in the current directory and count how many Python files exist", 26 "expected_artifacts": ["file count", ".py count"] 27 }, 28 { 29 "name": "create_structure", 30 "instruction": "Create a directory called 'test_workspace', create a file inside it named 'readme.txt' with content 'Hello World', then verify it exists", 31 "expected_artifacts": ["test_workspace/", "readme.txt"] 32 }, 33 { 34 "name": "file_operations", 35 "instruction": "Create a file called 'numbers.txt' with numbers 1-10 each on a new line, then display the first 3 lines", 36 "expected_artifacts": ["numbers.txt"] 37 }, 38 { 39 "name": "system_info", 40 "instruction": "Show the current working directory, current user, and Python version", 41 "expected_artifacts": ["pwd output", "whoami output"] 42 }, 43 { 44 "name": "text_processing", 45 "instruction": "Create a file 'words.txt' with 5 words (apple, banana, cherry, date, elderberry), then count the total characters in the file", 46 "expected_artifacts": ["words.txt", "character count"] 47 } 48 ] 49 50 51 class DirectAgentRunner: 52 """Runs tasks using direct Agent class approach.""" 53 54 def __init__(self): 55 from praisonaiagents import Agent 56 from praisonaiagents.tools import execute_command 57 from praisonaiagents.approval import get_approval_registry, AutoApproveBackend 58 59 self.Agent = Agent 60 self.execute_command = execute_command 61 self.registry = get_approval_registry() 62 self.registry.set_backend(AutoApproveBackend()) 63 self.results = [] 64 65 def run_task(self, task: dict) -> dict: 66 """Run a single task and return metrics.""" 67 print(f"\n๐ฏ Direct Agent - Running: {task['name']}") 68 print(f" Instruction: {task['instruction'][:60]}...") 69 70 start_time = time.time() 71 72 try: 73 agent = self.Agent( 74 name=f"direct-{task['name']}", 75 instructions="You are a terminal task agent. Use execute_command tool to complete tasks.", 76 tools=[self.execute_command], 77 llm="gpt-4o-mini" 78 ) 79 80 result = agent.start(task['instruction']) 81 elapsed = time.time() - start_time 82 83 # Get cost if available 84 cost = getattr(agent, 'total_cost', 0) or 0 85 86 print(f" โ Completed in {elapsed:.2f}s") 87 print(f" ๐ฐ Cost: ${cost:.6f}") 88 89 return { 90 "task": task['name'], 91 "success": True, 92 "time": elapsed, 93 "cost": cost, 94 "result_preview": str(result)[:100] if result else "No output", 95 "error": None 96 } 97 98 except Exception as e: 99 elapsed = time.time() - start_time 100 print(f" โ Failed in {elapsed:.2f}s: {e}") 101 return { 102 "task": task['name'], 103 "success": False, 104 "time": elapsed, 105 "cost": 0, 106 "result_preview": None, 107 "error": str(e) 108 } 109 110 111 class WrapperAgentRunner: 112 """Runs tasks using CLI wrapper approach via subprocess.""" 113 114 def __init__(self): 115 self.results = [] 116 117 def run_task(self, task: dict) -> dict: 118 """Run a single task using praisonai CLI via subprocess.""" 119 print(f"\n๐ฏ Wrapper Agent - Running: {task['name']}") 120 print(f" Instruction: {task['instruction'][:60]}...") 121 122 start_time = time.time() 123 124 try: 125 import subprocess 126 import shlex 127 128 # Build the praisonai CLI command 129 # Format: praisonai "TASK" --model MODEL 130 cmd = [ 131 "praisonai", 132 task['instruction'], 133 "--model", "gpt-4o-mini" 134 ] 135 136 # Run the command 137 result = subprocess.run( 138 cmd, 139 capture_output=True, 140 text=True, 141 timeout=300, # 5 minute timeout 142 env={**os.environ, "LOGLEVEL": "WARNING"} 143 ) 144 145 elapsed = time.time() - start_time 146 147 if result.returncode == 0: 148 print(f" โ Completed in {elapsed:.2f}s") 149 return { 150 "task": task['name'], 151 "success": True, 152 "time": elapsed, 153 "cost": 0, # Cost tracking not available in CLI mode 154 "result_preview": result.stdout[:100] if result.stdout else "No output", 155 "error": None 156 } 157 else: 158 print(f" โ Failed with exit code {result.returncode} in {elapsed:.2f}s") 159 return { 160 "task": task['name'], 161 "success": False, 162 "time": elapsed, 163 "cost": 0, 164 "result_preview": None, 165 "error": f"Exit code {result.returncode}: {result.stderr[:200]}" 166 } 167 168 except subprocess.TimeoutExpired: 169 elapsed = time.time() - start_time 170 print(f" โฑ๏ธ Timeout after {elapsed:.2f}s") 171 return { 172 "task": task['name'], 173 "success": False, 174 "time": elapsed, 175 "cost": 0, 176 "result_preview": None, 177 "error": "Timeout after 300s" 178 } 179 180 except FileNotFoundError: 181 elapsed = time.time() - start_time 182 print(f" โ ๏ธ praisonai CLI not found") 183 return { 184 "task": task['name'], 185 "success": False, 186 "time": elapsed, 187 "cost": 0, 188 "result_preview": None, 189 "error": "praisonai CLI not installed (pip install praisonai)" 190 } 191 192 except Exception as e: 193 elapsed = time.time() - start_time 194 print(f" โ Failed in {elapsed:.2f}s: {e}") 195 return { 196 "task": task['name'], 197 "success": False, 198 "time": elapsed, 199 "cost": 0, 200 "result_preview": None, 201 "error": str(e) 202 } 203 204 205 def compare_results(direct_results: list, wrapper_results: list): 206 """Compare results from both approaches.""" 207 print("\n" + "=" * 70) 208 print("๐ COMPARISON RESULTS") 209 print("=" * 70) 210 211 # Calculate metrics 212 direct_success = sum(1 for r in direct_results if r['success']) 213 wrapper_success = sum(1 for r in wrapper_results if r['success']) 214 215 direct_total_time = sum(r['time'] for r in direct_results) 216 wrapper_total_time = sum(r['time'] for r in wrapper_results) 217 218 direct_total_cost = sum(r['cost'] for r in direct_results) 219 220 print(f"\nโ Success Rate:") 221 print(f" Direct Agent: {direct_success}/5 ({direct_success*20}%)") 222 print(f" Wrapper Agent: {wrapper_success}/5 ({wrapper_success*20}%)") 223 224 print(f"\nโฑ๏ธ Total Time:") 225 print(f" Direct Agent: {direct_total_time:.2f}s (avg: {direct_total_time/5:.2f}s per task)") 226 print(f" Wrapper Agent: {wrapper_total_time:.2f}s (avg: {wrapper_total_time/5:.2f}s per task)") 227 228 print(f"\n๐ฐ Total Cost (Direct Agent only):") 229 print(f" ${direct_total_cost:.6f} (avg: ${direct_total_cost/5:.6f} per task)") 230 231 print(f"\n๐ Detailed Results:") 232 print(f" {'Task':<20} {'Direct':<12} {'Wrapper':<12} {'Winner':<10}") 233 print(f" {'-'*56}") 234 235 for d, w in zip(direct_results, wrapper_results): 236 d_status = "โ " if d['success'] else "โ" 237 w_status = "โ " if w['success'] else "โ" 238 239 if d['success'] and w['success']: 240 winner = "Direct" if d['time'] < w['time'] else "Wrapper" 241 elif d['success']: 242 winner = "Direct" 243 elif w['success']: 244 winner = "Wrapper" 245 else: 246 winner = "None" 247 248 print(f" {d['task']:<20} {d_status} {d['time']:>6.2f}s {w_status} {w['time']:>6.2f}s {winner}") 249 250 # Overall winner 251 print(f"\n๐ OVERALL WINNER:") 252 if direct_success > wrapper_success: 253 print(f" ๐ฅ Direct Agent (higher success rate)") 254 elif wrapper_success > direct_success: 255 print(f" ๐ฅ Wrapper Agent (higher success rate)") 256 elif direct_success == wrapper_success == 5: 257 if direct_total_time < wrapper_total_time: 258 print(f" ๐ฅ Direct Agent (faster, same success rate)") 259 else: 260 print(f" ๐ฅ Wrapper Agent (faster, same success rate)") 261 else: 262 print(f" โ ๏ธ Mixed results - both had failures") 263 264 print("\n" + "=" * 70) 265 266 267 def main(): 268 print("=" * 70) 269 print("๐ฌ AGENT COMPARISON TEST - 5 Tasks") 270 print("=" * 70) 271 print(f"\nTesting {len(TEST_TASKS)} tasks with both approaches...") 272 print("Model: gpt-4o-mini") 273 print(f"Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") 274 275 # Clean up any previous test artifacts 276 print("\n๐งน Cleaning up previous test artifacts...") 277 cleanup_commands = [ 278 "rm -rf test_workspace numbers.txt words.txt readme.txt 2>/dev/null; true" 279 ] 280 for cmd in cleanup_commands: 281 os.system(cmd) 282 283 # Run with Direct Agent 284 print("\n" + "=" * 70) 285 print("๐ค RUNNING WITH DIRECT AGENT (Agent class)") 286 print("=" * 70) 287 288 direct_runner = DirectAgentRunner() 289 direct_results = [] 290 291 for task in TEST_TASKS: 292 result = direct_runner.run_task(task) 293 direct_results.append(result) 294 295 # Clean up between runs 296 print("\n๐งน Cleaning up before wrapper test...") 297 for cmd in cleanup_commands: 298 os.system(cmd) 299 300 # Run with Wrapper Agent 301 print("\n" + "=" * 70) 302 print("๐ฆ RUNNING WITH WRAPPER AGENT (CLI approach)") 303 print("=" * 70) 304 305 wrapper_runner = WrapperAgentRunner() 306 wrapper_results = [] 307 308 for task in TEST_TASKS: 309 result = wrapper_runner.run_task(task) 310 wrapper_results.append(result) 311 312 # Compare results 313 compare_results(direct_results, wrapper_results) 314 315 # Final cleanup 316 print("\n๐งน Final cleanup...") 317 for cmd in cleanup_commands: 318 os.system(cmd) 319 320 print(f"\nโ Comparison complete! End time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") 321 322 return direct_results, wrapper_results 323 324 325 if __name__ == "__main__": 326 try: 327 direct_results, wrapper_results = main() 328 329 # Exit with error if both failed completely 330 direct_success = sum(1 for r in direct_results if r['success']) 331 wrapper_success = sum(1 for r in wrapper_results if r['success']) 332 333 if direct_success == 0 and wrapper_success == 0: 334 print("\nโ Both approaches failed completely!") 335 sys.exit(1) 336 else: 337 print("\nโ At least one approach succeeded on some tasks") 338 sys.exit(0) 339 340 except KeyboardInterrupt: 341 print("\n\nโ ๏ธ Test interrupted by user") 342 sys.exit(130) 343 except Exception as e: 344 print(f"\nโ Fatal error: {e}") 345 import traceback 346 traceback.print_exc() 347 sys.exit(1)