/ examples / terminal_bench / multi_agent_example.py
multi_agent_example.py
  1  """
  2  Multi-Agent PraisonAI Integration for Terminal-Bench 2.0
  3  
  4  This example shows how to use PraisonAI's AgentTeam and AgentFlow
  5  capabilities with Terminal-Bench 2.0 for complex multi-step tasks.
  6  
  7  The multi-agent approach is useful for Terminal-Bench tasks that benefit from:
  8  - Task planning and decomposition
  9  - Specialized roles (planner, executor, verifier)
 10  - Error correction and refinement
 11  - Complex workflows with multiple phases
 12  
 13  Usage:
 14      harbor run -d terminal-bench/terminal-bench-2 \
 15          --agent-import-path examples.terminal_bench.multi_agent_example:MultiAgentPraisonAI \
 16          --model openai/gpt-4o
 17  """
 18  
 19  import asyncio
 20  from typing import Any, Dict, Optional
 21  
 22  try:
 23      from harbor.agents.base import BaseAgent
 24      from harbor.environments.base import BaseEnvironment
 25      from harbor.models.agent.context import AgentContext
 26  except ImportError as e:
 27      raise ImportError(f"Harbor framework required: {e}") from e
 28  
 29  try:
 30      from praisonaiagents import Agent, AgentTeam, Task
 31      from praisonaiagents.approval import get_approval_registry, AutoApproveBackend
 32  except ImportError as e:
 33      raise ImportError(f"PraisonAI required: {e}") from e
 34  
 35  
 36  class MultiAgentPraisonAI(BaseAgent):
 37      """Multi-agent PraisonAI system for complex Terminal-Bench tasks."""
 38      
 39      @staticmethod
 40      def name() -> str:
 41          return "praisonai-multi"
 42  
 43      def version(self) -> str | None:
 44          try:
 45              import praisonaiagents
 46              return getattr(praisonaiagents, "__version__", None)
 47          except ImportError:
 48              return None
 49  
 50      async def setup(self, environment: BaseEnvironment) -> None:
 51          """Setup phase - no container setup needed for external agent."""
 52          pass
 53  
 54      async def run(
 55          self,
 56          instruction: str,
 57          environment: BaseEnvironment,
 58          context: AgentContext,
 59      ) -> None:
 60          """
 61          Run multi-agent PraisonAI team on the Terminal-Bench task.
 62          
 63          Uses a three-agent approach:
 64          1. Planner: Analyzes task and creates execution plan
 65          2. Executor: Implements the plan using shell commands  
 66          3. Verifier: Tests and validates the solution
 67          """
 68          
 69          # Set auto-approval for container safety
 70          registry = get_approval_registry()
 71          original_backend = registry.get_backend()
 72          registry.set_backend(AutoApproveBackend(), agent_name="multi-agent-planner")
 73          
 74          try:
 75              # Create bash tool that wraps Harbor environment
 76              async def bash_tool(command: str) -> str:
 77                  """Execute bash command in Harbor environment."""
 78                  if not command.strip():
 79                      return "Error: Empty command"
 80                      
 81                  try:
 82                      result = await environment.exec(command=command, timeout_sec=30)
 83                      output_parts = []
 84                      if result.stdout:
 85                          output_parts.append(result.stdout.strip())
 86                      if result.stderr:
 87                          output_parts.append(f"[stderr]: {result.stderr.strip()}")
 88                      if result.return_code != 0:
 89                          output_parts.append(f"[exit_code]: {result.return_code}")
 90                      return "\n".join(output_parts) if output_parts else "(no output)"
 91                  except Exception as e:
 92                      return f"Error: {str(e)}"
 93  
 94              # Create specialized agents
 95              planner = Agent(
 96                  name="planner",
 97                  instructions=(
 98                      "You are an expert task planner for terminal/coding tasks. "
 99                      "Analyze the given instruction and create a detailed, step-by-step plan. "
100                      "Break down complex tasks into smaller, manageable steps. "
101                      "Consider potential issues and edge cases. "
102                      "Output your plan as a numbered list of specific actions."
103                  ),
104                  llm=self.model_name or "openai/gpt-4o",
105              )
106              
107              executor = Agent(
108                  name="executor", 
109                  instructions=(
110                      "You are an expert terminal executor. "
111                      "Follow the provided plan step-by-step using bash commands. "
112                      "Use the bash_tool to execute commands safely. "
113                      "Be precise and check each step before proceeding. "
114                      "If a step fails, try alternative approaches."
115                  ),
116                  tools=[bash_tool],
117                  llm=self.model_name or "openai/gpt-4o", 
118              )
119              
120              verifier = Agent(
121                  name="verifier",
122                  instructions=(
123                      "You are a solution verifier and tester. "
124                      "Test the completed solution to ensure it works correctly. "
125                      "Run appropriate tests and checks using bash commands. "
126                      "Report whether the solution meets the requirements. "
127                      "If issues are found, suggest specific fixes."
128                  ),
129                  tools=[bash_tool],
130                  llm=self.model_name or "openai/gpt-4o",
131              )
132  
133              print(f"🚀 Multi-Agent PraisonAI starting: {instruction[:100]}...")
134              
135              # Phase 1: Planning
136              print("📋 Phase 1: Task Planning")
137              plan = await planner.astart(f"Create a detailed plan for: {instruction}")
138              print(f"Plan created: {len(plan.split('.')) if plan else 0} steps")
139              
140              # Phase 2: Execution
141              print("⚡ Phase 2: Task Execution") 
142              execution_prompt = f"Execute this plan step by step:\n\nOriginal task: {instruction}\n\nPlan:\n{plan}"
143              execution_result = await executor.astart(execution_prompt)
144              print("Execution completed")
145              
146              # Phase 3: Verification
147              print("✅ Phase 3: Solution Verification")
148              verification_prompt = f"Verify this solution works correctly:\n\nOriginal task: {instruction}\n\nSolution: {execution_result}\n\nRun tests to confirm it works."
149              verification_result = await verifier.astart(verification_prompt)
150              print("Verification completed")
151              
152              # Combine results
153              final_result = {
154                  "plan": plan,
155                  "execution": execution_result,
156                  "verification": verification_result,
157              }
158              
159              print("✅ Multi-Agent PraisonAI completed task")
160              
161              # Populate context
162              self._populate_context([planner, executor, verifier], context, final_result)
163              
164          except Exception as e:
165              print(f"❌ Multi-Agent PraisonAI failed: {str(e)}")
166              context.metadata = {"error": str(e)}
167              raise
168          finally:
169              # Restore original approval backend to avoid global state pollution
170              if original_backend:
171                  registry.set_backend(original_backend)
172              else:
173                  registry.remove_backend(agent_name="multi-agent-planner")
174  
175      def _populate_context(self, agents: list, context: AgentContext, result: Dict[str, Any]) -> None:
176          """Populate Harbor context with multi-agent metrics."""
177          try:
178              # Aggregate token usage from all agents
179              total_input_tokens = 0
180              total_output_tokens = 0
181              total_cost = 0.0
182              
183              for agent in agents:
184                  # Use agent's actual metrics properties
185                  total_input_tokens += getattr(agent, '_total_tokens_in', 0)
186                  total_output_tokens += getattr(agent, '_total_tokens_out', 0)
187                  total_cost += agent.total_cost or 0.0
188              
189              context.n_input_tokens = total_input_tokens if total_input_tokens > 0 else None
190              context.n_output_tokens = total_output_tokens if total_output_tokens > 0 else None
191              context.cost_usd = total_cost if total_cost > 0 else None
192              
193              context.metadata = {
194                  "framework": "praisonai-multi",
195                  "agent_type": "multi-agent-team", 
196                  "agents": [agent.name for agent in agents],
197                  "model": self.model_name or "openai/gpt-4o",
198                  "phases": ["planning", "execution", "verification"],
199                  "tools_used": ["bash_tool"],
200                  "result_summary": str(result.get("verification", ""))[:200],
201                  "version": self.version(),
202              }
203              
204          except Exception as e:
205              context.metadata = {"context_error": str(e)}
206  
207  
208  # Alternative implementation using AgentTeam (more structured)
209  class AgentTeamPraisonAI(BaseAgent):
210      """AgentTeam-based implementation for Terminal-Bench tasks."""
211      
212      @staticmethod
213      def name() -> str:
214          return "praisonai-team"
215          
216      def version(self) -> str | None:
217          try:
218              import praisonaiagents
219              return getattr(praisonaiagents, "__version__", None)
220          except ImportError:
221              return None
222  
223      async def setup(self, environment: BaseEnvironment) -> None:
224          pass
225  
226      async def run(
227          self, 
228          instruction: str,
229          environment: BaseEnvironment,
230          context: AgentContext,
231      ) -> None:
232          """Run structured AgentTeam workflow."""
233          
234          registry = get_approval_registry()
235          original_backend = registry.get_backend()
236          registry.set_backend(AutoApproveBackend(), agent_name="agent-team")
237          
238          try:
239              # Create bash tool
240              async def bash_tool(command: str) -> str:
241                  result = await environment.exec(command=command, timeout_sec=30)
242                  output_parts = []
243                  if result.stdout:
244                      output_parts.append(result.stdout.strip())
245                  if result.stderr:
246                      output_parts.append(f"[stderr]: {result.stderr.strip()}")
247                  if result.return_code != 0:
248                      output_parts.append(f"[exit_code]: {result.return_code}")
249                  return "\n".join(output_parts) if output_parts else "(no output)"
250  
251              # Create agents
252              planner = Agent(
253                  name="planner",
254                  instructions="Create detailed execution plans for terminal tasks",
255                  llm=self.model_name or "openai/gpt-4o"
256              )
257              
258              executor = Agent(
259                  name="executor",
260                  instructions="Execute terminal commands based on plans",
261                  tools=[bash_tool],
262                  llm=self.model_name or "openai/gpt-4o"
263              )
264              
265              # Create tasks
266              plan_task = Task(
267                  name="plan",
268                  description=f"Create plan for: {instruction}",
269                  agent=planner
270              )
271              
272              execute_task = Task(
273                  name="execute", 
274                  description=f"Execute plan for: {instruction}",
275                  agent=executor,
276                  dependencies=[plan_task]  # Execute after planning
277              )
278              
279              # Create and run team
280              team = AgentTeam(
281                  agents=[planner, executor],
282                  tasks=[plan_task, execute_task],
283                  process="sequential"
284              )
285              
286              print(f"🚀 AgentTeam starting: {instruction[:100]}...")
287              result = await team.astart(instruction)
288              print("✅ AgentTeam completed")
289              
290              # Populate context
291              context.metadata = {
292                  "framework": "praisonai-team",
293                  "workflow": "sequential",
294                  "agents_count": len(team.agents),
295                  "tasks_count": len(team.tasks),
296                  "result": str(result)[:200] if result else None,
297              }
298              
299          finally:
300              # Restore original approval backend to avoid global state pollution
301              if original_backend:
302                  registry.set_backend(original_backend)
303              else:
304                  registry.remove_backend(agent_name="agent-team")
305  
306  
307  if __name__ == "__main__":
308      print("Multi-Agent PraisonAI for Terminal-Bench 2.0")
309      print("=" * 50)
310      print()
311      print("Available agent implementations:")
312      print("1. MultiAgentPraisonAI - Custom multi-agent workflow")
313      print("2. AgentTeamPraisonAI - Structured AgentTeam workflow")
314      print()
315      print("Usage examples:")
316      print()
317      print("# Multi-agent custom workflow")
318      print("harbor run -d terminal-bench/terminal-bench-2 \\")
319      print("  --agent-import-path examples.terminal_bench.multi_agent_example:MultiAgentPraisonAI \\")
320      print("  --model openai/gpt-4o")
321      print()
322      print("# AgentTeam structured workflow")  
323      print("harbor run -d terminal-bench/terminal-bench-2 \\")
324      print("  --agent-import-path examples.terminal_bench.multi_agent_example:AgentTeamPraisonAI \\")
325      print("  --model openai/gpt-4o")
326      print()
327      print("Benefits of multi-agent approach:")
328      print("- Task decomposition and planning")
329      print("- Specialized roles and expertise")
330      print("- Error detection and correction")
331      print("- Higher success rates on complex tasks")