agentic_eval.py
1 """ 2 ARGUS-AI Agentic Workflow Evaluation 3 4 Demonstrates ASF, ERR, and CPCS metrics for autonomous agent monitoring. 5 6 Author: Anil Prasad | Ambharii Labs 7 """ 8 9 import argus_ai 10 from argus_ai.types import AgenticEvalRequest 11 12 # Initialize with agentic profile 13 argus = argus_ai.init(profile="agentic") 14 15 # Simulate an agentic workflow execution 16 workflow = AgenticEvalRequest( 17 prompt="Research competitor pricing and generate a comparison report", 18 response="Generated comprehensive pricing comparison across 5 competitors.", 19 steps_planned=8, 20 steps_completed=7, 21 steps_failed=2, 22 steps_recovered=1, 23 retries=3, 24 total_cost_usd=0.45, 25 tool_calls=[ 26 {"tool": "web_search", "status": "success", "latency_ms": 800}, 27 {"tool": "web_search", "status": "success", "latency_ms": 650}, 28 {"tool": "web_scrape", "status": "failed", "latency_ms": 5000}, 29 {"tool": "web_scrape", "status": "success", "latency_ms": 1200}, 30 {"tool": "data_extract", "status": "success", "latency_ms": 300}, 31 {"tool": "llm_analyze", "status": "success", "latency_ms": 2100}, 32 {"tool": "llm_analyze", "status": "failed", "latency_ms": 30000}, 33 {"tool": "report_gen", "status": "success", "latency_ms": 1800}, 34 ], 35 model_name="claude-sonnet-4", 36 latency_ms=42000.0, 37 metadata={"workflow_type": "competitive_analysis"}, 38 ) 39 40 # Run full evaluation (G-ARVIS + agentic metrics) 41 result, agentic_metrics = argus.evaluate_agentic(workflow) 42 43 print("=== G-ARVIS Scores ===") 44 print(f" Composite: {result.garvis_composite:.3f}") 45 print(f" Groundedness: {result.groundedness:.3f}") 46 print(f" Accuracy: {result.accuracy:.3f}") 47 print(f" Reliability: {result.reliability:.3f}") 48 print(f" Variance: {result.variance:.3f}") 49 print(f" Inference Cost: {result.inference_cost:.3f}") 50 print(f" Safety: {result.safety:.3f}") 51 52 print("\n=== Agentic Metrics ===") 53 for metric in agentic_metrics: 54 print(f" {metric.name}: {metric.score:.3f}") 55 for k, v in metric.details.items(): 56 print(f" {k}: {v}") 57 58 print(f"\nOverall Passing: {result.passing}") 59 if result.alerts: 60 print(f"\nAlerts ({len(result.alerts)}):") 61 for alert in result.alerts: 62 print(f" {alert}")