criteria_example.py
1 """ 2 Criteria Evaluation Example 3 4 This example demonstrates how to evaluate agent outputs 5 against custom criteria using LLM-as-judge. 6 """ 7 8 import os 9 from praisonaiagents import Agent 10 from praisonaiagents.eval import CriteriaEvaluator 11 12 # Check if we have an API key 13 has_api_key = os.getenv("OPENAI_API_KEY") is not None 14 15 if has_api_key: 16 print("--- Testing Criteria Evaluation with Agent ---") 17 # Create a customer service agent 18 agent = Agent( 19 instructions="You are a friendly customer service agent. Be helpful and empathetic." 20 ) 21 22 # Create criteria evaluator with numeric scoring 23 evaluator = CriteriaEvaluator( 24 criteria="Response is helpful, empathetic, and provides a clear solution", 25 agent=agent, 26 input_text="I'm frustrated because my order hasn't arrived yet.", 27 scoring_type="numeric", # Score 1-10 28 threshold=7.0, # Pass if score >= 7 29 num_iterations=2, 30 output="verbose" 31 ) 32 33 # Run evaluation 34 result = evaluator.run(print_summary=True) 35 36 print("\nNumeric Scoring Results:") 37 print(f" Average Score: {result.avg_score}/10") 38 print(f" Pass Rate: {result.pass_rate:.1%}") 39 print(f" Passed: {result.passed}") 40 41 # Binary scoring example (pass/fail) 42 print("\n--- Testing Binary Scoring ---") 43 binary_evaluator = CriteriaEvaluator( 44 criteria="Response does not contain any offensive language", 45 agent=agent, 46 input_text="Tell me a joke", 47 scoring_type="binary", # Pass or Fail 48 output="verbose" 49 ) 50 51 binary_result = binary_evaluator.run(print_summary=True) 52 53 # With on_fail callback 54 print("\n--- Testing Failure Callback ---") 55 def handle_failure(score): 56 print(f"ALERT: Evaluation failed with score {score.score}") 57 print(f"Reasoning: {score.reasoning}") 58 59 callback_evaluator = CriteriaEvaluator( 60 criteria="Response is professional and helpful", 61 agent=agent, 62 input_text="Help me", 63 on_fail=handle_failure, 64 threshold=8.0 65 ) 66 67 callback_evaluator.run() 68 else: 69 print("⚠️ No OPENAI_API_KEY found. Skipping agent-based criteria evaluation...") 70 71 # Test pre-generated output evaluation (doesn't need agent) 72 print("\n--- Testing Pre-generated Output Evaluation ---") 73 if has_api_key: 74 output = "I understand your frustration. Let me check on your order right away." 75 try: 76 result2 = evaluator.evaluate_output(output, print_summary=True) 77 print(f"Pre-generated output score: {result2.avg_score}/10") 78 except Exception as e: 79 print(f"Pre-generated evaluation failed: {e}") 80 else: 81 print("⚠️ Skipping pre-generated evaluation (no API key)") 82 print("To run full evaluation, set OPENAI_API_KEY environment variable")