Cradicle Explorer

/ examples / eval / criteria_example.py
criteria_example.py
 1  """
 2  Criteria Evaluation Example
 3  
 4  This example demonstrates how to evaluate agent outputs
 5  against custom criteria using LLM-as-judge.
 6  """
 7  
 8  import os
 9  from praisonaiagents import Agent
10  from praisonaiagents.eval import CriteriaEvaluator
11  
12  # Check if we have an API key
13  has_api_key = os.getenv("OPENAI_API_KEY") is not None
14  
15  if has_api_key:
16      print("--- Testing Criteria Evaluation with Agent ---")
17      # Create a customer service agent
18      agent = Agent(
19          instructions="You are a friendly customer service agent. Be helpful and empathetic."
20      )
21  
22      # Create criteria evaluator with numeric scoring
23      evaluator = CriteriaEvaluator(
24          criteria="Response is helpful, empathetic, and provides a clear solution",
25          agent=agent,
26          input_text="I'm frustrated because my order hasn't arrived yet.",
27          scoring_type="numeric",  # Score 1-10
28          threshold=7.0,           # Pass if score >= 7
29          num_iterations=2,
30          output="verbose"
31      )
32  
33      # Run evaluation
34      result = evaluator.run(print_summary=True)
35  
36      print("\nNumeric Scoring Results:")
37      print(f"  Average Score: {result.avg_score}/10")
38      print(f"  Pass Rate: {result.pass_rate:.1%}")
39      print(f"  Passed: {result.passed}")
40  
41      # Binary scoring example (pass/fail)
42      print("\n--- Testing Binary Scoring ---")
43      binary_evaluator = CriteriaEvaluator(
44          criteria="Response does not contain any offensive language",
45          agent=agent,
46          input_text="Tell me a joke",
47          scoring_type="binary",  # Pass or Fail
48          output="verbose"
49      )
50  
51      binary_result = binary_evaluator.run(print_summary=True)
52      
53      # With on_fail callback
54      print("\n--- Testing Failure Callback ---")
55      def handle_failure(score):
56          print(f"ALERT: Evaluation failed with score {score.score}")
57          print(f"Reasoning: {score.reasoning}")
58  
59      callback_evaluator = CriteriaEvaluator(
60          criteria="Response is professional and helpful",
61          agent=agent,
62          input_text="Help me",
63          on_fail=handle_failure,
64          threshold=8.0
65      )
66  
67      callback_evaluator.run()
68  else:
69      print("⚠️  No OPENAI_API_KEY found. Skipping agent-based criteria evaluation...")
70  
71  # Test pre-generated output evaluation (doesn't need agent)
72  print("\n--- Testing Pre-generated Output Evaluation ---")
73  if has_api_key:
74      output = "I understand your frustration. Let me check on your order right away."
75      try:
76          result2 = evaluator.evaluate_output(output, print_summary=True)
77          print(f"Pre-generated output score: {result2.avg_score}/10")
78      except Exception as e:
79          print(f"Pre-generated evaluation failed: {e}")
80  else:
81      print("⚠️  Skipping pre-generated evaluation (no API key)")
82      print("To run full evaluation, set OPENAI_API_KEY environment variable")