Cradicle Explorer

/ examples / rag / rag_evaluation.py
rag_evaluation.py
  1  """
  2  RAG Evaluation Example
  3  
  4  This example demonstrates how to evaluate RAG quality by testing
  5  agent responses against expected answers.
  6  
  7  Usage:
  8      python rag_evaluation.py
  9  """
 10  
 11  from praisonaiagents import Agent
 12  
 13  
 14  # Sample knowledge for evaluation
 15  SAMPLE_KNOWLEDGE = """
 16  Product Overview:
 17  CloudManager Pro is an enterprise cloud management platform.
 18  Its main purpose is to help organizations monitor, optimize, and secure their multi-cloud infrastructure.
 19  
 20  Key Features:
 21  - Real-time monitoring across AWS, Azure, and GCP
 22  - Cost optimization with automated recommendations
 23  - Security compliance checking
 24  - Automated scaling and resource management
 25  
 26  Getting Started:
 27  1. Sign up at cloudmanager.example.com
 28  2. Connect your cloud accounts
 29  3. Configure monitoring dashboards
 30  4. Set up alerts and policies
 31  """
 32  
 33  
 34  def create_test_queries():
 35      """Create sample test queries for evaluation."""
 36      return [
 37          {
 38              "query": "What is the main purpose of CloudManager Pro?",
 39              "expected_contains": ["monitor", "cloud", "infrastructure"],
 40          },
 41          {
 42              "query": "What are the key features?",
 43              "expected_contains": ["monitoring", "cost", "security"],
 44          },
 45          {
 46              "query": "How do I get started?",
 47              "expected_contains": ["sign up", "connect", "configure"],
 48          },
 49      ]
 50  
 51  
 52  def evaluate_rag_agent(agent: Agent, test_queries: list) -> dict:
 53      """
 54      Evaluate RAG agent performance on test queries.
 55      
 56      Returns metrics including answer relevance accuracy.
 57      """
 58      results = {
 59          "total": len(test_queries),
 60          "passed": 0,
 61          "details": [],
 62      }
 63      
 64      for test in test_queries:
 65          query = test["query"]
 66          expected_terms = test.get("expected_contains", [])
 67          
 68          # Get agent response
 69          response = agent.chat(query)
 70          response_lower = response.lower()
 71          
 72          # Check if response contains expected terms
 73          terms_found = sum(1 for term in expected_terms if term.lower() in response_lower)
 74          pass_rate = terms_found / len(expected_terms) if expected_terms else 1.0
 75          passed = pass_rate >= 0.5  # At least 50% of terms found
 76          
 77          if passed:
 78              results["passed"] += 1
 79          
 80          results["details"].append({
 81              "query": query,
 82              "passed": passed,
 83              "terms_found": terms_found,
 84              "total_terms": len(expected_terms),
 85              "pass_rate": pass_rate,
 86              "answer_preview": response[:150],
 87          })
 88      
 89      results["accuracy"] = results["passed"] / results["total"]
 90      return results
 91  
 92  
 93  def print_results(results: dict):
 94      """Print evaluation results in a formatted way."""
 95      print("\n" + "=" * 60)
 96      print("RAG Evaluation Results")
 97      print("=" * 60)
 98      
 99      print(f"\nTotal Queries: {results['total']}")
100      print(f"Passed: {results['passed']}/{results['total']}")
101      print(f"Accuracy: {results['accuracy']:.1%}")
102      
103      print("\n" + "-" * 60)
104      print("Detailed Results:")
105      print("-" * 60)
106      
107      for i, detail in enumerate(results["details"], 1):
108          status = "✅ PASS" if detail["passed"] else "❌ FAIL"
109          print(f"\n[{i}] {status}: {detail['query']}")
110          print(f"    Terms Found: {detail['terms_found']}/{detail['total_terms']}")
111          print(f"    Answer: {detail['answer_preview']}...")
112  
113  
114  def main():
115      print("=" * 60)
116      print("RAG Evaluation Example")
117      print("=" * 60)
118      
119      # Create agent with knowledge
120      agent = Agent(
121          name="Product Expert",
122          instructions=f"""You are a product expert who answers questions accurately.
123          Use only the following knowledge to answer questions.
124          
125          KNOWLEDGE:
126          {SAMPLE_KNOWLEDGE}""",
127          output="silent"
128      )
129      
130      # Get test queries
131      test_queries = create_test_queries()
132      
133      # Run evaluation
134      print(f"\nEvaluating {len(test_queries)} queries...")
135      results = evaluate_rag_agent(agent, test_queries)
136      
137      # Print results
138      print_results(results)
139      
140      # Return status
141      if results["accuracy"] < 0.8:
142          print("\n⚠️ WARNING: Accuracy below 80% threshold!")
143          return 1
144      
145      print("\n✅ Evaluation PASSED!")
146      return 0
147  
148  
149  if __name__ == "__main__":
150      exit(main())