rag_evaluation.py
1 """ 2 RAG Evaluation Example 3 4 This example demonstrates how to evaluate RAG quality by testing 5 agent responses against expected answers. 6 7 Usage: 8 python rag_evaluation.py 9 """ 10 11 from praisonaiagents import Agent 12 13 14 # Sample knowledge for evaluation 15 SAMPLE_KNOWLEDGE = """ 16 Product Overview: 17 CloudManager Pro is an enterprise cloud management platform. 18 Its main purpose is to help organizations monitor, optimize, and secure their multi-cloud infrastructure. 19 20 Key Features: 21 - Real-time monitoring across AWS, Azure, and GCP 22 - Cost optimization with automated recommendations 23 - Security compliance checking 24 - Automated scaling and resource management 25 26 Getting Started: 27 1. Sign up at cloudmanager.example.com 28 2. Connect your cloud accounts 29 3. Configure monitoring dashboards 30 4. Set up alerts and policies 31 """ 32 33 34 def create_test_queries(): 35 """Create sample test queries for evaluation.""" 36 return [ 37 { 38 "query": "What is the main purpose of CloudManager Pro?", 39 "expected_contains": ["monitor", "cloud", "infrastructure"], 40 }, 41 { 42 "query": "What are the key features?", 43 "expected_contains": ["monitoring", "cost", "security"], 44 }, 45 { 46 "query": "How do I get started?", 47 "expected_contains": ["sign up", "connect", "configure"], 48 }, 49 ] 50 51 52 def evaluate_rag_agent(agent: Agent, test_queries: list) -> dict: 53 """ 54 Evaluate RAG agent performance on test queries. 55 56 Returns metrics including answer relevance accuracy. 57 """ 58 results = { 59 "total": len(test_queries), 60 "passed": 0, 61 "details": [], 62 } 63 64 for test in test_queries: 65 query = test["query"] 66 expected_terms = test.get("expected_contains", []) 67 68 # Get agent response 69 response = agent.chat(query) 70 response_lower = response.lower() 71 72 # Check if response contains expected terms 73 terms_found = sum(1 for term in expected_terms if term.lower() in response_lower) 74 pass_rate = terms_found / len(expected_terms) if expected_terms else 1.0 75 passed = pass_rate >= 0.5 # At least 50% of terms found 76 77 if passed: 78 results["passed"] += 1 79 80 results["details"].append({ 81 "query": query, 82 "passed": passed, 83 "terms_found": terms_found, 84 "total_terms": len(expected_terms), 85 "pass_rate": pass_rate, 86 "answer_preview": response[:150], 87 }) 88 89 results["accuracy"] = results["passed"] / results["total"] 90 return results 91 92 93 def print_results(results: dict): 94 """Print evaluation results in a formatted way.""" 95 print("\n" + "=" * 60) 96 print("RAG Evaluation Results") 97 print("=" * 60) 98 99 print(f"\nTotal Queries: {results['total']}") 100 print(f"Passed: {results['passed']}/{results['total']}") 101 print(f"Accuracy: {results['accuracy']:.1%}") 102 103 print("\n" + "-" * 60) 104 print("Detailed Results:") 105 print("-" * 60) 106 107 for i, detail in enumerate(results["details"], 1): 108 status = "✅ PASS" if detail["passed"] else "❌ FAIL" 109 print(f"\n[{i}] {status}: {detail['query']}") 110 print(f" Terms Found: {detail['terms_found']}/{detail['total_terms']}") 111 print(f" Answer: {detail['answer_preview']}...") 112 113 114 def main(): 115 print("=" * 60) 116 print("RAG Evaluation Example") 117 print("=" * 60) 118 119 # Create agent with knowledge 120 agent = Agent( 121 name="Product Expert", 122 instructions=f"""You are a product expert who answers questions accurately. 123 Use only the following knowledge to answer questions. 124 125 KNOWLEDGE: 126 {SAMPLE_KNOWLEDGE}""", 127 output="silent" 128 ) 129 130 # Get test queries 131 test_queries = create_test_queries() 132 133 # Run evaluation 134 print(f"\nEvaluating {len(test_queries)} queries...") 135 results = evaluate_rag_agent(agent, test_queries) 136 137 # Print results 138 print_results(results) 139 140 # Return status 141 if results["accuracy"] < 0.8: 142 print("\n⚠️ WARNING: Accuracy below 80% threshold!") 143 return 1 144 145 print("\n✅ Evaluation PASSED!") 146 return 0 147 148 149 if __name__ == "__main__": 150 exit(main())