test_context_relevance_evaluator.py
1 # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai> 2 # 3 # SPDX-License-Identifier: Apache-2.0 4 5 import math 6 import os 7 8 import pytest 9 10 from haystack import Pipeline 11 from haystack.components.evaluators import ContextRelevanceEvaluator 12 from haystack.components.generators.chat.openai import OpenAIChatGenerator 13 from haystack.dataclasses.chat_message import ChatMessage 14 from haystack.utils.auth import Secret 15 16 17 class TestContextRelevanceEvaluator: 18 def test_init_default(self, monkeypatch): 19 monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") 20 component = ContextRelevanceEvaluator() 21 22 assert component.instructions == ( 23 "Please extract only sentences from the provided context which are absolutely relevant and " 24 "required to answer the following question. If no relevant sentences are found, or if you " 25 "believe the question cannot be answered from the given context, return an empty list, example: []" 26 ) 27 assert component.inputs == [("questions", list[str]), ("contexts", list[list[str]])] 28 assert component.outputs == ["relevant_statements"] 29 assert component.examples == [ 30 { 31 "inputs": { 32 "questions": "What is the capital of Germany?", 33 "contexts": ["Berlin is the capital of Germany. Berlin and was founded in 1244."], 34 }, 35 "outputs": {"relevant_statements": ["Berlin is the capital of Germany."]}, 36 }, 37 { 38 "inputs": { 39 "questions": "What is the capital of France?", 40 "contexts": [ 41 "Berlin is the capital of Germany and was founded in 1244.", 42 "Europe is a continent with 44 countries.", 43 "Madrid is the capital of Spain.", 44 ], 45 }, 46 "outputs": {"relevant_statements": []}, 47 }, 48 { 49 "inputs": {"questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."]}, 50 "outputs": {"relevant_statements": ["Rome is the capital of Italy."]}, 51 }, 52 ] 53 54 assert isinstance(component._chat_generator, OpenAIChatGenerator) 55 assert component._chat_generator.client.api_key == "test-api-key" 56 assert component._chat_generator.generation_kwargs == {"response_format": {"type": "json_object"}, "seed": 42} 57 58 def test_init_fail_wo_openai_api_key(self, monkeypatch): 59 monkeypatch.delenv("OPENAI_API_KEY", raising=False) 60 with pytest.raises(ValueError, match="None of the .* environment variables are set"): 61 ContextRelevanceEvaluator() 62 63 def test_init_with_parameters(self, monkeypatch): 64 monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") 65 component = ContextRelevanceEvaluator( 66 examples=[ 67 {"inputs": {"questions": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}, 68 {"inputs": {"questions": "Football is the most popular sport."}, "outputs": {"custom_score": 0}}, 69 ] 70 ) 71 72 assert component.examples == [ 73 {"inputs": {"questions": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}}, 74 {"inputs": {"questions": "Football is the most popular sport."}, "outputs": {"custom_score": 0}}, 75 ] 76 77 assert isinstance(component._chat_generator, OpenAIChatGenerator) 78 assert component._chat_generator.client.api_key == "test-api-key" 79 assert component._chat_generator.generation_kwargs == {"response_format": {"type": "json_object"}, "seed": 42} 80 81 def test_init_with_chat_generator(self, monkeypatch): 82 monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") 83 chat_generator = OpenAIChatGenerator(generation_kwargs={"response_format": {"type": "json_object"}, "seed": 42}) 84 component = ContextRelevanceEvaluator(chat_generator=chat_generator) 85 86 assert component._chat_generator is chat_generator 87 88 def test_to_dict_with_parameters(self, monkeypatch): 89 monkeypatch.setenv("ENV_VAR", "test-api-key") 90 chat_generator = OpenAIChatGenerator( 91 generation_kwargs={"response_format": {"type": "json_object"}, "seed": 42}, 92 api_key=Secret.from_env_var("ENV_VAR"), 93 ) 94 95 component = ContextRelevanceEvaluator( 96 chat_generator=chat_generator, 97 examples=[{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}], 98 raise_on_failure=False, 99 progress_bar=False, 100 ) 101 data = component.to_dict() 102 assert data == { 103 "type": "haystack.components.evaluators.context_relevance.ContextRelevanceEvaluator", 104 "init_parameters": { 105 "chat_generator": chat_generator.to_dict(), 106 "examples": [{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}], 107 "progress_bar": False, 108 "raise_on_failure": False, 109 }, 110 } 111 112 def test_from_dict(self, monkeypatch): 113 monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") 114 chat_generator = OpenAIChatGenerator(generation_kwargs={"response_format": {"type": "json_object"}, "seed": 42}) 115 116 data = { 117 "type": "haystack.components.evaluators.context_relevance.ContextRelevanceEvaluator", 118 "init_parameters": { 119 "chat_generator": chat_generator.to_dict(), 120 "examples": [{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}], 121 }, 122 } 123 124 component = ContextRelevanceEvaluator.from_dict(data) 125 assert isinstance(component._chat_generator, OpenAIChatGenerator) 126 assert component._chat_generator.client.api_key == "test-api-key" 127 assert component._chat_generator.generation_kwargs == {"response_format": {"type": "json_object"}, "seed": 42} 128 assert component.examples == [{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}] 129 130 def test_pipeline_serde(self, monkeypatch): 131 monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") 132 133 component = ContextRelevanceEvaluator() 134 pipeline = Pipeline() 135 pipeline.add_component("evaluator", component) 136 137 serialized_pipeline = pipeline.dumps() 138 deserialized_pipeline = Pipeline.loads(serialized_pipeline) 139 assert deserialized_pipeline == pipeline 140 141 def test_run_calculates_mean_score(self, monkeypatch): 142 monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") 143 component = ContextRelevanceEvaluator() 144 145 def chat_generator_run(self, *args, **kwargs): 146 if "Football" in kwargs["messages"][0].text: 147 return {"replies": [ChatMessage.from_assistant('{"relevant_statements": ["a", "b"], "score": 1}')]} 148 return {"replies": [ChatMessage.from_assistant('{"relevant_statements": [], "score": 0}')]} 149 150 monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run) 151 152 questions = ["Which is the most popular global sport?", "Who created the Python language?"] 153 contexts = [ 154 [ 155 "The popularity of sports can be measured in various ways, including TV viewership, social media " 156 "presence, number of participants, and economic impact. Football is undoubtedly the world's most " 157 "popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and " 158 "Messi, drawing a followership of more than 4 billion people." 159 ], 160 [ 161 "Python is design philosophy emphasizes code readability, and its language constructs aim to help " 162 "programmers write clear, logical code for both small and large-scale software projects." 163 ], 164 ] 165 results = component.run(questions=questions, contexts=contexts) 166 167 assert results == { 168 "results": [{"score": 1, "relevant_statements": ["a", "b"]}, {"score": 0, "relevant_statements": []}], 169 "score": 0.5, 170 "meta": None, 171 "individual_scores": [1, 0], 172 } 173 174 def test_run_no_statements_extracted(self, monkeypatch): 175 monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") 176 component = ContextRelevanceEvaluator() 177 178 def chat_generator_run(self, *args, **kwargs): 179 if "Football" in kwargs["messages"][0].text: 180 return {"replies": [ChatMessage.from_assistant('{"relevant_statements": ["a", "b"], "score": 1}')]} 181 return {"replies": [ChatMessage.from_assistant('{"relevant_statements": [], "score": 0}')]} 182 183 monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run) 184 185 questions = ["Which is the most popular global sport?", "Who created the Python language?"] 186 contexts = [ 187 [ 188 "The popularity of sports can be measured in various ways, including TV viewership, social media " 189 "presence, number of participants, and economic impact. Football is undoubtedly the world's most " 190 "popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and " 191 "Messi, drawing a followership of more than 4 billion people." 192 ], 193 [], 194 ] 195 results = component.run(questions=questions, contexts=contexts) 196 assert results == { 197 "results": [{"score": 1, "relevant_statements": ["a", "b"]}, {"score": 0, "relevant_statements": []}], 198 "score": 0.5, 199 "meta": None, 200 "individual_scores": [1, 0], 201 } 202 203 def test_run_missing_parameters(self, monkeypatch): 204 monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") 205 component = ContextRelevanceEvaluator() 206 with pytest.raises(ValueError, match="LLM evaluator expected input parameter"): 207 component.run() 208 209 def test_run_returns_nan_raise_on_failure_false(self, monkeypatch): 210 monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") 211 component = ContextRelevanceEvaluator(raise_on_failure=False) 212 213 def chat_generator_run(self, *args, **kwargs): 214 if "Python" in kwargs["messages"][0].text: 215 raise Exception("OpenAI API request failed.") 216 return {"replies": [ChatMessage.from_assistant('{"relevant_statements": ["c", "d"], "score": 1}')]} 217 218 monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run) 219 220 questions = ["Which is the most popular global sport?", "Who created the Python language?"] 221 contexts = [ 222 [ 223 "The popularity of sports can be measured in various ways, including TV viewership, social media " 224 "presence, number of participants, and economic impact. Football is undoubtedly the world's most " 225 "popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and " 226 "Messi, drawing a followership of more than 4 billion people." 227 ], 228 [ 229 "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming " 230 "language. Its design philosophy emphasizes code readability, and its language constructs aim to help " 231 "programmers write clear, logical code for both small and large-scale software projects." 232 ], 233 ] 234 results = component.run(questions=questions, contexts=contexts) 235 236 assert math.isnan(results["score"]) 237 assert results["results"][0] == {"relevant_statements": ["c", "d"], "score": 1} 238 assert results["results"][1]["relevant_statements"] == [] 239 assert math.isnan(results["results"][1]["score"]) 240 241 @pytest.mark.skipif( 242 not os.environ.get("OPENAI_API_KEY", None), 243 reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.", 244 ) 245 @pytest.mark.integration 246 def test_live_run(self): 247 questions = ["Who created the Python language?"] 248 contexts = [["Python, created by Guido van Rossum, is a high-level general-purpose programming language."]] 249 250 evaluator = ContextRelevanceEvaluator(chat_generator=OpenAIChatGenerator(model="gpt-4.1-nano")) 251 result = evaluator.run(questions=questions, contexts=contexts) 252 253 required_fields = {"results"} 254 assert all(field in result for field in required_fields) 255 nested_required_fields = {"score", "relevant_statements"} 256 assert all(field in result["results"][0] for field in nested_required_fields) 257 258 assert "meta" in result 259 assert "prompt_tokens" in result["meta"][0]["usage"] 260 assert "completion_tokens" in result["meta"][0]["usage"] 261 assert "total_tokens" in result["meta"][0]["usage"]