/ test / components / evaluators / test_context_relevance_evaluator.py
test_context_relevance_evaluator.py
  1  # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
  2  #
  3  # SPDX-License-Identifier: Apache-2.0
  4  
  5  import math
  6  import os
  7  
  8  import pytest
  9  
 10  from haystack import Pipeline
 11  from haystack.components.evaluators import ContextRelevanceEvaluator
 12  from haystack.components.generators.chat.openai import OpenAIChatGenerator
 13  from haystack.dataclasses.chat_message import ChatMessage
 14  from haystack.utils.auth import Secret
 15  
 16  
 17  class TestContextRelevanceEvaluator:
 18      def test_init_default(self, monkeypatch):
 19          monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
 20          component = ContextRelevanceEvaluator()
 21  
 22          assert component.instructions == (
 23              "Please extract only sentences from the provided context which are absolutely relevant and "
 24              "required to answer the following question. If no relevant sentences are found, or if you "
 25              "believe the question cannot be answered from the given context, return an empty list, example: []"
 26          )
 27          assert component.inputs == [("questions", list[str]), ("contexts", list[list[str]])]
 28          assert component.outputs == ["relevant_statements"]
 29          assert component.examples == [
 30              {
 31                  "inputs": {
 32                      "questions": "What is the capital of Germany?",
 33                      "contexts": ["Berlin is the capital of Germany. Berlin and was founded in 1244."],
 34                  },
 35                  "outputs": {"relevant_statements": ["Berlin is the capital of Germany."]},
 36              },
 37              {
 38                  "inputs": {
 39                      "questions": "What is the capital of France?",
 40                      "contexts": [
 41                          "Berlin is the capital of Germany and was founded in 1244.",
 42                          "Europe is a continent with 44 countries.",
 43                          "Madrid is the capital of Spain.",
 44                      ],
 45                  },
 46                  "outputs": {"relevant_statements": []},
 47              },
 48              {
 49                  "inputs": {"questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."]},
 50                  "outputs": {"relevant_statements": ["Rome is the capital of Italy."]},
 51              },
 52          ]
 53  
 54          assert isinstance(component._chat_generator, OpenAIChatGenerator)
 55          assert component._chat_generator.client.api_key == "test-api-key"
 56          assert component._chat_generator.generation_kwargs == {"response_format": {"type": "json_object"}, "seed": 42}
 57  
 58      def test_init_fail_wo_openai_api_key(self, monkeypatch):
 59          monkeypatch.delenv("OPENAI_API_KEY", raising=False)
 60          with pytest.raises(ValueError, match="None of the .* environment variables are set"):
 61              ContextRelevanceEvaluator()
 62  
 63      def test_init_with_parameters(self, monkeypatch):
 64          monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
 65          component = ContextRelevanceEvaluator(
 66              examples=[
 67                  {"inputs": {"questions": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
 68                  {"inputs": {"questions": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
 69              ]
 70          )
 71  
 72          assert component.examples == [
 73              {"inputs": {"questions": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
 74              {"inputs": {"questions": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
 75          ]
 76  
 77          assert isinstance(component._chat_generator, OpenAIChatGenerator)
 78          assert component._chat_generator.client.api_key == "test-api-key"
 79          assert component._chat_generator.generation_kwargs == {"response_format": {"type": "json_object"}, "seed": 42}
 80  
 81      def test_init_with_chat_generator(self, monkeypatch):
 82          monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
 83          chat_generator = OpenAIChatGenerator(generation_kwargs={"response_format": {"type": "json_object"}, "seed": 42})
 84          component = ContextRelevanceEvaluator(chat_generator=chat_generator)
 85  
 86          assert component._chat_generator is chat_generator
 87  
 88      def test_to_dict_with_parameters(self, monkeypatch):
 89          monkeypatch.setenv("ENV_VAR", "test-api-key")
 90          chat_generator = OpenAIChatGenerator(
 91              generation_kwargs={"response_format": {"type": "json_object"}, "seed": 42},
 92              api_key=Secret.from_env_var("ENV_VAR"),
 93          )
 94  
 95          component = ContextRelevanceEvaluator(
 96              chat_generator=chat_generator,
 97              examples=[{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}],
 98              raise_on_failure=False,
 99              progress_bar=False,
100          )
101          data = component.to_dict()
102          assert data == {
103              "type": "haystack.components.evaluators.context_relevance.ContextRelevanceEvaluator",
104              "init_parameters": {
105                  "chat_generator": chat_generator.to_dict(),
106                  "examples": [{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}],
107                  "progress_bar": False,
108                  "raise_on_failure": False,
109              },
110          }
111  
112      def test_from_dict(self, monkeypatch):
113          monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
114          chat_generator = OpenAIChatGenerator(generation_kwargs={"response_format": {"type": "json_object"}, "seed": 42})
115  
116          data = {
117              "type": "haystack.components.evaluators.context_relevance.ContextRelevanceEvaluator",
118              "init_parameters": {
119                  "chat_generator": chat_generator.to_dict(),
120                  "examples": [{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}],
121              },
122          }
123  
124          component = ContextRelevanceEvaluator.from_dict(data)
125          assert isinstance(component._chat_generator, OpenAIChatGenerator)
126          assert component._chat_generator.client.api_key == "test-api-key"
127          assert component._chat_generator.generation_kwargs == {"response_format": {"type": "json_object"}, "seed": 42}
128          assert component.examples == [{"inputs": {"questions": "What is football?"}, "outputs": {"score": 0}}]
129  
130      def test_pipeline_serde(self, monkeypatch):
131          monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
132  
133          component = ContextRelevanceEvaluator()
134          pipeline = Pipeline()
135          pipeline.add_component("evaluator", component)
136  
137          serialized_pipeline = pipeline.dumps()
138          deserialized_pipeline = Pipeline.loads(serialized_pipeline)
139          assert deserialized_pipeline == pipeline
140  
141      def test_run_calculates_mean_score(self, monkeypatch):
142          monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
143          component = ContextRelevanceEvaluator()
144  
145          def chat_generator_run(self, *args, **kwargs):
146              if "Football" in kwargs["messages"][0].text:
147                  return {"replies": [ChatMessage.from_assistant('{"relevant_statements": ["a", "b"], "score": 1}')]}
148              return {"replies": [ChatMessage.from_assistant('{"relevant_statements": [], "score": 0}')]}
149  
150          monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run)
151  
152          questions = ["Which is the most popular global sport?", "Who created the Python language?"]
153          contexts = [
154              [
155                  "The popularity of sports can be measured in various ways, including TV viewership, social media "
156                  "presence, number of participants, and economic impact. Football is undoubtedly the world's most "
157                  "popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and "
158                  "Messi, drawing a followership of more than 4 billion people."
159              ],
160              [
161                  "Python is design philosophy emphasizes code readability, and its language constructs aim to help "
162                  "programmers write clear, logical code for both small and large-scale software projects."
163              ],
164          ]
165          results = component.run(questions=questions, contexts=contexts)
166  
167          assert results == {
168              "results": [{"score": 1, "relevant_statements": ["a", "b"]}, {"score": 0, "relevant_statements": []}],
169              "score": 0.5,
170              "meta": None,
171              "individual_scores": [1, 0],
172          }
173  
174      def test_run_no_statements_extracted(self, monkeypatch):
175          monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
176          component = ContextRelevanceEvaluator()
177  
178          def chat_generator_run(self, *args, **kwargs):
179              if "Football" in kwargs["messages"][0].text:
180                  return {"replies": [ChatMessage.from_assistant('{"relevant_statements": ["a", "b"], "score": 1}')]}
181              return {"replies": [ChatMessage.from_assistant('{"relevant_statements": [], "score": 0}')]}
182  
183          monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run)
184  
185          questions = ["Which is the most popular global sport?", "Who created the Python language?"]
186          contexts = [
187              [
188                  "The popularity of sports can be measured in various ways, including TV viewership, social media "
189                  "presence, number of participants, and economic impact. Football is undoubtedly the world's most "
190                  "popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and "
191                  "Messi, drawing a followership of more than 4 billion people."
192              ],
193              [],
194          ]
195          results = component.run(questions=questions, contexts=contexts)
196          assert results == {
197              "results": [{"score": 1, "relevant_statements": ["a", "b"]}, {"score": 0, "relevant_statements": []}],
198              "score": 0.5,
199              "meta": None,
200              "individual_scores": [1, 0],
201          }
202  
203      def test_run_missing_parameters(self, monkeypatch):
204          monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
205          component = ContextRelevanceEvaluator()
206          with pytest.raises(ValueError, match="LLM evaluator expected input parameter"):
207              component.run()
208  
209      def test_run_returns_nan_raise_on_failure_false(self, monkeypatch):
210          monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
211          component = ContextRelevanceEvaluator(raise_on_failure=False)
212  
213          def chat_generator_run(self, *args, **kwargs):
214              if "Python" in kwargs["messages"][0].text:
215                  raise Exception("OpenAI API request failed.")
216              return {"replies": [ChatMessage.from_assistant('{"relevant_statements": ["c", "d"], "score": 1}')]}
217  
218          monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run)
219  
220          questions = ["Which is the most popular global sport?", "Who created the Python language?"]
221          contexts = [
222              [
223                  "The popularity of sports can be measured in various ways, including TV viewership, social media "
224                  "presence, number of participants, and economic impact. Football is undoubtedly the world's most "
225                  "popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and "
226                  "Messi, drawing a followership of more than 4 billion people."
227              ],
228              [
229                  "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming "
230                  "language. Its design philosophy emphasizes code readability, and its language constructs aim to help "
231                  "programmers write clear, logical code for both small and large-scale software projects."
232              ],
233          ]
234          results = component.run(questions=questions, contexts=contexts)
235  
236          assert math.isnan(results["score"])
237          assert results["results"][0] == {"relevant_statements": ["c", "d"], "score": 1}
238          assert results["results"][1]["relevant_statements"] == []
239          assert math.isnan(results["results"][1]["score"])
240  
241      @pytest.mark.skipif(
242          not os.environ.get("OPENAI_API_KEY", None),
243          reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
244      )
245      @pytest.mark.integration
246      def test_live_run(self):
247          questions = ["Who created the Python language?"]
248          contexts = [["Python, created by Guido van Rossum, is a high-level general-purpose programming language."]]
249  
250          evaluator = ContextRelevanceEvaluator(chat_generator=OpenAIChatGenerator(model="gpt-4.1-nano"))
251          result = evaluator.run(questions=questions, contexts=contexts)
252  
253          required_fields = {"results"}
254          assert all(field in result for field in required_fields)
255          nested_required_fields = {"score", "relevant_statements"}
256          assert all(field in result["results"][0] for field in nested_required_fields)
257  
258          assert "meta" in result
259          assert "prompt_tokens" in result["meta"][0]["usage"]
260          assert "completion_tokens" in result["meta"][0]["usage"]
261          assert "total_tokens" in result["meta"][0]["usage"]