/ test / components / evaluators / test_llm_evaluator.py
test_llm_evaluator.py
  1  # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
  2  #
  3  # SPDX-License-Identifier: Apache-2.0
  4  
  5  
  6  import pytest
  7  
  8  from haystack import Pipeline
  9  from haystack.components.evaluators import LLMEvaluator
 10  from haystack.components.generators.chat.openai import OpenAIChatGenerator
 11  from haystack.dataclasses.chat_message import ChatMessage
 12  
 13  
 14  class TestLLMEvaluator:
 15      def test_init_default(self, monkeypatch):
 16          monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
 17          component = LLMEvaluator(
 18              instructions="test-instruction",
 19              inputs=[("predicted_answers", list[str])],
 20              outputs=["score"],
 21              examples=[
 22                  {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
 23              ],
 24          )
 25          assert component.instructions == "test-instruction"
 26          assert component.inputs == [("predicted_answers", list[str])]
 27          assert component.outputs == ["score"]
 28          assert component.examples == [
 29              {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
 30          ]
 31  
 32          assert isinstance(component._chat_generator, OpenAIChatGenerator)
 33          assert component._chat_generator.client.api_key == "test-api-key"
 34          assert component._chat_generator.generation_kwargs == {"response_format": {"type": "json_object"}, "seed": 42}
 35  
 36      def test_init_fail_wo_openai_api_key(self, monkeypatch):
 37          monkeypatch.delenv("OPENAI_API_KEY", raising=False)
 38          with pytest.raises(ValueError, match="None of the .* environment variables are set"):
 39              LLMEvaluator(
 40                  instructions="test-instruction",
 41                  inputs=[("predicted_answers", list[str])],
 42                  outputs=["score"],
 43                  examples=[
 44                      {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
 45                  ],
 46              )
 47  
 48      def test_init_with_chat_generator(self, monkeypatch):
 49          monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
 50          chat_generator = OpenAIChatGenerator(generation_kwargs={"custom_key": "custom_value"})
 51          component = LLMEvaluator(
 52              instructions="test-instruction",
 53              chat_generator=chat_generator,
 54              inputs=[("predicted_answers", list[str])],
 55              outputs=["custom_score"],
 56              examples=[
 57                  {"inputs": {"predicted_answers": "answer 1"}, "outputs": {"custom_score": 1}},
 58                  {"inputs": {"predicted_answers": "answer 2"}, "outputs": {"custom_score": 0}},
 59              ],
 60          )
 61  
 62          assert component._chat_generator is chat_generator
 63  
 64      def test_init_with_invalid_parameters(self, monkeypatch):
 65          monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
 66          # Invalid inputs
 67          with pytest.raises(ValueError):
 68              LLMEvaluator(
 69                  instructions="test-instruction",
 70                  inputs={("predicted_answers", list[str])},
 71                  outputs=["score"],
 72                  examples=[
 73                      {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
 74                  ],
 75              )
 76          with pytest.raises(ValueError):
 77              LLMEvaluator(
 78                  instructions="test-instruction",
 79                  inputs=[(list[str], "predicted_answers")],
 80                  outputs=["score"],
 81                  examples=[
 82                      {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
 83                  ],
 84              )
 85          with pytest.raises(ValueError):
 86              LLMEvaluator(
 87                  instructions="test-instruction",
 88                  inputs=[list[str]],
 89                  outputs=["score"],
 90                  examples=[
 91                      {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
 92                  ],
 93              )
 94          with pytest.raises(ValueError):
 95              LLMEvaluator(
 96                  instructions="test-instruction",
 97                  inputs={("predicted_answers", str)},
 98                  outputs=["score"],
 99                  examples=[
100                      {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
101                  ],
102              )
103  
104          # Invalid outputs
105          with pytest.raises(ValueError):
106              LLMEvaluator(
107                  instructions="test-instruction",
108                  inputs=[("predicted_answers", list[str])],
109                  outputs="score",
110                  examples=[
111                      {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
112                  ],
113              )
114          with pytest.raises(ValueError):
115              LLMEvaluator(
116                  instructions="test-instruction",
117                  inputs=[("predicted_answers", list[str])],
118                  outputs=[["score"]],
119                  examples=[
120                      {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
121                  ],
122              )
123  
124          # Invalid examples
125          with pytest.raises(ValueError):
126              LLMEvaluator(
127                  instructions="test-instruction",
128                  inputs=[("predicted_answers", list[str])],
129                  outputs=["score"],
130                  examples={
131                      "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"},
132                      "outputs": {"custom_score": 1},
133                  },
134              )
135          with pytest.raises(ValueError):
136              LLMEvaluator(
137                  instructions="test-instruction",
138                  inputs=[("predicted_answers", list[str])],
139                  outputs=["score"],
140                  examples=[
141                      [
142                          {
143                              "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"},
144                              "outputs": {"custom_score": 1},
145                          }
146                      ]
147                  ],
148              )
149          with pytest.raises(ValueError):
150              LLMEvaluator(
151                  instructions="test-instruction",
152                  inputs=[("predicted_answers", list[str])],
153                  outputs=["score"],
154                  examples=[
155                      {
156                          "wrong_key": {"predicted_answers": "Damn, this is straight outta hell!!!"},
157                          "outputs": {"custom_score": 1},
158                      }
159                  ],
160              )
161          with pytest.raises(ValueError):
162              LLMEvaluator(
163                  instructions="test-instruction",
164                  inputs=[("predicted_answers", list[str])],
165                  outputs=["score"],
166                  examples=[
167                      {
168                          "inputs": [{"predicted_answers": "Damn, this is straight outta hell!!!"}],
169                          "outputs": [{"custom_score": 1}],
170                      }
171                  ],
172              )
173          with pytest.raises(ValueError):
174              LLMEvaluator(
175                  instructions="test-instruction",
176                  inputs=[("predicted_answers", list[str])],
177                  outputs=["score"],
178                  examples=[{"inputs": {1: "Damn, this is straight outta hell!!!"}, "outputs": {2: 1}}],
179              )
180  
181      def test_to_dict_default(self, monkeypatch):
182          monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
183          chat_generator = OpenAIChatGenerator(generation_kwargs={"response_format": {"type": "json_object"}, "seed": 42})
184  
185          component = LLMEvaluator(
186              instructions="test-instruction",
187              inputs=[("predicted_answers", list[str])],
188              outputs=["score"],
189              examples=[
190                  {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
191              ],
192          )
193          data = component.to_dict()
194          assert data == {
195              "type": "haystack.components.evaluators.llm_evaluator.LLMEvaluator",
196              "init_parameters": {
197                  "chat_generator": chat_generator.to_dict(),
198                  "instructions": "test-instruction",
199                  "inputs": [["predicted_answers", "list[str]"]],
200                  "outputs": ["score"],
201                  "progress_bar": True,
202                  "examples": [
203                      {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
204                  ],
205              },
206          }
207  
208      def test_to_dict_with_parameters(self, monkeypatch):
209          monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
210          chat_generator = OpenAIChatGenerator(generation_kwargs={"response_format": {"type": "json_object"}, "seed": 42})
211  
212          component = LLMEvaluator(
213              instructions="test-instruction",
214              inputs=[("predicted_answers", list[str])],
215              outputs=["custom_score"],
216              examples=[
217                  {
218                      "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"},
219                      "outputs": {"custom_score": 1},
220                  },
221                  {
222                      "inputs": {"predicted_answers": "Football is the most popular sport."},
223                      "outputs": {"custom_score": 0},
224                  },
225              ],
226          )
227          data = component.to_dict()
228          assert data == {
229              "type": "haystack.components.evaluators.llm_evaluator.LLMEvaluator",
230              "init_parameters": {
231                  "chat_generator": chat_generator.to_dict(),
232                  "instructions": "test-instruction",
233                  "inputs": [["predicted_answers", "list[str]"]],
234                  "outputs": ["custom_score"],
235                  "progress_bar": True,
236                  "examples": [
237                      {
238                          "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"},
239                          "outputs": {"custom_score": 1},
240                      },
241                      {
242                          "inputs": {"predicted_answers": "Football is the most popular sport."},
243                          "outputs": {"custom_score": 0},
244                      },
245                  ],
246              },
247          }
248  
249      def test_from_dict(self, monkeypatch):
250          monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
251          chat_generator = OpenAIChatGenerator(generation_kwargs={"response_format": {"type": "json_object"}, "seed": 42})
252  
253          data = {
254              "type": "haystack.components.evaluators.llm_evaluator.LLMEvaluator",
255              "init_parameters": {
256                  "chat_generator": chat_generator.to_dict(),
257                  "instructions": "test-instruction",
258                  "inputs": [["predicted_answers", "list[str]"]],
259                  "outputs": ["score"],
260                  "examples": [
261                      {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
262                  ],
263              },
264          }
265  
266          component = LLMEvaluator.from_dict(data)
267          assert isinstance(component._chat_generator, OpenAIChatGenerator)
268          assert component._chat_generator.client.api_key == "test-api-key"
269          assert component._chat_generator.generation_kwargs == {"response_format": {"type": "json_object"}, "seed": 42}
270          assert component.instructions == "test-instruction"
271          assert component.inputs == [("predicted_answers", list[str])]
272          assert component.outputs == ["score"]
273          assert component.examples == [
274              {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
275          ]
276  
277      def test_pipeline_serde(self, monkeypatch):
278          monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
279          pipeline = Pipeline()
280          component = LLMEvaluator(
281              instructions="test-instruction",
282              inputs=[("questions", list[str]), ("predicted_answers", list[list[str]])],
283              outputs=["score"],
284              examples=[
285                  {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
286              ],
287          )
288          pipeline.add_component("evaluator", component)
289          serialized_pipeline = pipeline.dumps()
290          deserialized_pipeline = Pipeline.loads(serialized_pipeline)
291          assert deserialized_pipeline == pipeline
292  
293      def test_run_with_different_lengths(self, monkeypatch):
294          monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
295          component = LLMEvaluator(
296              instructions="test-instruction",
297              inputs=[("questions", list[str]), ("predicted_answers", list[list[str]])],
298              outputs=["score"],
299              examples=[
300                  {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
301              ],
302          )
303  
304          def chat_generator_run(self, *args, **kwargs):
305              return {"replies": [ChatMessage.from_assistant('{"score": 0.5}')]}
306  
307          monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run)
308  
309          with pytest.raises(ValueError):
310              component.run(questions=["What is the capital of Germany?"], predicted_answers=[["Berlin"], ["Paris"]])
311  
312          with pytest.raises(ValueError):
313              component.run(
314                  questions=["What is the capital of Germany?", "What is the capital of France?"],
315                  predicted_answers=[["Berlin"]],
316              )
317  
318      def test_run_returns_parsed_result(self, monkeypatch):
319          monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
320          component = LLMEvaluator(
321              instructions="test-instruction",
322              inputs=[("questions", list[str]), ("predicted_answers", list[list[str]])],
323              outputs=["score"],
324              examples=[
325                  {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
326              ],
327          )
328  
329          def chat_generator_run(self, *args, **kwargs):
330              return {"replies": [ChatMessage.from_assistant('{"score": 0.5}')]}
331  
332          monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run)
333  
334          results = component.run(questions=["What is the capital of Germany?"], predicted_answers=["Berlin"])
335          assert results == {"results": [{"score": 0.5}], "meta": None}
336  
337      def test_prepare_template(self, monkeypatch):
338          monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
339          component = LLMEvaluator(
340              instructions="test-instruction",
341              inputs=[("predicted_answers", list[str])],
342              outputs=["score"],
343              examples=[
344                  {"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
345                  {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}},
346              ],
347          )
348          template = component.prepare_template()
349          assert (
350              template
351              == "Instructions:\ntest-instruction\n\nGenerate the response in JSON format with the following keys:"
352              '\n["score"]\nConsider the instructions and the examples below to determine those values.\n\n'
353              'Examples:\nInputs:\n{"predicted_answers": "Damn, this is straight outta hell!!!"}\nOutputs:'
354              '\n{"score": 1}\nInputs:\n{"predicted_answers": "Football is the most popular sport."}\nOutputs:'
355              '\n{"score": 0}\n\nInputs:\n{"predicted_answers": {{ predicted_answers }}}\nOutputs:\n'
356          )
357  
358      def test_invalid_input_parameters(self, monkeypatch):
359          monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
360          component = LLMEvaluator(
361              instructions="test-instruction",
362              inputs=[("predicted_answers", list[str])],
363              outputs=["score"],
364              examples=[
365                  {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
366              ],
367          )
368          # None of the expected parameters are received
369          with pytest.raises(ValueError):
370              component.validate_input_parameters(
371                  expected={"predicted_answers": list[str]}, received={"questions": list[str]}
372              )
373  
374          # Only one but not all the expected parameters are received
375          with pytest.raises(ValueError):
376              component.validate_input_parameters(
377                  expected={"predicted_answers": list[str], "questions": list[str]}, received={"questions": list[str]}
378              )
379  
380          # Received inputs are not lists
381          with pytest.raises(ValueError):
382              component.validate_input_parameters(expected={"questions": list[str]}, received={"questions": str})
383  
384      def test_invalid_outputs(self, monkeypatch):
385          monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
386          component = LLMEvaluator(
387              instructions="test-instruction",
388              inputs=[("predicted_answers", list[str])],
389              outputs=["score"],
390              examples=[
391                  {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
392              ],
393          )
394  
395          def chat_generator_run(self, *args, **kwargs):
396              return {"replies": [ChatMessage.from_assistant('{"score": 1.0}')]}
397  
398          monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run)
399  
400          # Test missing key "another_expected_output"
401          component.outputs = ["score", "another_expected_output"]
402          with pytest.raises(ValueError, match="Missing expected keys"):
403              component.run(predicted_answers=["answer"])
404  
405          # Test wrong key
406          def chat_generator_run_wrong_key(self, *args, **kwargs):
407              return {"replies": [ChatMessage.from_assistant('{"wrong_name": 1.0}')]}
408  
409          monkeypatch.setattr(
410              "haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run_wrong_key
411          )
412  
413          component.outputs = ["score"]
414          with pytest.raises(ValueError, match="Missing expected keys"):
415              component.run(predicted_answers=["answer"])
416  
417      def test_output_invalid_json_raise_on_failure_false(self, monkeypatch):
418          monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
419          component = LLMEvaluator(
420              instructions="test-instruction",
421              inputs=[("predicted_answers", list[str])],
422              outputs=["score"],
423              examples=[
424                  {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
425              ],
426              raise_on_failure=False,
427          )
428  
429          def chat_generator_run(self, *args, **kwargs):
430              return {"replies": [ChatMessage.from_assistant("some_invalid_json_output")]}
431  
432          monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run)
433  
434          result = component.run(predicted_answers=["answer"])
435          assert result["results"] == [None]
436  
437      def test_output_invalid_json_raise_on_failure_true(self, monkeypatch):
438          monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
439          component = LLMEvaluator(
440              instructions="test-instruction",
441              inputs=[("predicted_answers", list[str])],
442              outputs=["score"],
443              examples=[
444                  {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
445              ],
446              raise_on_failure=True,
447          )
448  
449          def chat_generator_run(self, *args, **kwargs):
450              return {"replies": [ChatMessage.from_assistant("some_invalid_json_output")]}
451  
452          monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run)
453  
454          with pytest.raises(
455              ValueError
456          ):  # json_utils/LLMEvaluator might raise JSONDecodeError which inherits from ValueError or wrapped
457              component.run(predicted_answers=["answer"])