test_llm_evaluator.py
1 # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai> 2 # 3 # SPDX-License-Identifier: Apache-2.0 4 5 6 import pytest 7 8 from haystack import Pipeline 9 from haystack.components.evaluators import LLMEvaluator 10 from haystack.components.generators.chat.openai import OpenAIChatGenerator 11 from haystack.dataclasses.chat_message import ChatMessage 12 13 14 class TestLLMEvaluator: 15 def test_init_default(self, monkeypatch): 16 monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") 17 component = LLMEvaluator( 18 instructions="test-instruction", 19 inputs=[("predicted_answers", list[str])], 20 outputs=["score"], 21 examples=[ 22 {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} 23 ], 24 ) 25 assert component.instructions == "test-instruction" 26 assert component.inputs == [("predicted_answers", list[str])] 27 assert component.outputs == ["score"] 28 assert component.examples == [ 29 {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} 30 ] 31 32 assert isinstance(component._chat_generator, OpenAIChatGenerator) 33 assert component._chat_generator.client.api_key == "test-api-key" 34 assert component._chat_generator.generation_kwargs == {"response_format": {"type": "json_object"}, "seed": 42} 35 36 def test_init_fail_wo_openai_api_key(self, monkeypatch): 37 monkeypatch.delenv("OPENAI_API_KEY", raising=False) 38 with pytest.raises(ValueError, match="None of the .* environment variables are set"): 39 LLMEvaluator( 40 instructions="test-instruction", 41 inputs=[("predicted_answers", list[str])], 42 outputs=["score"], 43 examples=[ 44 {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} 45 ], 46 ) 47 48 def test_init_with_chat_generator(self, monkeypatch): 49 monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") 50 chat_generator = OpenAIChatGenerator(generation_kwargs={"custom_key": "custom_value"}) 51 component = LLMEvaluator( 52 instructions="test-instruction", 53 chat_generator=chat_generator, 54 inputs=[("predicted_answers", list[str])], 55 outputs=["custom_score"], 56 examples=[ 57 {"inputs": {"predicted_answers": "answer 1"}, "outputs": {"custom_score": 1}}, 58 {"inputs": {"predicted_answers": "answer 2"}, "outputs": {"custom_score": 0}}, 59 ], 60 ) 61 62 assert component._chat_generator is chat_generator 63 64 def test_init_with_invalid_parameters(self, monkeypatch): 65 monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") 66 # Invalid inputs 67 with pytest.raises(ValueError): 68 LLMEvaluator( 69 instructions="test-instruction", 70 inputs={("predicted_answers", list[str])}, 71 outputs=["score"], 72 examples=[ 73 {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} 74 ], 75 ) 76 with pytest.raises(ValueError): 77 LLMEvaluator( 78 instructions="test-instruction", 79 inputs=[(list[str], "predicted_answers")], 80 outputs=["score"], 81 examples=[ 82 {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} 83 ], 84 ) 85 with pytest.raises(ValueError): 86 LLMEvaluator( 87 instructions="test-instruction", 88 inputs=[list[str]], 89 outputs=["score"], 90 examples=[ 91 {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} 92 ], 93 ) 94 with pytest.raises(ValueError): 95 LLMEvaluator( 96 instructions="test-instruction", 97 inputs={("predicted_answers", str)}, 98 outputs=["score"], 99 examples=[ 100 {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} 101 ], 102 ) 103 104 # Invalid outputs 105 with pytest.raises(ValueError): 106 LLMEvaluator( 107 instructions="test-instruction", 108 inputs=[("predicted_answers", list[str])], 109 outputs="score", 110 examples=[ 111 {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} 112 ], 113 ) 114 with pytest.raises(ValueError): 115 LLMEvaluator( 116 instructions="test-instruction", 117 inputs=[("predicted_answers", list[str])], 118 outputs=[["score"]], 119 examples=[ 120 {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} 121 ], 122 ) 123 124 # Invalid examples 125 with pytest.raises(ValueError): 126 LLMEvaluator( 127 instructions="test-instruction", 128 inputs=[("predicted_answers", list[str])], 129 outputs=["score"], 130 examples={ 131 "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, 132 "outputs": {"custom_score": 1}, 133 }, 134 ) 135 with pytest.raises(ValueError): 136 LLMEvaluator( 137 instructions="test-instruction", 138 inputs=[("predicted_answers", list[str])], 139 outputs=["score"], 140 examples=[ 141 [ 142 { 143 "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, 144 "outputs": {"custom_score": 1}, 145 } 146 ] 147 ], 148 ) 149 with pytest.raises(ValueError): 150 LLMEvaluator( 151 instructions="test-instruction", 152 inputs=[("predicted_answers", list[str])], 153 outputs=["score"], 154 examples=[ 155 { 156 "wrong_key": {"predicted_answers": "Damn, this is straight outta hell!!!"}, 157 "outputs": {"custom_score": 1}, 158 } 159 ], 160 ) 161 with pytest.raises(ValueError): 162 LLMEvaluator( 163 instructions="test-instruction", 164 inputs=[("predicted_answers", list[str])], 165 outputs=["score"], 166 examples=[ 167 { 168 "inputs": [{"predicted_answers": "Damn, this is straight outta hell!!!"}], 169 "outputs": [{"custom_score": 1}], 170 } 171 ], 172 ) 173 with pytest.raises(ValueError): 174 LLMEvaluator( 175 instructions="test-instruction", 176 inputs=[("predicted_answers", list[str])], 177 outputs=["score"], 178 examples=[{"inputs": {1: "Damn, this is straight outta hell!!!"}, "outputs": {2: 1}}], 179 ) 180 181 def test_to_dict_default(self, monkeypatch): 182 monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") 183 chat_generator = OpenAIChatGenerator(generation_kwargs={"response_format": {"type": "json_object"}, "seed": 42}) 184 185 component = LLMEvaluator( 186 instructions="test-instruction", 187 inputs=[("predicted_answers", list[str])], 188 outputs=["score"], 189 examples=[ 190 {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} 191 ], 192 ) 193 data = component.to_dict() 194 assert data == { 195 "type": "haystack.components.evaluators.llm_evaluator.LLMEvaluator", 196 "init_parameters": { 197 "chat_generator": chat_generator.to_dict(), 198 "instructions": "test-instruction", 199 "inputs": [["predicted_answers", "list[str]"]], 200 "outputs": ["score"], 201 "progress_bar": True, 202 "examples": [ 203 {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} 204 ], 205 }, 206 } 207 208 def test_to_dict_with_parameters(self, monkeypatch): 209 monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") 210 chat_generator = OpenAIChatGenerator(generation_kwargs={"response_format": {"type": "json_object"}, "seed": 42}) 211 212 component = LLMEvaluator( 213 instructions="test-instruction", 214 inputs=[("predicted_answers", list[str])], 215 outputs=["custom_score"], 216 examples=[ 217 { 218 "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, 219 "outputs": {"custom_score": 1}, 220 }, 221 { 222 "inputs": {"predicted_answers": "Football is the most popular sport."}, 223 "outputs": {"custom_score": 0}, 224 }, 225 ], 226 ) 227 data = component.to_dict() 228 assert data == { 229 "type": "haystack.components.evaluators.llm_evaluator.LLMEvaluator", 230 "init_parameters": { 231 "chat_generator": chat_generator.to_dict(), 232 "instructions": "test-instruction", 233 "inputs": [["predicted_answers", "list[str]"]], 234 "outputs": ["custom_score"], 235 "progress_bar": True, 236 "examples": [ 237 { 238 "inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, 239 "outputs": {"custom_score": 1}, 240 }, 241 { 242 "inputs": {"predicted_answers": "Football is the most popular sport."}, 243 "outputs": {"custom_score": 0}, 244 }, 245 ], 246 }, 247 } 248 249 def test_from_dict(self, monkeypatch): 250 monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") 251 chat_generator = OpenAIChatGenerator(generation_kwargs={"response_format": {"type": "json_object"}, "seed": 42}) 252 253 data = { 254 "type": "haystack.components.evaluators.llm_evaluator.LLMEvaluator", 255 "init_parameters": { 256 "chat_generator": chat_generator.to_dict(), 257 "instructions": "test-instruction", 258 "inputs": [["predicted_answers", "list[str]"]], 259 "outputs": ["score"], 260 "examples": [ 261 {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} 262 ], 263 }, 264 } 265 266 component = LLMEvaluator.from_dict(data) 267 assert isinstance(component._chat_generator, OpenAIChatGenerator) 268 assert component._chat_generator.client.api_key == "test-api-key" 269 assert component._chat_generator.generation_kwargs == {"response_format": {"type": "json_object"}, "seed": 42} 270 assert component.instructions == "test-instruction" 271 assert component.inputs == [("predicted_answers", list[str])] 272 assert component.outputs == ["score"] 273 assert component.examples == [ 274 {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} 275 ] 276 277 def test_pipeline_serde(self, monkeypatch): 278 monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") 279 pipeline = Pipeline() 280 component = LLMEvaluator( 281 instructions="test-instruction", 282 inputs=[("questions", list[str]), ("predicted_answers", list[list[str]])], 283 outputs=["score"], 284 examples=[ 285 {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} 286 ], 287 ) 288 pipeline.add_component("evaluator", component) 289 serialized_pipeline = pipeline.dumps() 290 deserialized_pipeline = Pipeline.loads(serialized_pipeline) 291 assert deserialized_pipeline == pipeline 292 293 def test_run_with_different_lengths(self, monkeypatch): 294 monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") 295 component = LLMEvaluator( 296 instructions="test-instruction", 297 inputs=[("questions", list[str]), ("predicted_answers", list[list[str]])], 298 outputs=["score"], 299 examples=[ 300 {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} 301 ], 302 ) 303 304 def chat_generator_run(self, *args, **kwargs): 305 return {"replies": [ChatMessage.from_assistant('{"score": 0.5}')]} 306 307 monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run) 308 309 with pytest.raises(ValueError): 310 component.run(questions=["What is the capital of Germany?"], predicted_answers=[["Berlin"], ["Paris"]]) 311 312 with pytest.raises(ValueError): 313 component.run( 314 questions=["What is the capital of Germany?", "What is the capital of France?"], 315 predicted_answers=[["Berlin"]], 316 ) 317 318 def test_run_returns_parsed_result(self, monkeypatch): 319 monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") 320 component = LLMEvaluator( 321 instructions="test-instruction", 322 inputs=[("questions", list[str]), ("predicted_answers", list[list[str]])], 323 outputs=["score"], 324 examples=[ 325 {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} 326 ], 327 ) 328 329 def chat_generator_run(self, *args, **kwargs): 330 return {"replies": [ChatMessage.from_assistant('{"score": 0.5}')]} 331 332 monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run) 333 334 results = component.run(questions=["What is the capital of Germany?"], predicted_answers=["Berlin"]) 335 assert results == {"results": [{"score": 0.5}], "meta": None} 336 337 def test_prepare_template(self, monkeypatch): 338 monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") 339 component = LLMEvaluator( 340 instructions="test-instruction", 341 inputs=[("predicted_answers", list[str])], 342 outputs=["score"], 343 examples=[ 344 {"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}}, 345 {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}, 346 ], 347 ) 348 template = component.prepare_template() 349 assert ( 350 template 351 == "Instructions:\ntest-instruction\n\nGenerate the response in JSON format with the following keys:" 352 '\n["score"]\nConsider the instructions and the examples below to determine those values.\n\n' 353 'Examples:\nInputs:\n{"predicted_answers": "Damn, this is straight outta hell!!!"}\nOutputs:' 354 '\n{"score": 1}\nInputs:\n{"predicted_answers": "Football is the most popular sport."}\nOutputs:' 355 '\n{"score": 0}\n\nInputs:\n{"predicted_answers": {{ predicted_answers }}}\nOutputs:\n' 356 ) 357 358 def test_invalid_input_parameters(self, monkeypatch): 359 monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") 360 component = LLMEvaluator( 361 instructions="test-instruction", 362 inputs=[("predicted_answers", list[str])], 363 outputs=["score"], 364 examples=[ 365 {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} 366 ], 367 ) 368 # None of the expected parameters are received 369 with pytest.raises(ValueError): 370 component.validate_input_parameters( 371 expected={"predicted_answers": list[str]}, received={"questions": list[str]} 372 ) 373 374 # Only one but not all the expected parameters are received 375 with pytest.raises(ValueError): 376 component.validate_input_parameters( 377 expected={"predicted_answers": list[str], "questions": list[str]}, received={"questions": list[str]} 378 ) 379 380 # Received inputs are not lists 381 with pytest.raises(ValueError): 382 component.validate_input_parameters(expected={"questions": list[str]}, received={"questions": str}) 383 384 def test_invalid_outputs(self, monkeypatch): 385 monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") 386 component = LLMEvaluator( 387 instructions="test-instruction", 388 inputs=[("predicted_answers", list[str])], 389 outputs=["score"], 390 examples=[ 391 {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} 392 ], 393 ) 394 395 def chat_generator_run(self, *args, **kwargs): 396 return {"replies": [ChatMessage.from_assistant('{"score": 1.0}')]} 397 398 monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run) 399 400 # Test missing key "another_expected_output" 401 component.outputs = ["score", "another_expected_output"] 402 with pytest.raises(ValueError, match="Missing expected keys"): 403 component.run(predicted_answers=["answer"]) 404 405 # Test wrong key 406 def chat_generator_run_wrong_key(self, *args, **kwargs): 407 return {"replies": [ChatMessage.from_assistant('{"wrong_name": 1.0}')]} 408 409 monkeypatch.setattr( 410 "haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run_wrong_key 411 ) 412 413 component.outputs = ["score"] 414 with pytest.raises(ValueError, match="Missing expected keys"): 415 component.run(predicted_answers=["answer"]) 416 417 def test_output_invalid_json_raise_on_failure_false(self, monkeypatch): 418 monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") 419 component = LLMEvaluator( 420 instructions="test-instruction", 421 inputs=[("predicted_answers", list[str])], 422 outputs=["score"], 423 examples=[ 424 {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} 425 ], 426 raise_on_failure=False, 427 ) 428 429 def chat_generator_run(self, *args, **kwargs): 430 return {"replies": [ChatMessage.from_assistant("some_invalid_json_output")]} 431 432 monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run) 433 434 result = component.run(predicted_answers=["answer"]) 435 assert result["results"] == [None] 436 437 def test_output_invalid_json_raise_on_failure_true(self, monkeypatch): 438 monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") 439 component = LLMEvaluator( 440 instructions="test-instruction", 441 inputs=[("predicted_answers", list[str])], 442 outputs=["score"], 443 examples=[ 444 {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} 445 ], 446 raise_on_failure=True, 447 ) 448 449 def chat_generator_run(self, *args, **kwargs): 450 return {"replies": [ChatMessage.from_assistant("some_invalid_json_output")]} 451 452 monkeypatch.setattr("haystack.components.evaluators.llm_evaluator.OpenAIChatGenerator.run", chat_generator_run) 453 454 with pytest.raises( 455 ValueError 456 ): # json_utils/LLMEvaluator might raise JSONDecodeError which inherits from ValueError or wrapped 457 component.run(predicted_answers=["answer"])