test_scorers.py
1 import json 2 from typing import Any 3 from unittest.mock import patch 4 5 import pytest 6 from click.testing import CliRunner 7 8 import mlflow 9 from mlflow.cli.scorers import commands 10 from mlflow.exceptions import MlflowException 11 from mlflow.genai.scorers import get_all_scorers, list_scorers, scorer 12 from mlflow.utils.string_utils import _create_table 13 14 15 @pytest.fixture 16 def mock_databricks_environment(): 17 with ( 18 patch("mlflow.genai.scorers.base.is_databricks_uri", return_value=True), 19 ): 20 yield 21 22 23 @pytest.fixture 24 def runner(): 25 return CliRunner(catch_exceptions=False) 26 27 28 @pytest.fixture 29 def experiment(): 30 """Create a test experiment.""" 31 experiment_id = mlflow.create_experiment( 32 f"test_scorers_cli_{mlflow.utils.time.get_current_time_millis()}" 33 ) 34 yield experiment_id 35 mlflow.delete_experiment(experiment_id) 36 37 38 @pytest.fixture 39 def correctness_scorer(): 40 """Create a correctness scorer.""" 41 42 @scorer 43 def _correctness_scorer(outputs) -> bool: 44 return len(outputs) > 0 45 46 return _correctness_scorer 47 48 49 @pytest.fixture 50 def safety_scorer(): 51 """Create a safety scorer.""" 52 53 @scorer 54 def _safety_scorer(outputs) -> bool: 55 return len(outputs) > 0 56 57 return _safety_scorer 58 59 60 @pytest.fixture 61 def relevance_scorer(): 62 """Create a relevance scorer.""" 63 64 @scorer 65 def _relevance_scorer(outputs) -> bool: 66 return len(outputs) > 0 67 68 return _relevance_scorer 69 70 71 @pytest.fixture 72 def generic_scorer(): 73 """Create a generic test scorer.""" 74 75 @scorer 76 def _generic_scorer(outputs) -> bool: 77 return True 78 79 return _generic_scorer 80 81 82 def test_commands_group_exists(): 83 assert commands.name == "scorers" 84 assert commands.help is not None 85 86 87 def test_list_command_params(): 88 list_cmd = next((cmd for cmd in commands.commands.values() if cmd.name == "list"), None) 89 assert list_cmd is not None 90 param_names = {p.name for p in list_cmd.params} 91 assert param_names == {"experiment_id", "builtin", "output"} 92 93 94 def test_list_scorers_table_output( 95 runner: CliRunner, 96 experiment: str, 97 correctness_scorer: Any, 98 safety_scorer: Any, 99 relevance_scorer: Any, 100 mock_databricks_environment: Any, 101 ): 102 correctness_scorer.register(experiment_id=experiment, name="Correctness") 103 safety_scorer.register(experiment_id=experiment, name="Safety") 104 relevance_scorer.register(experiment_id=experiment, name="RelevanceToQuery") 105 106 result = runner.invoke(commands, ["list", "--experiment-id", experiment]) 107 108 assert result.exit_code == 0 109 110 # Construct expected table output (scorers are returned in alphabetical order) 111 # Note: click.echo() adds a trailing newline 112 expected_table = ( 113 _create_table( 114 [["Correctness", ""], ["RelevanceToQuery", ""], ["Safety", ""]], 115 headers=["Scorer Name", "Description"], 116 ) 117 + "\n" 118 ) 119 assert result.output == expected_table 120 121 122 def test_list_scorers_json_output( 123 runner: CliRunner, 124 experiment: str, 125 correctness_scorer: Any, 126 safety_scorer: Any, 127 relevance_scorer: Any, 128 mock_databricks_environment: Any, 129 ): 130 correctness_scorer.register(experiment_id=experiment, name="Correctness") 131 safety_scorer.register(experiment_id=experiment, name="Safety") 132 relevance_scorer.register(experiment_id=experiment, name="RelevanceToQuery") 133 134 result = runner.invoke(commands, ["list", "--experiment-id", experiment, "--output", "json"]) 135 136 assert result.exit_code == 0 137 output_json = json.loads(result.output) 138 expected_scorers = [ 139 {"name": "Correctness", "description": None}, 140 {"name": "RelevanceToQuery", "description": None}, 141 {"name": "Safety", "description": None}, 142 ] 143 assert output_json["scorers"] == expected_scorers 144 145 146 @pytest.mark.parametrize( 147 ("output_format", "expected_output"), 148 [ 149 ("table", ""), 150 ("json", {"scorers": []}), 151 ], 152 ) 153 def test_list_scorers_empty_experiment( 154 runner: CliRunner, experiment: str, output_format: str, expected_output: Any 155 ): 156 args = ["list", "--experiment-id", experiment] 157 if output_format == "json": 158 args.extend(["--output", "json"]) 159 160 result = runner.invoke(commands, args) 161 assert result.exit_code == 0 162 163 if output_format == "json": 164 output_json = json.loads(result.output) 165 assert output_json == expected_output 166 else: 167 # Empty table produces minimal output 168 assert result.output.strip() == expected_output 169 170 171 def test_list_scorers_with_experiment_id_env_var( 172 runner: CliRunner, experiment: str, correctness_scorer: Any, mock_databricks_environment: Any 173 ): 174 correctness_scorer.register(experiment_id=experiment, name="Correctness") 175 176 result = runner.invoke(commands, ["list"], env={"MLFLOW_EXPERIMENT_ID": experiment}) 177 178 assert result.exit_code == 0 179 assert "Correctness" in result.output 180 181 182 def test_list_scorers_missing_experiment_id(runner: CliRunner): 183 result = runner.invoke(commands, ["list"]) 184 185 assert result.exit_code != 0 186 assert "experiment-id" in result.output.lower() or "experiment_id" in result.output.lower() 187 188 189 def test_list_scorers_invalid_output_format(runner: CliRunner, experiment: str): 190 result = runner.invoke(commands, ["list", "--experiment-id", experiment, "--output", "invalid"]) 191 192 assert result.exit_code != 0 193 assert "invalid" in result.output.lower() or "choice" in result.output.lower() 194 195 196 def test_list_scorers_special_characters_in_names( 197 runner: CliRunner, experiment: str, generic_scorer: Any, mock_databricks_environment: Any 198 ): 199 generic_scorer.register(experiment_id=experiment, name="Scorer With Spaces") 200 generic_scorer.register(experiment_id=experiment, name="Scorer.With.Dots") 201 generic_scorer.register(experiment_id=experiment, name="Scorer-With-Dashes") 202 generic_scorer.register(experiment_id=experiment, name="Scorer_With_Underscores") 203 204 result = runner.invoke(commands, ["list", "--experiment-id", experiment]) 205 206 assert result.exit_code == 0 207 assert "Scorer With Spaces" in result.output 208 assert "Scorer.With.Dots" in result.output 209 assert "Scorer-With-Dashes" in result.output 210 assert "Scorer_With_Underscores" in result.output 211 212 213 @pytest.mark.parametrize( 214 "output_format", 215 ["table", "json"], 216 ) 217 def test_list_scorers_single_scorer( 218 runner: CliRunner, 219 experiment: str, 220 generic_scorer: Any, 221 output_format: str, 222 mock_databricks_environment: Any, 223 ): 224 generic_scorer.register(experiment_id=experiment, name="OnlyScorer") 225 226 args = ["list", "--experiment-id", experiment] 227 if output_format == "json": 228 args.extend(["--output", "json"]) 229 230 result = runner.invoke(commands, args) 231 assert result.exit_code == 0 232 233 if output_format == "json": 234 output_json = json.loads(result.output) 235 assert output_json == {"scorers": [{"name": "OnlyScorer", "description": None}]} 236 else: 237 assert "OnlyScorer" in result.output 238 239 240 @pytest.mark.parametrize( 241 "output_format", 242 ["table", "json"], 243 ) 244 def test_list_scorers_long_names( 245 runner: CliRunner, 246 experiment: str, 247 generic_scorer: Any, 248 output_format: str, 249 mock_databricks_environment: Any, 250 ): 251 long_name = "VeryLongScorerNameThatShouldNotBeTruncatedEvenIfItIsReallyReallyLong" 252 generic_scorer.register(experiment_id=experiment, name=long_name) 253 254 args = ["list", "--experiment-id", experiment] 255 if output_format == "json": 256 args.extend(["--output", "json"]) 257 258 result = runner.invoke(commands, args) 259 assert result.exit_code == 0 260 261 if output_format == "json": 262 output_json = json.loads(result.output) 263 assert output_json == {"scorers": [{"name": long_name, "description": None}]} 264 else: 265 # Full name should be present 266 assert long_name in result.output 267 268 269 def test_list_scorers_with_descriptions(runner: CliRunner, experiment: str): 270 from mlflow.genai.judges import make_judge 271 272 judge1 = make_judge( 273 name="quality_judge", 274 instructions="Evaluate {{ outputs }}", 275 description="Evaluates response quality", 276 feedback_value_type=str, 277 ) 278 judge1.register(experiment_id=experiment) 279 280 judge2 = make_judge( 281 name="safety_judge", 282 instructions="Check {{ outputs }}", 283 description="Checks for safety issues", 284 feedback_value_type=str, 285 ) 286 judge2.register(experiment_id=experiment) 287 288 judge3 = make_judge( 289 name="no_desc_judge", 290 instructions="Evaluate {{ outputs }}", 291 feedback_value_type=str, 292 ) 293 judge3.register(experiment_id=experiment) 294 295 result_json = runner.invoke( 296 commands, ["list", "--experiment-id", experiment, "--output", "json"] 297 ) 298 assert result_json.exit_code == 0 299 output_json = json.loads(result_json.output) 300 301 assert len(output_json["scorers"]) == 3 302 scorers_by_name = {s["name"]: s for s in output_json["scorers"]} 303 304 assert scorers_by_name["no_desc_judge"]["description"] is None 305 assert scorers_by_name["quality_judge"]["description"] == "Evaluates response quality" 306 assert scorers_by_name["safety_judge"]["description"] == "Checks for safety issues" 307 308 result_table = runner.invoke(commands, ["list", "--experiment-id", experiment]) 309 assert result_table.exit_code == 0 310 assert "Evaluates response quality" in result_table.output 311 assert "Checks for safety issues" in result_table.output 312 313 314 def test_create_judge_basic(runner: CliRunner, experiment: str): 315 result = runner.invoke( 316 commands, 317 [ 318 "register-llm-judge", 319 "--name", 320 "test_judge", 321 "--instructions", 322 "Evaluate {{ outputs }}", 323 "--experiment-id", 324 experiment, 325 ], 326 ) 327 328 assert result.exit_code == 0 329 assert "Successfully created and registered judge scorer 'test_judge'" in result.output 330 assert experiment in result.output 331 332 # Verify judge was registered 333 scorers = list_scorers(experiment_id=experiment) 334 scorer_names = [s.name for s in scorers] 335 assert "test_judge" in scorer_names 336 337 338 def test_create_judge_with_model(runner: CliRunner, experiment: str): 339 result = runner.invoke( 340 commands, 341 [ 342 "register-llm-judge", 343 "--name", 344 "custom_model_judge", 345 "--instructions", 346 "Check {{ inputs }} and {{ outputs }}", 347 "--model", 348 "openai:/gpt-4", 349 "--experiment-id", 350 experiment, 351 ], 352 ) 353 354 assert result.exit_code == 0 355 assert "Successfully created and registered" in result.output 356 357 # Verify judge was registered with correct model 358 scorers = list_scorers(experiment_id=experiment) 359 scorer_names = [s.name for s in scorers] 360 assert "custom_model_judge" in scorer_names 361 362 # Get the judge and verify it uses the specified model 363 judge = next(s for s in scorers if s.name == "custom_model_judge") 364 assert judge.model == "openai:/gpt-4" 365 366 367 def test_create_judge_short_options(runner: CliRunner, experiment: str): 368 result = runner.invoke( 369 commands, 370 [ 371 "register-llm-judge", 372 "-n", 373 "short_options_judge", 374 "-i", 375 "Evaluate {{ outputs }}", 376 "-x", 377 experiment, 378 ], 379 ) 380 381 assert result.exit_code == 0 382 assert "Successfully created and registered" in result.output 383 384 # Verify judge was registered 385 scorers = list_scorers(experiment_id=experiment) 386 scorer_names = [s.name for s in scorers] 387 assert "short_options_judge" in scorer_names 388 389 390 def test_create_judge_with_env_var(runner: CliRunner, experiment: str): 391 result = runner.invoke( 392 commands, 393 [ 394 "register-llm-judge", 395 "--name", 396 "env_var_judge", 397 "--instructions", 398 "Check {{ outputs }}", 399 ], 400 env={"MLFLOW_EXPERIMENT_ID": experiment}, 401 ) 402 403 assert result.exit_code == 0 404 assert "Successfully created and registered" in result.output 405 406 # Verify judge was registered 407 scorers = list_scorers(experiment_id=experiment) 408 scorer_names = [s.name for s in scorers] 409 assert "env_var_judge" in scorer_names 410 411 412 @pytest.mark.parametrize( 413 ("args", "missing_param"), 414 [ 415 (["--instructions", "test", "--experiment-id", "123"], "name"), 416 (["--name", "test", "--experiment-id", "123"], "instructions"), 417 (["--name", "test", "--instructions", "test"], "experiment-id"), 418 ], 419 ) 420 def test_create_judge_missing_required_params( 421 runner: CliRunner, args: list[str], missing_param: str 422 ): 423 result = runner.invoke(commands, ["register-llm-judge"] + args) 424 425 assert result.exit_code != 0 426 # Click typically shows "Missing option" for required parameters 427 assert "missing" in result.output.lower() or "required" in result.output.lower() 428 429 430 def test_create_judge_invalid_prompt(runner: CliRunner, experiment: str): 431 # Should raise MlflowException because make_judge validates that instructions 432 # contain at least one variable 433 with pytest.raises(MlflowException, match="[Tt]emplate.*variable"): 434 runner.invoke( 435 commands, 436 [ 437 "register-llm-judge", 438 "--name", 439 "invalid_judge", 440 "--instructions", 441 "This has no template variables", 442 "--experiment-id", 443 experiment, 444 ], 445 ) 446 447 448 def test_create_judge_special_characters_in_name(runner: CliRunner, experiment: str): 449 # Verify experiment has no judges initially 450 scorers = list_scorers(experiment_id=experiment) 451 assert len(scorers) == 0 452 453 result = runner.invoke( 454 commands, 455 [ 456 "register-llm-judge", 457 "--name", 458 "judge-with_special.chars", 459 "--instructions", 460 "Evaluate {{ outputs }}", 461 "--experiment-id", 462 experiment, 463 ], 464 ) 465 466 assert result.exit_code == 0 467 assert "Successfully created and registered" in result.output 468 469 # Verify experiment has exactly one judge 470 scorers = list_scorers(experiment_id=experiment) 471 assert len(scorers) == 1 472 assert scorers[0].name == "judge-with_special.chars" 473 474 475 def test_create_judge_duplicate_registration(runner: CliRunner, experiment: str): 476 # Create a judge 477 result1 = runner.invoke( 478 commands, 479 [ 480 "register-llm-judge", 481 "--name", 482 "duplicate_judge", 483 "--instructions", 484 "Evaluate {{ outputs }}", 485 "--experiment-id", 486 experiment, 487 ], 488 ) 489 assert result1.exit_code == 0 490 491 scorers = list_scorers(experiment_id=experiment) 492 assert len(scorers) == 1 493 assert scorers[0].name == "duplicate_judge" 494 495 # Register the same judge again with same name - should succeed (replaces the old one) 496 result2 = runner.invoke( 497 commands, 498 [ 499 "register-llm-judge", 500 "--name", 501 "duplicate_judge", 502 "--instructions", 503 "Evaluate {{ outputs }}", 504 "--experiment-id", 505 experiment, 506 ], 507 ) 508 assert result2.exit_code == 0 509 510 # Verify there is still only one judge (the new one replaced the old one) 511 scorers = list_scorers(experiment_id=experiment) 512 assert len(scorers) == 1 513 assert scorers[0].name == "duplicate_judge" 514 515 516 def test_create_judge_with_description(runner: CliRunner, experiment: str): 517 description = "Evaluates response quality and relevance" 518 result = runner.invoke( 519 commands, 520 [ 521 "register-llm-judge", 522 "--name", 523 "judge_with_desc", 524 "--instructions", 525 "Evaluate {{ outputs }}", 526 "--description", 527 description, 528 "--experiment-id", 529 experiment, 530 ], 531 ) 532 533 assert result.exit_code == 0 534 assert "Successfully created and registered" in result.output 535 536 scorers = list_scorers(experiment_id=experiment) 537 assert len(scorers) == 1 538 judge = scorers[0] 539 assert judge.name == "judge_with_desc" 540 assert judge.description == description 541 542 543 def test_create_judge_with_description_short_flag(runner: CliRunner, experiment: str): 544 description = "Checks for PII in outputs" 545 result = runner.invoke( 546 commands, 547 [ 548 "register-llm-judge", 549 "-n", 550 "pii_judge", 551 "-i", 552 "Check {{ outputs }}", 553 "-d", 554 description, 555 "-x", 556 experiment, 557 ], 558 ) 559 560 assert result.exit_code == 0 561 562 scorers = list_scorers(experiment_id=experiment) 563 judge = next(s for s in scorers if s.name == "pii_judge") 564 assert judge.description == description 565 566 567 @pytest.mark.parametrize("output_format", ["table", "json"]) 568 def test_list_builtin_scorers_output_formats(runner, output_format): 569 args = ["list", "--builtin"] 570 if output_format == "json": 571 args.extend(["--output", "json"]) 572 573 result = runner.invoke(commands, args) 574 assert result.exit_code == 0 575 576 if output_format == "json": 577 data = json.loads(result.output) 578 assert "scorers" in data 579 assert isinstance(data["scorers"], list) 580 assert len(data["scorers"]) > 0 581 582 # Verify each scorer has required fields 583 for scorer_item in data["scorers"]: 584 assert "name" in scorer_item 585 assert "description" in scorer_item 586 587 # Verify some builtin scorer names appear 588 scorer_names = [s["name"] for s in data["scorers"]] 589 assert "correctness" in scorer_names 590 assert "relevance_to_query" in scorer_names 591 assert "completeness" in scorer_names 592 else: 593 # Verify table headers 594 assert "Scorer Name" in result.output 595 assert "Description" in result.output 596 597 # Verify some builtin scorer names appear 598 assert "correctness" in result.output 599 assert "relevance_to_query" in result.output 600 assert "completeness" in result.output 601 602 603 def test_list_builtin_scorers_short_flag(runner): 604 result = runner.invoke(commands, ["list", "-b"]) 605 assert result.exit_code == 0 606 assert "Scorer Name" in result.output 607 608 609 def test_list_builtin_scorers_shows_all_available_scorers(runner): 610 result = runner.invoke(commands, ["list", "--builtin", "--output", "json"]) 611 assert result.exit_code == 0 612 613 expected_scorers = get_all_scorers() 614 expected_names = {scorer.name for scorer in expected_scorers} 615 616 data = json.loads(result.output) 617 actual_names = {s["name"] for s in data["scorers"]} 618 619 assert actual_names == expected_names 620 621 622 def test_list_scorers_mutually_exclusive_flags(runner, experiment): 623 result = runner.invoke(commands, ["list", "--builtin", "--experiment-id", experiment]) 624 assert result.exit_code != 0 625 assert "Cannot specify both --builtin and --experiment-id" in result.output 626 627 628 def test_list_scorers_requires_one_flag(runner): 629 result = runner.invoke(commands, ["list"]) 630 assert result.exit_code != 0 631 assert "Must specify either --builtin or --experiment-id" in result.output 632 633 634 def test_list_scorers_env_var_still_works(runner, experiment, monkeypatch): 635 monkeypatch.setenv("MLFLOW_EXPERIMENT_ID", experiment) 636 result = runner.invoke(commands, ["list"]) 637 assert result.exit_code == 0 638 639 640 def test_create_judge_with_base_url(runner: CliRunner, experiment: str): 641 result = runner.invoke( 642 commands, 643 [ 644 "register-llm-judge", 645 "--name", 646 "proxy_judge", 647 "--instructions", 648 "Evaluate {{ outputs }}", 649 "--model", 650 "openai:/gpt-4", 651 "--base-url", 652 "http://my-proxy:8080/v1", 653 "--experiment-id", 654 experiment, 655 ], 656 ) 657 658 assert result.exit_code == 0 659 assert "Successfully created and registered" in result.output 660 661 # base_url is not persisted, so the registered judge won't have it 662 scorers = list_scorers(experiment_id=experiment) 663 assert any(s.name == "proxy_judge" for s in scorers) 664 665 666 def test_create_judge_with_extra_headers(runner: CliRunner, experiment: str): 667 result = runner.invoke( 668 commands, 669 [ 670 "register-llm-judge", 671 "--name", 672 "headers_judge", 673 "--instructions", 674 "Evaluate {{ outputs }}", 675 "--model", 676 "openai:/gpt-4", 677 "--extra-headers", 678 '{"X-Api-Key": "secret", "X-Org": "my-org"}', 679 "--experiment-id", 680 experiment, 681 ], 682 ) 683 684 assert result.exit_code == 0 685 assert "Successfully created and registered" in result.output 686 687 scorers = list_scorers(experiment_id=experiment) 688 assert any(s.name == "headers_judge" for s in scorers) 689 690 691 def test_create_judge_with_base_url_and_extra_headers(runner: CliRunner, experiment: str): 692 result = runner.invoke( 693 commands, 694 [ 695 "register-llm-judge", 696 "--name", 697 "full_judge", 698 "--instructions", 699 "Evaluate {{ outputs }}", 700 "--model", 701 "openai:/gpt-4", 702 "--base-url", 703 "http://proxy:9090", 704 "--extra-headers", 705 '{"Authorization": "Bearer token"}', 706 "--experiment-id", 707 experiment, 708 ], 709 ) 710 711 assert result.exit_code == 0 712 assert "Successfully created and registered" in result.output 713 714 715 def test_create_judge_invalid_extra_headers_json(runner: CliRunner, experiment: str): 716 result = runner.invoke( 717 commands, 718 [ 719 "register-llm-judge", 720 "--name", 721 "bad_json_judge", 722 "--instructions", 723 "Evaluate {{ outputs }}", 724 "--extra-headers", 725 "not valid json", 726 "--experiment-id", 727 experiment, 728 ], 729 ) 730 731 assert result.exit_code != 0 732 assert "Invalid JSON" in result.output 733 734 735 def test_create_judge_extra_headers_not_dict(runner: CliRunner, experiment: str): 736 result = runner.invoke( 737 commands, 738 [ 739 "register-llm-judge", 740 "--name", 741 "array_headers_judge", 742 "--instructions", 743 "Evaluate {{ outputs }}", 744 "--extra-headers", 745 '["not", "a", "dict"]', 746 "--experiment-id", 747 experiment, 748 ], 749 ) 750 751 assert result.exit_code != 0 752 assert "Expected a JSON object" in result.output 753 754 755 def test_create_judge_extra_headers_non_string_values(runner: CliRunner, experiment: str): 756 result = runner.invoke( 757 commands, 758 [ 759 "register-llm-judge", 760 "--name", 761 "non_string_headers_judge", 762 "--instructions", 763 "Evaluate {{ outputs }}", 764 "--extra-headers", 765 '{"Authorization": 123}', 766 "--experiment-id", 767 experiment, 768 ], 769 ) 770 771 assert result.exit_code != 0 772 assert "must all be strings" in result.output