/ tests / cli / test_scorers.py
test_scorers.py
  1  import json
  2  from typing import Any
  3  from unittest.mock import patch
  4  
  5  import pytest
  6  from click.testing import CliRunner
  7  
  8  import mlflow
  9  from mlflow.cli.scorers import commands
 10  from mlflow.exceptions import MlflowException
 11  from mlflow.genai.scorers import get_all_scorers, list_scorers, scorer
 12  from mlflow.utils.string_utils import _create_table
 13  
 14  
 15  @pytest.fixture
 16  def mock_databricks_environment():
 17      with (
 18          patch("mlflow.genai.scorers.base.is_databricks_uri", return_value=True),
 19      ):
 20          yield
 21  
 22  
 23  @pytest.fixture
 24  def runner():
 25      return CliRunner(catch_exceptions=False)
 26  
 27  
 28  @pytest.fixture
 29  def experiment():
 30      """Create a test experiment."""
 31      experiment_id = mlflow.create_experiment(
 32          f"test_scorers_cli_{mlflow.utils.time.get_current_time_millis()}"
 33      )
 34      yield experiment_id
 35      mlflow.delete_experiment(experiment_id)
 36  
 37  
 38  @pytest.fixture
 39  def correctness_scorer():
 40      """Create a correctness scorer."""
 41  
 42      @scorer
 43      def _correctness_scorer(outputs) -> bool:
 44          return len(outputs) > 0
 45  
 46      return _correctness_scorer
 47  
 48  
 49  @pytest.fixture
 50  def safety_scorer():
 51      """Create a safety scorer."""
 52  
 53      @scorer
 54      def _safety_scorer(outputs) -> bool:
 55          return len(outputs) > 0
 56  
 57      return _safety_scorer
 58  
 59  
 60  @pytest.fixture
 61  def relevance_scorer():
 62      """Create a relevance scorer."""
 63  
 64      @scorer
 65      def _relevance_scorer(outputs) -> bool:
 66          return len(outputs) > 0
 67  
 68      return _relevance_scorer
 69  
 70  
 71  @pytest.fixture
 72  def generic_scorer():
 73      """Create a generic test scorer."""
 74  
 75      @scorer
 76      def _generic_scorer(outputs) -> bool:
 77          return True
 78  
 79      return _generic_scorer
 80  
 81  
 82  def test_commands_group_exists():
 83      assert commands.name == "scorers"
 84      assert commands.help is not None
 85  
 86  
 87  def test_list_command_params():
 88      list_cmd = next((cmd for cmd in commands.commands.values() if cmd.name == "list"), None)
 89      assert list_cmd is not None
 90      param_names = {p.name for p in list_cmd.params}
 91      assert param_names == {"experiment_id", "builtin", "output"}
 92  
 93  
 94  def test_list_scorers_table_output(
 95      runner: CliRunner,
 96      experiment: str,
 97      correctness_scorer: Any,
 98      safety_scorer: Any,
 99      relevance_scorer: Any,
100      mock_databricks_environment: Any,
101  ):
102      correctness_scorer.register(experiment_id=experiment, name="Correctness")
103      safety_scorer.register(experiment_id=experiment, name="Safety")
104      relevance_scorer.register(experiment_id=experiment, name="RelevanceToQuery")
105  
106      result = runner.invoke(commands, ["list", "--experiment-id", experiment])
107  
108      assert result.exit_code == 0
109  
110      # Construct expected table output (scorers are returned in alphabetical order)
111      # Note: click.echo() adds a trailing newline
112      expected_table = (
113          _create_table(
114              [["Correctness", ""], ["RelevanceToQuery", ""], ["Safety", ""]],
115              headers=["Scorer Name", "Description"],
116          )
117          + "\n"
118      )
119      assert result.output == expected_table
120  
121  
122  def test_list_scorers_json_output(
123      runner: CliRunner,
124      experiment: str,
125      correctness_scorer: Any,
126      safety_scorer: Any,
127      relevance_scorer: Any,
128      mock_databricks_environment: Any,
129  ):
130      correctness_scorer.register(experiment_id=experiment, name="Correctness")
131      safety_scorer.register(experiment_id=experiment, name="Safety")
132      relevance_scorer.register(experiment_id=experiment, name="RelevanceToQuery")
133  
134      result = runner.invoke(commands, ["list", "--experiment-id", experiment, "--output", "json"])
135  
136      assert result.exit_code == 0
137      output_json = json.loads(result.output)
138      expected_scorers = [
139          {"name": "Correctness", "description": None},
140          {"name": "RelevanceToQuery", "description": None},
141          {"name": "Safety", "description": None},
142      ]
143      assert output_json["scorers"] == expected_scorers
144  
145  
146  @pytest.mark.parametrize(
147      ("output_format", "expected_output"),
148      [
149          ("table", ""),
150          ("json", {"scorers": []}),
151      ],
152  )
153  def test_list_scorers_empty_experiment(
154      runner: CliRunner, experiment: str, output_format: str, expected_output: Any
155  ):
156      args = ["list", "--experiment-id", experiment]
157      if output_format == "json":
158          args.extend(["--output", "json"])
159  
160      result = runner.invoke(commands, args)
161      assert result.exit_code == 0
162  
163      if output_format == "json":
164          output_json = json.loads(result.output)
165          assert output_json == expected_output
166      else:
167          # Empty table produces minimal output
168          assert result.output.strip() == expected_output
169  
170  
171  def test_list_scorers_with_experiment_id_env_var(
172      runner: CliRunner, experiment: str, correctness_scorer: Any, mock_databricks_environment: Any
173  ):
174      correctness_scorer.register(experiment_id=experiment, name="Correctness")
175  
176      result = runner.invoke(commands, ["list"], env={"MLFLOW_EXPERIMENT_ID": experiment})
177  
178      assert result.exit_code == 0
179      assert "Correctness" in result.output
180  
181  
182  def test_list_scorers_missing_experiment_id(runner: CliRunner):
183      result = runner.invoke(commands, ["list"])
184  
185      assert result.exit_code != 0
186      assert "experiment-id" in result.output.lower() or "experiment_id" in result.output.lower()
187  
188  
189  def test_list_scorers_invalid_output_format(runner: CliRunner, experiment: str):
190      result = runner.invoke(commands, ["list", "--experiment-id", experiment, "--output", "invalid"])
191  
192      assert result.exit_code != 0
193      assert "invalid" in result.output.lower() or "choice" in result.output.lower()
194  
195  
196  def test_list_scorers_special_characters_in_names(
197      runner: CliRunner, experiment: str, generic_scorer: Any, mock_databricks_environment: Any
198  ):
199      generic_scorer.register(experiment_id=experiment, name="Scorer With Spaces")
200      generic_scorer.register(experiment_id=experiment, name="Scorer.With.Dots")
201      generic_scorer.register(experiment_id=experiment, name="Scorer-With-Dashes")
202      generic_scorer.register(experiment_id=experiment, name="Scorer_With_Underscores")
203  
204      result = runner.invoke(commands, ["list", "--experiment-id", experiment])
205  
206      assert result.exit_code == 0
207      assert "Scorer With Spaces" in result.output
208      assert "Scorer.With.Dots" in result.output
209      assert "Scorer-With-Dashes" in result.output
210      assert "Scorer_With_Underscores" in result.output
211  
212  
213  @pytest.mark.parametrize(
214      "output_format",
215      ["table", "json"],
216  )
217  def test_list_scorers_single_scorer(
218      runner: CliRunner,
219      experiment: str,
220      generic_scorer: Any,
221      output_format: str,
222      mock_databricks_environment: Any,
223  ):
224      generic_scorer.register(experiment_id=experiment, name="OnlyScorer")
225  
226      args = ["list", "--experiment-id", experiment]
227      if output_format == "json":
228          args.extend(["--output", "json"])
229  
230      result = runner.invoke(commands, args)
231      assert result.exit_code == 0
232  
233      if output_format == "json":
234          output_json = json.loads(result.output)
235          assert output_json == {"scorers": [{"name": "OnlyScorer", "description": None}]}
236      else:
237          assert "OnlyScorer" in result.output
238  
239  
240  @pytest.mark.parametrize(
241      "output_format",
242      ["table", "json"],
243  )
244  def test_list_scorers_long_names(
245      runner: CliRunner,
246      experiment: str,
247      generic_scorer: Any,
248      output_format: str,
249      mock_databricks_environment: Any,
250  ):
251      long_name = "VeryLongScorerNameThatShouldNotBeTruncatedEvenIfItIsReallyReallyLong"
252      generic_scorer.register(experiment_id=experiment, name=long_name)
253  
254      args = ["list", "--experiment-id", experiment]
255      if output_format == "json":
256          args.extend(["--output", "json"])
257  
258      result = runner.invoke(commands, args)
259      assert result.exit_code == 0
260  
261      if output_format == "json":
262          output_json = json.loads(result.output)
263          assert output_json == {"scorers": [{"name": long_name, "description": None}]}
264      else:
265          # Full name should be present
266          assert long_name in result.output
267  
268  
269  def test_list_scorers_with_descriptions(runner: CliRunner, experiment: str):
270      from mlflow.genai.judges import make_judge
271  
272      judge1 = make_judge(
273          name="quality_judge",
274          instructions="Evaluate {{ outputs }}",
275          description="Evaluates response quality",
276          feedback_value_type=str,
277      )
278      judge1.register(experiment_id=experiment)
279  
280      judge2 = make_judge(
281          name="safety_judge",
282          instructions="Check {{ outputs }}",
283          description="Checks for safety issues",
284          feedback_value_type=str,
285      )
286      judge2.register(experiment_id=experiment)
287  
288      judge3 = make_judge(
289          name="no_desc_judge",
290          instructions="Evaluate {{ outputs }}",
291          feedback_value_type=str,
292      )
293      judge3.register(experiment_id=experiment)
294  
295      result_json = runner.invoke(
296          commands, ["list", "--experiment-id", experiment, "--output", "json"]
297      )
298      assert result_json.exit_code == 0
299      output_json = json.loads(result_json.output)
300  
301      assert len(output_json["scorers"]) == 3
302      scorers_by_name = {s["name"]: s for s in output_json["scorers"]}
303  
304      assert scorers_by_name["no_desc_judge"]["description"] is None
305      assert scorers_by_name["quality_judge"]["description"] == "Evaluates response quality"
306      assert scorers_by_name["safety_judge"]["description"] == "Checks for safety issues"
307  
308      result_table = runner.invoke(commands, ["list", "--experiment-id", experiment])
309      assert result_table.exit_code == 0
310      assert "Evaluates response quality" in result_table.output
311      assert "Checks for safety issues" in result_table.output
312  
313  
314  def test_create_judge_basic(runner: CliRunner, experiment: str):
315      result = runner.invoke(
316          commands,
317          [
318              "register-llm-judge",
319              "--name",
320              "test_judge",
321              "--instructions",
322              "Evaluate {{ outputs }}",
323              "--experiment-id",
324              experiment,
325          ],
326      )
327  
328      assert result.exit_code == 0
329      assert "Successfully created and registered judge scorer 'test_judge'" in result.output
330      assert experiment in result.output
331  
332      # Verify judge was registered
333      scorers = list_scorers(experiment_id=experiment)
334      scorer_names = [s.name for s in scorers]
335      assert "test_judge" in scorer_names
336  
337  
338  def test_create_judge_with_model(runner: CliRunner, experiment: str):
339      result = runner.invoke(
340          commands,
341          [
342              "register-llm-judge",
343              "--name",
344              "custom_model_judge",
345              "--instructions",
346              "Check {{ inputs }} and {{ outputs }}",
347              "--model",
348              "openai:/gpt-4",
349              "--experiment-id",
350              experiment,
351          ],
352      )
353  
354      assert result.exit_code == 0
355      assert "Successfully created and registered" in result.output
356  
357      # Verify judge was registered with correct model
358      scorers = list_scorers(experiment_id=experiment)
359      scorer_names = [s.name for s in scorers]
360      assert "custom_model_judge" in scorer_names
361  
362      # Get the judge and verify it uses the specified model
363      judge = next(s for s in scorers if s.name == "custom_model_judge")
364      assert judge.model == "openai:/gpt-4"
365  
366  
367  def test_create_judge_short_options(runner: CliRunner, experiment: str):
368      result = runner.invoke(
369          commands,
370          [
371              "register-llm-judge",
372              "-n",
373              "short_options_judge",
374              "-i",
375              "Evaluate {{ outputs }}",
376              "-x",
377              experiment,
378          ],
379      )
380  
381      assert result.exit_code == 0
382      assert "Successfully created and registered" in result.output
383  
384      # Verify judge was registered
385      scorers = list_scorers(experiment_id=experiment)
386      scorer_names = [s.name for s in scorers]
387      assert "short_options_judge" in scorer_names
388  
389  
390  def test_create_judge_with_env_var(runner: CliRunner, experiment: str):
391      result = runner.invoke(
392          commands,
393          [
394              "register-llm-judge",
395              "--name",
396              "env_var_judge",
397              "--instructions",
398              "Check {{ outputs }}",
399          ],
400          env={"MLFLOW_EXPERIMENT_ID": experiment},
401      )
402  
403      assert result.exit_code == 0
404      assert "Successfully created and registered" in result.output
405  
406      # Verify judge was registered
407      scorers = list_scorers(experiment_id=experiment)
408      scorer_names = [s.name for s in scorers]
409      assert "env_var_judge" in scorer_names
410  
411  
412  @pytest.mark.parametrize(
413      ("args", "missing_param"),
414      [
415          (["--instructions", "test", "--experiment-id", "123"], "name"),
416          (["--name", "test", "--experiment-id", "123"], "instructions"),
417          (["--name", "test", "--instructions", "test"], "experiment-id"),
418      ],
419  )
420  def test_create_judge_missing_required_params(
421      runner: CliRunner, args: list[str], missing_param: str
422  ):
423      result = runner.invoke(commands, ["register-llm-judge"] + args)
424  
425      assert result.exit_code != 0
426      # Click typically shows "Missing option" for required parameters
427      assert "missing" in result.output.lower() or "required" in result.output.lower()
428  
429  
430  def test_create_judge_invalid_prompt(runner: CliRunner, experiment: str):
431      # Should raise MlflowException because make_judge validates that instructions
432      # contain at least one variable
433      with pytest.raises(MlflowException, match="[Tt]emplate.*variable"):
434          runner.invoke(
435              commands,
436              [
437                  "register-llm-judge",
438                  "--name",
439                  "invalid_judge",
440                  "--instructions",
441                  "This has no template variables",
442                  "--experiment-id",
443                  experiment,
444              ],
445          )
446  
447  
448  def test_create_judge_special_characters_in_name(runner: CliRunner, experiment: str):
449      # Verify experiment has no judges initially
450      scorers = list_scorers(experiment_id=experiment)
451      assert len(scorers) == 0
452  
453      result = runner.invoke(
454          commands,
455          [
456              "register-llm-judge",
457              "--name",
458              "judge-with_special.chars",
459              "--instructions",
460              "Evaluate {{ outputs }}",
461              "--experiment-id",
462              experiment,
463          ],
464      )
465  
466      assert result.exit_code == 0
467      assert "Successfully created and registered" in result.output
468  
469      # Verify experiment has exactly one judge
470      scorers = list_scorers(experiment_id=experiment)
471      assert len(scorers) == 1
472      assert scorers[0].name == "judge-with_special.chars"
473  
474  
475  def test_create_judge_duplicate_registration(runner: CliRunner, experiment: str):
476      # Create a judge
477      result1 = runner.invoke(
478          commands,
479          [
480              "register-llm-judge",
481              "--name",
482              "duplicate_judge",
483              "--instructions",
484              "Evaluate {{ outputs }}",
485              "--experiment-id",
486              experiment,
487          ],
488      )
489      assert result1.exit_code == 0
490  
491      scorers = list_scorers(experiment_id=experiment)
492      assert len(scorers) == 1
493      assert scorers[0].name == "duplicate_judge"
494  
495      # Register the same judge again with same name - should succeed (replaces the old one)
496      result2 = runner.invoke(
497          commands,
498          [
499              "register-llm-judge",
500              "--name",
501              "duplicate_judge",
502              "--instructions",
503              "Evaluate {{ outputs }}",
504              "--experiment-id",
505              experiment,
506          ],
507      )
508      assert result2.exit_code == 0
509  
510      # Verify there is still only one judge (the new one replaced the old one)
511      scorers = list_scorers(experiment_id=experiment)
512      assert len(scorers) == 1
513      assert scorers[0].name == "duplicate_judge"
514  
515  
516  def test_create_judge_with_description(runner: CliRunner, experiment: str):
517      description = "Evaluates response quality and relevance"
518      result = runner.invoke(
519          commands,
520          [
521              "register-llm-judge",
522              "--name",
523              "judge_with_desc",
524              "--instructions",
525              "Evaluate {{ outputs }}",
526              "--description",
527              description,
528              "--experiment-id",
529              experiment,
530          ],
531      )
532  
533      assert result.exit_code == 0
534      assert "Successfully created and registered" in result.output
535  
536      scorers = list_scorers(experiment_id=experiment)
537      assert len(scorers) == 1
538      judge = scorers[0]
539      assert judge.name == "judge_with_desc"
540      assert judge.description == description
541  
542  
543  def test_create_judge_with_description_short_flag(runner: CliRunner, experiment: str):
544      description = "Checks for PII in outputs"
545      result = runner.invoke(
546          commands,
547          [
548              "register-llm-judge",
549              "-n",
550              "pii_judge",
551              "-i",
552              "Check {{ outputs }}",
553              "-d",
554              description,
555              "-x",
556              experiment,
557          ],
558      )
559  
560      assert result.exit_code == 0
561  
562      scorers = list_scorers(experiment_id=experiment)
563      judge = next(s for s in scorers if s.name == "pii_judge")
564      assert judge.description == description
565  
566  
567  @pytest.mark.parametrize("output_format", ["table", "json"])
568  def test_list_builtin_scorers_output_formats(runner, output_format):
569      args = ["list", "--builtin"]
570      if output_format == "json":
571          args.extend(["--output", "json"])
572  
573      result = runner.invoke(commands, args)
574      assert result.exit_code == 0
575  
576      if output_format == "json":
577          data = json.loads(result.output)
578          assert "scorers" in data
579          assert isinstance(data["scorers"], list)
580          assert len(data["scorers"]) > 0
581  
582          # Verify each scorer has required fields
583          for scorer_item in data["scorers"]:
584              assert "name" in scorer_item
585              assert "description" in scorer_item
586  
587          # Verify some builtin scorer names appear
588          scorer_names = [s["name"] for s in data["scorers"]]
589          assert "correctness" in scorer_names
590          assert "relevance_to_query" in scorer_names
591          assert "completeness" in scorer_names
592      else:
593          # Verify table headers
594          assert "Scorer Name" in result.output
595          assert "Description" in result.output
596  
597          # Verify some builtin scorer names appear
598          assert "correctness" in result.output
599          assert "relevance_to_query" in result.output
600          assert "completeness" in result.output
601  
602  
603  def test_list_builtin_scorers_short_flag(runner):
604      result = runner.invoke(commands, ["list", "-b"])
605      assert result.exit_code == 0
606      assert "Scorer Name" in result.output
607  
608  
609  def test_list_builtin_scorers_shows_all_available_scorers(runner):
610      result = runner.invoke(commands, ["list", "--builtin", "--output", "json"])
611      assert result.exit_code == 0
612  
613      expected_scorers = get_all_scorers()
614      expected_names = {scorer.name for scorer in expected_scorers}
615  
616      data = json.loads(result.output)
617      actual_names = {s["name"] for s in data["scorers"]}
618  
619      assert actual_names == expected_names
620  
621  
622  def test_list_scorers_mutually_exclusive_flags(runner, experiment):
623      result = runner.invoke(commands, ["list", "--builtin", "--experiment-id", experiment])
624      assert result.exit_code != 0
625      assert "Cannot specify both --builtin and --experiment-id" in result.output
626  
627  
628  def test_list_scorers_requires_one_flag(runner):
629      result = runner.invoke(commands, ["list"])
630      assert result.exit_code != 0
631      assert "Must specify either --builtin or --experiment-id" in result.output
632  
633  
634  def test_list_scorers_env_var_still_works(runner, experiment, monkeypatch):
635      monkeypatch.setenv("MLFLOW_EXPERIMENT_ID", experiment)
636      result = runner.invoke(commands, ["list"])
637      assert result.exit_code == 0
638  
639  
640  def test_create_judge_with_base_url(runner: CliRunner, experiment: str):
641      result = runner.invoke(
642          commands,
643          [
644              "register-llm-judge",
645              "--name",
646              "proxy_judge",
647              "--instructions",
648              "Evaluate {{ outputs }}",
649              "--model",
650              "openai:/gpt-4",
651              "--base-url",
652              "http://my-proxy:8080/v1",
653              "--experiment-id",
654              experiment,
655          ],
656      )
657  
658      assert result.exit_code == 0
659      assert "Successfully created and registered" in result.output
660  
661      # base_url is not persisted, so the registered judge won't have it
662      scorers = list_scorers(experiment_id=experiment)
663      assert any(s.name == "proxy_judge" for s in scorers)
664  
665  
666  def test_create_judge_with_extra_headers(runner: CliRunner, experiment: str):
667      result = runner.invoke(
668          commands,
669          [
670              "register-llm-judge",
671              "--name",
672              "headers_judge",
673              "--instructions",
674              "Evaluate {{ outputs }}",
675              "--model",
676              "openai:/gpt-4",
677              "--extra-headers",
678              '{"X-Api-Key": "secret", "X-Org": "my-org"}',
679              "--experiment-id",
680              experiment,
681          ],
682      )
683  
684      assert result.exit_code == 0
685      assert "Successfully created and registered" in result.output
686  
687      scorers = list_scorers(experiment_id=experiment)
688      assert any(s.name == "headers_judge" for s in scorers)
689  
690  
691  def test_create_judge_with_base_url_and_extra_headers(runner: CliRunner, experiment: str):
692      result = runner.invoke(
693          commands,
694          [
695              "register-llm-judge",
696              "--name",
697              "full_judge",
698              "--instructions",
699              "Evaluate {{ outputs }}",
700              "--model",
701              "openai:/gpt-4",
702              "--base-url",
703              "http://proxy:9090",
704              "--extra-headers",
705              '{"Authorization": "Bearer token"}',
706              "--experiment-id",
707              experiment,
708          ],
709      )
710  
711      assert result.exit_code == 0
712      assert "Successfully created and registered" in result.output
713  
714  
715  def test_create_judge_invalid_extra_headers_json(runner: CliRunner, experiment: str):
716      result = runner.invoke(
717          commands,
718          [
719              "register-llm-judge",
720              "--name",
721              "bad_json_judge",
722              "--instructions",
723              "Evaluate {{ outputs }}",
724              "--extra-headers",
725              "not valid json",
726              "--experiment-id",
727              experiment,
728          ],
729      )
730  
731      assert result.exit_code != 0
732      assert "Invalid JSON" in result.output
733  
734  
735  def test_create_judge_extra_headers_not_dict(runner: CliRunner, experiment: str):
736      result = runner.invoke(
737          commands,
738          [
739              "register-llm-judge",
740              "--name",
741              "array_headers_judge",
742              "--instructions",
743              "Evaluate {{ outputs }}",
744              "--extra-headers",
745              '["not", "a", "dict"]',
746              "--experiment-id",
747              experiment,
748          ],
749      )
750  
751      assert result.exit_code != 0
752      assert "Expected a JSON object" in result.output
753  
754  
755  def test_create_judge_extra_headers_non_string_values(runner: CliRunner, experiment: str):
756      result = runner.invoke(
757          commands,
758          [
759              "register-llm-judge",
760              "--name",
761              "non_string_headers_judge",
762              "--instructions",
763              "Evaluate {{ outputs }}",
764              "--extra-headers",
765              '{"Authorization": 123}',
766              "--experiment-id",
767              experiment,
768          ],
769      )
770  
771      assert result.exit_code != 0
772      assert "must all be strings" in result.output