/ tests / cli / test_datasets.py
test_datasets.py
  1  import json
  2  
  3  import pytest
  4  from click.testing import CliRunner
  5  
  6  import mlflow
  7  from mlflow.cli.datasets import commands
  8  from mlflow.genai.datasets import create_dataset
  9  
 10  
 11  @pytest.fixture
 12  def runner():
 13      return CliRunner(catch_exceptions=False)
 14  
 15  
 16  @pytest.fixture
 17  def experiment():
 18      exp_id = mlflow.create_experiment("test_datasets_cli")
 19      yield exp_id
 20      mlflow.delete_experiment(exp_id)
 21  
 22  
 23  @pytest.fixture
 24  def dataset_a(experiment):
 25      return create_dataset(
 26          name="dataset_a",
 27          experiment_id=experiment,
 28          tags={"env": "production"},
 29      )
 30  
 31  
 32  @pytest.fixture
 33  def dataset_b(experiment):
 34      return create_dataset(
 35          name="dataset_b",
 36          experiment_id=experiment,
 37          tags={"env": "staging"},
 38      )
 39  
 40  
 41  def test_commands_group_exists():
 42      assert commands.name == "datasets"
 43      assert commands.help is not None
 44  
 45  
 46  def test_list_command_params():
 47      list_cmd = next((cmd for cmd in commands.commands.values() if cmd.name == "list"), None)
 48      assert list_cmd is not None
 49      param_names = {p.name for p in list_cmd.params}
 50      expected_params = {
 51          "experiment_id",
 52          "filter_string",
 53          "max_results",
 54          "order_by",
 55          "page_token",
 56          "output",
 57      }
 58      assert param_names == expected_params
 59  
 60  
 61  def test_list_datasets_table_output(runner: CliRunner, experiment: str, dataset_a):
 62      result = runner.invoke(commands, ["list", "--experiment-id", experiment])
 63  
 64      assert result.exit_code == 0
 65      assert dataset_a.dataset_id in result.output
 66      assert "dataset_a" in result.output
 67  
 68  
 69  def test_list_datasets_json_output(runner: CliRunner, experiment: str, dataset_a):
 70      result = runner.invoke(commands, ["list", "--experiment-id", experiment, "--output", "json"])
 71  
 72      assert result.exit_code == 0
 73  
 74      expected = {
 75          "datasets": [
 76              {
 77                  "dataset_id": dataset_a.dataset_id,
 78                  "name": "dataset_a",
 79                  "digest": dataset_a.digest,
 80                  "created_time": dataset_a.created_time,
 81                  "last_update_time": dataset_a.last_update_time,
 82                  "created_by": dataset_a.created_by,
 83                  "last_updated_by": dataset_a.last_updated_by,
 84                  "tags": dataset_a.tags,
 85              }
 86          ],
 87          "next_page_token": None,
 88      }
 89      assert json.loads(result.output) == expected
 90  
 91  
 92  def test_list_datasets_empty_results(runner: CliRunner, experiment: str):
 93      result = runner.invoke(commands, ["list", "--experiment-id", experiment])
 94  
 95      assert result.exit_code == 0
 96  
 97  
 98  def test_list_datasets_json_empty_results(runner: CliRunner, experiment: str):
 99      result = runner.invoke(commands, ["list", "--experiment-id", experiment, "--output", "json"])
100  
101      assert result.exit_code == 0
102      output_json = json.loads(result.output)
103      assert output_json == {"datasets": [], "next_page_token": None}
104  
105  
106  def test_list_datasets_with_experiment_id_env_var(runner: CliRunner, experiment: str, dataset_a):
107      result = runner.invoke(commands, ["list"], env={"MLFLOW_EXPERIMENT_ID": experiment})
108  
109      assert result.exit_code == 0
110      assert dataset_a.dataset_id in result.output
111  
112  
113  def test_list_datasets_missing_experiment_id(runner: CliRunner):
114      result = runner.invoke(commands, ["list"])
115  
116      assert result.exit_code != 0
117      assert "Missing option '--experiment-id' / '-x'" in result.output
118  
119  
120  def test_list_datasets_invalid_output_format(runner: CliRunner, experiment: str):
121      result = runner.invoke(commands, ["list", "--experiment-id", experiment, "--output", "invalid"])
122  
123      assert result.exit_code != 0
124      assert "'invalid' is not one of 'table', 'json'" in result.output
125  
126  
127  def test_list_datasets_with_filter_string(runner: CliRunner, experiment: str, dataset_a, dataset_b):
128      result = runner.invoke(
129          commands,
130          ["list", "--experiment-id", experiment, "--filter-string", "name = 'dataset_a'"],
131      )
132  
133      assert result.exit_code == 0
134      assert "dataset_a" in result.output
135      assert "dataset_b" not in result.output
136  
137  
138  def test_list_datasets_with_max_results(runner: CliRunner, experiment: str, dataset_a, dataset_b):
139      result = runner.invoke(commands, ["list", "--experiment-id", experiment, "--max-results", "1"])
140  
141      assert result.exit_code == 0
142      output_lines = [line for line in result.output.split("\n") if "dataset_" in line]
143      assert len(output_lines) == 1
144  
145  
146  def test_list_datasets_with_order_by(runner: CliRunner, experiment: str, dataset_a, dataset_b):
147      result = runner.invoke(
148          commands, ["list", "--experiment-id", experiment, "--order-by", "name ASC"]
149      )
150  
151      assert result.exit_code == 0
152      a_pos = result.output.find("dataset_a")
153      b_pos = result.output.find("dataset_b")
154      assert a_pos < b_pos
155  
156  
157  def test_list_datasets_short_option_x(runner: CliRunner, experiment: str, dataset_a):
158      result = runner.invoke(commands, ["list", "-x", experiment])
159  
160      assert result.exit_code == 0
161      assert dataset_a.dataset_id in result.output
162  
163  
164  def test_list_datasets_multiple_datasets(runner: CliRunner, experiment: str, dataset_a, dataset_b):
165      result = runner.invoke(commands, ["list", "--experiment-id", experiment])
166  
167      assert result.exit_code == 0
168      assert dataset_a.dataset_id in result.output
169      assert "dataset_a" in result.output
170      assert dataset_b.dataset_id in result.output
171      assert "dataset_b" in result.output
172  
173  
174  def test_list_datasets_json_multiple_datasets(
175      runner: CliRunner, experiment: str, dataset_a, dataset_b
176  ):
177      result = runner.invoke(commands, ["list", "--experiment-id", experiment, "--output", "json"])
178  
179      assert result.exit_code == 0
180      output_json = json.loads(result.output)
181      assert len(output_json["datasets"]) == 2
182  
183      names = {d["name"] for d in output_json["datasets"]}
184      assert names == {"dataset_a", "dataset_b"}