/ mlflow / cli / datasets.py
datasets.py
  1  import json
  2  from typing import Any, Literal
  3  
  4  import click
  5  
  6  from mlflow import MlflowClient
  7  from mlflow.environment_variables import MLFLOW_EXPERIMENT_ID
  8  from mlflow.utils.string_utils import _create_table
  9  from mlflow.utils.time import conv_longdate_to_str
 10  
 11  EXPERIMENT_ID = click.option(
 12      "--experiment-id",
 13      "-x",
 14      envvar=MLFLOW_EXPERIMENT_ID.name,
 15      type=click.STRING,
 16      required=True,
 17      help="Experiment ID to list datasets for. Can be set via MLFLOW_EXPERIMENT_ID env var.",
 18  )
 19  
 20  
 21  def _format_datasets_as_json(datasets) -> dict[str, Any]:
 22      """Format datasets as a JSON-serializable dictionary."""
 23      return {
 24          "datasets": [
 25              {
 26                  "dataset_id": ds.dataset_id,
 27                  "name": ds.name,
 28                  "digest": ds.digest,
 29                  "created_time": ds.created_time,
 30                  "last_update_time": ds.last_update_time,
 31                  "created_by": ds.created_by,
 32                  "last_updated_by": ds.last_updated_by,
 33                  "tags": ds.tags,
 34              }
 35              for ds in datasets
 36          ],
 37          "next_page_token": datasets.token,
 38      }
 39  
 40  
 41  def _format_datasets_as_table(datasets) -> tuple[list[list[str]], list[str]]:
 42      """Format datasets as table rows with headers."""
 43      headers = ["Dataset ID", "Name", "Created", "Last Updated", "Created By"]
 44      rows = []
 45      for ds in datasets:
 46          created = conv_longdate_to_str(ds.created_time) if ds.created_time else ""
 47          updated = conv_longdate_to_str(ds.last_update_time) if ds.last_update_time else ""
 48          rows.append([ds.dataset_id, ds.name, created, updated, ds.created_by or ""])
 49      return rows, headers
 50  
 51  
 52  @click.group("datasets")
 53  def commands():
 54      """Manage GenAI evaluation datasets."""
 55  
 56  
 57  @commands.command("list")
 58  @EXPERIMENT_ID
 59  @click.option(
 60      "--filter-string",
 61      type=click.STRING,
 62      help="Filter string (e.g., \"name LIKE 'qa_%'\").",
 63  )
 64  @click.option(
 65      "--max-results",
 66      type=click.INT,
 67      default=50,
 68      help="Maximum results (default: 50).",
 69  )
 70  @click.option(
 71      "--order-by",
 72      type=click.STRING,
 73      help="Columns to order by (e.g., 'last_update_time DESC').",
 74  )
 75  @click.option(
 76      "--page-token",
 77      type=click.STRING,
 78      help="Pagination token.",
 79  )
 80  @click.option(
 81      "--output",
 82      type=click.Choice(["table", "json"]),
 83      default="table",
 84      help="Output format.",
 85  )
 86  def list_datasets(
 87      experiment_id: str,
 88      filter_string: str | None = None,
 89      max_results: int = 50,
 90      order_by: str | None = None,
 91      page_token: str | None = None,
 92      output: Literal["table", "json"] = "table",
 93  ) -> None:
 94      """
 95      List GenAI evaluation datasets associated with an experiment.
 96  
 97      \b
 98      Examples:
 99      # List datasets in experiment 1
100      mlflow datasets list --experiment-id 1
101  
102      \b
103      # Using environment variable
104      export MLFLOW_EXPERIMENT_ID=1
105      mlflow datasets list --max-results 10
106  
107      \b
108      # Filter datasets by name pattern
109      mlflow datasets list --experiment-id 1 --filter-string "name LIKE 'qa_%'"
110  
111      \b
112      # Order results by last update time
113      mlflow datasets list --experiment-id 1 --order-by "last_update_time DESC"
114  
115      \b
116      # Output as JSON
117      mlflow datasets list --experiment-id 1 --output json
118      """
119      client = MlflowClient()
120      order_by_list = [o.strip() for o in order_by.split(",")] if order_by else None
121  
122      datasets = client.search_datasets(
123          experiment_ids=[experiment_id],
124          filter_string=filter_string,
125          max_results=max_results,
126          order_by=order_by_list,
127          page_token=page_token,
128      )
129  
130      if output == "json":
131          result = _format_datasets_as_json(datasets)
132          click.echo(json.dumps(result, indent=2))
133      else:
134          rows, headers = _format_datasets_as_table(datasets)
135          click.echo(_create_table(rows, headers=headers))
136  
137          if datasets.token:
138              click.echo(f"\nNext page token: {datasets.token}")