datasets.py
1 import json 2 from typing import Any, Literal 3 4 import click 5 6 from mlflow import MlflowClient 7 from mlflow.environment_variables import MLFLOW_EXPERIMENT_ID 8 from mlflow.utils.string_utils import _create_table 9 from mlflow.utils.time import conv_longdate_to_str 10 11 EXPERIMENT_ID = click.option( 12 "--experiment-id", 13 "-x", 14 envvar=MLFLOW_EXPERIMENT_ID.name, 15 type=click.STRING, 16 required=True, 17 help="Experiment ID to list datasets for. Can be set via MLFLOW_EXPERIMENT_ID env var.", 18 ) 19 20 21 def _format_datasets_as_json(datasets) -> dict[str, Any]: 22 """Format datasets as a JSON-serializable dictionary.""" 23 return { 24 "datasets": [ 25 { 26 "dataset_id": ds.dataset_id, 27 "name": ds.name, 28 "digest": ds.digest, 29 "created_time": ds.created_time, 30 "last_update_time": ds.last_update_time, 31 "created_by": ds.created_by, 32 "last_updated_by": ds.last_updated_by, 33 "tags": ds.tags, 34 } 35 for ds in datasets 36 ], 37 "next_page_token": datasets.token, 38 } 39 40 41 def _format_datasets_as_table(datasets) -> tuple[list[list[str]], list[str]]: 42 """Format datasets as table rows with headers.""" 43 headers = ["Dataset ID", "Name", "Created", "Last Updated", "Created By"] 44 rows = [] 45 for ds in datasets: 46 created = conv_longdate_to_str(ds.created_time) if ds.created_time else "" 47 updated = conv_longdate_to_str(ds.last_update_time) if ds.last_update_time else "" 48 rows.append([ds.dataset_id, ds.name, created, updated, ds.created_by or ""]) 49 return rows, headers 50 51 52 @click.group("datasets") 53 def commands(): 54 """Manage GenAI evaluation datasets.""" 55 56 57 @commands.command("list") 58 @EXPERIMENT_ID 59 @click.option( 60 "--filter-string", 61 type=click.STRING, 62 help="Filter string (e.g., \"name LIKE 'qa_%'\").", 63 ) 64 @click.option( 65 "--max-results", 66 type=click.INT, 67 default=50, 68 help="Maximum results (default: 50).", 69 ) 70 @click.option( 71 "--order-by", 72 type=click.STRING, 73 help="Columns to order by (e.g., 'last_update_time DESC').", 74 ) 75 @click.option( 76 "--page-token", 77 type=click.STRING, 78 help="Pagination token.", 79 ) 80 @click.option( 81 "--output", 82 type=click.Choice(["table", "json"]), 83 default="table", 84 help="Output format.", 85 ) 86 def list_datasets( 87 experiment_id: str, 88 filter_string: str | None = None, 89 max_results: int = 50, 90 order_by: str | None = None, 91 page_token: str | None = None, 92 output: Literal["table", "json"] = "table", 93 ) -> None: 94 """ 95 List GenAI evaluation datasets associated with an experiment. 96 97 \b 98 Examples: 99 # List datasets in experiment 1 100 mlflow datasets list --experiment-id 1 101 102 \b 103 # Using environment variable 104 export MLFLOW_EXPERIMENT_ID=1 105 mlflow datasets list --max-results 10 106 107 \b 108 # Filter datasets by name pattern 109 mlflow datasets list --experiment-id 1 --filter-string "name LIKE 'qa_%'" 110 111 \b 112 # Order results by last update time 113 mlflow datasets list --experiment-id 1 --order-by "last_update_time DESC" 114 115 \b 116 # Output as JSON 117 mlflow datasets list --experiment-id 1 --output json 118 """ 119 client = MlflowClient() 120 order_by_list = [o.strip() for o in order_by.split(",")] if order_by else None 121 122 datasets = client.search_datasets( 123 experiment_ids=[experiment_id], 124 filter_string=filter_string, 125 max_results=max_results, 126 order_by=order_by_list, 127 page_token=page_token, 128 ) 129 130 if output == "json": 131 result = _format_datasets_as_json(datasets) 132 click.echo(json.dumps(result, indent=2)) 133 else: 134 rows, headers = _format_datasets_as_table(datasets) 135 click.echo(_create_table(rows, headers=headers)) 136 137 if datasets.token: 138 click.echo(f"\nNext page token: {datasets.token}")