dataset.py
1 import json 2 from abc import abstractmethod 3 from typing import Any 4 5 from mlflow.data.dataset_source import DatasetSource 6 from mlflow.entities import Dataset as DatasetEntity 7 8 9 class Dataset: 10 """ 11 Represents a dataset for use with MLflow Tracking, including the name, digest (hash), 12 schema, and profile of the dataset as well as source information (e.g. the S3 bucket or 13 managed Delta table from which the dataset was derived). Most datasets expose features 14 and targets for training and evaluation as well. 15 """ 16 17 def __init__(self, source: DatasetSource, name: str | None = None, digest: str | None = None): 18 """ 19 Base constructor for a dataset. All subclasses must call this constructor. 20 """ 21 self._name = name 22 self._source = source 23 # Note: Subclasses should call super() once they've initialized all of 24 # the class attributes necessary for digest computation 25 self._digest = digest or self._compute_digest() 26 27 @abstractmethod 28 def _compute_digest(self) -> str: 29 """Computes a digest for the dataset. Called if the user doesn't supply 30 a digest when constructing the dataset. 31 32 Returns: 33 A string digest for the dataset. We recommend a maximum digest length 34 of 10 characters with an ideal length of 8 characters. 35 36 """ 37 38 def to_dict(self) -> dict[str, str]: 39 """Create config dictionary for the dataset. 40 41 Subclasses should override this method to provide additional fields in the config dict, 42 e.g., schema, profile, etc. 43 44 Returns a string dictionary containing the following fields: name, digest, source, source 45 type. 46 """ 47 return { 48 "name": self.name, 49 "digest": self.digest, 50 "source": self.source.to_json(), 51 "source_type": self.source._get_source_type(), 52 } 53 54 def to_json(self) -> str: 55 """ 56 Obtains a JSON string representation of the :py:class:`Dataset 57 <mlflow.data.dataset.Dataset>`. 58 59 Returns: 60 A JSON string representation of the :py:class:`Dataset <mlflow.data.dataset.Dataset>`. 61 """ 62 63 return json.dumps(self.to_dict()) 64 65 def _get_source_type(self) -> str: 66 """Returns the type of the dataset's underlying source.""" 67 68 return self.source._get_source_type() 69 70 @property 71 def name(self) -> str: 72 """ 73 The name of the dataset, e.g. ``"iris_data"``, ``"myschema.mycatalog.mytable@v1"``, etc. 74 """ 75 if self._name is not None: 76 return self._name 77 else: 78 return "dataset" 79 80 @property 81 def digest(self) -> str: 82 """ 83 A unique hash or fingerprint of the dataset, e.g. ``"498c7496"``. 84 """ 85 return self._digest 86 87 @property 88 def source(self) -> DatasetSource: 89 """ 90 Information about the dataset's source, represented as an instance of 91 :py:class:`DatasetSource <mlflow.data.dataset_source.DatasetSource>`. For example, this 92 may be the S3 location or the name of the managed Delta Table from which the dataset 93 was derived. 94 """ 95 return self._source 96 97 @property 98 @abstractmethod 99 def profile(self) -> Any | None: 100 """ 101 Optional summary statistics for the dataset, such as the number of rows in a table, the 102 mean / median / std of each table column, etc. 103 """ 104 105 @property 106 @abstractmethod 107 def schema(self) -> Any | None: 108 """ 109 Optional dataset schema, such as an instance of :py:class:`mlflow.types.Schema` representing 110 the features and targets of the dataset. 111 """ 112 113 def _to_mlflow_entity(self) -> DatasetEntity: 114 """ 115 Returns: 116 A `mlflow.entities.Dataset` instance representing the dataset. 117 """ 118 dataset_dict = self.to_dict() 119 return DatasetEntity( 120 name=dataset_dict["name"], 121 digest=dataset_dict["digest"], 122 source_type=dataset_dict["source_type"], 123 source=dataset_dict["source"], 124 schema=dataset_dict.get("schema"), 125 profile=dataset_dict.get("profile"), 126 )