/ mlflow / data / dataset.py
dataset.py
  1  import json
  2  from abc import abstractmethod
  3  from typing import Any
  4  
  5  from mlflow.data.dataset_source import DatasetSource
  6  from mlflow.entities import Dataset as DatasetEntity
  7  
  8  
  9  class Dataset:
 10      """
 11      Represents a dataset for use with MLflow Tracking, including the name, digest (hash),
 12      schema, and profile of the dataset as well as source information (e.g. the S3 bucket or
 13      managed Delta table from which the dataset was derived). Most datasets expose features
 14      and targets for training and evaluation as well.
 15      """
 16  
 17      def __init__(self, source: DatasetSource, name: str | None = None, digest: str | None = None):
 18          """
 19          Base constructor for a dataset. All subclasses must call this constructor.
 20          """
 21          self._name = name
 22          self._source = source
 23          # Note: Subclasses should call super() once they've initialized all of
 24          # the class attributes necessary for digest computation
 25          self._digest = digest or self._compute_digest()
 26  
 27      @abstractmethod
 28      def _compute_digest(self) -> str:
 29          """Computes a digest for the dataset. Called if the user doesn't supply
 30          a digest when constructing the dataset.
 31  
 32          Returns:
 33              A string digest for the dataset. We recommend a maximum digest length
 34              of 10 characters with an ideal length of 8 characters.
 35  
 36          """
 37  
 38      def to_dict(self) -> dict[str, str]:
 39          """Create config dictionary for the dataset.
 40  
 41          Subclasses should override this method to provide additional fields in the config dict,
 42          e.g., schema, profile, etc.
 43  
 44          Returns a string dictionary containing the following fields: name, digest, source, source
 45          type.
 46          """
 47          return {
 48              "name": self.name,
 49              "digest": self.digest,
 50              "source": self.source.to_json(),
 51              "source_type": self.source._get_source_type(),
 52          }
 53  
 54      def to_json(self) -> str:
 55          """
 56          Obtains a JSON string representation of the :py:class:`Dataset
 57          <mlflow.data.dataset.Dataset>`.
 58  
 59          Returns:
 60              A JSON string representation of the :py:class:`Dataset <mlflow.data.dataset.Dataset>`.
 61          """
 62  
 63          return json.dumps(self.to_dict())
 64  
 65      def _get_source_type(self) -> str:
 66          """Returns the type of the dataset's underlying source."""
 67  
 68          return self.source._get_source_type()
 69  
 70      @property
 71      def name(self) -> str:
 72          """
 73          The name of the dataset, e.g. ``"iris_data"``, ``"myschema.mycatalog.mytable@v1"``, etc.
 74          """
 75          if self._name is not None:
 76              return self._name
 77          else:
 78              return "dataset"
 79  
 80      @property
 81      def digest(self) -> str:
 82          """
 83          A unique hash or fingerprint of the dataset, e.g. ``"498c7496"``.
 84          """
 85          return self._digest
 86  
 87      @property
 88      def source(self) -> DatasetSource:
 89          """
 90          Information about the dataset's source, represented as an instance of
 91          :py:class:`DatasetSource <mlflow.data.dataset_source.DatasetSource>`. For example, this
 92          may be the S3 location or the name of the managed Delta Table from which the dataset
 93          was derived.
 94          """
 95          return self._source
 96  
 97      @property
 98      @abstractmethod
 99      def profile(self) -> Any | None:
100          """
101          Optional summary statistics for the dataset, such as the number of rows in a table, the
102          mean / median / std of each table column, etc.
103          """
104  
105      @property
106      @abstractmethod
107      def schema(self) -> Any | None:
108          """
109          Optional dataset schema, such as an instance of :py:class:`mlflow.types.Schema` representing
110          the features and targets of the dataset.
111          """
112  
113      def _to_mlflow_entity(self) -> DatasetEntity:
114          """
115          Returns:
116              A `mlflow.entities.Dataset` instance representing the dataset.
117          """
118          dataset_dict = self.to_dict()
119          return DatasetEntity(
120              name=dataset_dict["name"],
121              digest=dataset_dict["digest"],
122              source_type=dataset_dict["source_type"],
123              source=dataset_dict["source"],
124              schema=dataset_dict.get("schema"),
125              profile=dataset_dict.get("profile"),
126          )