/ mlflow / data / meta_dataset.py
meta_dataset.py
  1  import hashlib
  2  import json
  3  from typing import Any
  4  
  5  from mlflow.data.dataset import Dataset
  6  from mlflow.data.dataset_source import DatasetSource
  7  from mlflow.types import Schema
  8  
  9  
 10  class MetaDataset(Dataset):
 11      """Dataset that only contains metadata.
 12  
 13      This class is used to represent a dataset that only contains metadata, which is useful when
 14      users only want to log metadata to MLflow without logging the actual data. For example, users
 15      build a custom dataset from a text file publicly hosted in the Internet, and they want to log
 16      the text file's URL to MLflow for future tracking instead of the dataset itself.
 17  
 18      Args:
 19          source: dataset source of type `DatasetSource`, indicates where the data is from.
 20          name: name of the dataset. If not specified, a name is automatically generated.
 21          digest: digest (hash, fingerprint) of the dataset. If not specified, a digest is
 22              automatically computed.
 23          schame: schema of the dataset.
 24  
 25      .. code-block:: python
 26          :caption: Create a MetaDataset
 27  
 28          import mlflow
 29  
 30          mlflow.set_experiment("/test-mlflow-meta-dataset")
 31  
 32          source = mlflow.data.http_dataset_source.HTTPDatasetSource(
 33              url="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
 34          )
 35          ds = mlflow.data.meta_dataset.MetaDataset(source)
 36  
 37          with mlflow.start_run() as run:
 38              mlflow.log_input(ds)
 39  
 40      .. code-block:: python
 41          :caption: Create a MetaDataset with schema
 42  
 43          import mlflow
 44  
 45          mlflow.set_experiment("/test-mlflow-meta-dataset")
 46  
 47          source = mlflow.data.http_dataset_source.HTTPDatasetSource(
 48              url="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
 49          )
 50          schema = Schema([
 51              ColSpec(type=mlflow.types.DataType.string, name="text"),
 52              ColSpec(type=mlflow.types.DataType.integer, name="label"),
 53          ])
 54          ds = mlflow.data.meta_dataset.MetaDataset(source, schema=schema)
 55  
 56          with mlflow.start_run() as run:
 57              mlflow.log_input(ds)
 58      """
 59  
 60      def __init__(
 61          self,
 62          source: DatasetSource,
 63          name: str | None = None,
 64          digest: str | None = None,
 65          schema: Schema | None = None,
 66      ):
 67          # Set `self._schema` before calling the superclass constructor because
 68          # `self._compute_digest` depends on `self._schema`.
 69          self._schema = schema
 70          super().__init__(source=source, name=name, digest=digest)
 71  
 72      def _compute_digest(self) -> str:
 73          """Computes a digest for the dataset.
 74  
 75          The digest computation of `MetaDataset` is based on the dataset's name, source, source type,
 76          and schema instead of the actual data. Basically we compute the sha256 hash of the config
 77          dict.
 78          """
 79          config = {
 80              "name": self.name,
 81              "source": self.source.to_json(),
 82              "source_type": self.source._get_source_type(),
 83              "schema": self.schema.to_dict() if self.schema else "",
 84          }
 85          return hashlib.sha256(json.dumps(config).encode("utf-8")).hexdigest()[:8]
 86  
 87      @property
 88      def schema(self) -> Any | None:
 89          """Returns the schema of the dataset."""
 90          return self._schema
 91  
 92      def to_dict(self) -> dict[str, str]:
 93          """Create config dictionary for the MetaDataset.
 94  
 95          Returns a string dictionary containing the following fields: name, digest, source, source
 96          type, schema, and profile.
 97          """
 98          config = super().to_dict()
 99          if self.schema:
100              schema = json.dumps({"mlflow_colspec": self.schema.to_dict()}) if self.schema else None
101              config["schema"] = schema
102          return config