meta_dataset.py
1 import hashlib 2 import json 3 from typing import Any 4 5 from mlflow.data.dataset import Dataset 6 from mlflow.data.dataset_source import DatasetSource 7 from mlflow.types import Schema 8 9 10 class MetaDataset(Dataset): 11 """Dataset that only contains metadata. 12 13 This class is used to represent a dataset that only contains metadata, which is useful when 14 users only want to log metadata to MLflow without logging the actual data. For example, users 15 build a custom dataset from a text file publicly hosted in the Internet, and they want to log 16 the text file's URL to MLflow for future tracking instead of the dataset itself. 17 18 Args: 19 source: dataset source of type `DatasetSource`, indicates where the data is from. 20 name: name of the dataset. If not specified, a name is automatically generated. 21 digest: digest (hash, fingerprint) of the dataset. If not specified, a digest is 22 automatically computed. 23 schame: schema of the dataset. 24 25 .. code-block:: python 26 :caption: Create a MetaDataset 27 28 import mlflow 29 30 mlflow.set_experiment("/test-mlflow-meta-dataset") 31 32 source = mlflow.data.http_dataset_source.HTTPDatasetSource( 33 url="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz" 34 ) 35 ds = mlflow.data.meta_dataset.MetaDataset(source) 36 37 with mlflow.start_run() as run: 38 mlflow.log_input(ds) 39 40 .. code-block:: python 41 :caption: Create a MetaDataset with schema 42 43 import mlflow 44 45 mlflow.set_experiment("/test-mlflow-meta-dataset") 46 47 source = mlflow.data.http_dataset_source.HTTPDatasetSource( 48 url="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz" 49 ) 50 schema = Schema([ 51 ColSpec(type=mlflow.types.DataType.string, name="text"), 52 ColSpec(type=mlflow.types.DataType.integer, name="label"), 53 ]) 54 ds = mlflow.data.meta_dataset.MetaDataset(source, schema=schema) 55 56 with mlflow.start_run() as run: 57 mlflow.log_input(ds) 58 """ 59 60 def __init__( 61 self, 62 source: DatasetSource, 63 name: str | None = None, 64 digest: str | None = None, 65 schema: Schema | None = None, 66 ): 67 # Set `self._schema` before calling the superclass constructor because 68 # `self._compute_digest` depends on `self._schema`. 69 self._schema = schema 70 super().__init__(source=source, name=name, digest=digest) 71 72 def _compute_digest(self) -> str: 73 """Computes a digest for the dataset. 74 75 The digest computation of `MetaDataset` is based on the dataset's name, source, source type, 76 and schema instead of the actual data. Basically we compute the sha256 hash of the config 77 dict. 78 """ 79 config = { 80 "name": self.name, 81 "source": self.source.to_json(), 82 "source_type": self.source._get_source_type(), 83 "schema": self.schema.to_dict() if self.schema else "", 84 } 85 return hashlib.sha256(json.dumps(config).encode("utf-8")).hexdigest()[:8] 86 87 @property 88 def schema(self) -> Any | None: 89 """Returns the schema of the dataset.""" 90 return self._schema 91 92 def to_dict(self) -> dict[str, str]: 93 """Create config dictionary for the MetaDataset. 94 95 Returns a string dictionary containing the following fields: name, digest, source, source 96 type, schema, and profile. 97 """ 98 config = super().to_dict() 99 if self.schema: 100 schema = json.dumps({"mlflow_colspec": self.schema.to_dict()}) if self.schema else None 101 config["schema"] = schema 102 return config