Cradicle Explorer

/ tests / models / test_model_input_examples.py
test_model_input_examples.py
  1  import json
  2  import math
  3  from io import StringIO
  4  from unittest import mock
  5  
  6  import numpy as np
  7  import pandas as pd
  8  import pytest
  9  import sklearn.neighbors as knn
 10  from scipy.sparse import csc_matrix, csr_matrix
 11  from sklearn import datasets
 12  from sklearn.base import BaseEstimator, ClassifierMixin
 13  
 14  import mlflow
 15  from mlflow.models import Model
 16  from mlflow.models.signature import ModelSignature, infer_signature
 17  from mlflow.models.utils import (
 18      _Example,
 19      _read_sparse_matrix_from_json,
 20      parse_inputs_data,
 21  )
 22  from mlflow.types import DataType
 23  from mlflow.types.schema import ColSpec, Schema, TensorSpec
 24  from mlflow.types.utils import TensorsNotSupportedException
 25  from mlflow.utils.file_utils import TempDir
 26  from mlflow.utils.proto_json_utils import dataframe_from_raw_json
 27  
 28  
 29  @pytest.fixture
 30  def pandas_df_with_all_types():
 31      df = pd.DataFrame({
 32          "boolean": [True, False, True],
 33          "integer": np.array([1, 2, 3], np.int32),
 34          "long": np.array([1, 2, 3], np.int64),
 35          "float": np.array([math.pi, 2 * math.pi, 3 * math.pi], np.float32),
 36          "double": [math.pi, 2 * math.pi, 3 * math.pi],
 37          "binary": [bytes([1, 2, 3]), bytes([4, 5, 6]), bytes([7, 8, 9])],
 38          "string": ["a", "b", "c"],
 39          "boolean_ext": [True, False, True],
 40          "integer_ext": [1, 2, 3],
 41          "string_ext": ["a", "b", "c"],
 42          "array": np.array(["a", "b", "c"]),
 43      })
 44      df["boolean_ext"] = df["boolean_ext"].astype("boolean")
 45      df["integer_ext"] = df["integer_ext"].astype("Int64")
 46      df["string_ext"] = df["string_ext"].astype("string")
 47      return df
 48  
 49  
 50  @pytest.fixture
 51  def df_without_columns():
 52      return pd.DataFrame({0: [1, 2, 3], 1: [4, 5, 6], 2: [7, 8, 9]})
 53  
 54  
 55  @pytest.fixture
 56  def df_with_nan():
 57      return pd.DataFrame({
 58          "boolean": [True, False, True],
 59          "integer": np.array([1, 2, 3], np.int32),
 60          "long": np.array([1, 2, 3], np.int64),
 61          "float": np.array([np.nan, 2 * math.pi, 3 * math.pi], np.float32),
 62          "double": [math.pi, np.nan, 3 * math.pi],
 63          "binary": [bytes([1, 2, 3]), bytes([4, 5, 6]), bytes([7, 8, 9])],
 64          "string": ["a", "b", "c"],
 65      })
 66  
 67  
 68  @pytest.fixture
 69  def dict_of_ndarrays():
 70      return {
 71          "1D": np.arange(0, 12, 0.5),
 72          "2D": np.arange(0, 12, 0.5).reshape(3, 8),
 73          "3D": np.arange(0, 12, 0.5).reshape(2, 3, 4),
 74          "4D": np.arange(0, 12, 0.5).reshape(3, 2, 2, 2),
 75      }
 76  
 77  
 78  @pytest.fixture
 79  def dict_of_ndarrays_with_nans():
 80      return {
 81          "1D": np.array([0.5, np.nan, 2.0]),
 82          "2D": np.array([[0.1, 0.2], [np.nan, 0.5]]),
 83          "3D": np.array([[[0.1, np.nan], [0.3, 0.4]], [[np.nan, 0.6], [0.7, np.nan]]]),
 84      }
 85  
 86  
 87  @pytest.fixture
 88  def dict_of_sparse_matrix():
 89      return {
 90          "sparse_matrix_csc": csc_matrix(np.arange(0, 12, 0.5).reshape(3, 8)),
 91          "sparse_matrix_csr": csr_matrix(np.arange(0, 12, 0.5).reshape(3, 8)),
 92      }
 93  
 94  
 95  def test_input_examples(pandas_df_with_all_types, dict_of_ndarrays):
 96      sig = infer_signature(pandas_df_with_all_types)
 97      # test setting example with data frame with all supported data types
 98      with TempDir() as tmp:
 99          example = _Example(pandas_df_with_all_types)
100          example.save(tmp.path())
101          filename = example.info["artifact_path"]
102          with open(tmp.path(filename)) as f:
103              data = json.load(f)
104              assert set(data.keys()) == {"columns", "data"}
105          parsed_df = dataframe_from_raw_json(tmp.path(filename), schema=sig.inputs)
106          pd.testing.assert_frame_equal(pandas_df_with_all_types, parsed_df, check_dtype=False)
107          # the frame read without schema should match except for the binary values
108          pd.testing.assert_frame_equal(
109              parsed_df.drop(columns=["binary"]),
110              dataframe_from_raw_json(tmp.path(filename)).drop(columns=["binary"]),
111              check_dtype=False,
112          )
113  
114      # NB: Drop columns that cannot be encoded by proto_json_utils.pyNumpyEncoder
115      new_df = pandas_df_with_all_types.drop(columns=["boolean_ext", "integer_ext", "string_ext"])
116  
117      # pass the input as dictionary instead
118      with TempDir() as tmp:
119          d = {name: new_df[name].values for name in new_df.columns}
120          example = _Example(d)
121          example.save(tmp.path())
122          filename = example.info["artifact_path"]
123          parsed_dict = parse_inputs_data(tmp.path(filename))
124          assert d.keys() == parsed_dict.keys()
125          # Asserting binary will fail since it is converted to base64 encoded strings.
126          # The check above suffices that the binary input is stored.
127          del d["binary"]
128          for key in d:
129              np.testing.assert_array_equal(d[key], parsed_dict[key])
130  
131      # input passed as numpy array
132      new_df = pandas_df_with_all_types.drop(columns=["binary"])
133      for col in new_df:
134          input_example = new_df[col].to_numpy()
135          with TempDir() as tmp:
136              example = _Example(input_example)
137              example.save(tmp.path())
138              filename = example.info["artifact_path"]
139              parsed_ary = parse_inputs_data(tmp.path(filename))
140              np.testing.assert_array_equal(parsed_ary, input_example)
141  
142      # pass multidimensional array
143      for col in dict_of_ndarrays:
144          input_example = dict_of_ndarrays[col]
145          with TempDir() as tmp:
146              example = _Example(input_example)
147              example.save(tmp.path())
148              filename = example.info["artifact_path"]
149              parsed_ary = parse_inputs_data(tmp.path(filename))
150              np.testing.assert_array_equal(parsed_ary, input_example)
151  
152      # pass multidimensional array as a list
153      example = np.array([[1, 2, 3]])
154      with pytest.raises(
155          TensorsNotSupportedException,
156          match=r"Numpy arrays in list are not supported as input examples.",
157      ):
158          _Example([example, example])
159  
160      # pass dict with scalars
161      with TempDir() as tmp:
162          example = {"a": 1, "b": "abc"}
163          x = _Example(example)
164          x.save(tmp.path())
165          filename = x.info["artifact_path"]
166          with open(tmp.path(filename)) as f:
167              parsed_data = json.load(f)
168          assert example == parsed_data
169  
170  
171  def test_pandas_orients_for_input_examples(
172      pandas_df_with_all_types, df_without_columns, dict_of_ndarrays
173  ):
174      # test setting example with data frame with all supported data types
175      with TempDir() as tmp:
176          example = _Example(pandas_df_with_all_types)
177          example.save(tmp.path())
178          filename = example.info["artifact_path"]
179          assert example.info["type"] == "dataframe"
180          assert example.info["pandas_orient"] == "split"
181          with open(tmp.path(filename)) as f:
182              data = json.load(f)
183              dataframe = pd.read_json(
184                  StringIO(json.dumps(data)), orient=example.info["pandas_orient"], precise_float=True
185              )
186              pd.testing.assert_frame_equal(
187                  pandas_df_with_all_types.drop(columns=["binary"]),
188                  dataframe.drop(columns=["binary"]),
189                  check_dtype=False,
190              )
191  
192      with TempDir() as tmp:
193          example = _Example(df_without_columns)
194          example.save(tmp.path())
195          filename = example.info["artifact_path"]
196          assert example.info["type"] == "dataframe"
197          assert example.info["pandas_orient"] == "values"
198          with open(tmp.path(filename)) as f:
199              data = json.load(f)
200              assert set(data.keys()) == {"data"}
201              # NOTE: when no column names are provided (i.e. values orient),
202              # saving an example adds a "data" key rather than directly storing the plain data
203              data = data["data"]
204              dataframe = pd.read_json(
205                  StringIO(json.dumps(data)), orient=example.info["pandas_orient"]
206              )
207              pd.testing.assert_frame_equal(dataframe, df_without_columns, check_dtype=False)
208  
209      # pass dict with scalars
210      with TempDir() as tmp:
211          example = {"a": 1, "b": "abc"}
212          x = _Example(example)
213          x.save(tmp.path())
214          filename = x.info["artifact_path"]
215          assert x.info["type"] == "json_object"
216          with open(tmp.path(filename)) as f:
217              parsed_json = json.load(f)
218              assert parsed_json == example
219  
220  
221  def test_sparse_matrix_input_examples(dict_of_sparse_matrix):
222      for example_type, input_example in dict_of_sparse_matrix.items():
223          with TempDir() as tmp:
224              example = _Example(input_example)
225              example.save(tmp.path())
226              filename = example.info["artifact_path"]
227              assert example.info["type"] == example_type
228              parsed_matrix = _read_sparse_matrix_from_json(tmp.path(filename), example_type)
229              np.testing.assert_array_equal(parsed_matrix.toarray(), input_example.toarray())
230  
231  
232  def test_input_examples_with_nan(df_with_nan, dict_of_ndarrays_with_nans):
233      # test setting example with data frame with NaN values in it
234      sig = infer_signature(df_with_nan)
235      with TempDir() as tmp:
236          example = _Example(df_with_nan)
237          example.save(tmp.path())
238          filename = example.info["artifact_path"]
239          assert example.info["type"] == "dataframe"
240          assert example.info["pandas_orient"] == "split"
241          with open(tmp.path(filename)) as f:
242              data = json.load(f)
243              assert set(data.keys()) == {"columns", "data"}
244              pd.read_json(StringIO(json.dumps(data)), orient=example.info["pandas_orient"])
245  
246          parsed_df = dataframe_from_raw_json(tmp.path(filename), schema=sig.inputs)
247  
248          # by definition of NaN, NaN == NaN is False but NaN != NaN is True
249          pd.testing.assert_frame_equal(df_with_nan, parsed_df, check_dtype=False)
250          # the frame read without schema should match except for the binary values
251          no_schema_df = dataframe_from_raw_json(tmp.path(filename))
252          a = parsed_df.drop(columns=["binary"])
253          b = no_schema_df.drop(columns=["binary"])
254          pd.testing.assert_frame_equal(a, b, check_dtype=False)
255  
256      # pass multidimensional array
257      for col in dict_of_ndarrays_with_nans:
258          input_example = dict_of_ndarrays_with_nans[col]
259          sig = infer_signature(input_example)
260          with TempDir() as tmp:
261              example = _Example(input_example)
262              example.save(tmp.path())
263              filename = example.info["artifact_path"]
264              assert example.info["type"] == "ndarray"
265              parsed_ary = parse_inputs_data(tmp.path(filename), schema=sig.inputs)
266              assert np.array_equal(parsed_ary, input_example, equal_nan=True)
267  
268              # without a schema/dtype specified, the resulting tensor will keep the None type
269              no_schema_df = parse_inputs_data(tmp.path(filename))
270              np.testing.assert_array_equal(
271                  no_schema_df, np.where(np.isnan(input_example), None, input_example)
272              )
273  
274  
275  class DummySklearnModel(BaseEstimator, ClassifierMixin):
276      def __init__(self, output_shape=(1,)):
277          self.output_shape = output_shape
278  
279      def fit(self, X, y=None):
280          return self
281  
282      def predict(self, X):
283          n_samples = X.shape[0]
284          full_output_shape = (n_samples,) + self.output_shape
285          return np.zeros(full_output_shape, dtype=np.dtype("int64"))
286  
287  
288  @pytest.mark.parametrize(
289      ("input_is_tabular", "output_shape", "expected_signature"),
290      [
291          # When the input example is column-based, output 1D numpy arrays are interpreted `ColSpec`s
292          (
293              True,
294              (),
295              ModelSignature(
296                  inputs=Schema([ColSpec(name="feature", type=DataType.string)]),
297                  outputs=Schema([ColSpec(type=DataType.long)]),
298              ),
299          ),
300          # But if the output numpy array has higher dimensions, fallback to interpreting the model
301          # output as `TensorSpec`s.
302          (
303              True,
304              (2,),
305              ModelSignature(
306                  inputs=Schema([ColSpec(name="feature", type=DataType.string)]),
307                  outputs=Schema([TensorSpec(np.dtype("int64"), (-1, 2))]),
308              ),
309          ),
310          # If the input example is tensor-based, interpret output numpy arrays as `TensorSpec`s
311          (
312              False,
313              (),
314              ModelSignature(
315                  inputs=Schema([TensorSpec(np.dtype("int64"), (-1, 1))]),
316                  outputs=Schema([TensorSpec(np.dtype("int64"), (-1,))]),
317              ),
318          ),
319      ],
320  )
321  def test_infer_signature_with_input_example(input_is_tabular, output_shape, expected_signature):
322      model = DummySklearnModel(output_shape=output_shape)
323      artifact_path = "model"
324      example = pd.DataFrame({"feature": ["value"]}) if input_is_tabular else np.array([[1]])
325  
326      with mlflow.start_run():
327          model_info = mlflow.sklearn.log_model(model, name=artifact_path, input_example=example)
328  
329      mlflow_model = Model.load(model_info.model_uri)
330      assert mlflow_model.signature == expected_signature
331  
332  
333  def test_infer_signature_from_example_can_be_disabled():
334      artifact_path = "model"
335      with mlflow.start_run():
336          model_info = mlflow.sklearn.log_model(
337              DummySklearnModel(output_shape=()),
338              name=artifact_path,
339              input_example=np.array([[1]]),
340              signature=False,
341          )
342  
343      mlflow_model = Model.load(model_info.model_uri)
344      assert mlflow_model.signature is None
345  
346  
347  def test_infer_signature_raises_if_predict_on_input_example_fails(monkeypatch):
348      monkeypatch.setenv("MLFLOW_TESTING", "false")
349  
350      class ErrorModel(BaseEstimator, ClassifierMixin):
351          def fit(self, X, y=None):
352              return self
353  
354          def predict(self, X):
355              raise Exception("oh no!")
356  
357      with mock.patch("mlflow.models.model._logger.warning") as mock_warning:
358          with mlflow.start_run():
359              mlflow.sklearn.log_model(ErrorModel(), name="model", input_example=np.array([[1]]))
360          assert any(
361              "Failed to validate serving input example" in call[0][0]
362              for call in mock_warning.call_args_list
363          )
364  
365  
366  @pytest.fixture(scope="module")
367  def iris_model():
368      X, y = datasets.load_iris(return_X_y=True, as_frame=True)
369      return knn.KNeighborsClassifier().fit(X, y)
370  
371  
372  @pytest.mark.parametrize(
373      "input_example",
374      [
375          {
376              "sepal length (cm)": 5.1,
377              "sepal width (cm)": 3.5,
378              "petal length (cm)": 1.4,
379              "petal width (cm)": 0.2,
380          },
381          pd.DataFrame([[5.1, 3.5, 1.4, 0.2]]),
382          pd.DataFrame(
383              {
384                  "sepal length (cm)": 5.1,
385                  "sepal width (cm)": 3.5,
386                  "petal length (cm)": 1.4,
387                  "petal width (cm)": 0.2,
388              },
389              index=[0],
390          ),
391      ],
392  )
393  def test_infer_signature_on_multi_column_input_examples(input_example, iris_model):
394      artifact_path = "model"
395  
396      with mlflow.start_run():
397          model_info = mlflow.sklearn.log_model(
398              iris_model, name=artifact_path, input_example=input_example
399          )
400  
401      mlflow_model = Model.load(model_info.model_uri)
402      input_columns = mlflow_model.signature.inputs.inputs
403      assert len(input_columns) == 4
404      assert all(col.type == DataType.double for col in input_columns)
405      assert mlflow_model.signature.outputs == Schema([ColSpec(type=DataType.long)])
406  
407  
408  @pytest.mark.parametrize(
409      "input_example",
410      ["some string", bytes([1, 2, 3])],
411  )
412  def test_infer_signature_on_scalar_input_examples(input_example):
413      class IdentitySklearnModel(BaseEstimator, ClassifierMixin):
414          def fit(self, X, y=None):
415              return self
416  
417          def predict(self, X):
418              if isinstance(X, pd.DataFrame):
419                  return X
420              raise Exception("Unsupported input type")
421  
422      artifact_path = "model"
423  
424      with mlflow.start_run():
425          model_info = mlflow.sklearn.log_model(
426              IdentitySklearnModel(), name=artifact_path, input_example=input_example
427          )
428  
429      mlflow_model = Model.load(model_info.model_uri)
430      signature = mlflow_model.signature
431      assert isinstance(signature, ModelSignature)
432      assert signature.inputs.inputs[0].name is None
433      t = DataType.string if isinstance(input_example, str) else DataType.binary
434      assert signature == ModelSignature(
435          inputs=Schema([ColSpec(type=t)]),
436          outputs=Schema([ColSpec(name=0, type=t)]),
437      )
438      # test that a single string still passes pyfunc schema enforcement
439      mlflow.pyfunc.load_model(model_info.model_uri).predict(input_example)