test_model_input_examples.py
1 import json 2 import math 3 from io import StringIO 4 from unittest import mock 5 6 import numpy as np 7 import pandas as pd 8 import pytest 9 import sklearn.neighbors as knn 10 from scipy.sparse import csc_matrix, csr_matrix 11 from sklearn import datasets 12 from sklearn.base import BaseEstimator, ClassifierMixin 13 14 import mlflow 15 from mlflow.models import Model 16 from mlflow.models.signature import ModelSignature, infer_signature 17 from mlflow.models.utils import ( 18 _Example, 19 _read_sparse_matrix_from_json, 20 parse_inputs_data, 21 ) 22 from mlflow.types import DataType 23 from mlflow.types.schema import ColSpec, Schema, TensorSpec 24 from mlflow.types.utils import TensorsNotSupportedException 25 from mlflow.utils.file_utils import TempDir 26 from mlflow.utils.proto_json_utils import dataframe_from_raw_json 27 28 29 @pytest.fixture 30 def pandas_df_with_all_types(): 31 df = pd.DataFrame({ 32 "boolean": [True, False, True], 33 "integer": np.array([1, 2, 3], np.int32), 34 "long": np.array([1, 2, 3], np.int64), 35 "float": np.array([math.pi, 2 * math.pi, 3 * math.pi], np.float32), 36 "double": [math.pi, 2 * math.pi, 3 * math.pi], 37 "binary": [bytes([1, 2, 3]), bytes([4, 5, 6]), bytes([7, 8, 9])], 38 "string": ["a", "b", "c"], 39 "boolean_ext": [True, False, True], 40 "integer_ext": [1, 2, 3], 41 "string_ext": ["a", "b", "c"], 42 "array": np.array(["a", "b", "c"]), 43 }) 44 df["boolean_ext"] = df["boolean_ext"].astype("boolean") 45 df["integer_ext"] = df["integer_ext"].astype("Int64") 46 df["string_ext"] = df["string_ext"].astype("string") 47 return df 48 49 50 @pytest.fixture 51 def df_without_columns(): 52 return pd.DataFrame({0: [1, 2, 3], 1: [4, 5, 6], 2: [7, 8, 9]}) 53 54 55 @pytest.fixture 56 def df_with_nan(): 57 return pd.DataFrame({ 58 "boolean": [True, False, True], 59 "integer": np.array([1, 2, 3], np.int32), 60 "long": np.array([1, 2, 3], np.int64), 61 "float": np.array([np.nan, 2 * math.pi, 3 * math.pi], np.float32), 62 "double": [math.pi, np.nan, 3 * math.pi], 63 "binary": [bytes([1, 2, 3]), bytes([4, 5, 6]), bytes([7, 8, 9])], 64 "string": ["a", "b", "c"], 65 }) 66 67 68 @pytest.fixture 69 def dict_of_ndarrays(): 70 return { 71 "1D": np.arange(0, 12, 0.5), 72 "2D": np.arange(0, 12, 0.5).reshape(3, 8), 73 "3D": np.arange(0, 12, 0.5).reshape(2, 3, 4), 74 "4D": np.arange(0, 12, 0.5).reshape(3, 2, 2, 2), 75 } 76 77 78 @pytest.fixture 79 def dict_of_ndarrays_with_nans(): 80 return { 81 "1D": np.array([0.5, np.nan, 2.0]), 82 "2D": np.array([[0.1, 0.2], [np.nan, 0.5]]), 83 "3D": np.array([[[0.1, np.nan], [0.3, 0.4]], [[np.nan, 0.6], [0.7, np.nan]]]), 84 } 85 86 87 @pytest.fixture 88 def dict_of_sparse_matrix(): 89 return { 90 "sparse_matrix_csc": csc_matrix(np.arange(0, 12, 0.5).reshape(3, 8)), 91 "sparse_matrix_csr": csr_matrix(np.arange(0, 12, 0.5).reshape(3, 8)), 92 } 93 94 95 def test_input_examples(pandas_df_with_all_types, dict_of_ndarrays): 96 sig = infer_signature(pandas_df_with_all_types) 97 # test setting example with data frame with all supported data types 98 with TempDir() as tmp: 99 example = _Example(pandas_df_with_all_types) 100 example.save(tmp.path()) 101 filename = example.info["artifact_path"] 102 with open(tmp.path(filename)) as f: 103 data = json.load(f) 104 assert set(data.keys()) == {"columns", "data"} 105 parsed_df = dataframe_from_raw_json(tmp.path(filename), schema=sig.inputs) 106 pd.testing.assert_frame_equal(pandas_df_with_all_types, parsed_df, check_dtype=False) 107 # the frame read without schema should match except for the binary values 108 pd.testing.assert_frame_equal( 109 parsed_df.drop(columns=["binary"]), 110 dataframe_from_raw_json(tmp.path(filename)).drop(columns=["binary"]), 111 check_dtype=False, 112 ) 113 114 # NB: Drop columns that cannot be encoded by proto_json_utils.pyNumpyEncoder 115 new_df = pandas_df_with_all_types.drop(columns=["boolean_ext", "integer_ext", "string_ext"]) 116 117 # pass the input as dictionary instead 118 with TempDir() as tmp: 119 d = {name: new_df[name].values for name in new_df.columns} 120 example = _Example(d) 121 example.save(tmp.path()) 122 filename = example.info["artifact_path"] 123 parsed_dict = parse_inputs_data(tmp.path(filename)) 124 assert d.keys() == parsed_dict.keys() 125 # Asserting binary will fail since it is converted to base64 encoded strings. 126 # The check above suffices that the binary input is stored. 127 del d["binary"] 128 for key in d: 129 np.testing.assert_array_equal(d[key], parsed_dict[key]) 130 131 # input passed as numpy array 132 new_df = pandas_df_with_all_types.drop(columns=["binary"]) 133 for col in new_df: 134 input_example = new_df[col].to_numpy() 135 with TempDir() as tmp: 136 example = _Example(input_example) 137 example.save(tmp.path()) 138 filename = example.info["artifact_path"] 139 parsed_ary = parse_inputs_data(tmp.path(filename)) 140 np.testing.assert_array_equal(parsed_ary, input_example) 141 142 # pass multidimensional array 143 for col in dict_of_ndarrays: 144 input_example = dict_of_ndarrays[col] 145 with TempDir() as tmp: 146 example = _Example(input_example) 147 example.save(tmp.path()) 148 filename = example.info["artifact_path"] 149 parsed_ary = parse_inputs_data(tmp.path(filename)) 150 np.testing.assert_array_equal(parsed_ary, input_example) 151 152 # pass multidimensional array as a list 153 example = np.array([[1, 2, 3]]) 154 with pytest.raises( 155 TensorsNotSupportedException, 156 match=r"Numpy arrays in list are not supported as input examples.", 157 ): 158 _Example([example, example]) 159 160 # pass dict with scalars 161 with TempDir() as tmp: 162 example = {"a": 1, "b": "abc"} 163 x = _Example(example) 164 x.save(tmp.path()) 165 filename = x.info["artifact_path"] 166 with open(tmp.path(filename)) as f: 167 parsed_data = json.load(f) 168 assert example == parsed_data 169 170 171 def test_pandas_orients_for_input_examples( 172 pandas_df_with_all_types, df_without_columns, dict_of_ndarrays 173 ): 174 # test setting example with data frame with all supported data types 175 with TempDir() as tmp: 176 example = _Example(pandas_df_with_all_types) 177 example.save(tmp.path()) 178 filename = example.info["artifact_path"] 179 assert example.info["type"] == "dataframe" 180 assert example.info["pandas_orient"] == "split" 181 with open(tmp.path(filename)) as f: 182 data = json.load(f) 183 dataframe = pd.read_json( 184 StringIO(json.dumps(data)), orient=example.info["pandas_orient"], precise_float=True 185 ) 186 pd.testing.assert_frame_equal( 187 pandas_df_with_all_types.drop(columns=["binary"]), 188 dataframe.drop(columns=["binary"]), 189 check_dtype=False, 190 ) 191 192 with TempDir() as tmp: 193 example = _Example(df_without_columns) 194 example.save(tmp.path()) 195 filename = example.info["artifact_path"] 196 assert example.info["type"] == "dataframe" 197 assert example.info["pandas_orient"] == "values" 198 with open(tmp.path(filename)) as f: 199 data = json.load(f) 200 assert set(data.keys()) == {"data"} 201 # NOTE: when no column names are provided (i.e. values orient), 202 # saving an example adds a "data" key rather than directly storing the plain data 203 data = data["data"] 204 dataframe = pd.read_json( 205 StringIO(json.dumps(data)), orient=example.info["pandas_orient"] 206 ) 207 pd.testing.assert_frame_equal(dataframe, df_without_columns, check_dtype=False) 208 209 # pass dict with scalars 210 with TempDir() as tmp: 211 example = {"a": 1, "b": "abc"} 212 x = _Example(example) 213 x.save(tmp.path()) 214 filename = x.info["artifact_path"] 215 assert x.info["type"] == "json_object" 216 with open(tmp.path(filename)) as f: 217 parsed_json = json.load(f) 218 assert parsed_json == example 219 220 221 def test_sparse_matrix_input_examples(dict_of_sparse_matrix): 222 for example_type, input_example in dict_of_sparse_matrix.items(): 223 with TempDir() as tmp: 224 example = _Example(input_example) 225 example.save(tmp.path()) 226 filename = example.info["artifact_path"] 227 assert example.info["type"] == example_type 228 parsed_matrix = _read_sparse_matrix_from_json(tmp.path(filename), example_type) 229 np.testing.assert_array_equal(parsed_matrix.toarray(), input_example.toarray()) 230 231 232 def test_input_examples_with_nan(df_with_nan, dict_of_ndarrays_with_nans): 233 # test setting example with data frame with NaN values in it 234 sig = infer_signature(df_with_nan) 235 with TempDir() as tmp: 236 example = _Example(df_with_nan) 237 example.save(tmp.path()) 238 filename = example.info["artifact_path"] 239 assert example.info["type"] == "dataframe" 240 assert example.info["pandas_orient"] == "split" 241 with open(tmp.path(filename)) as f: 242 data = json.load(f) 243 assert set(data.keys()) == {"columns", "data"} 244 pd.read_json(StringIO(json.dumps(data)), orient=example.info["pandas_orient"]) 245 246 parsed_df = dataframe_from_raw_json(tmp.path(filename), schema=sig.inputs) 247 248 # by definition of NaN, NaN == NaN is False but NaN != NaN is True 249 pd.testing.assert_frame_equal(df_with_nan, parsed_df, check_dtype=False) 250 # the frame read without schema should match except for the binary values 251 no_schema_df = dataframe_from_raw_json(tmp.path(filename)) 252 a = parsed_df.drop(columns=["binary"]) 253 b = no_schema_df.drop(columns=["binary"]) 254 pd.testing.assert_frame_equal(a, b, check_dtype=False) 255 256 # pass multidimensional array 257 for col in dict_of_ndarrays_with_nans: 258 input_example = dict_of_ndarrays_with_nans[col] 259 sig = infer_signature(input_example) 260 with TempDir() as tmp: 261 example = _Example(input_example) 262 example.save(tmp.path()) 263 filename = example.info["artifact_path"] 264 assert example.info["type"] == "ndarray" 265 parsed_ary = parse_inputs_data(tmp.path(filename), schema=sig.inputs) 266 assert np.array_equal(parsed_ary, input_example, equal_nan=True) 267 268 # without a schema/dtype specified, the resulting tensor will keep the None type 269 no_schema_df = parse_inputs_data(tmp.path(filename)) 270 np.testing.assert_array_equal( 271 no_schema_df, np.where(np.isnan(input_example), None, input_example) 272 ) 273 274 275 class DummySklearnModel(BaseEstimator, ClassifierMixin): 276 def __init__(self, output_shape=(1,)): 277 self.output_shape = output_shape 278 279 def fit(self, X, y=None): 280 return self 281 282 def predict(self, X): 283 n_samples = X.shape[0] 284 full_output_shape = (n_samples,) + self.output_shape 285 return np.zeros(full_output_shape, dtype=np.dtype("int64")) 286 287 288 @pytest.mark.parametrize( 289 ("input_is_tabular", "output_shape", "expected_signature"), 290 [ 291 # When the input example is column-based, output 1D numpy arrays are interpreted `ColSpec`s 292 ( 293 True, 294 (), 295 ModelSignature( 296 inputs=Schema([ColSpec(name="feature", type=DataType.string)]), 297 outputs=Schema([ColSpec(type=DataType.long)]), 298 ), 299 ), 300 # But if the output numpy array has higher dimensions, fallback to interpreting the model 301 # output as `TensorSpec`s. 302 ( 303 True, 304 (2,), 305 ModelSignature( 306 inputs=Schema([ColSpec(name="feature", type=DataType.string)]), 307 outputs=Schema([TensorSpec(np.dtype("int64"), (-1, 2))]), 308 ), 309 ), 310 # If the input example is tensor-based, interpret output numpy arrays as `TensorSpec`s 311 ( 312 False, 313 (), 314 ModelSignature( 315 inputs=Schema([TensorSpec(np.dtype("int64"), (-1, 1))]), 316 outputs=Schema([TensorSpec(np.dtype("int64"), (-1,))]), 317 ), 318 ), 319 ], 320 ) 321 def test_infer_signature_with_input_example(input_is_tabular, output_shape, expected_signature): 322 model = DummySklearnModel(output_shape=output_shape) 323 artifact_path = "model" 324 example = pd.DataFrame({"feature": ["value"]}) if input_is_tabular else np.array([[1]]) 325 326 with mlflow.start_run(): 327 model_info = mlflow.sklearn.log_model(model, name=artifact_path, input_example=example) 328 329 mlflow_model = Model.load(model_info.model_uri) 330 assert mlflow_model.signature == expected_signature 331 332 333 def test_infer_signature_from_example_can_be_disabled(): 334 artifact_path = "model" 335 with mlflow.start_run(): 336 model_info = mlflow.sklearn.log_model( 337 DummySklearnModel(output_shape=()), 338 name=artifact_path, 339 input_example=np.array([[1]]), 340 signature=False, 341 ) 342 343 mlflow_model = Model.load(model_info.model_uri) 344 assert mlflow_model.signature is None 345 346 347 def test_infer_signature_raises_if_predict_on_input_example_fails(monkeypatch): 348 monkeypatch.setenv("MLFLOW_TESTING", "false") 349 350 class ErrorModel(BaseEstimator, ClassifierMixin): 351 def fit(self, X, y=None): 352 return self 353 354 def predict(self, X): 355 raise Exception("oh no!") 356 357 with mock.patch("mlflow.models.model._logger.warning") as mock_warning: 358 with mlflow.start_run(): 359 mlflow.sklearn.log_model(ErrorModel(), name="model", input_example=np.array([[1]])) 360 assert any( 361 "Failed to validate serving input example" in call[0][0] 362 for call in mock_warning.call_args_list 363 ) 364 365 366 @pytest.fixture(scope="module") 367 def iris_model(): 368 X, y = datasets.load_iris(return_X_y=True, as_frame=True) 369 return knn.KNeighborsClassifier().fit(X, y) 370 371 372 @pytest.mark.parametrize( 373 "input_example", 374 [ 375 { 376 "sepal length (cm)": 5.1, 377 "sepal width (cm)": 3.5, 378 "petal length (cm)": 1.4, 379 "petal width (cm)": 0.2, 380 }, 381 pd.DataFrame([[5.1, 3.5, 1.4, 0.2]]), 382 pd.DataFrame( 383 { 384 "sepal length (cm)": 5.1, 385 "sepal width (cm)": 3.5, 386 "petal length (cm)": 1.4, 387 "petal width (cm)": 0.2, 388 }, 389 index=[0], 390 ), 391 ], 392 ) 393 def test_infer_signature_on_multi_column_input_examples(input_example, iris_model): 394 artifact_path = "model" 395 396 with mlflow.start_run(): 397 model_info = mlflow.sklearn.log_model( 398 iris_model, name=artifact_path, input_example=input_example 399 ) 400 401 mlflow_model = Model.load(model_info.model_uri) 402 input_columns = mlflow_model.signature.inputs.inputs 403 assert len(input_columns) == 4 404 assert all(col.type == DataType.double for col in input_columns) 405 assert mlflow_model.signature.outputs == Schema([ColSpec(type=DataType.long)]) 406 407 408 @pytest.mark.parametrize( 409 "input_example", 410 ["some string", bytes([1, 2, 3])], 411 ) 412 def test_infer_signature_on_scalar_input_examples(input_example): 413 class IdentitySklearnModel(BaseEstimator, ClassifierMixin): 414 def fit(self, X, y=None): 415 return self 416 417 def predict(self, X): 418 if isinstance(X, pd.DataFrame): 419 return X 420 raise Exception("Unsupported input type") 421 422 artifact_path = "model" 423 424 with mlflow.start_run(): 425 model_info = mlflow.sklearn.log_model( 426 IdentitySklearnModel(), name=artifact_path, input_example=input_example 427 ) 428 429 mlflow_model = Model.load(model_info.model_uri) 430 signature = mlflow_model.signature 431 assert isinstance(signature, ModelSignature) 432 assert signature.inputs.inputs[0].name is None 433 t = DataType.string if isinstance(input_example, str) else DataType.binary 434 assert signature == ModelSignature( 435 inputs=Schema([ColSpec(type=t)]), 436 outputs=Schema([ColSpec(name=0, type=t)]), 437 ) 438 # test that a single string still passes pyfunc schema enforcement 439 mlflow.pyfunc.load_model(model_info.model_uri).predict(input_example)