test_pyfunc_schema_enforcement.py
1 import base64 2 import datetime 3 import decimal 4 import json 5 import os 6 import re 7 from unittest import mock 8 9 import cloudpickle 10 import numpy as np 11 import pandas as pd 12 import pytest 13 import sklearn.linear_model 14 from packaging.version import Version 15 16 import mlflow 17 import mlflow.pyfunc.scoring_server as pyfunc_scoring_server 18 from mlflow.exceptions import MlflowException 19 from mlflow.models import ( 20 Model, 21 ModelSignature, 22 convert_input_example_to_serving_input, 23 infer_signature, 24 ) 25 from mlflow.models.utils import ( 26 _enforce_params_schema, 27 _enforce_schema, 28 ) 29 from mlflow.pyfunc import PyFuncModel 30 from mlflow.pyfunc.scoring_server import is_unified_llm_input 31 from mlflow.tracking.artifact_utils import _download_artifact_from_uri 32 from mlflow.types import ColSpec, DataType, ParamSchema, ParamSpec, Schema, TensorSpec 33 from mlflow.types.schema import AnyType, Array, Map, Object, Property 34 from mlflow.utils.proto_json_utils import dump_input_data 35 36 from tests.helper_functions import pyfunc_scoring_endpoint 37 from tests.pyfunc.utils import score_model_in_process 38 from tests.tracing.helper import get_traces 39 40 41 class TestModel: 42 @staticmethod 43 def predict(pdf, params=None): 44 return pdf 45 46 47 @pytest.fixture(scope="module") 48 def sample_params_basic(): 49 return { 50 "str_param": "str_a", 51 "int_param": np.int32(1), 52 "bool_param": True, 53 "double_param": 1.0, 54 "float_param": np.float32(0.1), 55 "long_param": 100, 56 "datetime_param": np.datetime64("2023-06-26 00:00:00"), 57 "str_list": ["a", "b", "c"], 58 "bool_list": [True, False], 59 "double_array": np.array([1.0, 2.0]), 60 } 61 62 63 @pytest.fixture(scope="module") 64 def param_schema_basic(): 65 return ParamSchema([ 66 ParamSpec("str_param", DataType.string, "str_a", None), 67 ParamSpec("int_param", DataType.integer, np.int32(1), None), 68 ParamSpec("bool_param", DataType.boolean, True, None), 69 ParamSpec("double_param", DataType.double, 1.0, None), 70 ParamSpec("float_param", DataType.float, np.float32(0.1), None), 71 ParamSpec("long_param", DataType.long, 100, None), 72 ParamSpec("datetime_param", DataType.datetime, np.datetime64("2023-06-26 00:00:00"), None), 73 ParamSpec("str_list", DataType.string, ["a", "b", "c"], (-1,)), 74 ParamSpec("bool_list", DataType.boolean, [True, False], (-1,)), 75 ParamSpec("double_array", DataType.double, [1.0, 2.0], (-1,)), 76 ]) 77 78 79 class PythonModelWithBasicParams(mlflow.pyfunc.PythonModel): 80 def predict(self, context, model_input, params=None): 81 assert isinstance(params, dict) 82 assert isinstance(params["str_param"], str) 83 assert isinstance(params["int_param"], int) 84 assert isinstance(params["bool_param"], bool) 85 assert isinstance(params["double_param"], float) 86 assert isinstance(params["float_param"], float) 87 assert isinstance(params["long_param"], int) 88 assert isinstance(params["datetime_param"], datetime.datetime) 89 assert isinstance(params["str_list"], list) 90 assert all(isinstance(x, str) for x in params["str_list"]) 91 assert isinstance(params["bool_list"], list) 92 assert all(isinstance(x, bool) for x in params["bool_list"]) 93 assert isinstance(params["double_array"], list) 94 assert all(isinstance(x, float) for x in params["double_array"]) 95 return params 96 97 98 @pytest.fixture(scope="module") 99 def sample_params_with_arrays(): 100 return { 101 "int_array": np.array([np.int32(1), np.int32(2)]), 102 "double_array": np.array([1.0, 2.0]), 103 "float_array": np.array([np.float32(1.0), np.float32(2.0)]), 104 "long_array": np.array([1, 2]), 105 "datetime_array": np.array([ 106 np.datetime64("2023-06-26 00:00:00"), 107 np.datetime64("2023-06-26 00:00:00"), 108 ]), 109 } 110 111 112 class PythonModelWithArrayParams(mlflow.pyfunc.PythonModel): 113 def predict(self, context, model_input, params=None): 114 assert isinstance(params, dict) 115 assert all(isinstance(x, int) for x in params["int_array"]) 116 assert all(isinstance(x, float) for x in params["double_array"]) 117 assert all(isinstance(x, float) for x in params["float_array"]) 118 assert all(isinstance(x, int) for x in params["long_array"]) 119 assert all(isinstance(x, datetime.datetime) for x in params["datetime_array"]) 120 return params 121 122 123 def test_schema_enforcement_single_column_2d_array(): 124 X = np.array([[1], [2], [3]]) 125 y = np.array([1, 2, 3]) 126 model = sklearn.linear_model.LinearRegression() 127 model.fit(X, y) 128 signature = infer_signature(X, y) 129 assert signature.inputs.inputs[0].shape == (-1, 1) 130 assert signature.outputs.inputs[0].shape == (-1,) 131 132 with mlflow.start_run(): 133 model_info = mlflow.sklearn.log_model(model, name="model", signature=signature) 134 135 loaded_model = mlflow.pyfunc.load_model(model_info.model_uri) 136 pdf = pd.DataFrame(X) 137 np.testing.assert_almost_equal(loaded_model.predict(pdf), model.predict(pdf)) 138 139 140 def test_column_schema_enforcement(): 141 m = Model() 142 input_schema = Schema([ 143 ColSpec("integer", "a"), 144 ColSpec("long", "b"), 145 ColSpec("float", "c"), 146 ColSpec("double", "d"), 147 ColSpec("boolean", "e"), 148 ColSpec("string", "g"), 149 ColSpec("binary", "f"), 150 ColSpec("datetime", "h"), 151 ]) 152 m.signature = ModelSignature(inputs=input_schema) 153 pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) 154 pdf = pd.DataFrame( 155 data=[[1, 2, 3, 4, True, "x", bytes([1]), "2021-01-01 00:00:00.1234567"]], 156 columns=["b", "d", "a", "c", "e", "g", "f", "h"], 157 dtype=object, 158 ) 159 pdf["a"] = pdf["a"].astype(np.int32) 160 pdf["b"] = pdf["b"].astype(np.int64) 161 pdf["c"] = pdf["c"].astype(np.float32) 162 pdf["d"] = pdf["d"].astype(np.float64) 163 pdf["h"] = pdf["h"].astype(np.dtype("datetime64[ns]")) 164 # test that missing column raises 165 match_missing_inputs = "Model is missing inputs" 166 with pytest.raises(MlflowException, match=match_missing_inputs): 167 res = pyfunc_model.predict(pdf[["b", "d", "a", "e", "g", "f", "h"]]) 168 169 # test that extra column is ignored 170 pdf["x"] = 1 171 172 # test that columns are reordered, extra column is ignored 173 res = pyfunc_model.predict(pdf) 174 assert all((res == pdf[input_schema.input_names()]).all()) 175 176 expected_types = dict(zip(input_schema.input_names(), input_schema.pandas_types())) 177 # MLflow datetime type in input_schema does not encode precision, so add it for assertions 178 expected_types["h"] = np.dtype("datetime64[ns]") 179 # object cannot be converted to pandas Strings at the moment 180 expected_types["f"] = object 181 expected_types["g"] = object 182 actual_types = res.dtypes.to_dict() 183 assert expected_types == actual_types 184 185 # Test conversions 186 # 1. long -> integer raises 187 pdf["a"] = pdf["a"].astype(np.int64) 188 match_incompatible_inputs = "Incompatible input types" 189 with pytest.raises(MlflowException, match=match_incompatible_inputs): 190 pyfunc_model.predict(pdf) 191 pdf["a"] = pdf["a"].astype(np.int32) 192 # 2. integer -> long works 193 pdf["b"] = pdf["b"].astype(np.int32) 194 res = pyfunc_model.predict(pdf) 195 assert all((res == pdf[input_schema.input_names()]).all()) 196 assert res.dtypes.to_dict() == expected_types 197 pdf["b"] = pdf["b"].astype(np.int64) 198 199 # 3. unsigned int -> long works 200 pdf["b"] = pdf["b"].astype(np.uint32) 201 res = pyfunc_model.predict(pdf) 202 assert all((res == pdf[input_schema.input_names()]).all()) 203 assert res.dtypes.to_dict() == expected_types 204 pdf["b"] = pdf["b"].astype(np.int64) 205 206 # 4. unsigned int -> int raises 207 pdf["a"] = pdf["a"].astype(np.uint32) 208 with pytest.raises(MlflowException, match=match_incompatible_inputs): 209 pyfunc_model.predict(pdf) 210 pdf["a"] = pdf["a"].astype(np.int32) 211 212 # 5. double -> float raises 213 pdf["c"] = pdf["c"].astype(np.float64) 214 with pytest.raises(MlflowException, match=match_incompatible_inputs): 215 pyfunc_model.predict(pdf) 216 pdf["c"] = pdf["c"].astype(np.float32) 217 218 # 6. float -> double works, double -> float does not 219 pdf["d"] = pdf["d"].astype(np.float32) 220 res = pyfunc_model.predict(pdf) 221 assert res.dtypes.to_dict() == expected_types 222 pdf["d"] = pdf["d"].astype(np.float64) 223 pdf["c"] = pdf["c"].astype(np.float64) 224 with pytest.raises(MlflowException, match=match_incompatible_inputs): 225 pyfunc_model.predict(pdf) 226 pdf["c"] = pdf["c"].astype(np.float32) 227 228 # 7. int -> float raises 229 pdf["c"] = pdf["c"].astype(np.int32) 230 with pytest.raises(MlflowException, match=match_incompatible_inputs): 231 pyfunc_model.predict(pdf) 232 pdf["c"] = pdf["c"].astype(np.float32) 233 234 # 8. int -> double works 235 pdf["d"] = pdf["d"].astype(np.int32) 236 pyfunc_model.predict(pdf) 237 assert all((res == pdf[input_schema.input_names()]).all()) 238 assert res.dtypes.to_dict() == expected_types 239 240 # 9. long -> double raises 241 pdf["d"] = pdf["d"].astype(np.int64) 242 with pytest.raises(MlflowException, match=match_incompatible_inputs): 243 pyfunc_model.predict(pdf) 244 pdf["d"] = pdf["d"].astype(np.float64) 245 246 # 10. any float -> any int raises 247 pdf["a"] = pdf["a"].astype(np.float32) 248 with pytest.raises(MlflowException, match=match_incompatible_inputs): 249 pyfunc_model.predict(pdf) 250 # 10. any float -> any int raises 251 pdf["a"] = pdf["a"].astype(np.float64) 252 with pytest.raises(MlflowException, match=match_incompatible_inputs): 253 pyfunc_model.predict(pdf) 254 pdf["a"] = pdf["a"].astype(np.int32) 255 pdf["b"] = pdf["b"].astype(np.float64) 256 with pytest.raises(MlflowException, match=match_incompatible_inputs): 257 pyfunc_model.predict(pdf) 258 pdf["b"] = pdf["b"].astype(np.int64) 259 260 pdf["b"] = pdf["b"].astype(np.float64) 261 with pytest.raises(MlflowException, match=match_incompatible_inputs): 262 pyfunc_model.predict(pdf) 263 pdf["b"] = pdf["b"].astype(np.int64) 264 265 # 11. objects work 266 pdf["b"] = pdf["b"].astype(object) 267 pdf["d"] = pdf["d"].astype(object) 268 pdf["e"] = pdf["e"].astype(object) 269 pdf["f"] = pdf["f"].astype(object) 270 pdf["g"] = pdf["g"].astype(object) 271 res = pyfunc_model.predict(pdf) 272 assert res.dtypes.to_dict() == expected_types 273 274 # 12. datetime64[D] (date only) -> datetime64[x] works 275 pdf["h"] = pdf["h"].values.astype("datetime64[D]") 276 res = pyfunc_model.predict(pdf) 277 assert res.dtypes.to_dict() == expected_types 278 pdf["h"] = pdf["h"].astype("datetime64[s]") 279 280 # 13. np.ndarrays can be converted to dataframe but have no columns 281 with pytest.raises(MlflowException, match=match_missing_inputs): 282 pyfunc_model.predict(pdf.values) 283 284 # 14. dictionaries of str -> list/nparray work, 285 # including extraneous multi-dimensional arrays and lists 286 arr = np.array([1, 2, 3]) 287 d = { 288 "a": arr.astype("int32"), 289 "b": arr.astype("int64"), 290 "c": arr.astype("float32"), 291 "d": arr.astype("float64"), 292 "e": [True, False, True], 293 "g": ["a", "b", "c"], 294 "f": [bytes(0), bytes(1), bytes(1)], 295 "h": np.array(["2020-01-01", "2020-02-02", "2020-03-03"], dtype=np.datetime64), 296 # Extraneous multi-dimensional numpy array should be silently dropped 297 "i": np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), 298 # Extraneous multi-dimensional list should be silently dropped 299 "j": [[1, 2, 3], [4, 5, 6], [7, 8, 9]], 300 } 301 res = pyfunc_model.predict(d) 302 assert res.dtypes.to_dict() == expected_types 303 304 # 15. dictionaries of str -> list[list] fail 305 d = { 306 "a": [arr.astype("int32")], 307 "b": [arr.astype("int64")], 308 "c": [arr.astype("float32")], 309 "d": [arr.astype("float64")], 310 "e": [[True, False, True]], 311 "g": np.array([["a", "b", "c"]]), 312 "f": [[bytes(0), bytes(1), bytes(1)]], 313 "h": [np.array(["2020-01-01", "2020-02-02", "2020-03-03"], dtype=np.datetime64)], 314 } 315 with pytest.raises(MlflowException, match=match_incompatible_inputs): 316 pyfunc_model.predict(d) 317 318 # 16. conversion to dataframe fails 319 d = { 320 "a": [1], 321 "b": [1, 2], 322 "c": [1, 2, 3], 323 } 324 with pytest.raises( 325 MlflowException, 326 match="This model contains a column-based signature, which suggests a DataFrame input.", 327 ): 328 pyfunc_model.predict(d) 329 330 # 17. conversion from Decimal to float is allowed since numpy currently has no support for the 331 # data type. 332 pdf["d"] = [decimal.Decimal(1.0)] 333 res = pyfunc_model.predict(pdf) 334 assert res.dtypes.to_dict() == expected_types 335 336 337 def _compare_exact_tensor_dict_input(d1, d2): 338 """Return whether two dicts of np arrays are exactly equal""" 339 if d1.keys() != d2.keys(): 340 return False 341 return all(np.array_equal(d1[key], d2[key]) for key in d1) 342 343 344 def test_tensor_multi_named_schema_enforcement(): 345 m = Model() 346 input_schema = Schema([ 347 TensorSpec(np.dtype(np.uint64), (-1, 5), "a"), 348 TensorSpec(np.dtype(np.short), (-1, 2), "b"), 349 TensorSpec(np.dtype(np.float32), (2, -1, 2), "c"), 350 ]) 351 m.signature = ModelSignature(inputs=input_schema) 352 pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) 353 inp = { 354 "a": np.array([[0, 0, 0, 0, 0], [1, 1, 1, 1, 1]], dtype=np.uint64), 355 "b": np.array([[0, 0], [1, 1], [2, 2]], dtype=np.short), 356 "c": np.array([[[0, 0], [1, 1]], [[2, 2], [3, 3]]], dtype=np.float32), 357 } 358 359 # test that missing column raises 360 inp1 = inp.copy() 361 with pytest.raises(MlflowException, match="Model is missing inputs"): 362 pyfunc_model.predict(inp1.pop("b")) 363 364 # test that extra column is ignored 365 inp2 = inp.copy() 366 inp2["x"] = 1 367 368 # test that extra column is removed 369 res = pyfunc_model.predict(inp2) 370 assert res == {k: v for k, v in inp.items() if k in {"a", "b", "c"}} 371 expected_types = dict(zip(input_schema.input_names(), input_schema.input_types())) 372 actual_types = {k: v.dtype for k, v in res.items()} 373 assert expected_types == actual_types 374 375 # test that variable axes are supported 376 inp3 = { 377 "a": np.array([[0, 0, 0, 0, 0], [1, 1, 1, 1, 1], [2, 2, 2, 2, 2]], dtype=np.uint64), 378 "b": np.array([[0, 0], [1, 1]], dtype=np.short), 379 "c": np.array([[[0, 0]], [[2, 2]]], dtype=np.float32), 380 } 381 res = pyfunc_model.predict(inp3) 382 assert _compare_exact_tensor_dict_input(res, inp3) 383 expected_types = dict(zip(input_schema.input_names(), input_schema.input_types())) 384 actual_types = {k: v.dtype for k, v in res.items()} 385 assert expected_types == actual_types 386 387 # test that type casting is not supported 388 inp4 = inp.copy() 389 inp4["a"] = inp4["a"].astype(np.int32) 390 with pytest.raises( 391 MlflowException, match="dtype of input int32 does not match expected dtype uint64" 392 ): 393 pyfunc_model.predict(inp4) 394 395 # test wrong shape 396 inp5 = { 397 "a": np.array([[0, 0, 0, 0]], dtype=np.uint), 398 "b": np.array([[0, 0], [1, 1]], dtype=np.short), 399 "c": np.array([[[0, 0]]], dtype=np.float32), 400 } 401 with pytest.raises( 402 MlflowException, 403 match=re.escape("Shape of input (1, 4) does not match expected shape (-1, 5)"), 404 ): 405 pyfunc_model.predict(inp5) 406 407 # test non-dictionary input 408 inp6 = [ 409 np.array([[0, 0, 0, 0, 0]], dtype=np.uint64), 410 np.array([[0, 0], [1, 1]], dtype=np.short), 411 np.array([[[0, 0]]], dtype=np.float32), 412 ] 413 with pytest.raises( 414 MlflowException, match=re.escape("Model is missing inputs ['a', 'b', 'c'].") 415 ): 416 pyfunc_model.predict(inp6) 417 418 # test empty ndarray does not work 419 inp7 = inp.copy() 420 inp7["a"] = np.array([]) 421 with pytest.raises( 422 MlflowException, match=re.escape("Shape of input (0,) does not match expected shape") 423 ): 424 pyfunc_model.predict(inp7) 425 426 # test dictionary of str -> list does not work 427 inp8 = {k: list(v) for k, v in inp.items()} 428 match = ( 429 r"This model contains a tensor-based model signature with input names.+" 430 r"suggests a dictionary input mapping input name to a numpy array, but a dict" 431 r" with value type <class 'list'> was found" 432 ) 433 with pytest.raises(MlflowException, match=match): 434 pyfunc_model.predict(inp8) 435 436 # test dataframe input fails at shape enforcement 437 pdf = pd.DataFrame(data=[[1, 2, 3]], columns=["a", "b", "c"]) 438 pdf["a"] = pdf["a"].astype(np.uint64) 439 pdf["b"] = pdf["b"].astype(np.short) 440 pdf["c"] = pdf["c"].astype(np.float32) 441 with pytest.raises( 442 MlflowException, 443 match=re.escape( 444 "The input pandas dataframe column 'a' contains scalar values, which requires the " 445 "shape to be (-1,) or (-1, 1), but got tensor spec shape of (-1, 5)" 446 ), 447 ): 448 pyfunc_model.predict(pdf) 449 450 451 def test_schema_enforcement_single_named_tensor_schema(): 452 m = Model() 453 input_schema = Schema([TensorSpec(np.dtype(np.uint64), (-1, 2, 3), "a")]) 454 m.signature = ModelSignature(inputs=input_schema) 455 pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) 456 input_array = np.array(range(12), dtype=np.uint64).reshape((2, 2, 3)) 457 inp = { 458 "a": input_array, 459 } 460 461 # sanity test that dictionary with correct input works 462 res = pyfunc_model.predict(inp) 463 assert res == inp 464 expected_types = dict(zip(input_schema.input_names(), input_schema.input_types())) 465 actual_types = {k: v.dtype for k, v in res.items()} 466 assert expected_types == actual_types 467 468 # test single np.ndarray input works and is converted to dictionary 469 res = pyfunc_model.predict(inp["a"]) 470 assert res == inp 471 expected_types = dict(zip(input_schema.input_names(), input_schema.input_types())) 472 actual_types = {k: v.dtype for k, v in res.items()} 473 assert expected_types == actual_types 474 475 # test list does not work 476 with pytest.raises(MlflowException, match="Model is missing inputs"): 477 pyfunc_model.predict(input_array.tolist()) 478 479 480 def test_schema_enforcement_single_unnamed_tensor_schema(): 481 m = Model() 482 input_schema = Schema([TensorSpec(np.dtype(np.uint64), (-1, 3))]) 483 m.signature = ModelSignature(inputs=input_schema) 484 pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) 485 486 input_array = np.array(range(6), dtype=np.uint64).reshape((2, 3)) 487 488 # test single np.ndarray input works and is converted to dictionary 489 res = pyfunc_model.predict(input_array) 490 np.testing.assert_array_equal(res, input_array) 491 expected_types = input_schema.input_types()[0] 492 assert expected_types == res.dtype 493 494 input_df = pd.DataFrame(input_array, columns=["c1", "c2", "c3"]) 495 res = pyfunc_model.predict(input_df) 496 np.testing.assert_array_equal(res, input_array) 497 assert expected_types == res.dtype 498 499 input_df = input_df.drop("c3", axis=1) 500 with pytest.raises( 501 expected_exception=MlflowException, 502 match=re.escape( 503 "This model contains a model signature with an unnamed input. Since the " 504 "input data is a pandas DataFrame containing multiple columns, " 505 "the input shape must be of the structure " 506 "(-1, number_of_dataframe_columns). " 507 "Instead, the input DataFrame passed had 2 columns and " 508 "an input shape of (-1, 3) with all values within the " 509 "DataFrame of scalar type. Please adjust the passed in DataFrame to " 510 "match the expected structure", 511 ), 512 ): 513 pyfunc_model.predict(input_df) 514 515 516 def test_schema_enforcement_named_tensor_schema_1d(): 517 m = Model() 518 input_schema = Schema([ 519 TensorSpec(np.dtype(np.uint64), (-1,), "a"), 520 TensorSpec(np.dtype(np.float32), (-1,), "b"), 521 ]) 522 m.signature = ModelSignature(inputs=input_schema) 523 pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) 524 pdf = pd.DataFrame(data=[[0, 0], [1, 1]], columns=["a", "b"]) 525 pdf["a"] = pdf["a"].astype(np.uint64) 526 pdf["b"] = pdf["a"].astype(np.float32) 527 d_inp = { 528 "a": np.array(pdf["a"], dtype=np.uint64), 529 "b": np.array(pdf["b"], dtype=np.float32), 530 } 531 532 # test dataframe input works for 1d tensor specs and input is converted to dict 533 res = pyfunc_model.predict(pdf) 534 assert _compare_exact_tensor_dict_input(res, d_inp) 535 expected_types = dict(zip(input_schema.input_names(), input_schema.input_types())) 536 actual_types = {k: v.dtype for k, v in res.items()} 537 assert expected_types == actual_types 538 539 wrong_m = Model() 540 wrong_m.signature = ModelSignature( 541 inputs=Schema([ 542 TensorSpec(np.dtype(np.uint64), (-1, 2), "a"), 543 TensorSpec(np.dtype(np.float32), (-1,), "b"), 544 ]) 545 ) 546 wrong_pyfunc_model = PyFuncModel(model_meta=wrong_m, model_impl=TestModel()) 547 with pytest.raises( 548 expected_exception=MlflowException, 549 match=re.escape( 550 "The input pandas dataframe column 'a' contains scalar " 551 "values, which requires the shape to be (-1,) or (-1, 1), but got tensor spec " 552 "shape of (-1, 2)." 553 ), 554 ): 555 wrong_pyfunc_model.predict(pdf) 556 557 wrong_m.signature.inputs = Schema([ 558 TensorSpec(np.dtype(np.uint64), (2, -1), "a"), 559 TensorSpec(np.dtype(np.float32), (-1,), "b"), 560 ]) 561 with pytest.raises( 562 expected_exception=MlflowException, 563 match=re.escape( 564 "For pandas dataframe input, the first dimension of shape must be a variable " 565 "dimension and other dimensions must be fixed, but in model signature the shape " 566 "of input a is (2, -1)." 567 ), 568 ): 569 wrong_pyfunc_model.predict(pdf) 570 571 # test that dictionary works too 572 res = pyfunc_model.predict(d_inp) 573 assert res == d_inp 574 expected_types = dict(zip(input_schema.input_names(), input_schema.input_types())) 575 actual_types = {k: v.dtype for k, v in res.items()} 576 assert expected_types == actual_types 577 578 579 def test_schema_enforcement_named_tensor_schema_multidimensional(): 580 m = Model() 581 input_schema = Schema([ 582 TensorSpec(np.dtype(np.uint64), (-1, 2, 3), "a"), 583 TensorSpec(np.dtype(np.float32), (-1, 3, 4), "b"), 584 ]) 585 m.signature = ModelSignature(inputs=input_schema) 586 pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) 587 data_a = np.array(range(12), dtype=np.uint64) 588 data_b = np.array(range(24), dtype=np.float32) + 10.0 589 pdf = pd.DataFrame({ 590 "a": data_a.reshape(-1, 2 * 3).tolist(), 591 "b": data_b.reshape(-1, 3 * 4).tolist(), 592 }) 593 d_inp = { 594 "a": data_a.reshape((-1, 2, 3)), 595 "b": data_b.reshape((-1, 3, 4)), 596 } 597 598 # test dataframe input works for 1d tensor specs and input is converted to dict 599 res = pyfunc_model.predict(pdf) 600 assert _compare_exact_tensor_dict_input(res, d_inp) 601 602 # test dataframe input works for 1d tensor specs and input is converted to dict 603 pdf_contains_numpy_array = pd.DataFrame({ 604 "a": list(data_a.reshape(-1, 2 * 3)), 605 "b": list(data_b.reshape(-1, 3 * 4)), 606 }) 607 res = pyfunc_model.predict(pdf_contains_numpy_array) 608 assert _compare_exact_tensor_dict_input(res, d_inp) 609 610 expected_types = dict(zip(input_schema.input_names(), input_schema.input_types())) 611 actual_types = {k: v.dtype for k, v in res.items()} 612 assert expected_types == actual_types 613 614 with pytest.raises( 615 expected_exception=MlflowException, 616 match=re.escape( 617 "The value in the Input DataFrame column 'a' could not be converted to the expected " 618 "shape of: '(-1, 2, 3)'. Ensure that each of the input list elements are of uniform " 619 "length and that the data can be coerced to the tensor type 'uint64'" 620 ), 621 ): 622 pyfunc_model.predict( 623 pdf.assign(a=np.array(range(16), dtype=np.uint64).reshape(-1, 8).tolist()) 624 ) 625 626 # test that dictionary works too 627 res = pyfunc_model.predict(d_inp) 628 assert res == d_inp 629 expected_types = dict(zip(input_schema.input_names(), input_schema.input_types())) 630 actual_types = {k: v.dtype for k, v in res.items()} 631 assert expected_types == actual_types 632 633 634 def test_missing_value_hint_is_displayed_when_it_should(): 635 m = Model() 636 input_schema = Schema([ColSpec("integer", "a")]) 637 m.signature = ModelSignature(inputs=input_schema) 638 pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) 639 pdf = pd.DataFrame(data=[[1], [None]], columns=["a"]) 640 match = "Incompatible input types" 641 with pytest.raises(MlflowException, match=match) as ex: 642 pyfunc_model.predict(pdf) 643 hint = "Hint: the type mismatch is likely caused by missing values." 644 assert hint in str(ex.value.message) 645 pdf = pd.DataFrame(data=[[1.5], [None]], columns=["a"]) 646 with pytest.raises(MlflowException, match=match) as ex: 647 pyfunc_model.predict(pdf) 648 assert hint not in str(ex.value.message) 649 pdf = pd.DataFrame(data=[[1], [2]], columns=["a"], dtype=np.float64) 650 with pytest.raises(MlflowException, match=match) as ex: 651 pyfunc_model.predict(pdf) 652 assert hint not in str(ex.value.message) 653 654 655 def test_column_schema_enforcement_no_col_names(): 656 m = Model() 657 input_schema = Schema([ColSpec("double"), ColSpec("double"), ColSpec("double")]) 658 m.signature = ModelSignature(inputs=input_schema) 659 pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) 660 test_data = [[1.0, 2.0, 3.0]] 661 662 # Can call with just a list 663 pd.testing.assert_frame_equal(pyfunc_model.predict(test_data), pd.DataFrame(test_data)) 664 665 # Or can call with a DataFrame without column names 666 pd.testing.assert_frame_equal( 667 pyfunc_model.predict(pd.DataFrame(test_data)), pd.DataFrame(test_data) 668 ) 669 670 # # Or can call with a np.ndarray 671 pd.testing.assert_frame_equal( 672 pyfunc_model.predict(pd.DataFrame(test_data).values), pd.DataFrame(test_data) 673 ) 674 675 # Or with column names! 676 pdf = pd.DataFrame(data=test_data, columns=["a", "b", "c"]) 677 pd.testing.assert_frame_equal(pyfunc_model.predict(pdf), pdf) 678 679 # Must provide the right number of arguments 680 with pytest.raises(MlflowException, match="the provided value only has 2 inputs."): 681 pyfunc_model.predict([[1.0, 2.0]]) 682 683 # Must provide the right types 684 with pytest.raises(MlflowException, match="Can not safely convert int64 to float64"): 685 pyfunc_model.predict([[1, 2, 3]]) 686 687 # Can only provide data type that can be converted to dataframe... 688 with pytest.raises(MlflowException, match="Expected input to be DataFrame. Found: set"): 689 pyfunc_model.predict({1, 2, 3}) 690 691 # 9. dictionaries of str -> list/nparray work 692 d = {"a": [1.0], "b": [2.0], "c": [3.0]} 693 pd.testing.assert_frame_equal(pyfunc_model.predict(d), pd.DataFrame(d)) 694 695 696 def test_tensor_schema_enforcement_no_col_names(): 697 m = Model() 698 input_schema = Schema([TensorSpec(np.dtype(np.float32), (-1, 3))]) 699 m.signature = ModelSignature(inputs=input_schema) 700 pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) 701 test_data = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32) 702 703 # Can call with numpy array of correct shape 704 np.testing.assert_array_equal(pyfunc_model.predict(test_data), test_data) 705 706 # Or can call with a dataframe 707 np.testing.assert_array_equal(pyfunc_model.predict(pd.DataFrame(test_data)), test_data) 708 709 # Can not call with a list 710 with pytest.raises( 711 MlflowException, 712 match="This model contains a tensor-based model signature with no input names", 713 ): 714 pyfunc_model.predict([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) 715 716 # Can not call with a dict 717 with pytest.raises( 718 MlflowException, 719 match="This model contains a tensor-based model signature with no input names", 720 ): 721 pyfunc_model.predict({"blah": test_data}) 722 723 # Can not call with a np.ndarray of a wrong shape 724 with pytest.raises( 725 MlflowException, 726 match=re.escape("Shape of input (2, 2) does not match expected shape (-1, 3)"), 727 ): 728 pyfunc_model.predict(np.array([[1.0, 2.0], [4.0, 5.0]])) 729 730 # Can not call with a np.ndarray of a wrong type 731 with pytest.raises( 732 MlflowException, match="dtype of input uint32 does not match expected dtype float32" 733 ): 734 pyfunc_model.predict(test_data.astype(np.uint32)) 735 736 # Can call with a np.ndarray with more elements along variable axis 737 test_data2 = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]], dtype=np.float32) 738 np.testing.assert_array_equal(pyfunc_model.predict(test_data2), test_data2) 739 740 # Can not call with an empty ndarray 741 with pytest.raises( 742 MlflowException, match=re.escape("Shape of input () does not match expected shape (-1, 3)") 743 ): 744 pyfunc_model.predict(np.ndarray([])) 745 746 747 @pytest.mark.parametrize("orient", ["records"]) 748 def test_schema_enforcement_for_inputs_style_orientation_of_dataframe(orient): 749 # Test Dict[str, List[Any]] 750 test_signature = { 751 "inputs": '[{"name": "a", "type": "long"}, {"name": "b", "type": "string"}]', 752 "outputs": '[{"name": "response", "type": "string"}]', 753 } 754 signature = ModelSignature.from_dict(test_signature) 755 data = {"a": [4, 5, 6], "b": ["a", "b", "c"]} 756 pd_data = pd.DataFrame(data) 757 check = _enforce_schema(data, signature.inputs) 758 pd.testing.assert_frame_equal(check, pd_data) 759 pd_check = _enforce_schema(pd_data.to_dict(orient=orient), signature.inputs) 760 pd.testing.assert_frame_equal(pd_check, pd_data) 761 762 # Test Dict[str, str] 763 test_signature = { 764 "inputs": '[{"name": "a", "type": "string"}]', 765 "outputs": '[{"name": "response", "type": "string"}]', 766 } 767 signature = ModelSignature.from_dict(test_signature) 768 data = {"a": "Hi there!"} 769 pd_data = pd.DataFrame([data]) 770 check = _enforce_schema(data, signature.inputs) 771 pd.testing.assert_frame_equal(check, pd_data) 772 pd_check = _enforce_schema(pd_data.to_dict(orient=orient), signature.inputs) 773 pd.testing.assert_frame_equal(pd_check, pd_data) 774 775 # Test List[Dict[str, Union[str, List[str]]]] 776 test_signature = { 777 "inputs": '[{"name": "query", "type": "string"}, {"name": "inputs", "type": "string"}]', 778 } 779 signature = ModelSignature.from_dict(test_signature) 780 data = [{"query": ["test_query1", "test_query2"], "inputs": "test input"}] 781 pd_data = pd.DataFrame(data) 782 check = _enforce_schema(data, signature.inputs) 783 pd.testing.assert_frame_equal(check, pd_data) 784 pd_check = _enforce_schema(pd_data.to_dict(orient=orient), signature.inputs) 785 pd.testing.assert_frame_equal(pd_check, pd_data) 786 787 # Test List[str] 788 test_signature = { 789 "inputs": '[{"type": "string"}]', 790 "outputs": '[{"name": "response", "type": "string"}]', 791 } 792 signature = ModelSignature.from_dict(test_signature) 793 data = ["a", "b", "c"] 794 pd_data = pd.DataFrame(data) 795 check = _enforce_schema(data, signature.inputs) 796 pd.testing.assert_frame_equal(check, pd_data) 797 pd_check = _enforce_schema(pd_data.to_dict(orient=orient), signature.inputs) 798 pd.testing.assert_frame_equal(pd_check, pd_data) 799 800 # Test Dict[str, np.ndarray] 801 test_signature = { 802 "inputs": '[{"name": "a", "type": "long"}, {"name": "b", "type": "string"}]', 803 "outputs": '[{"name": "response", "type": "string"}]', 804 } 805 signature = ModelSignature.from_dict(test_signature) 806 data = {"a": np.array([1, 2, 3]), "b": np.array(["a", "b", "c"])} 807 pd_data = pd.DataFrame(data) 808 check = _enforce_schema(data, signature.inputs) 809 pd.testing.assert_frame_equal(check, pd_data) 810 pd_check = _enforce_schema(pd_data.to_dict(orient=orient), signature.inputs) 811 pd.testing.assert_frame_equal(pd_check, pd_data) 812 813 # Test Dict[str, <scalar>] (support added in MLflow 2.3.0) 814 test_signature = { 815 "inputs": '[{"name": "a", "type": "long"}, {"name": "b", "type": "string"}]', 816 "outputs": '[{"name": "response", "type": "string"}]', 817 } 818 signature = ModelSignature.from_dict(test_signature) 819 data = {"a": 12, "b": "a"} 820 pd_data = pd.DataFrame([data]) 821 check = _enforce_schema(data, signature.inputs) 822 pd.testing.assert_frame_equal(check, pd_data) 823 pd_check = _enforce_schema(pd_data.to_dict(orient=orient), signature.inputs) 824 pd.testing.assert_frame_equal(pd_check, pd_data) 825 826 # Test Dict[str, np.ndarray] where array.size == 1 827 test_signature = { 828 "inputs": '[{"name": "a", "type": "long"}, {"name": "b", "type": "string"}]', 829 "outputs": '[{"name": "response", "type": "string"}]', 830 } 831 signature = ModelSignature.from_dict(test_signature) 832 data = {"a": np.array([12]), "b": np.array(["a"])} 833 pd_data = pd.DataFrame(data) 834 check = _enforce_schema(data, signature.inputs) 835 pd.testing.assert_frame_equal(check, pd_data) 836 pd_check = _enforce_schema(pd_data.to_dict(orient=orient), signature.inputs) 837 pd.testing.assert_frame_equal(pd_check, pd_data) 838 839 # Test Dict[str, np.ndarray] where primitives are supplied 840 test_signature = { 841 "inputs": '[{"name": "a", "type": "string"}, {"name": "b", "type": "string"}]', 842 "outputs": '[{"name": "response", "type": "string"}]', 843 } 844 signature = ModelSignature.from_dict(test_signature) 845 # simulates the structure that model serving will convert the data to when using 846 # a Dict[str, str] with a scalar singular value string 847 data = {"a": np.array("a"), "b": np.array("b")} 848 pd_data = pd.DataFrame([data]) 849 check = _enforce_schema(data, signature.inputs) 850 pd.testing.assert_frame_equal(check, pd_data) 851 pd_check = _enforce_schema(pd_data.to_dict(orient=orient), signature.inputs) 852 pd.testing.assert_frame_equal(pd_check, pd_data) 853 854 # Assert that the Dict[str, np.ndarray] casing with primitive does not work on anything 855 # but a single string. 856 test_signature = { 857 "inputs": '[{"name": "a", "type": "long"}, {"name": "b", "type": "long"}]', 858 "outputs": '[{"name": "response", "type": "string"}]', 859 } 860 signature = ModelSignature.from_dict(test_signature) 861 data = {"a": np.array(1), "b": np.array(2)} 862 pd_data = pd.DataFrame([data]) 863 # Schema enforcement explicitly only provides support for strings that meet primitives in 864 # np.arrays criteria. All other data types should fail. 865 with pytest.raises(MlflowException, match="This model contains a column-based"): 866 _enforce_schema(data, signature.inputs) 867 with pytest.raises(MlflowException, match="Incompatible input types for column a. Can not"): 868 _enforce_schema(pd_data.to_dict(orient=orient), signature.inputs) 869 870 # Test bytes 871 test_signature = { 872 "inputs": '[{"name": "audio", "type": "binary"}]', 873 "outputs": '[{"name": "response", "type": "string"}]', 874 } 875 signature = ModelSignature.from_dict(test_signature) 876 data = {"audio": b"Hi I am a bytes string"} 877 pd_data = pd.DataFrame([data]) 878 check = _enforce_schema(data, signature.inputs) 879 pd.testing.assert_frame_equal(check, pd_data) 880 pd_check = _enforce_schema(pd_data.to_dict(orient=orient), signature.inputs) 881 pd.testing.assert_frame_equal(pd_check, pd_data) 882 883 # Test base64 encoded 884 test_signature = { 885 "inputs": '[{"name": "audio", "type": "binary"}]', 886 "outputs": '[{"name": "response", "type": "string"}]', 887 } 888 signature = ModelSignature.from_dict(test_signature) 889 data = {"audio": base64.b64encode(b"Hi I am a bytes string").decode("ascii")} 890 pd_data = pd.DataFrame([data]) 891 check = _enforce_schema(data, signature.inputs) 892 pd.testing.assert_frame_equal(check, pd_data) 893 pd_check = _enforce_schema(pd_data.to_dict(orient=orient), signature.inputs) 894 pd.testing.assert_frame_equal(pd_check, pd_data) 895 896 897 def test_schema_enforcement_for_optional_columns(): 898 input_schema = Schema([ 899 ColSpec("double", "a"), 900 ColSpec("double", "b"), 901 ColSpec("string", "c", required=False), 902 ColSpec("long", "d", required=False), 903 ]) 904 signature = ModelSignature(inputs=input_schema) 905 test_data_with_all_cols = {"a": [1.0], "b": [1.0], "c": ["something"], "d": [2]} 906 test_data_with_only_required_cols = {"a": [1.0], "b": [1.0]} 907 test_data_with_one_optional_col = {"a": [1.0], "b": [1.0], "d": [2]} 908 909 for data in [ 910 test_data_with_all_cols, 911 test_data_with_only_required_cols, 912 test_data_with_one_optional_col, 913 ]: 914 pd_data = pd.DataFrame(data) 915 check = _enforce_schema(pd_data, signature.inputs) 916 pd.testing.assert_frame_equal(check, pd_data) 917 918 # Ensure wrong data type for optional column throws 919 test_bad_data = {"a": [1.0], "b": [1.0], "d": ["not the right type"]} 920 pd_data = pd.DataFrame(test_bad_data) 921 with pytest.raises(MlflowException, match="Incompatible input types for column d."): 922 _enforce_schema(pd_data, signature.inputs) 923 924 # Ensure it still validates for required columns 925 test_missing_required = {"b": [2.0], "c": ["something"]} 926 pd_data = pd.DataFrame(test_missing_required) 927 with pytest.raises(MlflowException, match="Model is missing inputs"): 928 _enforce_schema(pd_data, signature.inputs) 929 930 931 def test_schema_enforcement_for_list_inputs_back_compatibility_check(): 932 # Test Dict[str, scalar or List[str]] 933 test_signature = { 934 "inputs": '[{"name": "prompt", "type": "string"}, {"name": "stop", "type": "string"}]', 935 "outputs": '[{"type": "string"}]', 936 } 937 signature = ModelSignature.from_dict(test_signature) 938 data = {"prompt": "this is the prompt", "stop": ["a", "b"]} 939 pd_data = pd.DataFrame([data]) 940 check = _enforce_schema(data, signature.inputs) 941 pd.testing.assert_frame_equal(check, pd_data) 942 943 # Test Dict[str, List[str]] 944 test_signature = { 945 "inputs": '[{"name": "a", "type": "string"}, {"name": "b", "type": "string"}]', 946 "outputs": '[{"name": "response", "type": "string"}]', 947 } 948 signature = ModelSignature.from_dict(test_signature) 949 data = {"a": ["Hi there!"], "b": ["Hello there", "Bye!"]} 950 pd_data = pd.DataFrame([data]) 951 check = _enforce_schema(data, signature.inputs) 952 pd.testing.assert_frame_equal(check, pd_data) 953 954 # Test Dict[str, List[binary]] with bytes 955 test_signature = { 956 "inputs": '[{"name": "audio", "type": "binary"}]', 957 "outputs": '[{"name": "response", "type": "string"}]', 958 } 959 signature = ModelSignature.from_dict(test_signature) 960 data = {"audio": [b"Hi I am a bytes string"]} 961 pd_data = pd.DataFrame([data]) 962 pd_check = _enforce_schema(pd_data, signature.inputs) 963 pd.testing.assert_frame_equal(pd_check, pd_data) 964 965 # Test Dict[str, List[binary]] with base64 encoded 966 test_signature = { 967 "inputs": '[{"name": "audio", "type": "binary"}]', 968 "outputs": '[{"name": "response", "type": "string"}]', 969 } 970 signature = ModelSignature.from_dict(test_signature) 971 data = {"audio": [base64.b64encode(b"Hi I am a bytes string").decode("ascii")]} 972 pd_data = pd.DataFrame([data]) 973 pd_check = _enforce_schema(pd_data, signature.inputs) 974 pd.testing.assert_frame_equal(pd_check, pd_data) 975 976 # Test Dict[str, List[Any]] 977 test_signature = { 978 "inputs": '[{"name": "a", "type": "long"}, {"name": "b", "type": "string"}]', 979 "outputs": '[{"name": "response", "type": "string"}]', 980 } 981 signature = ModelSignature.from_dict(test_signature) 982 data = {"a": [4, 5, 6], "b": ["a", "b", "c"]} 983 pd_data = pd.DataFrame(data) 984 pd_check = _enforce_schema(data, signature.inputs) 985 pd.testing.assert_frame_equal(pd_check, pd_data) 986 987 # Test Dict[str, np.ndarray] 988 test_signature = { 989 "inputs": '[{"name": "a", "type": "long"}, {"name": "b", "type": "string"}]', 990 "outputs": '[{"name": "response", "type": "string"}]', 991 } 992 signature = ModelSignature.from_dict(test_signature) 993 data = {"a": np.array([1, 2, 3]), "b": np.array(["a", "b", "c"])} 994 pd_data = pd.DataFrame(data) 995 pd_check = _enforce_schema(pd_data.to_dict(orient="list"), signature.inputs) 996 pd.testing.assert_frame_equal(pd_check, pd_data) 997 998 # Test Dict[str, np.ndarray] where array.size == 1 999 test_signature = { 1000 "inputs": '[{"name": "a", "type": "long"}, {"name": "b", "type": "string"}]', 1001 "outputs": '[{"name": "response", "type": "string"}]', 1002 } 1003 signature = ModelSignature.from_dict(test_signature) 1004 data = {"a": np.array([12]), "b": np.array(["a"])} 1005 pd_data = pd.DataFrame(data) 1006 pd_check = _enforce_schema(pd_data.to_dict(orient="list"), signature.inputs) 1007 pd.testing.assert_frame_equal(pd_check, pd_data) 1008 1009 # Test Dict[str, np.ndarray] where primitives are supplied 1010 test_signature = { 1011 "inputs": '[{"name": "a", "type": "string"}, {"name": "b", "type": "string"}]', 1012 "outputs": '[{"name": "response", "type": "string"}]', 1013 } 1014 signature = ModelSignature.from_dict(test_signature) 1015 # simulates the structure that model serving will convert the data to when using 1016 # a Dict[str, str] with a scalar singular value string 1017 data = {"a": np.array("a"), "b": np.array("b")} 1018 pd_data = pd.DataFrame([data]) 1019 pd_check = _enforce_schema(pd_data.to_dict(orient="list"), signature.inputs) 1020 pd.testing.assert_frame_equal(pd_check, pd_data) 1021 1022 1023 def test_schema_enforcement_for_list_inputs(): 1024 # Test Dict[str, scalar or List[str]] 1025 test_signature = { 1026 "inputs": '[{"type": "string", "name": "prompt", "required": true}, ' 1027 '{"type": "array", "items": {"type": "string"}, ' 1028 '"name": "stop", "required": true}]', 1029 "outputs": '[{"type": "string", "required": true}]', 1030 } 1031 signature = ModelSignature.from_dict(test_signature) 1032 data = {"prompt": "this is the prompt", "stop": ["a", "b"]} 1033 output = "this is the output" 1034 assert signature == infer_signature(data, output) 1035 pd_data = pd.DataFrame([data]) 1036 check = _enforce_schema(data, signature.inputs) 1037 pd.testing.assert_frame_equal(check, pd_data) 1038 1039 # Test Dict[str, List[str]] 1040 test_signature = { 1041 "inputs": '[{"type": "array", "items": {"type": "string"}, ' 1042 '"name": "a", "required": true}, ' 1043 '{"type": "array", "items": {"type": "string"}, ' 1044 '"name": "b", "required": true}]', 1045 "outputs": '[{"type": "string", "required": true}]', 1046 } 1047 signature = ModelSignature.from_dict(test_signature) 1048 data = {"a": ["Hi there!"], "b": ["Hello there", "Bye!"]} 1049 assert signature == infer_signature(data, output) 1050 pd_data = pd.DataFrame([data]) 1051 check = _enforce_schema(data, signature.inputs) 1052 pd.testing.assert_frame_equal(check, pd_data) 1053 1054 # Test Dict[str, List[binary]] with bytes 1055 test_signature = { 1056 "inputs": '[{"type": "array", "items": {"type": "binary"}, ' 1057 '"name": "audio", "required": true}]', 1058 "outputs": '[{"type": "string", "required": true}]', 1059 } 1060 signature = ModelSignature.from_dict(test_signature) 1061 data = {"audio": [b"Hi I am a bytes string"]} 1062 assert signature == infer_signature(data, output) 1063 pd_data = pd.DataFrame([data]) 1064 check = _enforce_schema(data, signature.inputs) 1065 pd.testing.assert_frame_equal(check, pd_data) 1066 1067 # Test Dict[str, List[binary]] with base64 encoded 1068 test_signature = { 1069 "inputs": '[{"type": "array", "items": {"type": "binary"}, ' 1070 '"name": "audio", "required": true}]', 1071 "outputs": '[{"type": "string", "required": true}]', 1072 } 1073 signature = ModelSignature.from_dict(test_signature) 1074 data = {"audio": [base64.b64encode(b"Hi I am a bytes string")]} 1075 assert signature == infer_signature(data, output) 1076 pd_data = pd.DataFrame([data]) 1077 check = _enforce_schema(data, signature.inputs) 1078 pd.testing.assert_frame_equal(check, pd_data) 1079 1080 # Test Dict[str, List[Any]] 1081 test_signature = { 1082 "inputs": '[{"type": "array", "items": {"type": "long"}, ' 1083 '"name": "a", "required": true}, ' 1084 '{"type": "array", "items": {"type": "string"}, ' 1085 '"name": "b", "required": true}]', 1086 "outputs": '[{"type": "string", "required": true}]', 1087 } 1088 signature = ModelSignature.from_dict(test_signature) 1089 data = {"a": [4, 5, 6], "b": ["a", "b", "c"]} 1090 assert signature == infer_signature(data, output) 1091 pd_data = pd.DataFrame([data]) 1092 check = _enforce_schema(data, signature.inputs) 1093 pd.testing.assert_frame_equal(check, pd_data) 1094 1095 # Test Dict[str, np.ndarray] 1096 test_signature = { 1097 "inputs": '[{"name": "a", "type": "tensor", "tensor-spec": ' 1098 '{"dtype": "int64", "shape": [-1]}}, ' 1099 '{"name": "b", "type": "tensor", "tensor-spec": ' 1100 '{"dtype": "str", "shape": [-1]}}]', 1101 "outputs": '[{"type": "string", "required": true}]', 1102 } 1103 signature = ModelSignature.from_dict(test_signature) 1104 data = {"a": np.array([1, 2, 3]), "b": np.array(["a", "b", "c"])} 1105 pd_check = _enforce_schema(data, signature.inputs) 1106 assert pd_check == data 1107 1108 # Test Dict[str, np.ndarray] where array.size == 1 1109 test_signature = { 1110 "inputs": '[{"name": "a", "type": "tensor", "tensor-spec": ' 1111 '{"dtype": "int64", "shape": [-1]}}, ' 1112 '{"name": "b", "type": "tensor", "tensor-spec": ' 1113 '{"dtype": "str", "shape": [-1]}}]', 1114 "outputs": '[{"type": "string", "required": true}]', 1115 } 1116 signature = ModelSignature.from_dict(test_signature) 1117 data = {"a": np.array([12]), "b": np.array(["a"])} 1118 pd_check = _enforce_schema(data, signature.inputs) 1119 assert pd_check == data 1120 1121 1122 def test_enforce_schema_warns_with_extra_fields(): 1123 schema = Schema([ColSpec("string", "a")]) 1124 with mock.patch("mlflow.models.utils._logger.warning") as mock_warning: 1125 _enforce_schema({"a": "hi", "b": "bye"}, schema) 1126 mock_warning.assert_called_once_with( 1127 "Found extra inputs in the model input that are not defined in the model " 1128 "signature: `['b']`. These inputs will be ignored." 1129 ) 1130 1131 1132 def test_enforce_params_schema_with_success(): 1133 # Correct parameters & schema 1134 test_parameters = { 1135 "str_param": "str_a", 1136 "int_param": np.int32(1), 1137 "bool_param": True, 1138 "double_param": 1.0, 1139 "float_param": np.float32(0.1), 1140 "long_param": 100, 1141 "datetime_param": np.datetime64("2023-06-26 00:00:00"), 1142 "str_list": ["a", "b", "c"], 1143 "bool_list": [True, False], 1144 "object": {"a": 1, "b": ["x", "y"], "c": {"d": 2}}, 1145 } 1146 test_schema = ParamSchema([ 1147 ParamSpec("str_param", DataType.string, "str_a", None), 1148 ParamSpec("int_param", DataType.integer, np.int32(1), None), 1149 ParamSpec("bool_param", DataType.boolean, True, None), 1150 ParamSpec("double_param", DataType.double, 1.0, None), 1151 ParamSpec("float_param", DataType.float, np.float32(0.1), None), 1152 ParamSpec("long_param", DataType.long, 100, None), 1153 ParamSpec("datetime_param", DataType.datetime, np.datetime64("2023-06-26 00:00:00"), None), 1154 ParamSpec("str_list", DataType.string, ["a", "b", "c"], (-1,)), 1155 ParamSpec("bool_list", DataType.boolean, [True, False], (-1,)), 1156 ParamSpec( 1157 "object", 1158 Object([ 1159 Property("a", DataType.long), 1160 Property("b", Array(DataType.string)), 1161 Property("c", Object([Property("d", DataType.long)])), 1162 ]), 1163 {"a": 1, "b": ["x", "y"], "c": {"d": 2}}, 1164 None, 1165 ), 1166 ]) 1167 assert _enforce_params_schema(test_parameters, test_schema) == test_parameters 1168 1169 # Correct parameters & schema with array 1170 params = { 1171 "double_array": np.array([1.0, 2.0]), 1172 "float_array": np.array([np.float32(1.0), np.float32(2.0)]), 1173 "long_array": np.array([1, 2]), 1174 "datetime_array": np.array([ 1175 np.datetime64("2023-06-26 00:00:00"), 1176 np.datetime64("2023-06-26 00:00:00"), 1177 ]), 1178 } 1179 schema = ParamSchema([ 1180 ParamSpec("double_array", DataType.double, np.array([1.0, 2.0]), (-1,)), 1181 ParamSpec( 1182 "float_array", DataType.float, np.array([np.float32(1.0), np.float32(2.0)]), (-1,) 1183 ), 1184 ParamSpec("long_array", DataType.long, np.array([1, 2]), (-1,)), 1185 ParamSpec( 1186 "datetime_array", 1187 DataType.datetime, 1188 np.array([np.datetime64("2023-06-26 00:00:00"), np.datetime64("2023-06-26 00:00:00")]), 1189 (-1,), 1190 ), 1191 ]) 1192 for param, value in params.items(): 1193 assert (_enforce_params_schema(params, schema)[param] == value).all() 1194 1195 # Converting parameters value type to corresponding schema type 1196 # 1. int -> long, float, double 1197 assert _enforce_params_schema({"double_param": np.int32(1)}, test_schema)["double_param"] == 1.0 1198 assert _enforce_params_schema({"float_param": np.int32(1)}, test_schema)["float_param"] == 1.0 1199 assert _enforce_params_schema({"long_param": np.int32(1)}, test_schema)["long_param"] == 1 1200 # With array 1201 for param in ["double_array", "float_array", "long_array"]: 1202 assert ( 1203 _enforce_params_schema({param: [np.int32(1), np.int32(2)]}, schema)[param] 1204 == params[param] 1205 ).all() 1206 assert ( 1207 _enforce_params_schema({param: np.array([np.int32(1), np.int32(2)])}, schema)[param] 1208 == params[param] 1209 ).all() 1210 1211 # 2. long -> float, double 1212 assert _enforce_params_schema({"double_param": 1}, test_schema)["double_param"] == 1.0 1213 assert _enforce_params_schema({"float_param": 1}, test_schema)["float_param"] == 1.0 1214 # With array 1215 for param in ["double_array", "float_array"]: 1216 assert (_enforce_params_schema({param: [1, 2]}, schema)[param] == params[param]).all() 1217 assert ( 1218 _enforce_params_schema({param: np.array([1, 2])}, schema)[param] == params[param] 1219 ).all() 1220 1221 # 3. float -> double 1222 assert ( 1223 _enforce_params_schema({"double_param": np.float32(1)}, test_schema)["double_param"] == 1.0 1224 ) 1225 assert np.isclose( 1226 _enforce_params_schema({"double_param": np.float32(0.1)}, test_schema)["double_param"], 1227 0.1, 1228 atol=1e-6, 1229 ) 1230 # With array 1231 assert ( 1232 _enforce_params_schema({"double_array": [np.float32(1), np.float32(2)]}, schema)[ 1233 "double_array" 1234 ] 1235 == params["double_array"] 1236 ).all() 1237 assert ( 1238 _enforce_params_schema({"double_array": np.array([np.float32(1), np.float32(2)])}, schema)[ 1239 "double_array" 1240 ] 1241 == params["double_array"] 1242 ).all() 1243 1244 # 4. any -> datetime (try conversion) 1245 assert _enforce_params_schema({"datetime_param": "2023-07-01 00:00:00"}, test_schema)[ 1246 "datetime_param" 1247 ] == np.datetime64("2023-07-01 00:00:00") 1248 1249 # With array 1250 assert ( 1251 _enforce_params_schema( 1252 {"datetime_array": ["2023-06-26 00:00:00", "2023-06-26 00:00:00"]}, schema 1253 )["datetime_array"] 1254 == params["datetime_array"] 1255 ).all() 1256 assert ( 1257 _enforce_params_schema( 1258 {"datetime_array": np.array(["2023-06-26 00:00:00", "2023-06-26 00:00:00"])}, schema 1259 )["datetime_array"] 1260 == params["datetime_array"] 1261 ).all() 1262 1263 # Add default values if the parameter is not provided 1264 test_parameters = {"a": "str_a"} 1265 test_schema = ParamSchema([ 1266 ParamSpec("a", DataType.string, ""), 1267 ParamSpec("b", DataType.long, 1), 1268 ]) 1269 updated_parameters = {"b": 1} 1270 updated_parameters.update(test_parameters) 1271 assert _enforce_params_schema(test_parameters, test_schema) == updated_parameters 1272 1273 # Ignore values not specified in ParamSchema and log warning 1274 test_parameters = {"a": "str_a", "invalid_param": "value"} 1275 test_schema = ParamSchema([ParamSpec("a", DataType.string, "")]) 1276 with mock.patch("mlflow.models.utils._logger.warning") as mock_warning: 1277 assert _enforce_params_schema(test_parameters, test_schema) == {"a": "str_a"} 1278 mock_warning.assert_called_once_with( 1279 "Unrecognized params ['invalid_param'] are ignored for inference. " 1280 "Supported params are: {'a'}. " 1281 "To enable them, please add corresponding schema in ModelSignature." 1282 ) 1283 1284 # Converting parameters keys to string if it is not 1285 test_parameters = {1: 1.0} 1286 test_schema = ParamSchema([ParamSpec("1", DataType.double, 1.0)]) 1287 assert _enforce_params_schema(test_parameters, test_schema) == {"1": 1.0} 1288 1289 1290 def test_enforce_params_schema_add_default_values(): 1291 class MyModel(mlflow.pyfunc.PythonModel): 1292 def predict(self, context, model_input, params): 1293 return list(params.values()) 1294 1295 params = {"str_param": "string", "int_array": [1, 2, 3]} 1296 signature = infer_signature(["input"], params=params) 1297 1298 with mlflow.start_run(): 1299 model_info = mlflow.pyfunc.log_model( 1300 name="my_model", python_model=MyModel(), signature=signature 1301 ) 1302 1303 loaded_model = mlflow.pyfunc.load_model(model_info.model_uri) 1304 1305 # Not passing params -- predict with default values 1306 loaded_predict = loaded_model.predict(["input"]) 1307 assert loaded_predict == ["string", [1, 2, 3]] 1308 1309 # Passing some params -- add default values 1310 loaded_predict = loaded_model.predict(["input"], params={"str_param": "new_string"}) 1311 assert loaded_predict == ["new_string", [1, 2, 3]] 1312 1313 # Passing all params -- override 1314 loaded_predict = loaded_model.predict( 1315 ["input"], params={"str_param": "new_string", "int_array": [4, 5, 6]} 1316 ) 1317 assert loaded_predict == ["new_string", [4, 5, 6]] 1318 1319 # Raise warning for unrecognized params 1320 with mock.patch("mlflow.models.utils._logger.warning") as mock_warning: 1321 loaded_predict = loaded_model.predict(["input"], params={"new_param": "new_string"}) 1322 mock_warning.assert_called_once() 1323 assert ( 1324 "Unrecognized params ['new_param'] are ignored for inference" 1325 in mock_warning.call_args[0][0] 1326 ) 1327 assert loaded_predict == ["string", [1, 2, 3]] 1328 1329 1330 def test_enforce_params_schema_errors(): 1331 # Raise error when failing to convert value to DataType.datetime 1332 test_schema = ParamSchema([ 1333 ParamSpec("datetime_param", DataType.datetime, np.datetime64("2023-06-06")) 1334 ]) 1335 with pytest.raises( 1336 MlflowException, 1337 match=r"Failed to convert value `1.0` from type `<class 'float'>` to `DataType.datetime`", 1338 ): 1339 _enforce_params_schema({"datetime_param": 1.0}, test_schema) 1340 # With array 1341 test_schema = ParamSchema([ 1342 ParamSpec( 1343 "datetime_array", 1344 DataType.datetime, 1345 np.array([np.datetime64("2023-06-06"), np.datetime64("2023-06-06")]), 1346 (-1,), 1347 ) 1348 ]) 1349 with pytest.raises( 1350 MlflowException, 1351 match=r"Failed to convert value `1.0` from type `<class 'float'>` to `DataType.datetime`", 1352 ): 1353 _enforce_params_schema({"datetime_array": [1.0, 2.0]}, test_schema) 1354 1355 # Raise error when failing to convert value to DataType.float 1356 test_schema = ParamSchema([ParamSpec("float_param", DataType.float, np.float32(1))]) 1357 with pytest.raises( 1358 MlflowException, match=r"Failed to validate type and shape for 'float_param'" 1359 ): 1360 _enforce_params_schema({"float_param": "a"}, test_schema) 1361 # With array 1362 test_schema = ParamSchema([ 1363 ParamSpec("float_array", DataType.float, np.array([np.float32(1), np.float32(2)]), (-1,)) 1364 ]) 1365 with pytest.raises( 1366 MlflowException, match=r"Failed to validate type and shape for 'float_array'" 1367 ): 1368 _enforce_params_schema( 1369 {"float_array": [np.float32(1), np.float32(2), np.float64(3)]}, test_schema 1370 ) 1371 1372 # Raise error for any other conversions 1373 error_msg = r"Failed to validate type and shape for 'int_param'" 1374 test_schema = ParamSchema([ParamSpec("int_param", DataType.long, np.int32(1))]) 1375 with pytest.raises(MlflowException, match=error_msg): 1376 _enforce_params_schema({"int_param": np.float32(1)}, test_schema) 1377 with pytest.raises(MlflowException, match=error_msg): 1378 _enforce_params_schema({"int_param": "1"}, test_schema) 1379 with pytest.raises(MlflowException, match=error_msg): 1380 _enforce_params_schema({"int_param": np.datetime64("2023-06-06")}, test_schema) 1381 1382 error_msg = r"Failed to validate type and shape for 'str_param'" 1383 test_schema = ParamSchema([ParamSpec("str_param", DataType.string, "1")]) 1384 with pytest.raises(MlflowException, match=error_msg): 1385 _enforce_params_schema({"str_param": np.float32(1)}, test_schema) 1386 with pytest.raises(MlflowException, match=error_msg): 1387 _enforce_params_schema({"str_param": b"string"}, test_schema) 1388 with pytest.raises(MlflowException, match=error_msg): 1389 _enforce_params_schema({"str_param": np.datetime64("2023-06-06")}, test_schema) 1390 1391 # Raise error if parameters is not dictionary 1392 with pytest.raises(MlflowException, match=r"Parameters must be a dictionary. Got type 'int'."): 1393 _enforce_params_schema(100, test_schema) 1394 1395 # Raise error if invalid parameters are passed 1396 test_parameters = {"a": True, "b": (1, 2), "c": b"test"} 1397 test_schema = ParamSchema([ 1398 ParamSpec("a", DataType.boolean, False), 1399 ParamSpec("b", DataType.string, [], (-1,)), 1400 ParamSpec("c", DataType.string, ""), 1401 ]) 1402 with pytest.raises( 1403 MlflowException, 1404 match=re.escape( 1405 "Value must be a 1D array with shape (-1,) for param 'b': string " 1406 "(default: []) (shape: (-1,)), received tuple" 1407 ), 1408 ): 1409 _enforce_params_schema(test_parameters, test_schema) 1410 # Raise error for non-1D array 1411 with pytest.raises(MlflowException, match=r"received list with ndim 2"): 1412 _enforce_params_schema( 1413 {"a": [[1, 2], [3, 4]]}, ParamSchema([ParamSpec("a", DataType.long, [], (-1,))]) 1414 ) 1415 1416 1417 def test_enforce_params_schema_warns_with_model_without_params(): 1418 class MyModel(mlflow.pyfunc.PythonModel): 1419 def predict(self, context, model_input, params=None): 1420 return list(params.values()) if isinstance(params, dict) else None 1421 1422 params = {"str_param": "string", "int_array": [1, 2, 3], "123": 123} 1423 signature = infer_signature(["input"]) 1424 1425 with mlflow.start_run(): 1426 model_info = mlflow.pyfunc.log_model( 1427 name="model1", python_model=MyModel(), signature=signature 1428 ) 1429 1430 loaded_model = mlflow.pyfunc.load_model(model_info.model_uri) 1431 1432 with mock.patch("mlflow.models.utils._logger.warning") as mock_warning: 1433 loaded_model.predict(["input"], params=params) 1434 mock_warning.assert_called_with( 1435 "`params` can only be specified at inference time if the model signature defines a params " 1436 "schema. This model does not define a params schema. Ignoring provided params: " 1437 "['str_param', 'int_array', '123']" 1438 ) 1439 1440 1441 def test_enforce_params_schema_errors_with_model_with_params(): 1442 class MyModel(mlflow.pyfunc.PythonModel): 1443 def predict(self, context, model_input, params=None): 1444 return list(params.values()) if isinstance(params, dict) else None 1445 1446 params = {"str_param": "string", "int_array": [1, 2, 3], "123": 123} 1447 signature = infer_signature(["input"], params=params) 1448 1449 with mlflow.start_run(): 1450 model_info = mlflow.pyfunc.log_model( 1451 name="test_model", python_model=MyModel(), signature=signature 1452 ) 1453 1454 loaded_model_with_params = mlflow.pyfunc.load_model(model_info.model_uri) 1455 with pytest.raises(MlflowException, match=r"Parameters must be a dictionary. Got type 'list'"): 1456 loaded_model_with_params.predict(["input"], params=[1, 2, 3]) 1457 1458 with mock.patch("mlflow.models.utils._logger.warning") as mock_warning: 1459 loaded_model_with_params.predict(["input"], params={123: 456}) 1460 mock_warning.assert_called_with( 1461 "Keys in parameters should be of type `str`, but received non-string keys." 1462 "Converting all keys to string..." 1463 ) 1464 1465 1466 def test_param_spec_with_success(): 1467 # Normal cases 1468 assert ParamSpec("a", DataType.long, 1).default == 1 1469 assert ParamSpec("a", DataType.string, "1").default == "1" 1470 assert ParamSpec("a", DataType.boolean, True).default is True 1471 assert ParamSpec("a", DataType.double, 1.0).default == 1.0 1472 assert ParamSpec("a", DataType.float, np.float32(1)).default == 1 1473 assert ParamSpec("a", DataType.datetime, np.datetime64("2023-06-06")).default == datetime.date( 1474 2023, 6, 6 1475 ) 1476 assert ParamSpec( 1477 "a", DataType.datetime, np.datetime64("2023-06-06 00:00:00") 1478 ).default == datetime.datetime(2023, 6, 6, 0, 0, 0) 1479 assert ParamSpec("a", DataType.integer, np.int32(1)).default == 1 1480 1481 # Convert default value type if it is not consistent with provided type 1482 # 1. int -> long, float, double 1483 assert ParamSpec("a", DataType.long, np.int32(1)).default == 1 1484 assert ParamSpec("a", DataType.float, np.int32(1)).default == 1.0 1485 assert ParamSpec("a", DataType.double, np.int32(1)).default == 1.0 1486 # 2. long -> float, double 1487 assert ParamSpec("a", DataType.float, 1).default == 1.0 1488 assert ParamSpec("a", DataType.double, 1).default == 1.0 1489 # 3. float -> double 1490 assert ParamSpec("a", DataType.double, np.float32(1)).default == 1.0 1491 # 4. any -> datetime (try conversion) 1492 assert ParamSpec("a", DataType.datetime, "2023-07-01 00:00:00").default == np.datetime64( 1493 "2023-07-01 00:00:00" 1494 ) 1495 1496 1497 def test_param_spec_errors(): 1498 # Raise error if default value can not be converted to specified type 1499 with pytest.raises(MlflowException, match=r"Failed to validate type and shape for 'a'"): 1500 ParamSpec("a", DataType.integer, "1.0") 1501 with pytest.raises(MlflowException, match=r"Failed to validate type and shape for 'a'"): 1502 ParamSpec("a", DataType.integer, [1.0, 2.0], (-1,)) 1503 with pytest.raises(MlflowException, match=r"Failed to validate type and shape for 'a'"): 1504 ParamSpec("a", DataType.string, True) 1505 with pytest.raises(MlflowException, match=r"Failed to validate type and shape for 'a'"): 1506 ParamSpec("a", DataType.string, [1.0, 2.0], (-1,)) 1507 with pytest.raises(MlflowException, match=r"Binary type is not supported for parameters"): 1508 ParamSpec("a", DataType.binary, 1.0) 1509 with pytest.raises(MlflowException, match=r"Failed to convert value"): 1510 ParamSpec("a", DataType.datetime, 1.0) 1511 with pytest.raises(MlflowException, match=r"Failed to convert value"): 1512 ParamSpec("a", DataType.datetime, [1.0, 2.0], (-1,)) 1513 with pytest.raises(MlflowException, match=r"Failed to convert value to `DataType.datetime`"): 1514 ParamSpec("a", DataType.datetime, np.datetime64("20230606")) 1515 1516 # Raise error if shape is not specified for list value 1517 with pytest.raises( 1518 MlflowException, 1519 match=re.escape("Value must be a scalar for type `DataType.long`"), 1520 ): 1521 ParamSpec("a", DataType.long, [1, 2, 3], shape=None) 1522 with pytest.raises( 1523 MlflowException, 1524 match=re.escape("Value must be a scalar for type `DataType.integer`"), 1525 ): 1526 ParamSpec("a", DataType.integer, np.array([1, 2, 3]), shape=None) 1527 1528 # Raise error if shape is specified for scalar value 1529 with pytest.raises( 1530 MlflowException, 1531 match=re.escape( 1532 "Value must be a 1D array with shape (-1,) for param 'a': boolean (default: True) " 1533 "(shape: (-1,)), received bool" 1534 ), 1535 ): 1536 ParamSpec("a", DataType.boolean, True, shape=(-1,)) 1537 1538 # Raise error if shape specified is not allowed 1539 with pytest.raises( 1540 MlflowException, 1541 match=r"Shape must be None for scalar or dictionary value, " 1542 r"or \(-1,\) for 1D array value", 1543 ): 1544 ParamSpec("a", DataType.boolean, [True, False], (2,)) 1545 1546 # Raise error if default value is not scalar or 1D array 1547 with pytest.raises( 1548 MlflowException, 1549 match=re.escape( 1550 "Value must be a 1D array with shape (-1,) for param 'a': boolean (default: {'a': 1}) " 1551 "(shape: (-1,)), received dict" 1552 ), 1553 ): 1554 ParamSpec("a", DataType.boolean, {"a": 1}, (-1,)) 1555 1556 1557 def test_enforce_schema_in_python_model_predict(sample_params_basic, param_schema_basic): 1558 test_params = sample_params_basic 1559 test_schema = param_schema_basic 1560 signature = infer_signature(["input1"], params=test_params) 1561 with mlflow.start_run(): 1562 model_info = mlflow.pyfunc.log_model( 1563 name="test_model", 1564 python_model=PythonModelWithBasicParams(), 1565 signature=signature, 1566 ) 1567 assert signature.params == test_schema 1568 1569 loaded_model = mlflow.pyfunc.load_model(model_info.model_uri) 1570 loaded_predict = loaded_model.predict(["a", "b"], params=test_params) 1571 for param, value in test_params.items(): 1572 if param == "double_array": 1573 assert (loaded_predict[param] == value).all() 1574 else: 1575 assert loaded_predict[param] == value 1576 1577 # Automatically convert type if it's not consistent with schema 1578 # 1. int -> long, float, double 1579 params_int = { 1580 "double_param": np.int32(1), 1581 "float_param": np.int32(1), 1582 "long_param": np.int32(1), 1583 } 1584 expected_params_int = { 1585 "double_param": 1.0, 1586 "float_param": np.float32(1), 1587 "long_param": 1, 1588 } 1589 loaded_predict = loaded_model.predict(["a", "b"], params=params_int) 1590 for param in params_int: 1591 assert loaded_predict[param] == expected_params_int[param] 1592 1593 # 2. long -> float, double 1594 params_long = { 1595 "double_param": 1, 1596 "float_param": 1, 1597 } 1598 expected_params_long = { 1599 "double_param": 1.0, 1600 "float_param": np.float32(1), 1601 } 1602 loaded_predict = loaded_model.predict(["a", "b"], params=params_long) 1603 for param in params_long: 1604 assert loaded_predict[param] == expected_params_long[param] 1605 1606 # 3. float -> double 1607 assert ( 1608 loaded_model.predict( 1609 ["a", "b"], 1610 params={ 1611 "double_param": np.float32(1), 1612 }, 1613 )["double_param"] 1614 == 1.0 1615 ) 1616 1617 # 4. any -> datetime (try conversion) 1618 assert loaded_model.predict( 1619 ["a", "b"], 1620 params={ 1621 "datetime_param": "2023-06-26 00:00:00", 1622 }, 1623 )["datetime_param"] == np.datetime64("2023-06-26 00:00:00") 1624 1625 1626 def test_schema_enforcement_all_feature_types_pandas(): 1627 data = { 1628 "long": [1, 2, 3], 1629 "bool": [True, False, False], 1630 "string": ["a", "b", "c"], 1631 "datetime": [pd.Timestamp("2020-07-14 00:00:00")] * 3, 1632 "bool_nullable": [True, None, False], 1633 "string_nullable": ["a", "b", None], 1634 "double_nullable": [1.0, 2.0, None], 1635 } 1636 df = pd.DataFrame.from_dict(data) 1637 schema = Schema([ 1638 ColSpec(DataType.long, "long"), 1639 ColSpec(DataType.boolean, "bool"), 1640 ColSpec(DataType.string, "string"), 1641 ColSpec(DataType.datetime, "datetime"), 1642 ColSpec(DataType.boolean, "bool_nullable", required=False), 1643 ColSpec(DataType.string, "string_nullable", required=False), 1644 ColSpec(DataType.double, "double_nullable", required=False), 1645 ]) 1646 pd.testing.assert_frame_equal(_enforce_schema(df, schema), df, check_dtype=False) 1647 1648 1649 def test_enforce_schema_in_python_model_serving(sample_params_basic): 1650 signature = infer_signature(["input1"], params=sample_params_basic) 1651 with mlflow.start_run(): 1652 model_info = mlflow.pyfunc.log_model( 1653 name="test_model", 1654 python_model=PythonModelWithBasicParams(), 1655 signature=signature, 1656 ) 1657 1658 # params in payload should be json serializable 1659 test_params = { 1660 "str_param": "str_a", 1661 "int_param": 1, 1662 "bool_param": True, 1663 "double_param": 1.0, 1664 "float_param": 0.1, 1665 "long_param": 100, 1666 "datetime_param": datetime.datetime(2023, 6, 6, 0, 0, 0), 1667 "str_list": ["a", "b", "c"], 1668 "bool_list": [True, False], 1669 "double_array": np.array([1.0, 2.0]), 1670 } 1671 response = score_model_in_process( 1672 model_info.model_uri, 1673 data=dump_input_data(["a", "b"], params=test_params), 1674 content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, 1675 ) 1676 assert response.status_code == 200 1677 prediction = json.loads(response.content.decode("utf-8"))["predictions"] 1678 for param, value in test_params.items(): 1679 if param == "double_array": 1680 assert (prediction[param] == value).all() 1681 elif param == "datetime_param": 1682 assert prediction[param] == value.isoformat() 1683 else: 1684 assert prediction[param] == value 1685 1686 # Test invalid params for model serving 1687 with pytest.raises(TypeError, match=r"Object of type int32 is not JSON serializable"): 1688 dump_input_data(["a", "b"], params={"int_param": np.int32(1)}) 1689 1690 response = score_model_in_process( 1691 model_info.model_uri, 1692 data=dump_input_data(["a", "b"], params={"double_param": "invalid"}), 1693 content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, 1694 ) 1695 assert response.status_code == 400 1696 assert ( 1697 "Failed to validate type and shape for 'double_param'" 1698 in json.loads(response.content.decode("utf-8"))["message"] 1699 ) 1700 1701 # Can not pass bytes to request 1702 with pytest.raises(TypeError, match=r"Object of type bytes is not JSON serializable"): 1703 score_model_in_process( 1704 model_info.model_uri, 1705 data=dump_input_data(["a", "b"], params={"str_param": b"bytes"}), 1706 content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, 1707 ) 1708 1709 1710 def test_python_model_serving_compatible(tmp_path): 1711 """ 1712 # Code for logging the model in mlflow 2.4.0 1713 import mlflow 1714 from mlflow.models import infer_signature 1715 1716 class MyModel(mlflow.pyfunc.PythonModel): 1717 def predict(self, context, model_input): 1718 return model_input 1719 1720 with mlflow.start_run(): 1721 model_info = mlflow.pyfunc.log_model( 1722 python_model = MyModel(), 1723 artifact_path = "test_model", 1724 signature = infer_signature(["input"]), 1725 registered_model_name="model") 1726 """ 1727 tmp_path.joinpath("MLmodel").write_text( 1728 """ 1729 artifact_path: test_model 1730 flavors: 1731 python_function: 1732 cloudpickle_version: 2.2.1 1733 env: 1734 conda: conda.yaml 1735 virtualenv: python_env.yaml 1736 loader_module: mlflow.pyfunc.model 1737 python_model: python_model.pkl 1738 python_version: 3.8.16 1739 mlflow_version: 2.4.0 1740 model_uuid: 3cbde93be0114644a6ec900c64cab39d 1741 run_id: 3f87fdff03524c19908c3a47fb99f9cd 1742 signature: 1743 inputs: '[{"type": "string"}]' 1744 outputs: null 1745 utc_time_created: '2023-07-13 01:29:55.467561' 1746 """ 1747 ) 1748 tmp_path.joinpath("python_env.yaml").write_text( 1749 """ 1750 python: 3.8.16 1751 build_dependencies: 1752 - pip==23.1.2 1753 - setuptools==56.0.0 1754 - wheel==0.40.0 1755 dependencies: 1756 - -r requirements.txt 1757 """ 1758 ) 1759 tmp_path.joinpath("requirements.txt").write_text( 1760 """ 1761 mlflow==2.4.0 1762 cloudpickle==2.2.1 1763 """ 1764 ) 1765 1766 class MyModel(mlflow.pyfunc.PythonModel): 1767 def predict(self, context, model_input): 1768 return model_input 1769 1770 python_model = MyModel() 1771 1772 with open(tmp_path / "python_model.pkl", "wb") as out: 1773 cloudpickle.dump(python_model, out) 1774 1775 assert Version(mlflow.__version__) > Version("2.4.0") 1776 model_uri = str(tmp_path) 1777 pyfunc_loaded = mlflow.pyfunc.load_model(model_uri) 1778 1779 assert pyfunc_loaded.metadata.signature == ModelSignature(Schema([ColSpec("string")])) 1780 1781 # predict is compatible 1782 local_predict = pyfunc_loaded.predict(["input"]) 1783 assert local_predict.values[0].tolist() == ["input"] 1784 1785 # model serving is compatible 1786 response = score_model_in_process( 1787 model_uri, 1788 data=dump_input_data(["a", "b"]), 1789 content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, 1790 ) 1791 assert response.status_code == 200 1792 prediction = json.loads(response.content.decode("utf-8"))["predictions"] 1793 assert prediction == [{"0": "a"}, {"0": "b"}] 1794 1795 1796 def test_function_python_model_serving_compatible(tmp_path): 1797 """ 1798 # Code for logging the model in mlflow 2.4.0 1799 import mlflow 1800 from mlflow.models import infer_signature 1801 1802 def my_model(model_input): 1803 return model_input 1804 1805 with mlflow.start_run(): 1806 model_info = mlflow.pyfunc.log_model( 1807 python_model = my_model, 1808 artifact_path = "test_model", 1809 signature = infer_signature(["input"]), 1810 registered_model_name="model", 1811 input_example=["input"]) 1812 """ 1813 tmp_path.joinpath("MLmodel").write_text( 1814 """ 1815 artifact_path: test_model 1816 flavors: 1817 python_function: 1818 cloudpickle_version: 2.2.1 1819 env: 1820 conda: conda.yaml 1821 virtualenv: python_env.yaml 1822 loader_module: mlflow.pyfunc.model 1823 python_model: python_model.pkl 1824 python_version: 3.8.16 1825 mlflow_version: 2.4.0 1826 model_uuid: f19b9a51a34a453282e53ca41d384964 1827 run_id: 9fd7b6e125a547fdbb4505f15e8259ed 1828 saved_input_example_info: 1829 artifact_path: input_example.json 1830 pandas_orient: split 1831 type: dataframe 1832 signature: 1833 inputs: '[{"type": "string"}]' 1834 outputs: null 1835 utc_time_created: '2023-07-14 10:18:44.353510' 1836 """ 1837 ) 1838 tmp_path.joinpath("python_env.yaml").write_text( 1839 """ 1840 python: 3.8.16 1841 build_dependencies: 1842 - pip==23.1.2 1843 - setuptools==56.0.0 1844 - wheel==0.40.0 1845 dependencies: 1846 - -r requirements.txt 1847 """ 1848 ) 1849 tmp_path.joinpath("requirements.txt").write_text( 1850 """ 1851 mlflow==2.4.0 1852 cloudpickle==2.2.1 1853 pandas==2.0.3 1854 """ 1855 ) 1856 tmp_path.joinpath("input_example.json").write_text( 1857 """ 1858 {"data": [["input"]]} 1859 """ 1860 ) 1861 1862 def my_model(model_input): 1863 return model_input 1864 1865 from mlflow.pyfunc.model import _FunctionPythonModel 1866 1867 python_model = _FunctionPythonModel(my_model, signature=infer_signature(["input"])) 1868 1869 with open(tmp_path / "python_model.pkl", "wb") as out: 1870 cloudpickle.dump(python_model, out) 1871 1872 assert Version(mlflow.__version__) > Version("2.4.0") 1873 model_uri = str(tmp_path) 1874 pyfunc_loaded = mlflow.pyfunc.load_model(model_uri) 1875 1876 assert pyfunc_loaded.metadata.signature == ModelSignature(Schema([ColSpec("string")])) 1877 1878 # predict is compatible 1879 local_predict = pyfunc_loaded.predict(["input"]) 1880 assert local_predict.values[0].tolist() == ["input"] 1881 1882 # model serving is compatible 1883 response = score_model_in_process( 1884 model_uri, 1885 data=dump_input_data(["a", "b"]), 1886 content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, 1887 ) 1888 assert response.status_code == 200 1889 prediction = json.loads(response.content.decode("utf-8"))["predictions"] 1890 assert prediction == [{"0": "a"}, {"0": "b"}] 1891 1892 1893 def test_enforce_schema_with_arrays_in_python_model_predict(sample_params_with_arrays): 1894 params = sample_params_with_arrays 1895 signature = infer_signature(["input1"], params=params) 1896 with mlflow.start_run(): 1897 model_info = mlflow.pyfunc.log_model( 1898 name="test_model", 1899 python_model=PythonModelWithArrayParams(), 1900 signature=signature, 1901 ) 1902 1903 loaded_model = mlflow.pyfunc.load_model(model_info.model_uri) 1904 loaded_predict = loaded_model.predict(["a", "b"], params=params) 1905 for param, value in params.items(): 1906 assert (loaded_predict[param] == value).all() 1907 1908 # Automatically convert type if it's not consistent with schema 1909 # 1. int -> long, float, double 1910 for param in ["double_array", "float_array", "long_array"]: 1911 loaded_predict = loaded_model.predict( 1912 ["a", "b"], params={param: np.array([np.int32(1), np.int32(2)])} 1913 ) 1914 assert (loaded_predict[param] == params[param]).all() 1915 # 2. long -> float, double 1916 for param in ["double_array", "float_array"]: 1917 loaded_predict = loaded_model.predict(["a", "b"], params={param: np.array([1, 2])}) 1918 assert (loaded_predict[param] == params[param]).all() 1919 # 3. float -> double 1920 loaded_predict = loaded_model.predict( 1921 ["a", "b"], params={"double_array": np.array([np.float32(1), np.float32(2)])} 1922 ) 1923 assert (loaded_predict["double_array"] == params["double_array"]).all() 1924 # 4. any -> datetime (try conversion) 1925 loaded_predict = loaded_model.predict( 1926 ["a", "b"], 1927 params={"datetime_array": np.array(["2023-06-26 00:00:00", "2023-06-26 00:00:00"])}, 1928 ) 1929 assert (loaded_predict["datetime_array"] == params["datetime_array"]).all() 1930 1931 # Raise error if failing to convert the type 1932 with pytest.raises( 1933 MlflowException, 1934 match=r"Failed to convert value `1.0` from type `<class 'float'>` to `DataType.datetime`", 1935 ): 1936 loaded_model.predict(["a", "b"], params={"datetime_array": [1.0, 2.0]}) 1937 with pytest.raises(MlflowException, match=r"Failed to validate type and shape for 'int_array'"): 1938 loaded_model.predict(["a", "b"], params={"int_array": np.array([1.0, 2.0])}) 1939 with pytest.raises( 1940 MlflowException, match=r"Failed to validate type and shape for 'float_array'" 1941 ): 1942 loaded_model.predict(["a", "b"], params={"float_array": [True, False]}) 1943 with pytest.raises( 1944 MlflowException, match=r"Failed to validate type and shape for 'double_array'" 1945 ): 1946 loaded_model.predict(["a", "b"], params={"double_array": [1.0, "2.0"]}) 1947 1948 1949 def test_enforce_schema_with_arrays_in_python_model_serving(sample_params_with_arrays): 1950 params = sample_params_with_arrays 1951 signature = infer_signature(["input1"], params=params) 1952 with mlflow.start_run(): 1953 model_info = mlflow.pyfunc.log_model( 1954 name="test_model", 1955 python_model=PythonModelWithArrayParams(), 1956 signature=signature, 1957 ) 1958 1959 with pyfunc_scoring_endpoint( 1960 model_info.model_uri, extra_args=["--env-manager", "local"] 1961 ) as endpoint: 1962 response = endpoint.invoke( 1963 data=dump_input_data(["a", "b"], params=params), 1964 content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, 1965 ) 1966 assert response.status_code == 200 1967 prediction = json.loads(response.content.decode("utf-8"))["predictions"] 1968 for param, value in params.items(): 1969 if param == "datetime_array": 1970 assert prediction[param] == list(map(np.datetime_as_string, value)) 1971 else: 1972 assert (prediction[param] == value).all() 1973 1974 # Test invalid params for model serving 1975 response = endpoint.invoke( 1976 data=dump_input_data(["a", "b"], params={"datetime_array": [1.0, 2.0]}), 1977 content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, 1978 ) 1979 assert response.status_code == 400 1980 assert ( 1981 "Failed to convert value `1.0` from type `<class 'float'>` to `DataType.datetime`" 1982 in json.loads(response.content.decode("utf-8"))["message"] 1983 ) 1984 1985 response = endpoint.invoke( 1986 data=dump_input_data(["a", "b"], params={"int_array": np.array([1.0, 2.0])}), 1987 content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, 1988 ) 1989 assert response.status_code == 400 1990 assert ( 1991 "Failed to validate type and shape for 'int_array'" 1992 in json.loads(response.content.decode("utf-8"))["message"] 1993 ) 1994 1995 response = endpoint.invoke( 1996 data=dump_input_data(["a", "b"], params={"float_array": [True, False]}), 1997 content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, 1998 ) 1999 assert response.status_code == 400 2000 assert ( 2001 "Failed to validate type and shape for 'float_array'" 2002 in json.loads(response.content.decode("utf-8"))["message"] 2003 ) 2004 2005 response = endpoint.invoke( 2006 data=dump_input_data(["a", "b"], params={"double_array": [1.0, "2.0"]}), 2007 content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, 2008 ) 2009 assert response.status_code == 400 2010 assert ( 2011 "Failed to validate type and shape for 'double_array'" 2012 in json.loads(response.content.decode("utf-8"))["message"] 2013 ) 2014 2015 2016 @pytest.mark.parametrize( 2017 ("example", "input_schema", "output_schema"), 2018 [ 2019 ( 2020 ["input1", "input2", "input3"], 2021 Schema([ColSpec(DataType.string)]), 2022 Schema([ColSpec(DataType.string, 0)]), 2023 ), 2024 ( 2025 [{"a": "a", "b": "b"}, {"a": "b"}], 2026 Schema([ColSpec(DataType.string, "a"), ColSpec(DataType.string, "b", required=False)]), 2027 Schema([ColSpec(DataType.string, "a"), ColSpec(DataType.string, "b", required=False)]), 2028 ), 2029 ( 2030 {"a": ["a", "b", "c"], "b": "b"}, 2031 Schema([ColSpec(Array(DataType.string), "a"), ColSpec(DataType.string, "b")]), 2032 Schema([ColSpec(Array(DataType.string), "a"), ColSpec(DataType.string, "b")]), 2033 ), 2034 ( 2035 pd.DataFrame({"a": ["a", "b", "c"], "b": "b"}), 2036 Schema([ColSpec(DataType.string, "a"), ColSpec(DataType.string, "b")]), 2037 Schema([ColSpec(DataType.string, "a"), ColSpec(DataType.string, "b")]), 2038 ), 2039 ], 2040 ) 2041 def test_pyfunc_model_input_example_with_params( 2042 sample_params_basic, param_schema_basic, tmp_path, example, input_schema, output_schema 2043 ): 2044 class MyModel(mlflow.pyfunc.PythonModel): 2045 def predict(self, context, model_input, params=None): 2046 return model_input 2047 2048 with mlflow.start_run(): 2049 model_info = mlflow.pyfunc.log_model( 2050 name="test_model", 2051 python_model=MyModel(), 2052 input_example=(example, sample_params_basic), 2053 ) 2054 2055 # Test _infer_signature_from_input_example 2056 assert model_info.signature.inputs == input_schema 2057 assert model_info.signature.outputs == output_schema 2058 assert model_info.signature.params == param_schema_basic 2059 2060 # Test predict 2061 loaded_model = mlflow.pyfunc.load_model(model_info.model_uri) 2062 prediction = loaded_model.predict(example) 2063 expected_df = pd.DataFrame([example] if isinstance(example, dict) else example) 2064 pd.testing.assert_frame_equal(prediction, expected_df) 2065 2066 # Test saved example 2067 local_path = _download_artifact_from_uri(model_info.model_uri, output_path=tmp_path) 2068 mlflow_model = Model.load(os.path.join(local_path, "MLmodel")) 2069 loaded_example = mlflow_model.load_input_example(local_path) 2070 if isinstance(example, list) and all(np.isscalar(x) for x in example): 2071 np.testing.assert_equal(loaded_example, example) 2072 else: 2073 if isinstance(example, pd.DataFrame): 2074 pd.testing.assert_frame_equal(loaded_example, example) 2075 else: 2076 assert loaded_example == example 2077 2078 for test_example in ["saved_example", "manual_example"]: 2079 if test_example == "saved_example": 2080 payload = mlflow_model.get_serving_input(local_path) 2081 else: 2082 if isinstance(example, pd.DataFrame): 2083 payload = json.dumps({"dataframe_split": example.to_dict(orient="split")}) 2084 else: 2085 payload = json.dumps({"inputs": example}) 2086 2087 response = score_model_in_process( 2088 model_info.model_uri, 2089 data=payload, 2090 content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, 2091 ) 2092 assert response.status_code == 200, response.content 2093 result = json.loads(response.content.decode("utf-8"))["predictions"] 2094 result = pd.DataFrame(result).values.tolist()[0] 2095 np.testing.assert_equal(result, expected_df.values.tolist()[0]) 2096 2097 2098 def test_invalid_input_example_warn_when_model_logging(): 2099 class MyModel(mlflow.pyfunc.PythonModel): 2100 def predict(self, context, model_input, params=None): 2101 # List[str] is converted to pandas DataFrame 2102 # after schema enforcement, so this is invalid 2103 assert isinstance(model_input, list) 2104 return "string" 2105 2106 with mock.patch("mlflow.models.model._logger.warning") as mock_warning: 2107 with mlflow.start_run(): 2108 mlflow.pyfunc.log_model( 2109 name="test_model", 2110 python_model=MyModel(), 2111 input_example=["some string"], 2112 ) 2113 assert any( 2114 "Failed to validate serving input example" in call[0][0] 2115 for call in mock_warning.call_args_list 2116 ) 2117 2118 2119 def assert_equal(a, b): 2120 if isinstance(a, pd.DataFrame): 2121 pd.testing.assert_frame_equal(a, b) 2122 elif isinstance(a, np.ndarray) or isinstance(b, np.ndarray): 2123 np.testing.assert_equal(a, b) 2124 elif isinstance(a, dict): 2125 assert a.keys() == b.keys() 2126 for key in a: 2127 assert_equal(a[key], b[key]) 2128 else: 2129 assert a == b 2130 2131 2132 @pytest.mark.parametrize( 2133 ("example", "signature", "expected_input", "expected_output"), 2134 [ 2135 ( 2136 pd.DataFrame({"a": ["input1", "input2", "input3"]}), 2137 ModelSignature( 2138 Schema([ColSpec(DataType.string, "a")]), Schema([ColSpec(DataType.string)]) 2139 ), 2140 pd.DataFrame({"a": ["input1", "input2", "input3"]}), 2141 "string output", 2142 ), 2143 ( 2144 np.array([1, 2, 3]), 2145 ModelSignature( 2146 Schema([TensorSpec(np.dtype("int64"), (-1,))]), 2147 Schema([TensorSpec(np.dtype("float64"), (-1,))]), 2148 ), 2149 np.array([1, 2, 3]), 2150 np.array([1.0, 2.0, 3.0]), 2151 ), 2152 ( 2153 np.array([1, 2, 3, np.nan]), 2154 ModelSignature( 2155 Schema([TensorSpec(np.dtype("float64"), (-1,))]), 2156 Schema([TensorSpec(np.dtype("float64"), (-1,))]), 2157 ), 2158 np.array([1, 2, 3, np.nan]), 2159 np.array([1.0, 2.0, 3.0, np.nan]), 2160 ), 2161 ( 2162 {"a": np.array([1, 2, 3])}, 2163 ModelSignature( 2164 Schema([TensorSpec(np.dtype("int64"), (-1,), "a")]), 2165 Schema([TensorSpec(np.dtype("float64"), (-1,), "b")]), 2166 ), 2167 {"a": np.array([1, 2, 3])}, 2168 {"b": np.array([1.0, 2.0, 3.0])}, 2169 ), 2170 ( 2171 ["input1", "input2", "input3"], 2172 ModelSignature(Schema([ColSpec(DataType.string)]), Schema([ColSpec(DataType.string)])), 2173 # This is due to _enforce_schema 2174 pd.DataFrame(["input1", "input2", "input3"]), 2175 ["input1", "input2", "input3"], 2176 ), 2177 ( 2178 [{"a": ["sentence1", "sentence2"], "b": ["answer1", "answer2"]}], 2179 ModelSignature( 2180 Schema([ 2181 ColSpec(Array(DataType.string), "a"), 2182 ColSpec(Array(DataType.string), "b"), 2183 ]), 2184 Schema([ColSpec(DataType.string, "output")]), 2185 ), 2186 pd.DataFrame([{"a": ["sentence1", "sentence2"], "b": ["answer1", "answer2"]}]), 2187 {"output": "some prediction"}, 2188 ), 2189 ( 2190 {"messages": [{"role": "user", "content": "some question"}]}, 2191 ModelSignature( 2192 Schema([ 2193 ColSpec( 2194 Array( 2195 Object([ 2196 Property("role", DataType.string), 2197 Property("content", DataType.string), 2198 ]) 2199 ), 2200 "messages", 2201 ) 2202 ]), 2203 Schema([ColSpec(DataType.string, "output")]), 2204 ), 2205 # we assume the field is array so we need another list wrapper 2206 pd.DataFrame([{"messages": [{"role": "user", "content": "some question"}]}]), 2207 {"output": "some prediction"}, 2208 ), 2209 ], 2210 ) 2211 def test_input_example_validation_during_logging( 2212 tmp_path, example, signature, expected_input, expected_output 2213 ): 2214 from mlflow.models import validate_serving_input 2215 2216 class MyModel(mlflow.pyfunc.PythonModel): 2217 def predict(self, context, model_input, params=None): 2218 assert_equal(model_input, expected_input) 2219 return expected_output 2220 2221 with mlflow.start_run(): 2222 model_info = mlflow.pyfunc.log_model( 2223 name="test_model", 2224 python_model=MyModel(), 2225 input_example=example, 2226 ) 2227 assert model_info.signature == signature 2228 2229 mlflow_model = Model.load(model_info.model_uri) 2230 local_path = _download_artifact_from_uri(model_info.model_uri, output_path=tmp_path) 2231 serving_input_example = mlflow_model.get_serving_input(local_path) 2232 response = score_model_in_process( 2233 model_info.model_uri, 2234 data=serving_input_example, 2235 content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, 2236 ) 2237 assert response.status_code == 200, response.content 2238 if is_unified_llm_input(example): 2239 result = json.loads(response.content.decode("utf-8")) 2240 else: 2241 result = json.loads(response.content.decode("utf-8"))["predictions"] 2242 assert_equal(result, expected_output) 2243 2244 # make sure validate_serving_input has the same output 2245 assert convert_input_example_to_serving_input(example) == serving_input_example 2246 result = validate_serving_input(model_info.model_uri, serving_input_example) 2247 assert_equal(result, expected_output) 2248 2249 2250 def test_pyfunc_schema_inference_not_generate_trace(): 2251 # Test that the model logging call does not generate a trace. 2252 # When input example is provided, we run prediction to infer 2253 # the model signature, but it should not generate a trace. 2254 class MyModel(mlflow.pyfunc.PythonModel): 2255 @mlflow.trace() 2256 def predict(self, context, model_input): 2257 return model_input 2258 2259 with mlflow.start_run(): 2260 model_info = mlflow.pyfunc.log_model( 2261 name="test_model", 2262 python_model=MyModel(), 2263 input_example=["input"], 2264 ) 2265 2266 # No trace should be generated 2267 traces = get_traces() 2268 assert len(traces) == 0 2269 2270 # Normal prediction should emit a trace 2271 loaded_model = mlflow.pyfunc.load_model(model_info.model_uri) 2272 loaded_model.predict("input") 2273 traces = get_traces() 2274 assert len(traces) == 1 2275 2276 2277 @pytest.mark.parametrize( 2278 ("data", "schema"), 2279 [ 2280 ({"a": np.array([1, 2, 3])}, Schema([ColSpec(DataType.long, name="a")])), 2281 ({"query": "sentence"}, Schema([ColSpec(DataType.string, name="query")])), 2282 ( 2283 {"query": ["sentence_1", "sentence_2"]}, 2284 Schema([ColSpec(DataType.string, name="query")]), 2285 ), 2286 ( 2287 {"query": ["sentence_1", "sentence_2"], "table": "some_table"}, 2288 Schema([ 2289 ColSpec(DataType.string, name="query"), 2290 ColSpec(DataType.string, name="table"), 2291 ]), 2292 ), 2293 ( 2294 [{"query": "sentence"}, {"query": "sentence"}], 2295 Schema([ColSpec(DataType.string, name="query")]), 2296 ), 2297 ( 2298 [ 2299 {"query": ["sentence_1", "sentence_2"], "table": "some_table"}, 2300 {"query": ["sentence_1", "sentence_2"], "table": "some_table"}, 2301 ], 2302 Schema([ 2303 ColSpec(DataType.string, name="query"), 2304 ColSpec(DataType.string, name="table"), 2305 ]), 2306 ), 2307 ], 2308 ) 2309 def test_pyfunc_model_schema_enforcement_with_dicts_and_lists(data, schema): 2310 class MyModel(mlflow.pyfunc.PythonModel): 2311 def predict(self, context, model_input, params=None): 2312 return model_input 2313 2314 signature = ModelSignature(schema) 2315 with mlflow.start_run(): 2316 model_info = mlflow.pyfunc.log_model( 2317 name="test_model", 2318 python_model=MyModel(), 2319 signature=signature, 2320 ) 2321 loaded_model = mlflow.pyfunc.load_model(model_info.model_uri) 2322 prediction = loaded_model.predict(data) 2323 if isinstance(data, dict) and all( 2324 isinstance(x, str) or (isinstance(x, list) and all(isinstance(y, str) for y in x)) 2325 for x in data.values() 2326 ): 2327 df = pd.DataFrame([data]) 2328 else: 2329 df = pd.DataFrame(data) 2330 pd.testing.assert_frame_equal(prediction, df) 2331 2332 # Test pandas DataFrame input 2333 prediction = loaded_model.predict(df) 2334 pd.testing.assert_frame_equal(prediction, df) 2335 2336 2337 @pytest.mark.parametrize( 2338 ("data", "schema"), 2339 [ 2340 ({"query": "sentence"}, Schema([ColSpec(DataType.string, name="query")])), 2341 ( 2342 {"query": ["sentence_1", "sentence_2"]}, 2343 Schema([ColSpec(DataType.string, name="query")]), 2344 ), 2345 ( 2346 {"query": ["sentence_1", "sentence_2"], "table": "some_table"}, 2347 Schema([ 2348 ColSpec(DataType.string, name="query"), 2349 ColSpec(DataType.string, name="table"), 2350 ]), 2351 ), 2352 ], 2353 ) 2354 # `instances` is an invalid key for schema with MLflow < 2.9.0 2355 @pytest.mark.parametrize("format_key", ["inputs", "dataframe_split", "dataframe_records"]) 2356 def test_pyfunc_model_serving_with_dicts(data, schema, format_key): 2357 class MyModel(mlflow.pyfunc.PythonModel): 2358 def predict(self, context, model_input, params=None): 2359 return model_input 2360 2361 signature = ModelSignature(schema) 2362 with mlflow.start_run(): 2363 model_info = mlflow.pyfunc.log_model( 2364 name="test_model", 2365 python_model=MyModel(), 2366 signature=signature, 2367 ) 2368 2369 df = ( 2370 pd.DataFrame([data]) 2371 if all(isinstance(x, str) for x in data.values()) 2372 else pd.DataFrame(data) 2373 ) 2374 if format_key == "inputs": 2375 payload = {format_key: data} 2376 elif format_key in ("dataframe_split", "dataframe_records"): 2377 payload = {format_key: df.to_dict(orient=format_key[10:])} 2378 2379 response = score_model_in_process( 2380 model_info.model_uri, 2381 data=json.dumps(payload), 2382 content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, 2383 ) 2384 assert response.status_code == 200, response.content 2385 result = json.loads(response.content.decode("utf-8"))["predictions"] 2386 # This is not consistent with batch inference df 2387 pd.testing.assert_frame_equal(pd.DataFrame(result), df) 2388 2389 2390 @pytest.mark.parametrize( 2391 ("data", "schema"), 2392 [ 2393 ( 2394 [{"query": "sentence"}, {"query": "sentence"}], 2395 Schema([ColSpec(DataType.string, name="query")]), 2396 ), 2397 ( 2398 [ 2399 {"query": ["sentence_1", "sentence_2"], "table": "some_table"}, 2400 {"query": ["sentence_1", "sentence_2"], "table": "some_table"}, 2401 ], 2402 Schema([ 2403 ColSpec(DataType.string, name="query"), 2404 ColSpec(DataType.string, name="table"), 2405 ]), 2406 ), 2407 ], 2408 ) 2409 # `inputs`` is an invalid key for schema with MLflow < 2.9.0 2410 @pytest.mark.parametrize("format_key", ["instances", "dataframe_split", "dataframe_records"]) 2411 def test_pyfunc_model_serving_with_lists_of_dicts(data, schema, format_key): 2412 class MyModel(mlflow.pyfunc.PythonModel): 2413 def predict(self, context, model_input, params=None): 2414 return model_input 2415 2416 signature = ModelSignature(schema) 2417 with mlflow.start_run(): 2418 model_info = mlflow.pyfunc.log_model( 2419 name="test_model", 2420 python_model=MyModel(), 2421 signature=signature, 2422 ) 2423 2424 df = pd.DataFrame(data) 2425 if format_key == "instances": 2426 payload = {format_key: data} 2427 elif format_key in ("dataframe_split", "dataframe_records"): 2428 payload = {format_key: df.to_dict(orient=format_key[10:])} 2429 2430 response = score_model_in_process( 2431 model_info.model_uri, 2432 data=json.dumps(payload), 2433 content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, 2434 ) 2435 assert response.status_code == 200, response.content 2436 result = json.loads(response.content.decode("utf-8"))["predictions"] 2437 pd.testing.assert_frame_equal(pd.DataFrame(result), df) 2438 2439 2440 @pytest.mark.parametrize( 2441 ("data", "schema"), 2442 [ 2443 ({"query": "sentence"}, Schema([ColSpec(DataType.string, name="query")])), 2444 ( 2445 {"query": ["sentence_1", "sentence_2"]}, 2446 Schema([ColSpec(Array(DataType.string), name="query")]), 2447 ), 2448 ( 2449 {"query": {"a": "a", "b": 1}}, 2450 Schema([ 2451 ColSpec( 2452 Object([Property("a", DataType.string), Property("b", DataType.long)]), 2453 "query", 2454 ) 2455 ]), 2456 ), 2457 ( 2458 {"query": ["sentence_1", "sentence_2"], "table": "some_table"}, 2459 Schema([ 2460 ColSpec(Array(DataType.string), name="query"), 2461 ColSpec(DataType.string, name="table"), 2462 ]), 2463 ), 2464 ( 2465 {"query": [{"name": "value", "age": 10}, {"name": "value"}], "table": ["some_table"]}, 2466 Schema([ 2467 ColSpec( 2468 Array( 2469 Object([ 2470 Property("name", DataType.string), 2471 Property("age", DataType.long, required=False), 2472 ]) 2473 ), 2474 name="query", 2475 ), 2476 ColSpec(Array(DataType.string), name="table"), 2477 ]), 2478 ), 2479 ( 2480 [{"query": "sentence"}, {"query": "sentence"}], 2481 Schema([ColSpec(DataType.string, name="query")]), 2482 ), 2483 ( 2484 [ 2485 {"query": ["sentence_1", "sentence_2"], "table": "some_table"}, 2486 {"query": ["sentence_1", "sentence_2"]}, 2487 ], 2488 Schema([ 2489 ColSpec(Array(DataType.string), name="query"), 2490 ColSpec(DataType.string, name="table", required=False), 2491 ]), 2492 ), 2493 ], 2494 ) 2495 def test_pyfunc_model_schema_enforcement_with_objects_and_arrays(data, schema): 2496 class MyModel(mlflow.pyfunc.PythonModel): 2497 def load_context(self, context): 2498 self.pipeline = "pipeline" 2499 2500 def predict(self, context, model_input, params=None): 2501 assert self.pipeline == "pipeline" 2502 return model_input 2503 2504 signature = infer_signature(data) 2505 assert signature.inputs == schema 2506 pdf = pd.DataFrame(data if isinstance(data, list) else [data]) 2507 assert infer_signature(pdf).inputs == schema 2508 2509 with mlflow.start_run(): 2510 model_info = mlflow.pyfunc.log_model( 2511 name="test_model", 2512 python_model=MyModel(), 2513 signature=signature, 2514 ) 2515 loaded_model = mlflow.pyfunc.load_model(model_info.model_uri) 2516 prediction = loaded_model.predict(data) 2517 df = pd.DataFrame(data) if isinstance(data, list) else pd.DataFrame([data]) 2518 pd.testing.assert_frame_equal(prediction, df) 2519 2520 # Test pandas DataFrame input 2521 prediction = loaded_model.predict(df) 2522 pd.testing.assert_frame_equal(prediction, df) 2523 2524 2525 @pytest.mark.parametrize( 2526 "data", 2527 [ 2528 {"query": "sentence"}, 2529 {"query": ["sentence_1", "sentence_2"]}, 2530 {"query": ["sentence_1", "sentence_2"], "table": "some_table"}, 2531 {"query": [{"name": "value"}, {"name": "value"}], "table": ["some_table"]}, 2532 [{"query": "sentence"}, {"query": "sentence"}], 2533 [ 2534 {"query": ["sentence_1", "sentence_2"], "table": "some_table"}, 2535 {"query": ["sentence_1", "sentence_2"]}, 2536 ], 2537 [ 2538 {"query": [{"name": "value"}, {"name": "value"}], "table": ["some_table"]}, 2539 {"query": [{"name": "value", "age": 10}, {"name": "value"}], "table": ["some_table"]}, 2540 ], 2541 ], 2542 ) 2543 @pytest.mark.parametrize("format_key", ["inputs", "dataframe_split", "dataframe_records"]) 2544 def test_pyfunc_model_scoring_with_objects_and_arrays(data, format_key): 2545 class MyModel(mlflow.pyfunc.PythonModel): 2546 def predict(self, context, model_input, params=None): 2547 return model_input 2548 2549 with mlflow.start_run(): 2550 model_info = mlflow.pyfunc.log_model( 2551 name="test_model", 2552 python_model=MyModel(), 2553 signature=infer_signature(data), 2554 ) 2555 2556 df = pd.DataFrame(data) if isinstance(data, list) else pd.DataFrame([data]) 2557 2558 if format_key == "inputs": 2559 payload = {format_key: data} 2560 elif format_key == "dataframe_split": 2561 payload = {format_key: df.to_dict(orient="split")} 2562 elif format_key == "dataframe_records": 2563 payload = {format_key: df.to_dict(orient="records")} 2564 2565 response = score_model_in_process( 2566 model_info.model_uri, 2567 data=json.dumps(payload), 2568 content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, 2569 ) 2570 assert response.status_code == 200, response.content 2571 result = json.loads(response.content.decode("utf-8"))["predictions"] 2572 expected_result = df.to_dict(orient="records") 2573 np.testing.assert_equal(result, expected_result) 2574 2575 2576 @pytest.mark.parametrize( 2577 "data", 2578 [ 2579 {"query": "sentence"}, 2580 {"query": ["sentence_1", "sentence_2"]}, 2581 {"query": ["sentence_1", "sentence_2"], "table": "some_table"}, 2582 {"query": [{"name": "value"}, {"name": "value"}], "table": ["some_table"]}, 2583 [{"query": "sentence"}, {"query": "sentence"}], 2584 ], 2585 ) 2586 def test_pyfunc_model_scoring_with_objects_and_arrays_instances(data): 2587 class MyModel(mlflow.pyfunc.PythonModel): 2588 def predict(self, context, model_input, params=None): 2589 return model_input 2590 2591 with mlflow.start_run(): 2592 model_info = mlflow.pyfunc.log_model( 2593 name="test_model", 2594 python_model=MyModel(), 2595 signature=infer_signature(data), 2596 ) 2597 2598 df = pd.DataFrame(data) if isinstance(data, list) else pd.DataFrame([data]) 2599 response = score_model_in_process( 2600 model_info.model_uri, 2601 data=json.dumps({"instances": data}), 2602 content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, 2603 ) 2604 assert response.status_code == 200, response.content 2605 result = json.loads(response.content.decode("utf-8"))["predictions"] 2606 expected_result = df.to_dict(orient="records") 2607 np.testing.assert_equal(result, expected_result) 2608 2609 2610 @pytest.mark.parametrize( 2611 "data", 2612 [ 2613 [{"query": {"a": "b"}, "name": "A"}, {"query": {"a": "c"}, "name": "B"}], 2614 [ 2615 {"query": ["sentence_1", "sentence_2"], "table": "some_table"}, 2616 {"query": ["sentence_1", "sentence_2"]}, 2617 ], 2618 [ 2619 {"query": [{"name": "value"}, {"name": "value"}], "table": ["some_table"]}, 2620 {"query": [{"name": "value", "age": 10}, {"name": "value"}], "table": ["some_table"]}, 2621 ], 2622 ], 2623 ) 2624 def test_pyfunc_model_scoring_with_objects_and_arrays_instances_errors(data): 2625 class MyModel(mlflow.pyfunc.PythonModel): 2626 def predict(self, context, model_input, params=None): 2627 return model_input 2628 2629 with mlflow.start_run(): 2630 model_info = mlflow.pyfunc.log_model( 2631 name="test_model", 2632 python_model=MyModel(), 2633 signature=infer_signature(data), 2634 ) 2635 2636 response = score_model_in_process( 2637 model_info.model_uri, 2638 data=json.dumps({"instances": data}), 2639 content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, 2640 ) 2641 assert response.status_code == 400, response.content 2642 assert "Failed to enforce schema" in json.loads(response.content.decode("utf-8"))["message"] 2643 2644 2645 @pytest.mark.parametrize( 2646 ("data", "schema"), 2647 [ 2648 ( 2649 [{"query": "question1"}, {"query": "question2"}], 2650 Schema([ColSpec(DataType.string, "query")]), 2651 ), 2652 ( 2653 [{"query": ["sentence_1", "sentence_2"]}, {"query": ["sentence_1", "sentence_2"]}], 2654 Schema([ColSpec(DataType.string, "query")]), 2655 ), 2656 ( 2657 [ 2658 {"query": ["sentence_1", "sentence_2"], "table": "some_table"}, 2659 {"query": ["sentence_1", "sentence_2"], "table": "some_table"}, 2660 ], 2661 Schema([ColSpec(DataType.string, "query"), ColSpec(DataType.string, "table")]), 2662 ), 2663 ], 2664 ) 2665 def test_pyfunc_model_scoring_instances_backwards_compatibility(data, schema): 2666 class MyModel(mlflow.pyfunc.PythonModel): 2667 def predict(self, context, model_input, params=None): 2668 return model_input 2669 2670 with mlflow.start_run(): 2671 model_info = mlflow.pyfunc.log_model( 2672 name="test_model", 2673 python_model=MyModel(), 2674 signature=ModelSignature(schema), 2675 ) 2676 2677 response = score_model_in_process( 2678 model_info.model_uri, 2679 data=json.dumps({"instances": data}), 2680 content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, 2681 ) 2682 assert response.status_code == 200, response.content 2683 result = json.loads(response.content.decode("utf-8"))["predictions"] 2684 np.testing.assert_equal(result, data) 2685 2686 2687 @pytest.mark.parametrize( 2688 ("data", "schema"), 2689 [ 2690 ( 2691 { 2692 "netsed_list": [ 2693 [["a", "b"], ["c", "d"]], 2694 [["e", "f"], ["g"]], 2695 ] 2696 }, 2697 Schema([ColSpec(Array(Array(DataType.string)), name="netsed_list")]), 2698 ), 2699 ( 2700 { 2701 "numpy_2d_array": [ 2702 np.array([[np.int32(1), np.int32(2)], [np.int32(3), np.int32(4)]]) 2703 ] 2704 }, 2705 Schema([ColSpec(Array(Array(DataType.integer)), name="numpy_2d_array")]), 2706 ), 2707 ( 2708 {"list_of_np_array": [[np.array(["a", "b"])], [np.array(["c", "d"])]]}, 2709 Schema([ColSpec(Array(Array(DataType.string)), name="list_of_np_array")]), 2710 ), 2711 ], 2712 ) 2713 def test_pyfunc_model_schema_enforcement_nested_array(data, schema): 2714 class MyModel(mlflow.pyfunc.PythonModel): 2715 def predict(self, context, model_input, params=None): 2716 return model_input 2717 2718 df = pd.DataFrame.from_records(data) 2719 signature = infer_signature(df) 2720 assert signature.inputs == schema 2721 2722 with mlflow.start_run(): 2723 model_info = mlflow.pyfunc.log_model( 2724 name="test_model", 2725 python_model=MyModel(), 2726 signature=signature, 2727 ) 2728 loaded_model = mlflow.pyfunc.load_model(model_info.model_uri) 2729 prediction = loaded_model.predict(df) 2730 pd.testing.assert_frame_equal(prediction, df) 2731 2732 2733 @pytest.mark.parametrize( 2734 ("data", "schema"), 2735 [ 2736 ( 2737 { 2738 "simple_map": [ 2739 {"a": 3, "b": 4}, 2740 {}, 2741 {"c": 5}, 2742 ] 2743 }, 2744 Schema([ColSpec(Map(value_type=DataType.long), name="simple_map")]), 2745 ), 2746 ( 2747 { 2748 "simple_map": [ 2749 {"a": 3, "b": 4}, 2750 {}, 2751 {"c": 5}, 2752 ] 2753 }, 2754 Schema([ColSpec(Map(value_type=DataType.long))]), # Unnamed column 2755 ), 2756 ( 2757 { 2758 "nested_map": [ 2759 {"a": {"a1": 3, "a2": 4}, "b": {"b1": 5}}, 2760 {}, 2761 {"c": {}}, 2762 ] 2763 }, 2764 Schema([ColSpec(Map(value_type=Map(value_type=DataType.long)), name="nested_map")]), 2765 ), 2766 ( 2767 { 2768 "array_in_map": [ 2769 {"a": [1, 2, 3], "b": [4, 5]}, 2770 {}, 2771 {"c": []}, 2772 ] 2773 }, 2774 Schema([ColSpec(Map(value_type=Array(dtype=DataType.long)), name="array_in_map")]), 2775 ), 2776 ( 2777 { 2778 "object_in_map": [ 2779 {"a": {"key1": "a1", "key2": 1}, "b": {"key1": "b1"}}, 2780 {}, 2781 {"c": {"key1": "c1"}}, 2782 ] 2783 }, 2784 Schema([ 2785 ColSpec( 2786 Map( 2787 value_type=Object([ 2788 Property("key1", DataType.string), 2789 Property("key2", DataType.long, required=False), 2790 ]) 2791 ), 2792 name="object_in_map", 2793 ) 2794 ]), 2795 ), 2796 ( 2797 { 2798 "map_in_array": [ 2799 [{"a": 3, "b": 4}, {"c": 5}], 2800 [], 2801 [{"d": 6}], 2802 ] 2803 }, 2804 Schema([ColSpec(Array(dtype=Map(value_type=DataType.long)), name="map_in_array")]), 2805 ), 2806 ( 2807 { 2808 "map_in_object": [ 2809 {"key1": {"a": 3, "b": 4}, "key2": {"c": 5}}, 2810 {"key1": {"d": 6}}, 2811 ] 2812 }, 2813 Schema([ 2814 ColSpec( 2815 Object([ 2816 Property("key1", Map(value_type=DataType.long)), 2817 Property("key2", Map(value_type=DataType.long), required=False), 2818 ]), 2819 name="map_in_object", 2820 ) 2821 ]), 2822 ), 2823 ], 2824 ) 2825 @pytest.mark.parametrize("format_key", ["dataframe_split", "dataframe_records"]) 2826 def test_pyfunc_model_schema_enforcement_map_type(data, schema, format_key): 2827 class MyModel(mlflow.pyfunc.PythonModel): 2828 def predict(self, context, model_input, params=None): 2829 return model_input 2830 2831 df = pd.DataFrame.from_records(data) 2832 2833 with mlflow.start_run(): 2834 model_info = mlflow.pyfunc.log_model( 2835 name="test_model", 2836 python_model=MyModel(), 2837 signature=ModelSignature(inputs=schema, outputs=schema), 2838 ) 2839 loaded_model = mlflow.pyfunc.load_model(model_info.model_uri) 2840 prediction = loaded_model.predict(df) 2841 pd.testing.assert_frame_equal(prediction, df) 2842 2843 if format_key == "dataframe_split": 2844 payload = {format_key: df.to_dict(orient="split")} 2845 elif format_key == "dataframe_records": 2846 payload = {format_key: df.to_dict(orient="records")} 2847 2848 class CustomJsonEncoder(json.JSONEncoder): 2849 def default(self, o): 2850 import numpy as np 2851 2852 if isinstance(o, np.int64): 2853 return int(o) 2854 2855 return super().default(o) 2856 2857 response = score_model_in_process( 2858 model_info.model_uri, 2859 data=json.dumps(payload, cls=CustomJsonEncoder), 2860 content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, 2861 ) 2862 assert response.status_code == 200, response.content 2863 result = json.loads(response.content.decode("utf-8"))["predictions"] 2864 expected_result = df.to_dict(orient="records") 2865 np.testing.assert_equal(result, expected_result) 2866 2867 2868 @pytest.mark.parametrize( 2869 ("data", "schema"), 2870 [ 2871 ( 2872 [ 2873 { 2874 "object_column": {"query": ["sentence_1", "sentence_2"], "table": "some_table"}, 2875 "string_column": "some_string", 2876 "array_column": [{"name": "value"}, {"name": "value"}], 2877 }, 2878 { 2879 "object_column": {"query": ["sentence_1", "sentence_2"]}, 2880 "string_column": "some_string", 2881 "array_column": [{"name": "value"}], 2882 }, 2883 ], 2884 Schema([ 2885 ColSpec( 2886 Object([ 2887 Property("query", Array(DataType.string)), 2888 Property("table", DataType.string, required=False), 2889 ]), 2890 "object_column", 2891 ), 2892 ColSpec(DataType.string, "string_column"), 2893 ColSpec( 2894 Array(Object([Property("name", DataType.string)])), 2895 "array_column", 2896 ), 2897 ]), 2898 ), 2899 ], 2900 ) 2901 @pytest.mark.parametrize("format_key", ["inputs", "dataframe_split", "dataframe_records"]) 2902 def test_pyfunc_model_schema_enforcement_complex(data, schema, format_key): 2903 class MyModel(mlflow.pyfunc.PythonModel): 2904 def predict(self, context, model_input, params=None): 2905 return model_input 2906 2907 df = pd.DataFrame.from_records(data) 2908 signature = infer_signature(df) 2909 assert signature.inputs == schema 2910 2911 with mlflow.start_run(): 2912 model_info = mlflow.pyfunc.log_model( 2913 name="test_model", 2914 python_model=MyModel(), 2915 signature=signature, 2916 ) 2917 loaded_model = mlflow.pyfunc.load_model(model_info.model_uri) 2918 prediction = loaded_model.predict(df) 2919 pd.testing.assert_frame_equal(prediction, df) 2920 2921 if format_key == "inputs": 2922 payload = {format_key: data} 2923 elif format_key == "dataframe_split": 2924 payload = {format_key: df.to_dict(orient="split")} 2925 elif format_key == "dataframe_records": 2926 payload = {format_key: df.to_dict(orient="records")} 2927 2928 response = score_model_in_process( 2929 model_info.model_uri, 2930 data=json.dumps(payload), 2931 content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, 2932 ) 2933 assert response.status_code == 200, response.content 2934 result = json.loads(response.content.decode("utf-8"))["predictions"] 2935 expected_result = df.to_dict(orient="records") 2936 np.testing.assert_equal(result, expected_result) 2937 2938 2939 def test_zero_or_one_longs_convert_to_floats(): 2940 zeros = pd.DataFrame([{"temperature": 0}, {"temperature": 0.9}, {"temperature": 1}, {}]) 2941 schema = Schema([ColSpec(DataType.double, name="temperature", required=False)]) 2942 data = _enforce_schema(zeros, schema) 2943 pd.testing.assert_series_equal( 2944 data["temperature"], pd.Series([0.0, 0.9, 1.0, np.nan], dtype=np.float64), check_names=False 2945 ) 2946 2947 2948 @pytest.mark.parametrize( 2949 ("input_example", "expected_schema", "payload_example"), 2950 [ 2951 ({"a": None}, Schema([ColSpec(type=AnyType(), name="a", required=False)]), {"a": "string"}), 2952 ( 2953 {"a": [None, []]}, 2954 Schema([ColSpec(Array(AnyType()), name="a", required=False)]), 2955 {"a": ["abc", "123"]}, 2956 ), 2957 ( 2958 {"a": [None]}, 2959 Schema([ColSpec(type=Array(AnyType()), name="a", required=False)]), 2960 {"a": ["abc"]}, 2961 ), 2962 ( 2963 {"a": [None, "string"]}, 2964 Schema([ColSpec(type=Array(DataType.string), name="a", required=False)]), 2965 {"a": ["abc"]}, 2966 ), 2967 ( 2968 {"a": {"x": None}}, 2969 Schema([ColSpec(type=Object([Property("x", AnyType(), required=False)]), name="a")]), 2970 {"a": {"x": 234}}, 2971 ), 2972 ( 2973 [ 2974 { 2975 "messages": [ 2976 { 2977 "content": "You are a helpful assistant.", 2978 "additional_kwargs": {}, 2979 "response_metadata": {}, 2980 "type": "system", 2981 "name": None, 2982 "id": None, 2983 }, 2984 { 2985 "content": "What would you like to ask?", 2986 "additional_kwargs": {}, 2987 "response_metadata": {}, 2988 "type": "ai", 2989 "name": None, 2990 "id": None, 2991 "example": False, 2992 "tool_calls": [], 2993 "invalid_tool_calls": [], 2994 "usage_metadata": None, 2995 }, 2996 { 2997 "content": "Who owns MLflow?", 2998 "additional_kwargs": {}, 2999 "response_metadata": {}, 3000 "type": "human", 3001 "name": None, 3002 "id": None, 3003 "example": False, 3004 }, 3005 ], 3006 "text": "Hello?", 3007 } 3008 ], 3009 Schema([ 3010 ColSpec( 3011 Array( 3012 Object( 3013 properties=[ 3014 Property("content", DataType.string), 3015 Property("additional_kwargs", AnyType(), required=False), 3016 Property("response_metadata", AnyType(), required=False), 3017 Property("type", DataType.string), 3018 Property("name", AnyType(), required=False), 3019 Property("id", AnyType(), required=False), 3020 Property("example", DataType.boolean, required=False), 3021 Property("tool_calls", AnyType(), required=False), 3022 Property("invalid_tool_calls", AnyType(), required=False), 3023 Property("usage_metadata", AnyType(), required=False), 3024 ] 3025 ) 3026 ), 3027 name="messages", 3028 ), 3029 ColSpec(DataType.string, name="text"), 3030 ]), 3031 [ 3032 { 3033 "messages": [ 3034 { 3035 "content": "You are a helpful assistant.", 3036 "additional_kwargs": {"x": "x"}, 3037 "response_metadata": {"y": "y"}, 3038 "type": "system", 3039 "name": "test", 3040 "id": 1234567, 3041 "tool_calls": [{"tool1": "abc"}], 3042 "invalid_tool_calls": ["tool2", "tool3"], 3043 }, 3044 ], 3045 "text": "Hello?", 3046 } 3047 ], 3048 ), 3049 ], 3050 ) 3051 def test_schema_enforcement_for_anytype(input_example, expected_schema, payload_example): 3052 class MyModel(mlflow.pyfunc.PythonModel): 3053 def predict(self, context, model_input, params=None): 3054 return model_input 3055 3056 with mlflow.start_run(): 3057 model_info = mlflow.pyfunc.log_model( 3058 name="test_model", 3059 python_model=MyModel(), 3060 input_example=input_example, 3061 ) 3062 assert model_info.signature.inputs == expected_schema 3063 loaded_model = mlflow.pyfunc.load_model(model_info.model_uri) 3064 prediction = loaded_model.predict(payload_example) 3065 df = ( 3066 pd.DataFrame(payload_example) 3067 if isinstance(payload_example, list) 3068 else pd.DataFrame([payload_example]) 3069 ) 3070 pd.testing.assert_frame_equal(prediction, df) 3071 3072 data = convert_input_example_to_serving_input(payload_example) 3073 response = score_model_in_process( 3074 model_info.model_uri, 3075 data=data, 3076 content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, 3077 ) 3078 assert response.status_code == 200, response.content 3079 result = json.loads(response.content.decode("utf-8"))["predictions"] 3080 expected_result = df.to_dict(orient="records") 3081 np.testing.assert_equal(result, expected_result)