/ tests / pyfunc / test_pyfunc_schema_enforcement.py
test_pyfunc_schema_enforcement.py
   1  import base64
   2  import datetime
   3  import decimal
   4  import json
   5  import os
   6  import re
   7  from unittest import mock
   8  
   9  import cloudpickle
  10  import numpy as np
  11  import pandas as pd
  12  import pytest
  13  import sklearn.linear_model
  14  from packaging.version import Version
  15  
  16  import mlflow
  17  import mlflow.pyfunc.scoring_server as pyfunc_scoring_server
  18  from mlflow.exceptions import MlflowException
  19  from mlflow.models import (
  20      Model,
  21      ModelSignature,
  22      convert_input_example_to_serving_input,
  23      infer_signature,
  24  )
  25  from mlflow.models.utils import (
  26      _enforce_params_schema,
  27      _enforce_schema,
  28  )
  29  from mlflow.pyfunc import PyFuncModel
  30  from mlflow.pyfunc.scoring_server import is_unified_llm_input
  31  from mlflow.tracking.artifact_utils import _download_artifact_from_uri
  32  from mlflow.types import ColSpec, DataType, ParamSchema, ParamSpec, Schema, TensorSpec
  33  from mlflow.types.schema import AnyType, Array, Map, Object, Property
  34  from mlflow.utils.proto_json_utils import dump_input_data
  35  
  36  from tests.helper_functions import pyfunc_scoring_endpoint
  37  from tests.pyfunc.utils import score_model_in_process
  38  from tests.tracing.helper import get_traces
  39  
  40  
  41  class TestModel:
  42      @staticmethod
  43      def predict(pdf, params=None):
  44          return pdf
  45  
  46  
  47  @pytest.fixture(scope="module")
  48  def sample_params_basic():
  49      return {
  50          "str_param": "str_a",
  51          "int_param": np.int32(1),
  52          "bool_param": True,
  53          "double_param": 1.0,
  54          "float_param": np.float32(0.1),
  55          "long_param": 100,
  56          "datetime_param": np.datetime64("2023-06-26 00:00:00"),
  57          "str_list": ["a", "b", "c"],
  58          "bool_list": [True, False],
  59          "double_array": np.array([1.0, 2.0]),
  60      }
  61  
  62  
  63  @pytest.fixture(scope="module")
  64  def param_schema_basic():
  65      return ParamSchema([
  66          ParamSpec("str_param", DataType.string, "str_a", None),
  67          ParamSpec("int_param", DataType.integer, np.int32(1), None),
  68          ParamSpec("bool_param", DataType.boolean, True, None),
  69          ParamSpec("double_param", DataType.double, 1.0, None),
  70          ParamSpec("float_param", DataType.float, np.float32(0.1), None),
  71          ParamSpec("long_param", DataType.long, 100, None),
  72          ParamSpec("datetime_param", DataType.datetime, np.datetime64("2023-06-26 00:00:00"), None),
  73          ParamSpec("str_list", DataType.string, ["a", "b", "c"], (-1,)),
  74          ParamSpec("bool_list", DataType.boolean, [True, False], (-1,)),
  75          ParamSpec("double_array", DataType.double, [1.0, 2.0], (-1,)),
  76      ])
  77  
  78  
  79  class PythonModelWithBasicParams(mlflow.pyfunc.PythonModel):
  80      def predict(self, context, model_input, params=None):
  81          assert isinstance(params, dict)
  82          assert isinstance(params["str_param"], str)
  83          assert isinstance(params["int_param"], int)
  84          assert isinstance(params["bool_param"], bool)
  85          assert isinstance(params["double_param"], float)
  86          assert isinstance(params["float_param"], float)
  87          assert isinstance(params["long_param"], int)
  88          assert isinstance(params["datetime_param"], datetime.datetime)
  89          assert isinstance(params["str_list"], list)
  90          assert all(isinstance(x, str) for x in params["str_list"])
  91          assert isinstance(params["bool_list"], list)
  92          assert all(isinstance(x, bool) for x in params["bool_list"])
  93          assert isinstance(params["double_array"], list)
  94          assert all(isinstance(x, float) for x in params["double_array"])
  95          return params
  96  
  97  
  98  @pytest.fixture(scope="module")
  99  def sample_params_with_arrays():
 100      return {
 101          "int_array": np.array([np.int32(1), np.int32(2)]),
 102          "double_array": np.array([1.0, 2.0]),
 103          "float_array": np.array([np.float32(1.0), np.float32(2.0)]),
 104          "long_array": np.array([1, 2]),
 105          "datetime_array": np.array([
 106              np.datetime64("2023-06-26 00:00:00"),
 107              np.datetime64("2023-06-26 00:00:00"),
 108          ]),
 109      }
 110  
 111  
 112  class PythonModelWithArrayParams(mlflow.pyfunc.PythonModel):
 113      def predict(self, context, model_input, params=None):
 114          assert isinstance(params, dict)
 115          assert all(isinstance(x, int) for x in params["int_array"])
 116          assert all(isinstance(x, float) for x in params["double_array"])
 117          assert all(isinstance(x, float) for x in params["float_array"])
 118          assert all(isinstance(x, int) for x in params["long_array"])
 119          assert all(isinstance(x, datetime.datetime) for x in params["datetime_array"])
 120          return params
 121  
 122  
 123  def test_schema_enforcement_single_column_2d_array():
 124      X = np.array([[1], [2], [3]])
 125      y = np.array([1, 2, 3])
 126      model = sklearn.linear_model.LinearRegression()
 127      model.fit(X, y)
 128      signature = infer_signature(X, y)
 129      assert signature.inputs.inputs[0].shape == (-1, 1)
 130      assert signature.outputs.inputs[0].shape == (-1,)
 131  
 132      with mlflow.start_run():
 133          model_info = mlflow.sklearn.log_model(model, name="model", signature=signature)
 134  
 135      loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)
 136      pdf = pd.DataFrame(X)
 137      np.testing.assert_almost_equal(loaded_model.predict(pdf), model.predict(pdf))
 138  
 139  
 140  def test_column_schema_enforcement():
 141      m = Model()
 142      input_schema = Schema([
 143          ColSpec("integer", "a"),
 144          ColSpec("long", "b"),
 145          ColSpec("float", "c"),
 146          ColSpec("double", "d"),
 147          ColSpec("boolean", "e"),
 148          ColSpec("string", "g"),
 149          ColSpec("binary", "f"),
 150          ColSpec("datetime", "h"),
 151      ])
 152      m.signature = ModelSignature(inputs=input_schema)
 153      pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel())
 154      pdf = pd.DataFrame(
 155          data=[[1, 2, 3, 4, True, "x", bytes([1]), "2021-01-01 00:00:00.1234567"]],
 156          columns=["b", "d", "a", "c", "e", "g", "f", "h"],
 157          dtype=object,
 158      )
 159      pdf["a"] = pdf["a"].astype(np.int32)
 160      pdf["b"] = pdf["b"].astype(np.int64)
 161      pdf["c"] = pdf["c"].astype(np.float32)
 162      pdf["d"] = pdf["d"].astype(np.float64)
 163      pdf["h"] = pdf["h"].astype(np.dtype("datetime64[ns]"))
 164      # test that missing column raises
 165      match_missing_inputs = "Model is missing inputs"
 166      with pytest.raises(MlflowException, match=match_missing_inputs):
 167          res = pyfunc_model.predict(pdf[["b", "d", "a", "e", "g", "f", "h"]])
 168  
 169      # test that extra column is ignored
 170      pdf["x"] = 1
 171  
 172      # test that columns are reordered, extra column is ignored
 173      res = pyfunc_model.predict(pdf)
 174      assert all((res == pdf[input_schema.input_names()]).all())
 175  
 176      expected_types = dict(zip(input_schema.input_names(), input_schema.pandas_types()))
 177      # MLflow datetime type in input_schema does not encode precision, so add it for assertions
 178      expected_types["h"] = np.dtype("datetime64[ns]")
 179      # object cannot be converted to pandas Strings at the moment
 180      expected_types["f"] = object
 181      expected_types["g"] = object
 182      actual_types = res.dtypes.to_dict()
 183      assert expected_types == actual_types
 184  
 185      # Test conversions
 186      # 1. long -> integer raises
 187      pdf["a"] = pdf["a"].astype(np.int64)
 188      match_incompatible_inputs = "Incompatible input types"
 189      with pytest.raises(MlflowException, match=match_incompatible_inputs):
 190          pyfunc_model.predict(pdf)
 191      pdf["a"] = pdf["a"].astype(np.int32)
 192      # 2. integer -> long works
 193      pdf["b"] = pdf["b"].astype(np.int32)
 194      res = pyfunc_model.predict(pdf)
 195      assert all((res == pdf[input_schema.input_names()]).all())
 196      assert res.dtypes.to_dict() == expected_types
 197      pdf["b"] = pdf["b"].astype(np.int64)
 198  
 199      # 3. unsigned int -> long works
 200      pdf["b"] = pdf["b"].astype(np.uint32)
 201      res = pyfunc_model.predict(pdf)
 202      assert all((res == pdf[input_schema.input_names()]).all())
 203      assert res.dtypes.to_dict() == expected_types
 204      pdf["b"] = pdf["b"].astype(np.int64)
 205  
 206      # 4. unsigned int -> int raises
 207      pdf["a"] = pdf["a"].astype(np.uint32)
 208      with pytest.raises(MlflowException, match=match_incompatible_inputs):
 209          pyfunc_model.predict(pdf)
 210      pdf["a"] = pdf["a"].astype(np.int32)
 211  
 212      # 5. double -> float raises
 213      pdf["c"] = pdf["c"].astype(np.float64)
 214      with pytest.raises(MlflowException, match=match_incompatible_inputs):
 215          pyfunc_model.predict(pdf)
 216      pdf["c"] = pdf["c"].astype(np.float32)
 217  
 218      # 6. float -> double works, double -> float does not
 219      pdf["d"] = pdf["d"].astype(np.float32)
 220      res = pyfunc_model.predict(pdf)
 221      assert res.dtypes.to_dict() == expected_types
 222      pdf["d"] = pdf["d"].astype(np.float64)
 223      pdf["c"] = pdf["c"].astype(np.float64)
 224      with pytest.raises(MlflowException, match=match_incompatible_inputs):
 225          pyfunc_model.predict(pdf)
 226      pdf["c"] = pdf["c"].astype(np.float32)
 227  
 228      # 7. int -> float raises
 229      pdf["c"] = pdf["c"].astype(np.int32)
 230      with pytest.raises(MlflowException, match=match_incompatible_inputs):
 231          pyfunc_model.predict(pdf)
 232      pdf["c"] = pdf["c"].astype(np.float32)
 233  
 234      # 8. int -> double works
 235      pdf["d"] = pdf["d"].astype(np.int32)
 236      pyfunc_model.predict(pdf)
 237      assert all((res == pdf[input_schema.input_names()]).all())
 238      assert res.dtypes.to_dict() == expected_types
 239  
 240      # 9. long -> double raises
 241      pdf["d"] = pdf["d"].astype(np.int64)
 242      with pytest.raises(MlflowException, match=match_incompatible_inputs):
 243          pyfunc_model.predict(pdf)
 244      pdf["d"] = pdf["d"].astype(np.float64)
 245  
 246      # 10. any float -> any int raises
 247      pdf["a"] = pdf["a"].astype(np.float32)
 248      with pytest.raises(MlflowException, match=match_incompatible_inputs):
 249          pyfunc_model.predict(pdf)
 250      # 10. any float -> any int raises
 251      pdf["a"] = pdf["a"].astype(np.float64)
 252      with pytest.raises(MlflowException, match=match_incompatible_inputs):
 253          pyfunc_model.predict(pdf)
 254      pdf["a"] = pdf["a"].astype(np.int32)
 255      pdf["b"] = pdf["b"].astype(np.float64)
 256      with pytest.raises(MlflowException, match=match_incompatible_inputs):
 257          pyfunc_model.predict(pdf)
 258      pdf["b"] = pdf["b"].astype(np.int64)
 259  
 260      pdf["b"] = pdf["b"].astype(np.float64)
 261      with pytest.raises(MlflowException, match=match_incompatible_inputs):
 262          pyfunc_model.predict(pdf)
 263      pdf["b"] = pdf["b"].astype(np.int64)
 264  
 265      # 11. objects work
 266      pdf["b"] = pdf["b"].astype(object)
 267      pdf["d"] = pdf["d"].astype(object)
 268      pdf["e"] = pdf["e"].astype(object)
 269      pdf["f"] = pdf["f"].astype(object)
 270      pdf["g"] = pdf["g"].astype(object)
 271      res = pyfunc_model.predict(pdf)
 272      assert res.dtypes.to_dict() == expected_types
 273  
 274      # 12. datetime64[D] (date only) -> datetime64[x] works
 275      pdf["h"] = pdf["h"].values.astype("datetime64[D]")
 276      res = pyfunc_model.predict(pdf)
 277      assert res.dtypes.to_dict() == expected_types
 278      pdf["h"] = pdf["h"].astype("datetime64[s]")
 279  
 280      # 13. np.ndarrays can be converted to dataframe but have no columns
 281      with pytest.raises(MlflowException, match=match_missing_inputs):
 282          pyfunc_model.predict(pdf.values)
 283  
 284      # 14. dictionaries of str -> list/nparray work,
 285      # including extraneous multi-dimensional arrays and lists
 286      arr = np.array([1, 2, 3])
 287      d = {
 288          "a": arr.astype("int32"),
 289          "b": arr.astype("int64"),
 290          "c": arr.astype("float32"),
 291          "d": arr.astype("float64"),
 292          "e": [True, False, True],
 293          "g": ["a", "b", "c"],
 294          "f": [bytes(0), bytes(1), bytes(1)],
 295          "h": np.array(["2020-01-01", "2020-02-02", "2020-03-03"], dtype=np.datetime64),
 296          # Extraneous multi-dimensional numpy array should be silently dropped
 297          "i": np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
 298          # Extraneous multi-dimensional list should be silently dropped
 299          "j": [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
 300      }
 301      res = pyfunc_model.predict(d)
 302      assert res.dtypes.to_dict() == expected_types
 303  
 304      # 15. dictionaries of str -> list[list] fail
 305      d = {
 306          "a": [arr.astype("int32")],
 307          "b": [arr.astype("int64")],
 308          "c": [arr.astype("float32")],
 309          "d": [arr.astype("float64")],
 310          "e": [[True, False, True]],
 311          "g": np.array([["a", "b", "c"]]),
 312          "f": [[bytes(0), bytes(1), bytes(1)]],
 313          "h": [np.array(["2020-01-01", "2020-02-02", "2020-03-03"], dtype=np.datetime64)],
 314      }
 315      with pytest.raises(MlflowException, match=match_incompatible_inputs):
 316          pyfunc_model.predict(d)
 317  
 318      # 16. conversion to dataframe fails
 319      d = {
 320          "a": [1],
 321          "b": [1, 2],
 322          "c": [1, 2, 3],
 323      }
 324      with pytest.raises(
 325          MlflowException,
 326          match="This model contains a column-based signature, which suggests a DataFrame input.",
 327      ):
 328          pyfunc_model.predict(d)
 329  
 330      # 17. conversion from Decimal to float is allowed since numpy currently has no support for the
 331      #  data type.
 332      pdf["d"] = [decimal.Decimal(1.0)]
 333      res = pyfunc_model.predict(pdf)
 334      assert res.dtypes.to_dict() == expected_types
 335  
 336  
 337  def _compare_exact_tensor_dict_input(d1, d2):
 338      """Return whether two dicts of np arrays are exactly equal"""
 339      if d1.keys() != d2.keys():
 340          return False
 341      return all(np.array_equal(d1[key], d2[key]) for key in d1)
 342  
 343  
 344  def test_tensor_multi_named_schema_enforcement():
 345      m = Model()
 346      input_schema = Schema([
 347          TensorSpec(np.dtype(np.uint64), (-1, 5), "a"),
 348          TensorSpec(np.dtype(np.short), (-1, 2), "b"),
 349          TensorSpec(np.dtype(np.float32), (2, -1, 2), "c"),
 350      ])
 351      m.signature = ModelSignature(inputs=input_schema)
 352      pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel())
 353      inp = {
 354          "a": np.array([[0, 0, 0, 0, 0], [1, 1, 1, 1, 1]], dtype=np.uint64),
 355          "b": np.array([[0, 0], [1, 1], [2, 2]], dtype=np.short),
 356          "c": np.array([[[0, 0], [1, 1]], [[2, 2], [3, 3]]], dtype=np.float32),
 357      }
 358  
 359      # test that missing column raises
 360      inp1 = inp.copy()
 361      with pytest.raises(MlflowException, match="Model is missing inputs"):
 362          pyfunc_model.predict(inp1.pop("b"))
 363  
 364      # test that extra column is ignored
 365      inp2 = inp.copy()
 366      inp2["x"] = 1
 367  
 368      # test that extra column is removed
 369      res = pyfunc_model.predict(inp2)
 370      assert res == {k: v for k, v in inp.items() if k in {"a", "b", "c"}}
 371      expected_types = dict(zip(input_schema.input_names(), input_schema.input_types()))
 372      actual_types = {k: v.dtype for k, v in res.items()}
 373      assert expected_types == actual_types
 374  
 375      # test that variable axes are supported
 376      inp3 = {
 377          "a": np.array([[0, 0, 0, 0, 0], [1, 1, 1, 1, 1], [2, 2, 2, 2, 2]], dtype=np.uint64),
 378          "b": np.array([[0, 0], [1, 1]], dtype=np.short),
 379          "c": np.array([[[0, 0]], [[2, 2]]], dtype=np.float32),
 380      }
 381      res = pyfunc_model.predict(inp3)
 382      assert _compare_exact_tensor_dict_input(res, inp3)
 383      expected_types = dict(zip(input_schema.input_names(), input_schema.input_types()))
 384      actual_types = {k: v.dtype for k, v in res.items()}
 385      assert expected_types == actual_types
 386  
 387      # test that type casting is not supported
 388      inp4 = inp.copy()
 389      inp4["a"] = inp4["a"].astype(np.int32)
 390      with pytest.raises(
 391          MlflowException, match="dtype of input int32 does not match expected dtype uint64"
 392      ):
 393          pyfunc_model.predict(inp4)
 394  
 395      # test wrong shape
 396      inp5 = {
 397          "a": np.array([[0, 0, 0, 0]], dtype=np.uint),
 398          "b": np.array([[0, 0], [1, 1]], dtype=np.short),
 399          "c": np.array([[[0, 0]]], dtype=np.float32),
 400      }
 401      with pytest.raises(
 402          MlflowException,
 403          match=re.escape("Shape of input (1, 4) does not match expected shape (-1, 5)"),
 404      ):
 405          pyfunc_model.predict(inp5)
 406  
 407      # test non-dictionary input
 408      inp6 = [
 409          np.array([[0, 0, 0, 0, 0]], dtype=np.uint64),
 410          np.array([[0, 0], [1, 1]], dtype=np.short),
 411          np.array([[[0, 0]]], dtype=np.float32),
 412      ]
 413      with pytest.raises(
 414          MlflowException, match=re.escape("Model is missing inputs ['a', 'b', 'c'].")
 415      ):
 416          pyfunc_model.predict(inp6)
 417  
 418      # test empty ndarray does not work
 419      inp7 = inp.copy()
 420      inp7["a"] = np.array([])
 421      with pytest.raises(
 422          MlflowException, match=re.escape("Shape of input (0,) does not match expected shape")
 423      ):
 424          pyfunc_model.predict(inp7)
 425  
 426      # test dictionary of str -> list does not work
 427      inp8 = {k: list(v) for k, v in inp.items()}
 428      match = (
 429          r"This model contains a tensor-based model signature with input names.+"
 430          r"suggests a dictionary input mapping input name to a numpy array, but a dict"
 431          r" with value type <class 'list'> was found"
 432      )
 433      with pytest.raises(MlflowException, match=match):
 434          pyfunc_model.predict(inp8)
 435  
 436      # test dataframe input fails at shape enforcement
 437      pdf = pd.DataFrame(data=[[1, 2, 3]], columns=["a", "b", "c"])
 438      pdf["a"] = pdf["a"].astype(np.uint64)
 439      pdf["b"] = pdf["b"].astype(np.short)
 440      pdf["c"] = pdf["c"].astype(np.float32)
 441      with pytest.raises(
 442          MlflowException,
 443          match=re.escape(
 444              "The input pandas dataframe column 'a' contains scalar values, which requires the "
 445              "shape to be (-1,) or (-1, 1), but got tensor spec shape of (-1, 5)"
 446          ),
 447      ):
 448          pyfunc_model.predict(pdf)
 449  
 450  
 451  def test_schema_enforcement_single_named_tensor_schema():
 452      m = Model()
 453      input_schema = Schema([TensorSpec(np.dtype(np.uint64), (-1, 2, 3), "a")])
 454      m.signature = ModelSignature(inputs=input_schema)
 455      pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel())
 456      input_array = np.array(range(12), dtype=np.uint64).reshape((2, 2, 3))
 457      inp = {
 458          "a": input_array,
 459      }
 460  
 461      # sanity test that dictionary with correct input works
 462      res = pyfunc_model.predict(inp)
 463      assert res == inp
 464      expected_types = dict(zip(input_schema.input_names(), input_schema.input_types()))
 465      actual_types = {k: v.dtype for k, v in res.items()}
 466      assert expected_types == actual_types
 467  
 468      # test single np.ndarray input works and is converted to dictionary
 469      res = pyfunc_model.predict(inp["a"])
 470      assert res == inp
 471      expected_types = dict(zip(input_schema.input_names(), input_schema.input_types()))
 472      actual_types = {k: v.dtype for k, v in res.items()}
 473      assert expected_types == actual_types
 474  
 475      # test list does not work
 476      with pytest.raises(MlflowException, match="Model is missing inputs"):
 477          pyfunc_model.predict(input_array.tolist())
 478  
 479  
 480  def test_schema_enforcement_single_unnamed_tensor_schema():
 481      m = Model()
 482      input_schema = Schema([TensorSpec(np.dtype(np.uint64), (-1, 3))])
 483      m.signature = ModelSignature(inputs=input_schema)
 484      pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel())
 485  
 486      input_array = np.array(range(6), dtype=np.uint64).reshape((2, 3))
 487  
 488      # test single np.ndarray input works and is converted to dictionary
 489      res = pyfunc_model.predict(input_array)
 490      np.testing.assert_array_equal(res, input_array)
 491      expected_types = input_schema.input_types()[0]
 492      assert expected_types == res.dtype
 493  
 494      input_df = pd.DataFrame(input_array, columns=["c1", "c2", "c3"])
 495      res = pyfunc_model.predict(input_df)
 496      np.testing.assert_array_equal(res, input_array)
 497      assert expected_types == res.dtype
 498  
 499      input_df = input_df.drop("c3", axis=1)
 500      with pytest.raises(
 501          expected_exception=MlflowException,
 502          match=re.escape(
 503              "This model contains a model signature with an unnamed input. Since the "
 504              "input data is a pandas DataFrame containing multiple columns, "
 505              "the input shape must be of the structure "
 506              "(-1, number_of_dataframe_columns). "
 507              "Instead, the input DataFrame passed had 2 columns and "
 508              "an input shape of (-1, 3) with all values within the "
 509              "DataFrame of scalar type. Please adjust the passed in DataFrame to "
 510              "match the expected structure",
 511          ),
 512      ):
 513          pyfunc_model.predict(input_df)
 514  
 515  
 516  def test_schema_enforcement_named_tensor_schema_1d():
 517      m = Model()
 518      input_schema = Schema([
 519          TensorSpec(np.dtype(np.uint64), (-1,), "a"),
 520          TensorSpec(np.dtype(np.float32), (-1,), "b"),
 521      ])
 522      m.signature = ModelSignature(inputs=input_schema)
 523      pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel())
 524      pdf = pd.DataFrame(data=[[0, 0], [1, 1]], columns=["a", "b"])
 525      pdf["a"] = pdf["a"].astype(np.uint64)
 526      pdf["b"] = pdf["a"].astype(np.float32)
 527      d_inp = {
 528          "a": np.array(pdf["a"], dtype=np.uint64),
 529          "b": np.array(pdf["b"], dtype=np.float32),
 530      }
 531  
 532      # test dataframe input works for 1d tensor specs and input is converted to dict
 533      res = pyfunc_model.predict(pdf)
 534      assert _compare_exact_tensor_dict_input(res, d_inp)
 535      expected_types = dict(zip(input_schema.input_names(), input_schema.input_types()))
 536      actual_types = {k: v.dtype for k, v in res.items()}
 537      assert expected_types == actual_types
 538  
 539      wrong_m = Model()
 540      wrong_m.signature = ModelSignature(
 541          inputs=Schema([
 542              TensorSpec(np.dtype(np.uint64), (-1, 2), "a"),
 543              TensorSpec(np.dtype(np.float32), (-1,), "b"),
 544          ])
 545      )
 546      wrong_pyfunc_model = PyFuncModel(model_meta=wrong_m, model_impl=TestModel())
 547      with pytest.raises(
 548          expected_exception=MlflowException,
 549          match=re.escape(
 550              "The input pandas dataframe column 'a' contains scalar "
 551              "values, which requires the shape to be (-1,) or (-1, 1), but got tensor spec "
 552              "shape of (-1, 2)."
 553          ),
 554      ):
 555          wrong_pyfunc_model.predict(pdf)
 556  
 557      wrong_m.signature.inputs = Schema([
 558          TensorSpec(np.dtype(np.uint64), (2, -1), "a"),
 559          TensorSpec(np.dtype(np.float32), (-1,), "b"),
 560      ])
 561      with pytest.raises(
 562          expected_exception=MlflowException,
 563          match=re.escape(
 564              "For pandas dataframe input, the first dimension of shape must be a variable "
 565              "dimension and other dimensions must be fixed, but in model signature the shape "
 566              "of input a is (2, -1)."
 567          ),
 568      ):
 569          wrong_pyfunc_model.predict(pdf)
 570  
 571      # test that dictionary works too
 572      res = pyfunc_model.predict(d_inp)
 573      assert res == d_inp
 574      expected_types = dict(zip(input_schema.input_names(), input_schema.input_types()))
 575      actual_types = {k: v.dtype for k, v in res.items()}
 576      assert expected_types == actual_types
 577  
 578  
 579  def test_schema_enforcement_named_tensor_schema_multidimensional():
 580      m = Model()
 581      input_schema = Schema([
 582          TensorSpec(np.dtype(np.uint64), (-1, 2, 3), "a"),
 583          TensorSpec(np.dtype(np.float32), (-1, 3, 4), "b"),
 584      ])
 585      m.signature = ModelSignature(inputs=input_schema)
 586      pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel())
 587      data_a = np.array(range(12), dtype=np.uint64)
 588      data_b = np.array(range(24), dtype=np.float32) + 10.0
 589      pdf = pd.DataFrame({
 590          "a": data_a.reshape(-1, 2 * 3).tolist(),
 591          "b": data_b.reshape(-1, 3 * 4).tolist(),
 592      })
 593      d_inp = {
 594          "a": data_a.reshape((-1, 2, 3)),
 595          "b": data_b.reshape((-1, 3, 4)),
 596      }
 597  
 598      # test dataframe input works for 1d tensor specs and input is converted to dict
 599      res = pyfunc_model.predict(pdf)
 600      assert _compare_exact_tensor_dict_input(res, d_inp)
 601  
 602      # test dataframe input works for 1d tensor specs and input is converted to dict
 603      pdf_contains_numpy_array = pd.DataFrame({
 604          "a": list(data_a.reshape(-1, 2 * 3)),
 605          "b": list(data_b.reshape(-1, 3 * 4)),
 606      })
 607      res = pyfunc_model.predict(pdf_contains_numpy_array)
 608      assert _compare_exact_tensor_dict_input(res, d_inp)
 609  
 610      expected_types = dict(zip(input_schema.input_names(), input_schema.input_types()))
 611      actual_types = {k: v.dtype for k, v in res.items()}
 612      assert expected_types == actual_types
 613  
 614      with pytest.raises(
 615          expected_exception=MlflowException,
 616          match=re.escape(
 617              "The value in the Input DataFrame column 'a' could not be converted to the expected "
 618              "shape of: '(-1, 2, 3)'. Ensure that each of the input list elements are of uniform "
 619              "length and that the data can be coerced to the tensor type 'uint64'"
 620          ),
 621      ):
 622          pyfunc_model.predict(
 623              pdf.assign(a=np.array(range(16), dtype=np.uint64).reshape(-1, 8).tolist())
 624          )
 625  
 626      # test that dictionary works too
 627      res = pyfunc_model.predict(d_inp)
 628      assert res == d_inp
 629      expected_types = dict(zip(input_schema.input_names(), input_schema.input_types()))
 630      actual_types = {k: v.dtype for k, v in res.items()}
 631      assert expected_types == actual_types
 632  
 633  
 634  def test_missing_value_hint_is_displayed_when_it_should():
 635      m = Model()
 636      input_schema = Schema([ColSpec("integer", "a")])
 637      m.signature = ModelSignature(inputs=input_schema)
 638      pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel())
 639      pdf = pd.DataFrame(data=[[1], [None]], columns=["a"])
 640      match = "Incompatible input types"
 641      with pytest.raises(MlflowException, match=match) as ex:
 642          pyfunc_model.predict(pdf)
 643      hint = "Hint: the type mismatch is likely caused by missing values."
 644      assert hint in str(ex.value.message)
 645      pdf = pd.DataFrame(data=[[1.5], [None]], columns=["a"])
 646      with pytest.raises(MlflowException, match=match) as ex:
 647          pyfunc_model.predict(pdf)
 648      assert hint not in str(ex.value.message)
 649      pdf = pd.DataFrame(data=[[1], [2]], columns=["a"], dtype=np.float64)
 650      with pytest.raises(MlflowException, match=match) as ex:
 651          pyfunc_model.predict(pdf)
 652      assert hint not in str(ex.value.message)
 653  
 654  
 655  def test_column_schema_enforcement_no_col_names():
 656      m = Model()
 657      input_schema = Schema([ColSpec("double"), ColSpec("double"), ColSpec("double")])
 658      m.signature = ModelSignature(inputs=input_schema)
 659      pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel())
 660      test_data = [[1.0, 2.0, 3.0]]
 661  
 662      # Can call with just a list
 663      pd.testing.assert_frame_equal(pyfunc_model.predict(test_data), pd.DataFrame(test_data))
 664  
 665      # Or can call with a DataFrame without column names
 666      pd.testing.assert_frame_equal(
 667          pyfunc_model.predict(pd.DataFrame(test_data)), pd.DataFrame(test_data)
 668      )
 669  
 670      # # Or can call with a np.ndarray
 671      pd.testing.assert_frame_equal(
 672          pyfunc_model.predict(pd.DataFrame(test_data).values), pd.DataFrame(test_data)
 673      )
 674  
 675      # Or with column names!
 676      pdf = pd.DataFrame(data=test_data, columns=["a", "b", "c"])
 677      pd.testing.assert_frame_equal(pyfunc_model.predict(pdf), pdf)
 678  
 679      # Must provide the right number of arguments
 680      with pytest.raises(MlflowException, match="the provided value only has 2 inputs."):
 681          pyfunc_model.predict([[1.0, 2.0]])
 682  
 683      # Must provide the right types
 684      with pytest.raises(MlflowException, match="Can not safely convert int64 to float64"):
 685          pyfunc_model.predict([[1, 2, 3]])
 686  
 687      # Can only provide data type that can be converted to dataframe...
 688      with pytest.raises(MlflowException, match="Expected input to be DataFrame. Found: set"):
 689          pyfunc_model.predict({1, 2, 3})
 690  
 691      # 9. dictionaries of str -> list/nparray work
 692      d = {"a": [1.0], "b": [2.0], "c": [3.0]}
 693      pd.testing.assert_frame_equal(pyfunc_model.predict(d), pd.DataFrame(d))
 694  
 695  
 696  def test_tensor_schema_enforcement_no_col_names():
 697      m = Model()
 698      input_schema = Schema([TensorSpec(np.dtype(np.float32), (-1, 3))])
 699      m.signature = ModelSignature(inputs=input_schema)
 700      pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel())
 701      test_data = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32)
 702  
 703      # Can call with numpy array of correct shape
 704      np.testing.assert_array_equal(pyfunc_model.predict(test_data), test_data)
 705  
 706      # Or can call with a dataframe
 707      np.testing.assert_array_equal(pyfunc_model.predict(pd.DataFrame(test_data)), test_data)
 708  
 709      # Can not call with a list
 710      with pytest.raises(
 711          MlflowException,
 712          match="This model contains a tensor-based model signature with no input names",
 713      ):
 714          pyfunc_model.predict([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
 715  
 716      # Can not call with a dict
 717      with pytest.raises(
 718          MlflowException,
 719          match="This model contains a tensor-based model signature with no input names",
 720      ):
 721          pyfunc_model.predict({"blah": test_data})
 722  
 723      # Can not call with a np.ndarray of a wrong shape
 724      with pytest.raises(
 725          MlflowException,
 726          match=re.escape("Shape of input (2, 2) does not match expected shape (-1, 3)"),
 727      ):
 728          pyfunc_model.predict(np.array([[1.0, 2.0], [4.0, 5.0]]))
 729  
 730      # Can not call with a np.ndarray of a wrong type
 731      with pytest.raises(
 732          MlflowException, match="dtype of input uint32 does not match expected dtype float32"
 733      ):
 734          pyfunc_model.predict(test_data.astype(np.uint32))
 735  
 736      # Can call with a np.ndarray with more elements along variable axis
 737      test_data2 = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]], dtype=np.float32)
 738      np.testing.assert_array_equal(pyfunc_model.predict(test_data2), test_data2)
 739  
 740      # Can not call with an empty ndarray
 741      with pytest.raises(
 742          MlflowException, match=re.escape("Shape of input () does not match expected shape (-1, 3)")
 743      ):
 744          pyfunc_model.predict(np.ndarray([]))
 745  
 746  
 747  @pytest.mark.parametrize("orient", ["records"])
 748  def test_schema_enforcement_for_inputs_style_orientation_of_dataframe(orient):
 749      # Test Dict[str, List[Any]]
 750      test_signature = {
 751          "inputs": '[{"name": "a", "type": "long"}, {"name": "b", "type": "string"}]',
 752          "outputs": '[{"name": "response", "type": "string"}]',
 753      }
 754      signature = ModelSignature.from_dict(test_signature)
 755      data = {"a": [4, 5, 6], "b": ["a", "b", "c"]}
 756      pd_data = pd.DataFrame(data)
 757      check = _enforce_schema(data, signature.inputs)
 758      pd.testing.assert_frame_equal(check, pd_data)
 759      pd_check = _enforce_schema(pd_data.to_dict(orient=orient), signature.inputs)
 760      pd.testing.assert_frame_equal(pd_check, pd_data)
 761  
 762      # Test Dict[str, str]
 763      test_signature = {
 764          "inputs": '[{"name": "a", "type": "string"}]',
 765          "outputs": '[{"name": "response", "type": "string"}]',
 766      }
 767      signature = ModelSignature.from_dict(test_signature)
 768      data = {"a": "Hi there!"}
 769      pd_data = pd.DataFrame([data])
 770      check = _enforce_schema(data, signature.inputs)
 771      pd.testing.assert_frame_equal(check, pd_data)
 772      pd_check = _enforce_schema(pd_data.to_dict(orient=orient), signature.inputs)
 773      pd.testing.assert_frame_equal(pd_check, pd_data)
 774  
 775      # Test List[Dict[str, Union[str, List[str]]]]
 776      test_signature = {
 777          "inputs": '[{"name": "query", "type": "string"}, {"name": "inputs", "type": "string"}]',
 778      }
 779      signature = ModelSignature.from_dict(test_signature)
 780      data = [{"query": ["test_query1", "test_query2"], "inputs": "test input"}]
 781      pd_data = pd.DataFrame(data)
 782      check = _enforce_schema(data, signature.inputs)
 783      pd.testing.assert_frame_equal(check, pd_data)
 784      pd_check = _enforce_schema(pd_data.to_dict(orient=orient), signature.inputs)
 785      pd.testing.assert_frame_equal(pd_check, pd_data)
 786  
 787      # Test List[str]
 788      test_signature = {
 789          "inputs": '[{"type": "string"}]',
 790          "outputs": '[{"name": "response", "type": "string"}]',
 791      }
 792      signature = ModelSignature.from_dict(test_signature)
 793      data = ["a", "b", "c"]
 794      pd_data = pd.DataFrame(data)
 795      check = _enforce_schema(data, signature.inputs)
 796      pd.testing.assert_frame_equal(check, pd_data)
 797      pd_check = _enforce_schema(pd_data.to_dict(orient=orient), signature.inputs)
 798      pd.testing.assert_frame_equal(pd_check, pd_data)
 799  
 800      # Test Dict[str, np.ndarray]
 801      test_signature = {
 802          "inputs": '[{"name": "a", "type": "long"}, {"name": "b", "type": "string"}]',
 803          "outputs": '[{"name": "response", "type": "string"}]',
 804      }
 805      signature = ModelSignature.from_dict(test_signature)
 806      data = {"a": np.array([1, 2, 3]), "b": np.array(["a", "b", "c"])}
 807      pd_data = pd.DataFrame(data)
 808      check = _enforce_schema(data, signature.inputs)
 809      pd.testing.assert_frame_equal(check, pd_data)
 810      pd_check = _enforce_schema(pd_data.to_dict(orient=orient), signature.inputs)
 811      pd.testing.assert_frame_equal(pd_check, pd_data)
 812  
 813      # Test Dict[str, <scalar>] (support added in MLflow 2.3.0)
 814      test_signature = {
 815          "inputs": '[{"name": "a", "type": "long"}, {"name": "b", "type": "string"}]',
 816          "outputs": '[{"name": "response", "type": "string"}]',
 817      }
 818      signature = ModelSignature.from_dict(test_signature)
 819      data = {"a": 12, "b": "a"}
 820      pd_data = pd.DataFrame([data])
 821      check = _enforce_schema(data, signature.inputs)
 822      pd.testing.assert_frame_equal(check, pd_data)
 823      pd_check = _enforce_schema(pd_data.to_dict(orient=orient), signature.inputs)
 824      pd.testing.assert_frame_equal(pd_check, pd_data)
 825  
 826      # Test Dict[str, np.ndarray] where array.size == 1
 827      test_signature = {
 828          "inputs": '[{"name": "a", "type": "long"}, {"name": "b", "type": "string"}]',
 829          "outputs": '[{"name": "response", "type": "string"}]',
 830      }
 831      signature = ModelSignature.from_dict(test_signature)
 832      data = {"a": np.array([12]), "b": np.array(["a"])}
 833      pd_data = pd.DataFrame(data)
 834      check = _enforce_schema(data, signature.inputs)
 835      pd.testing.assert_frame_equal(check, pd_data)
 836      pd_check = _enforce_schema(pd_data.to_dict(orient=orient), signature.inputs)
 837      pd.testing.assert_frame_equal(pd_check, pd_data)
 838  
 839      # Test Dict[str, np.ndarray] where primitives are supplied
 840      test_signature = {
 841          "inputs": '[{"name": "a", "type": "string"}, {"name": "b", "type": "string"}]',
 842          "outputs": '[{"name": "response", "type": "string"}]',
 843      }
 844      signature = ModelSignature.from_dict(test_signature)
 845      # simulates the structure that model serving will convert the data to when using
 846      # a Dict[str, str] with a scalar singular value string
 847      data = {"a": np.array("a"), "b": np.array("b")}
 848      pd_data = pd.DataFrame([data])
 849      check = _enforce_schema(data, signature.inputs)
 850      pd.testing.assert_frame_equal(check, pd_data)
 851      pd_check = _enforce_schema(pd_data.to_dict(orient=orient), signature.inputs)
 852      pd.testing.assert_frame_equal(pd_check, pd_data)
 853  
 854      # Assert that the Dict[str, np.ndarray] casing with primitive does not work on anything
 855      # but a single string.
 856      test_signature = {
 857          "inputs": '[{"name": "a", "type": "long"}, {"name": "b", "type": "long"}]',
 858          "outputs": '[{"name": "response", "type": "string"}]',
 859      }
 860      signature = ModelSignature.from_dict(test_signature)
 861      data = {"a": np.array(1), "b": np.array(2)}
 862      pd_data = pd.DataFrame([data])
 863      # Schema enforcement explicitly only provides support for strings that meet primitives in
 864      # np.arrays criteria. All other data types should fail.
 865      with pytest.raises(MlflowException, match="This model contains a column-based"):
 866          _enforce_schema(data, signature.inputs)
 867      with pytest.raises(MlflowException, match="Incompatible input types for column a. Can not"):
 868          _enforce_schema(pd_data.to_dict(orient=orient), signature.inputs)
 869  
 870      # Test bytes
 871      test_signature = {
 872          "inputs": '[{"name": "audio", "type": "binary"}]',
 873          "outputs": '[{"name": "response", "type": "string"}]',
 874      }
 875      signature = ModelSignature.from_dict(test_signature)
 876      data = {"audio": b"Hi I am a bytes string"}
 877      pd_data = pd.DataFrame([data])
 878      check = _enforce_schema(data, signature.inputs)
 879      pd.testing.assert_frame_equal(check, pd_data)
 880      pd_check = _enforce_schema(pd_data.to_dict(orient=orient), signature.inputs)
 881      pd.testing.assert_frame_equal(pd_check, pd_data)
 882  
 883      # Test base64 encoded
 884      test_signature = {
 885          "inputs": '[{"name": "audio", "type": "binary"}]',
 886          "outputs": '[{"name": "response", "type": "string"}]',
 887      }
 888      signature = ModelSignature.from_dict(test_signature)
 889      data = {"audio": base64.b64encode(b"Hi I am a bytes string").decode("ascii")}
 890      pd_data = pd.DataFrame([data])
 891      check = _enforce_schema(data, signature.inputs)
 892      pd.testing.assert_frame_equal(check, pd_data)
 893      pd_check = _enforce_schema(pd_data.to_dict(orient=orient), signature.inputs)
 894      pd.testing.assert_frame_equal(pd_check, pd_data)
 895  
 896  
 897  def test_schema_enforcement_for_optional_columns():
 898      input_schema = Schema([
 899          ColSpec("double", "a"),
 900          ColSpec("double", "b"),
 901          ColSpec("string", "c", required=False),
 902          ColSpec("long", "d", required=False),
 903      ])
 904      signature = ModelSignature(inputs=input_schema)
 905      test_data_with_all_cols = {"a": [1.0], "b": [1.0], "c": ["something"], "d": [2]}
 906      test_data_with_only_required_cols = {"a": [1.0], "b": [1.0]}
 907      test_data_with_one_optional_col = {"a": [1.0], "b": [1.0], "d": [2]}
 908  
 909      for data in [
 910          test_data_with_all_cols,
 911          test_data_with_only_required_cols,
 912          test_data_with_one_optional_col,
 913      ]:
 914          pd_data = pd.DataFrame(data)
 915          check = _enforce_schema(pd_data, signature.inputs)
 916          pd.testing.assert_frame_equal(check, pd_data)
 917  
 918      # Ensure wrong data type for optional column throws
 919      test_bad_data = {"a": [1.0], "b": [1.0], "d": ["not the right type"]}
 920      pd_data = pd.DataFrame(test_bad_data)
 921      with pytest.raises(MlflowException, match="Incompatible input types for column d."):
 922          _enforce_schema(pd_data, signature.inputs)
 923  
 924      # Ensure it still validates for required columns
 925      test_missing_required = {"b": [2.0], "c": ["something"]}
 926      pd_data = pd.DataFrame(test_missing_required)
 927      with pytest.raises(MlflowException, match="Model is missing inputs"):
 928          _enforce_schema(pd_data, signature.inputs)
 929  
 930  
 931  def test_schema_enforcement_for_list_inputs_back_compatibility_check():
 932      # Test Dict[str, scalar or List[str]]
 933      test_signature = {
 934          "inputs": '[{"name": "prompt", "type": "string"}, {"name": "stop", "type": "string"}]',
 935          "outputs": '[{"type": "string"}]',
 936      }
 937      signature = ModelSignature.from_dict(test_signature)
 938      data = {"prompt": "this is the prompt", "stop": ["a", "b"]}
 939      pd_data = pd.DataFrame([data])
 940      check = _enforce_schema(data, signature.inputs)
 941      pd.testing.assert_frame_equal(check, pd_data)
 942  
 943      # Test Dict[str, List[str]]
 944      test_signature = {
 945          "inputs": '[{"name": "a", "type": "string"}, {"name": "b", "type": "string"}]',
 946          "outputs": '[{"name": "response", "type": "string"}]',
 947      }
 948      signature = ModelSignature.from_dict(test_signature)
 949      data = {"a": ["Hi there!"], "b": ["Hello there", "Bye!"]}
 950      pd_data = pd.DataFrame([data])
 951      check = _enforce_schema(data, signature.inputs)
 952      pd.testing.assert_frame_equal(check, pd_data)
 953  
 954      # Test Dict[str, List[binary]] with bytes
 955      test_signature = {
 956          "inputs": '[{"name": "audio", "type": "binary"}]',
 957          "outputs": '[{"name": "response", "type": "string"}]',
 958      }
 959      signature = ModelSignature.from_dict(test_signature)
 960      data = {"audio": [b"Hi I am a bytes string"]}
 961      pd_data = pd.DataFrame([data])
 962      pd_check = _enforce_schema(pd_data, signature.inputs)
 963      pd.testing.assert_frame_equal(pd_check, pd_data)
 964  
 965      # Test Dict[str, List[binary]] with base64 encoded
 966      test_signature = {
 967          "inputs": '[{"name": "audio", "type": "binary"}]',
 968          "outputs": '[{"name": "response", "type": "string"}]',
 969      }
 970      signature = ModelSignature.from_dict(test_signature)
 971      data = {"audio": [base64.b64encode(b"Hi I am a bytes string").decode("ascii")]}
 972      pd_data = pd.DataFrame([data])
 973      pd_check = _enforce_schema(pd_data, signature.inputs)
 974      pd.testing.assert_frame_equal(pd_check, pd_data)
 975  
 976      # Test Dict[str, List[Any]]
 977      test_signature = {
 978          "inputs": '[{"name": "a", "type": "long"}, {"name": "b", "type": "string"}]',
 979          "outputs": '[{"name": "response", "type": "string"}]',
 980      }
 981      signature = ModelSignature.from_dict(test_signature)
 982      data = {"a": [4, 5, 6], "b": ["a", "b", "c"]}
 983      pd_data = pd.DataFrame(data)
 984      pd_check = _enforce_schema(data, signature.inputs)
 985      pd.testing.assert_frame_equal(pd_check, pd_data)
 986  
 987      # Test Dict[str, np.ndarray]
 988      test_signature = {
 989          "inputs": '[{"name": "a", "type": "long"}, {"name": "b", "type": "string"}]',
 990          "outputs": '[{"name": "response", "type": "string"}]',
 991      }
 992      signature = ModelSignature.from_dict(test_signature)
 993      data = {"a": np.array([1, 2, 3]), "b": np.array(["a", "b", "c"])}
 994      pd_data = pd.DataFrame(data)
 995      pd_check = _enforce_schema(pd_data.to_dict(orient="list"), signature.inputs)
 996      pd.testing.assert_frame_equal(pd_check, pd_data)
 997  
 998      # Test Dict[str, np.ndarray] where array.size == 1
 999      test_signature = {
1000          "inputs": '[{"name": "a", "type": "long"}, {"name": "b", "type": "string"}]',
1001          "outputs": '[{"name": "response", "type": "string"}]',
1002      }
1003      signature = ModelSignature.from_dict(test_signature)
1004      data = {"a": np.array([12]), "b": np.array(["a"])}
1005      pd_data = pd.DataFrame(data)
1006      pd_check = _enforce_schema(pd_data.to_dict(orient="list"), signature.inputs)
1007      pd.testing.assert_frame_equal(pd_check, pd_data)
1008  
1009      # Test Dict[str, np.ndarray] where primitives are supplied
1010      test_signature = {
1011          "inputs": '[{"name": "a", "type": "string"}, {"name": "b", "type": "string"}]',
1012          "outputs": '[{"name": "response", "type": "string"}]',
1013      }
1014      signature = ModelSignature.from_dict(test_signature)
1015      # simulates the structure that model serving will convert the data to when using
1016      # a Dict[str, str] with a scalar singular value string
1017      data = {"a": np.array("a"), "b": np.array("b")}
1018      pd_data = pd.DataFrame([data])
1019      pd_check = _enforce_schema(pd_data.to_dict(orient="list"), signature.inputs)
1020      pd.testing.assert_frame_equal(pd_check, pd_data)
1021  
1022  
1023  def test_schema_enforcement_for_list_inputs():
1024      # Test Dict[str, scalar or List[str]]
1025      test_signature = {
1026          "inputs": '[{"type": "string", "name": "prompt", "required": true}, '
1027          '{"type": "array", "items": {"type": "string"}, '
1028          '"name": "stop", "required": true}]',
1029          "outputs": '[{"type": "string", "required": true}]',
1030      }
1031      signature = ModelSignature.from_dict(test_signature)
1032      data = {"prompt": "this is the prompt", "stop": ["a", "b"]}
1033      output = "this is the output"
1034      assert signature == infer_signature(data, output)
1035      pd_data = pd.DataFrame([data])
1036      check = _enforce_schema(data, signature.inputs)
1037      pd.testing.assert_frame_equal(check, pd_data)
1038  
1039      # Test Dict[str, List[str]]
1040      test_signature = {
1041          "inputs": '[{"type": "array", "items": {"type": "string"}, '
1042          '"name": "a", "required": true}, '
1043          '{"type": "array", "items": {"type": "string"}, '
1044          '"name": "b", "required": true}]',
1045          "outputs": '[{"type": "string", "required": true}]',
1046      }
1047      signature = ModelSignature.from_dict(test_signature)
1048      data = {"a": ["Hi there!"], "b": ["Hello there", "Bye!"]}
1049      assert signature == infer_signature(data, output)
1050      pd_data = pd.DataFrame([data])
1051      check = _enforce_schema(data, signature.inputs)
1052      pd.testing.assert_frame_equal(check, pd_data)
1053  
1054      # Test Dict[str, List[binary]] with bytes
1055      test_signature = {
1056          "inputs": '[{"type": "array", "items": {"type": "binary"}, '
1057          '"name": "audio", "required": true}]',
1058          "outputs": '[{"type": "string", "required": true}]',
1059      }
1060      signature = ModelSignature.from_dict(test_signature)
1061      data = {"audio": [b"Hi I am a bytes string"]}
1062      assert signature == infer_signature(data, output)
1063      pd_data = pd.DataFrame([data])
1064      check = _enforce_schema(data, signature.inputs)
1065      pd.testing.assert_frame_equal(check, pd_data)
1066  
1067      # Test Dict[str, List[binary]] with base64 encoded
1068      test_signature = {
1069          "inputs": '[{"type": "array", "items": {"type": "binary"}, '
1070          '"name": "audio", "required": true}]',
1071          "outputs": '[{"type": "string", "required": true}]',
1072      }
1073      signature = ModelSignature.from_dict(test_signature)
1074      data = {"audio": [base64.b64encode(b"Hi I am a bytes string")]}
1075      assert signature == infer_signature(data, output)
1076      pd_data = pd.DataFrame([data])
1077      check = _enforce_schema(data, signature.inputs)
1078      pd.testing.assert_frame_equal(check, pd_data)
1079  
1080      # Test Dict[str, List[Any]]
1081      test_signature = {
1082          "inputs": '[{"type": "array", "items": {"type": "long"}, '
1083          '"name": "a", "required": true}, '
1084          '{"type": "array", "items": {"type": "string"}, '
1085          '"name": "b", "required": true}]',
1086          "outputs": '[{"type": "string", "required": true}]',
1087      }
1088      signature = ModelSignature.from_dict(test_signature)
1089      data = {"a": [4, 5, 6], "b": ["a", "b", "c"]}
1090      assert signature == infer_signature(data, output)
1091      pd_data = pd.DataFrame([data])
1092      check = _enforce_schema(data, signature.inputs)
1093      pd.testing.assert_frame_equal(check, pd_data)
1094  
1095      # Test Dict[str, np.ndarray]
1096      test_signature = {
1097          "inputs": '[{"name": "a", "type": "tensor", "tensor-spec": '
1098          '{"dtype": "int64", "shape": [-1]}}, '
1099          '{"name": "b", "type": "tensor", "tensor-spec": '
1100          '{"dtype": "str", "shape": [-1]}}]',
1101          "outputs": '[{"type": "string", "required": true}]',
1102      }
1103      signature = ModelSignature.from_dict(test_signature)
1104      data = {"a": np.array([1, 2, 3]), "b": np.array(["a", "b", "c"])}
1105      pd_check = _enforce_schema(data, signature.inputs)
1106      assert pd_check == data
1107  
1108      # Test Dict[str, np.ndarray] where array.size == 1
1109      test_signature = {
1110          "inputs": '[{"name": "a", "type": "tensor", "tensor-spec": '
1111          '{"dtype": "int64", "shape": [-1]}}, '
1112          '{"name": "b", "type": "tensor", "tensor-spec": '
1113          '{"dtype": "str", "shape": [-1]}}]',
1114          "outputs": '[{"type": "string", "required": true}]',
1115      }
1116      signature = ModelSignature.from_dict(test_signature)
1117      data = {"a": np.array([12]), "b": np.array(["a"])}
1118      pd_check = _enforce_schema(data, signature.inputs)
1119      assert pd_check == data
1120  
1121  
1122  def test_enforce_schema_warns_with_extra_fields():
1123      schema = Schema([ColSpec("string", "a")])
1124      with mock.patch("mlflow.models.utils._logger.warning") as mock_warning:
1125          _enforce_schema({"a": "hi", "b": "bye"}, schema)
1126          mock_warning.assert_called_once_with(
1127              "Found extra inputs in the model input that are not defined in the model "
1128              "signature: `['b']`. These inputs will be ignored."
1129          )
1130  
1131  
1132  def test_enforce_params_schema_with_success():
1133      # Correct parameters & schema
1134      test_parameters = {
1135          "str_param": "str_a",
1136          "int_param": np.int32(1),
1137          "bool_param": True,
1138          "double_param": 1.0,
1139          "float_param": np.float32(0.1),
1140          "long_param": 100,
1141          "datetime_param": np.datetime64("2023-06-26 00:00:00"),
1142          "str_list": ["a", "b", "c"],
1143          "bool_list": [True, False],
1144          "object": {"a": 1, "b": ["x", "y"], "c": {"d": 2}},
1145      }
1146      test_schema = ParamSchema([
1147          ParamSpec("str_param", DataType.string, "str_a", None),
1148          ParamSpec("int_param", DataType.integer, np.int32(1), None),
1149          ParamSpec("bool_param", DataType.boolean, True, None),
1150          ParamSpec("double_param", DataType.double, 1.0, None),
1151          ParamSpec("float_param", DataType.float, np.float32(0.1), None),
1152          ParamSpec("long_param", DataType.long, 100, None),
1153          ParamSpec("datetime_param", DataType.datetime, np.datetime64("2023-06-26 00:00:00"), None),
1154          ParamSpec("str_list", DataType.string, ["a", "b", "c"], (-1,)),
1155          ParamSpec("bool_list", DataType.boolean, [True, False], (-1,)),
1156          ParamSpec(
1157              "object",
1158              Object([
1159                  Property("a", DataType.long),
1160                  Property("b", Array(DataType.string)),
1161                  Property("c", Object([Property("d", DataType.long)])),
1162              ]),
1163              {"a": 1, "b": ["x", "y"], "c": {"d": 2}},
1164              None,
1165          ),
1166      ])
1167      assert _enforce_params_schema(test_parameters, test_schema) == test_parameters
1168  
1169      # Correct parameters & schema with array
1170      params = {
1171          "double_array": np.array([1.0, 2.0]),
1172          "float_array": np.array([np.float32(1.0), np.float32(2.0)]),
1173          "long_array": np.array([1, 2]),
1174          "datetime_array": np.array([
1175              np.datetime64("2023-06-26 00:00:00"),
1176              np.datetime64("2023-06-26 00:00:00"),
1177          ]),
1178      }
1179      schema = ParamSchema([
1180          ParamSpec("double_array", DataType.double, np.array([1.0, 2.0]), (-1,)),
1181          ParamSpec(
1182              "float_array", DataType.float, np.array([np.float32(1.0), np.float32(2.0)]), (-1,)
1183          ),
1184          ParamSpec("long_array", DataType.long, np.array([1, 2]), (-1,)),
1185          ParamSpec(
1186              "datetime_array",
1187              DataType.datetime,
1188              np.array([np.datetime64("2023-06-26 00:00:00"), np.datetime64("2023-06-26 00:00:00")]),
1189              (-1,),
1190          ),
1191      ])
1192      for param, value in params.items():
1193          assert (_enforce_params_schema(params, schema)[param] == value).all()
1194  
1195      # Converting parameters value type to corresponding schema type
1196      # 1. int -> long, float, double
1197      assert _enforce_params_schema({"double_param": np.int32(1)}, test_schema)["double_param"] == 1.0
1198      assert _enforce_params_schema({"float_param": np.int32(1)}, test_schema)["float_param"] == 1.0
1199      assert _enforce_params_schema({"long_param": np.int32(1)}, test_schema)["long_param"] == 1
1200      # With array
1201      for param in ["double_array", "float_array", "long_array"]:
1202          assert (
1203              _enforce_params_schema({param: [np.int32(1), np.int32(2)]}, schema)[param]
1204              == params[param]
1205          ).all()
1206          assert (
1207              _enforce_params_schema({param: np.array([np.int32(1), np.int32(2)])}, schema)[param]
1208              == params[param]
1209          ).all()
1210  
1211      # 2. long -> float, double
1212      assert _enforce_params_schema({"double_param": 1}, test_schema)["double_param"] == 1.0
1213      assert _enforce_params_schema({"float_param": 1}, test_schema)["float_param"] == 1.0
1214      # With array
1215      for param in ["double_array", "float_array"]:
1216          assert (_enforce_params_schema({param: [1, 2]}, schema)[param] == params[param]).all()
1217          assert (
1218              _enforce_params_schema({param: np.array([1, 2])}, schema)[param] == params[param]
1219          ).all()
1220  
1221      # 3. float -> double
1222      assert (
1223          _enforce_params_schema({"double_param": np.float32(1)}, test_schema)["double_param"] == 1.0
1224      )
1225      assert np.isclose(
1226          _enforce_params_schema({"double_param": np.float32(0.1)}, test_schema)["double_param"],
1227          0.1,
1228          atol=1e-6,
1229      )
1230      # With array
1231      assert (
1232          _enforce_params_schema({"double_array": [np.float32(1), np.float32(2)]}, schema)[
1233              "double_array"
1234          ]
1235          == params["double_array"]
1236      ).all()
1237      assert (
1238          _enforce_params_schema({"double_array": np.array([np.float32(1), np.float32(2)])}, schema)[
1239              "double_array"
1240          ]
1241          == params["double_array"]
1242      ).all()
1243  
1244      # 4. any -> datetime (try conversion)
1245      assert _enforce_params_schema({"datetime_param": "2023-07-01 00:00:00"}, test_schema)[
1246          "datetime_param"
1247      ] == np.datetime64("2023-07-01 00:00:00")
1248  
1249      # With array
1250      assert (
1251          _enforce_params_schema(
1252              {"datetime_array": ["2023-06-26 00:00:00", "2023-06-26 00:00:00"]}, schema
1253          )["datetime_array"]
1254          == params["datetime_array"]
1255      ).all()
1256      assert (
1257          _enforce_params_schema(
1258              {"datetime_array": np.array(["2023-06-26 00:00:00", "2023-06-26 00:00:00"])}, schema
1259          )["datetime_array"]
1260          == params["datetime_array"]
1261      ).all()
1262  
1263      # Add default values if the parameter is not provided
1264      test_parameters = {"a": "str_a"}
1265      test_schema = ParamSchema([
1266          ParamSpec("a", DataType.string, ""),
1267          ParamSpec("b", DataType.long, 1),
1268      ])
1269      updated_parameters = {"b": 1}
1270      updated_parameters.update(test_parameters)
1271      assert _enforce_params_schema(test_parameters, test_schema) == updated_parameters
1272  
1273      # Ignore values not specified in ParamSchema and log warning
1274      test_parameters = {"a": "str_a", "invalid_param": "value"}
1275      test_schema = ParamSchema([ParamSpec("a", DataType.string, "")])
1276      with mock.patch("mlflow.models.utils._logger.warning") as mock_warning:
1277          assert _enforce_params_schema(test_parameters, test_schema) == {"a": "str_a"}
1278          mock_warning.assert_called_once_with(
1279              "Unrecognized params ['invalid_param'] are ignored for inference. "
1280              "Supported params are: {'a'}. "
1281              "To enable them, please add corresponding schema in ModelSignature."
1282          )
1283  
1284      # Converting parameters keys to string if it is not
1285      test_parameters = {1: 1.0}
1286      test_schema = ParamSchema([ParamSpec("1", DataType.double, 1.0)])
1287      assert _enforce_params_schema(test_parameters, test_schema) == {"1": 1.0}
1288  
1289  
1290  def test_enforce_params_schema_add_default_values():
1291      class MyModel(mlflow.pyfunc.PythonModel):
1292          def predict(self, context, model_input, params):
1293              return list(params.values())
1294  
1295      params = {"str_param": "string", "int_array": [1, 2, 3]}
1296      signature = infer_signature(["input"], params=params)
1297  
1298      with mlflow.start_run():
1299          model_info = mlflow.pyfunc.log_model(
1300              name="my_model", python_model=MyModel(), signature=signature
1301          )
1302  
1303      loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)
1304  
1305      # Not passing params -- predict with default values
1306      loaded_predict = loaded_model.predict(["input"])
1307      assert loaded_predict == ["string", [1, 2, 3]]
1308  
1309      # Passing some params -- add default values
1310      loaded_predict = loaded_model.predict(["input"], params={"str_param": "new_string"})
1311      assert loaded_predict == ["new_string", [1, 2, 3]]
1312  
1313      # Passing all params -- override
1314      loaded_predict = loaded_model.predict(
1315          ["input"], params={"str_param": "new_string", "int_array": [4, 5, 6]}
1316      )
1317      assert loaded_predict == ["new_string", [4, 5, 6]]
1318  
1319      # Raise warning for unrecognized params
1320      with mock.patch("mlflow.models.utils._logger.warning") as mock_warning:
1321          loaded_predict = loaded_model.predict(["input"], params={"new_param": "new_string"})
1322      mock_warning.assert_called_once()
1323      assert (
1324          "Unrecognized params ['new_param'] are ignored for inference"
1325          in mock_warning.call_args[0][0]
1326      )
1327      assert loaded_predict == ["string", [1, 2, 3]]
1328  
1329  
1330  def test_enforce_params_schema_errors():
1331      # Raise error when failing to convert value to DataType.datetime
1332      test_schema = ParamSchema([
1333          ParamSpec("datetime_param", DataType.datetime, np.datetime64("2023-06-06"))
1334      ])
1335      with pytest.raises(
1336          MlflowException,
1337          match=r"Failed to convert value `1.0` from type `<class 'float'>` to `DataType.datetime`",
1338      ):
1339          _enforce_params_schema({"datetime_param": 1.0}, test_schema)
1340      # With array
1341      test_schema = ParamSchema([
1342          ParamSpec(
1343              "datetime_array",
1344              DataType.datetime,
1345              np.array([np.datetime64("2023-06-06"), np.datetime64("2023-06-06")]),
1346              (-1,),
1347          )
1348      ])
1349      with pytest.raises(
1350          MlflowException,
1351          match=r"Failed to convert value `1.0` from type `<class 'float'>` to `DataType.datetime`",
1352      ):
1353          _enforce_params_schema({"datetime_array": [1.0, 2.0]}, test_schema)
1354  
1355      # Raise error when failing to convert value to DataType.float
1356      test_schema = ParamSchema([ParamSpec("float_param", DataType.float, np.float32(1))])
1357      with pytest.raises(
1358          MlflowException, match=r"Failed to validate type and shape for 'float_param'"
1359      ):
1360          _enforce_params_schema({"float_param": "a"}, test_schema)
1361      # With array
1362      test_schema = ParamSchema([
1363          ParamSpec("float_array", DataType.float, np.array([np.float32(1), np.float32(2)]), (-1,))
1364      ])
1365      with pytest.raises(
1366          MlflowException, match=r"Failed to validate type and shape for 'float_array'"
1367      ):
1368          _enforce_params_schema(
1369              {"float_array": [np.float32(1), np.float32(2), np.float64(3)]}, test_schema
1370          )
1371  
1372      # Raise error for any other conversions
1373      error_msg = r"Failed to validate type and shape for 'int_param'"
1374      test_schema = ParamSchema([ParamSpec("int_param", DataType.long, np.int32(1))])
1375      with pytest.raises(MlflowException, match=error_msg):
1376          _enforce_params_schema({"int_param": np.float32(1)}, test_schema)
1377      with pytest.raises(MlflowException, match=error_msg):
1378          _enforce_params_schema({"int_param": "1"}, test_schema)
1379      with pytest.raises(MlflowException, match=error_msg):
1380          _enforce_params_schema({"int_param": np.datetime64("2023-06-06")}, test_schema)
1381  
1382      error_msg = r"Failed to validate type and shape for 'str_param'"
1383      test_schema = ParamSchema([ParamSpec("str_param", DataType.string, "1")])
1384      with pytest.raises(MlflowException, match=error_msg):
1385          _enforce_params_schema({"str_param": np.float32(1)}, test_schema)
1386      with pytest.raises(MlflowException, match=error_msg):
1387          _enforce_params_schema({"str_param": b"string"}, test_schema)
1388      with pytest.raises(MlflowException, match=error_msg):
1389          _enforce_params_schema({"str_param": np.datetime64("2023-06-06")}, test_schema)
1390  
1391      # Raise error if parameters is not dictionary
1392      with pytest.raises(MlflowException, match=r"Parameters must be a dictionary. Got type 'int'."):
1393          _enforce_params_schema(100, test_schema)
1394  
1395      # Raise error if invalid parameters are passed
1396      test_parameters = {"a": True, "b": (1, 2), "c": b"test"}
1397      test_schema = ParamSchema([
1398          ParamSpec("a", DataType.boolean, False),
1399          ParamSpec("b", DataType.string, [], (-1,)),
1400          ParamSpec("c", DataType.string, ""),
1401      ])
1402      with pytest.raises(
1403          MlflowException,
1404          match=re.escape(
1405              "Value must be a 1D array with shape (-1,) for param 'b': string "
1406              "(default: []) (shape: (-1,)), received tuple"
1407          ),
1408      ):
1409          _enforce_params_schema(test_parameters, test_schema)
1410      # Raise error for non-1D array
1411      with pytest.raises(MlflowException, match=r"received list with ndim 2"):
1412          _enforce_params_schema(
1413              {"a": [[1, 2], [3, 4]]}, ParamSchema([ParamSpec("a", DataType.long, [], (-1,))])
1414          )
1415  
1416  
1417  def test_enforce_params_schema_warns_with_model_without_params():
1418      class MyModel(mlflow.pyfunc.PythonModel):
1419          def predict(self, context, model_input, params=None):
1420              return list(params.values()) if isinstance(params, dict) else None
1421  
1422      params = {"str_param": "string", "int_array": [1, 2, 3], "123": 123}
1423      signature = infer_signature(["input"])
1424  
1425      with mlflow.start_run():
1426          model_info = mlflow.pyfunc.log_model(
1427              name="model1", python_model=MyModel(), signature=signature
1428          )
1429  
1430      loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)
1431  
1432      with mock.patch("mlflow.models.utils._logger.warning") as mock_warning:
1433          loaded_model.predict(["input"], params=params)
1434      mock_warning.assert_called_with(
1435          "`params` can only be specified at inference time if the model signature defines a params "
1436          "schema. This model does not define a params schema. Ignoring provided params: "
1437          "['str_param', 'int_array', '123']"
1438      )
1439  
1440  
1441  def test_enforce_params_schema_errors_with_model_with_params():
1442      class MyModel(mlflow.pyfunc.PythonModel):
1443          def predict(self, context, model_input, params=None):
1444              return list(params.values()) if isinstance(params, dict) else None
1445  
1446      params = {"str_param": "string", "int_array": [1, 2, 3], "123": 123}
1447      signature = infer_signature(["input"], params=params)
1448  
1449      with mlflow.start_run():
1450          model_info = mlflow.pyfunc.log_model(
1451              name="test_model", python_model=MyModel(), signature=signature
1452          )
1453  
1454      loaded_model_with_params = mlflow.pyfunc.load_model(model_info.model_uri)
1455      with pytest.raises(MlflowException, match=r"Parameters must be a dictionary. Got type 'list'"):
1456          loaded_model_with_params.predict(["input"], params=[1, 2, 3])
1457  
1458      with mock.patch("mlflow.models.utils._logger.warning") as mock_warning:
1459          loaded_model_with_params.predict(["input"], params={123: 456})
1460      mock_warning.assert_called_with(
1461          "Keys in parameters should be of type `str`, but received non-string keys."
1462          "Converting all keys to string..."
1463      )
1464  
1465  
1466  def test_param_spec_with_success():
1467      # Normal cases
1468      assert ParamSpec("a", DataType.long, 1).default == 1
1469      assert ParamSpec("a", DataType.string, "1").default == "1"
1470      assert ParamSpec("a", DataType.boolean, True).default is True
1471      assert ParamSpec("a", DataType.double, 1.0).default == 1.0
1472      assert ParamSpec("a", DataType.float, np.float32(1)).default == 1
1473      assert ParamSpec("a", DataType.datetime, np.datetime64("2023-06-06")).default == datetime.date(
1474          2023, 6, 6
1475      )
1476      assert ParamSpec(
1477          "a", DataType.datetime, np.datetime64("2023-06-06 00:00:00")
1478      ).default == datetime.datetime(2023, 6, 6, 0, 0, 0)
1479      assert ParamSpec("a", DataType.integer, np.int32(1)).default == 1
1480  
1481      # Convert default value type if it is not consistent with provided type
1482      # 1. int -> long, float, double
1483      assert ParamSpec("a", DataType.long, np.int32(1)).default == 1
1484      assert ParamSpec("a", DataType.float, np.int32(1)).default == 1.0
1485      assert ParamSpec("a", DataType.double, np.int32(1)).default == 1.0
1486      # 2. long -> float, double
1487      assert ParamSpec("a", DataType.float, 1).default == 1.0
1488      assert ParamSpec("a", DataType.double, 1).default == 1.0
1489      # 3. float -> double
1490      assert ParamSpec("a", DataType.double, np.float32(1)).default == 1.0
1491      # 4. any -> datetime (try conversion)
1492      assert ParamSpec("a", DataType.datetime, "2023-07-01 00:00:00").default == np.datetime64(
1493          "2023-07-01 00:00:00"
1494      )
1495  
1496  
1497  def test_param_spec_errors():
1498      # Raise error if default value can not be converted to specified type
1499      with pytest.raises(MlflowException, match=r"Failed to validate type and shape for 'a'"):
1500          ParamSpec("a", DataType.integer, "1.0")
1501      with pytest.raises(MlflowException, match=r"Failed to validate type and shape for 'a'"):
1502          ParamSpec("a", DataType.integer, [1.0, 2.0], (-1,))
1503      with pytest.raises(MlflowException, match=r"Failed to validate type and shape for 'a'"):
1504          ParamSpec("a", DataType.string, True)
1505      with pytest.raises(MlflowException, match=r"Failed to validate type and shape for 'a'"):
1506          ParamSpec("a", DataType.string, [1.0, 2.0], (-1,))
1507      with pytest.raises(MlflowException, match=r"Binary type is not supported for parameters"):
1508          ParamSpec("a", DataType.binary, 1.0)
1509      with pytest.raises(MlflowException, match=r"Failed to convert value"):
1510          ParamSpec("a", DataType.datetime, 1.0)
1511      with pytest.raises(MlflowException, match=r"Failed to convert value"):
1512          ParamSpec("a", DataType.datetime, [1.0, 2.0], (-1,))
1513      with pytest.raises(MlflowException, match=r"Failed to convert value to `DataType.datetime`"):
1514          ParamSpec("a", DataType.datetime, np.datetime64("20230606"))
1515  
1516      # Raise error if shape is not specified for list value
1517      with pytest.raises(
1518          MlflowException,
1519          match=re.escape("Value must be a scalar for type `DataType.long`"),
1520      ):
1521          ParamSpec("a", DataType.long, [1, 2, 3], shape=None)
1522      with pytest.raises(
1523          MlflowException,
1524          match=re.escape("Value must be a scalar for type `DataType.integer`"),
1525      ):
1526          ParamSpec("a", DataType.integer, np.array([1, 2, 3]), shape=None)
1527  
1528      # Raise error if shape is specified for scalar value
1529      with pytest.raises(
1530          MlflowException,
1531          match=re.escape(
1532              "Value must be a 1D array with shape (-1,) for param 'a': boolean (default: True) "
1533              "(shape: (-1,)), received bool"
1534          ),
1535      ):
1536          ParamSpec("a", DataType.boolean, True, shape=(-1,))
1537  
1538      # Raise error if shape specified is not allowed
1539      with pytest.raises(
1540          MlflowException,
1541          match=r"Shape must be None for scalar or dictionary value, "
1542          r"or \(-1,\) for 1D array value",
1543      ):
1544          ParamSpec("a", DataType.boolean, [True, False], (2,))
1545  
1546      # Raise error if default value is not scalar or 1D array
1547      with pytest.raises(
1548          MlflowException,
1549          match=re.escape(
1550              "Value must be a 1D array with shape (-1,) for param 'a': boolean (default: {'a': 1}) "
1551              "(shape: (-1,)), received dict"
1552          ),
1553      ):
1554          ParamSpec("a", DataType.boolean, {"a": 1}, (-1,))
1555  
1556  
1557  def test_enforce_schema_in_python_model_predict(sample_params_basic, param_schema_basic):
1558      test_params = sample_params_basic
1559      test_schema = param_schema_basic
1560      signature = infer_signature(["input1"], params=test_params)
1561      with mlflow.start_run():
1562          model_info = mlflow.pyfunc.log_model(
1563              name="test_model",
1564              python_model=PythonModelWithBasicParams(),
1565              signature=signature,
1566          )
1567      assert signature.params == test_schema
1568  
1569      loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)
1570      loaded_predict = loaded_model.predict(["a", "b"], params=test_params)
1571      for param, value in test_params.items():
1572          if param == "double_array":
1573              assert (loaded_predict[param] == value).all()
1574          else:
1575              assert loaded_predict[param] == value
1576  
1577      # Automatically convert type if it's not consistent with schema
1578      # 1. int -> long, float, double
1579      params_int = {
1580          "double_param": np.int32(1),
1581          "float_param": np.int32(1),
1582          "long_param": np.int32(1),
1583      }
1584      expected_params_int = {
1585          "double_param": 1.0,
1586          "float_param": np.float32(1),
1587          "long_param": 1,
1588      }
1589      loaded_predict = loaded_model.predict(["a", "b"], params=params_int)
1590      for param in params_int:
1591          assert loaded_predict[param] == expected_params_int[param]
1592  
1593      # 2. long -> float, double
1594      params_long = {
1595          "double_param": 1,
1596          "float_param": 1,
1597      }
1598      expected_params_long = {
1599          "double_param": 1.0,
1600          "float_param": np.float32(1),
1601      }
1602      loaded_predict = loaded_model.predict(["a", "b"], params=params_long)
1603      for param in params_long:
1604          assert loaded_predict[param] == expected_params_long[param]
1605  
1606      # 3. float -> double
1607      assert (
1608          loaded_model.predict(
1609              ["a", "b"],
1610              params={
1611                  "double_param": np.float32(1),
1612              },
1613          )["double_param"]
1614          == 1.0
1615      )
1616  
1617      # 4. any -> datetime (try conversion)
1618      assert loaded_model.predict(
1619          ["a", "b"],
1620          params={
1621              "datetime_param": "2023-06-26 00:00:00",
1622          },
1623      )["datetime_param"] == np.datetime64("2023-06-26 00:00:00")
1624  
1625  
1626  def test_schema_enforcement_all_feature_types_pandas():
1627      data = {
1628          "long": [1, 2, 3],
1629          "bool": [True, False, False],
1630          "string": ["a", "b", "c"],
1631          "datetime": [pd.Timestamp("2020-07-14 00:00:00")] * 3,
1632          "bool_nullable": [True, None, False],
1633          "string_nullable": ["a", "b", None],
1634          "double_nullable": [1.0, 2.0, None],
1635      }
1636      df = pd.DataFrame.from_dict(data)
1637      schema = Schema([
1638          ColSpec(DataType.long, "long"),
1639          ColSpec(DataType.boolean, "bool"),
1640          ColSpec(DataType.string, "string"),
1641          ColSpec(DataType.datetime, "datetime"),
1642          ColSpec(DataType.boolean, "bool_nullable", required=False),
1643          ColSpec(DataType.string, "string_nullable", required=False),
1644          ColSpec(DataType.double, "double_nullable", required=False),
1645      ])
1646      pd.testing.assert_frame_equal(_enforce_schema(df, schema), df, check_dtype=False)
1647  
1648  
1649  def test_enforce_schema_in_python_model_serving(sample_params_basic):
1650      signature = infer_signature(["input1"], params=sample_params_basic)
1651      with mlflow.start_run():
1652          model_info = mlflow.pyfunc.log_model(
1653              name="test_model",
1654              python_model=PythonModelWithBasicParams(),
1655              signature=signature,
1656          )
1657  
1658      # params in payload should be json serializable
1659      test_params = {
1660          "str_param": "str_a",
1661          "int_param": 1,
1662          "bool_param": True,
1663          "double_param": 1.0,
1664          "float_param": 0.1,
1665          "long_param": 100,
1666          "datetime_param": datetime.datetime(2023, 6, 6, 0, 0, 0),
1667          "str_list": ["a", "b", "c"],
1668          "bool_list": [True, False],
1669          "double_array": np.array([1.0, 2.0]),
1670      }
1671      response = score_model_in_process(
1672          model_info.model_uri,
1673          data=dump_input_data(["a", "b"], params=test_params),
1674          content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON,
1675      )
1676      assert response.status_code == 200
1677      prediction = json.loads(response.content.decode("utf-8"))["predictions"]
1678      for param, value in test_params.items():
1679          if param == "double_array":
1680              assert (prediction[param] == value).all()
1681          elif param == "datetime_param":
1682              assert prediction[param] == value.isoformat()
1683          else:
1684              assert prediction[param] == value
1685  
1686      # Test invalid params for model serving
1687      with pytest.raises(TypeError, match=r"Object of type int32 is not JSON serializable"):
1688          dump_input_data(["a", "b"], params={"int_param": np.int32(1)})
1689  
1690      response = score_model_in_process(
1691          model_info.model_uri,
1692          data=dump_input_data(["a", "b"], params={"double_param": "invalid"}),
1693          content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON,
1694      )
1695      assert response.status_code == 400
1696      assert (
1697          "Failed to validate type and shape for 'double_param'"
1698          in json.loads(response.content.decode("utf-8"))["message"]
1699      )
1700  
1701      # Can not pass bytes to request
1702      with pytest.raises(TypeError, match=r"Object of type bytes is not JSON serializable"):
1703          score_model_in_process(
1704              model_info.model_uri,
1705              data=dump_input_data(["a", "b"], params={"str_param": b"bytes"}),
1706              content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON,
1707          )
1708  
1709  
1710  def test_python_model_serving_compatible(tmp_path):
1711      """
1712      # Code for logging the model in mlflow 2.4.0
1713      import mlflow
1714      from mlflow.models import infer_signature
1715  
1716      class MyModel(mlflow.pyfunc.PythonModel):
1717          def predict(self, context, model_input):
1718              return model_input
1719  
1720      with mlflow.start_run():
1721          model_info = mlflow.pyfunc.log_model(
1722                      python_model = MyModel(),
1723                      artifact_path = "test_model",
1724                      signature = infer_signature(["input"]),
1725                      registered_model_name="model")
1726      """
1727      tmp_path.joinpath("MLmodel").write_text(
1728          """
1729  artifact_path: test_model
1730  flavors:
1731    python_function:
1732      cloudpickle_version: 2.2.1
1733      env:
1734        conda: conda.yaml
1735        virtualenv: python_env.yaml
1736      loader_module: mlflow.pyfunc.model
1737      python_model: python_model.pkl
1738      python_version: 3.8.16
1739  mlflow_version: 2.4.0
1740  model_uuid: 3cbde93be0114644a6ec900c64cab39d
1741  run_id: 3f87fdff03524c19908c3a47fb99f9cd
1742  signature:
1743    inputs: '[{"type": "string"}]'
1744    outputs: null
1745  utc_time_created: '2023-07-13 01:29:55.467561'
1746          """
1747      )
1748      tmp_path.joinpath("python_env.yaml").write_text(
1749          """
1750  python: 3.8.16
1751  build_dependencies:
1752      - pip==23.1.2
1753      - setuptools==56.0.0
1754      - wheel==0.40.0
1755  dependencies:
1756      - -r requirements.txt
1757          """
1758      )
1759      tmp_path.joinpath("requirements.txt").write_text(
1760          """
1761  mlflow==2.4.0
1762  cloudpickle==2.2.1
1763          """
1764      )
1765  
1766      class MyModel(mlflow.pyfunc.PythonModel):
1767          def predict(self, context, model_input):
1768              return model_input
1769  
1770      python_model = MyModel()
1771  
1772      with open(tmp_path / "python_model.pkl", "wb") as out:
1773          cloudpickle.dump(python_model, out)
1774  
1775      assert Version(mlflow.__version__) > Version("2.4.0")
1776      model_uri = str(tmp_path)
1777      pyfunc_loaded = mlflow.pyfunc.load_model(model_uri)
1778  
1779      assert pyfunc_loaded.metadata.signature == ModelSignature(Schema([ColSpec("string")]))
1780  
1781      # predict is compatible
1782      local_predict = pyfunc_loaded.predict(["input"])
1783      assert local_predict.values[0].tolist() == ["input"]
1784  
1785      # model serving is compatible
1786      response = score_model_in_process(
1787          model_uri,
1788          data=dump_input_data(["a", "b"]),
1789          content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON,
1790      )
1791      assert response.status_code == 200
1792      prediction = json.loads(response.content.decode("utf-8"))["predictions"]
1793      assert prediction == [{"0": "a"}, {"0": "b"}]
1794  
1795  
1796  def test_function_python_model_serving_compatible(tmp_path):
1797      """
1798      # Code for logging the model in mlflow 2.4.0
1799      import mlflow
1800      from mlflow.models import infer_signature
1801  
1802      def my_model(model_input):
1803          return model_input
1804  
1805      with mlflow.start_run():
1806          model_info = mlflow.pyfunc.log_model(
1807                      python_model = my_model,
1808                      artifact_path = "test_model",
1809                      signature = infer_signature(["input"]),
1810                      registered_model_name="model",
1811                      input_example=["input"])
1812      """
1813      tmp_path.joinpath("MLmodel").write_text(
1814          """
1815  artifact_path: test_model
1816  flavors:
1817    python_function:
1818      cloudpickle_version: 2.2.1
1819      env:
1820        conda: conda.yaml
1821        virtualenv: python_env.yaml
1822      loader_module: mlflow.pyfunc.model
1823      python_model: python_model.pkl
1824      python_version: 3.8.16
1825  mlflow_version: 2.4.0
1826  model_uuid: f19b9a51a34a453282e53ca41d384964
1827  run_id: 9fd7b6e125a547fdbb4505f15e8259ed
1828  saved_input_example_info:
1829    artifact_path: input_example.json
1830    pandas_orient: split
1831    type: dataframe
1832  signature:
1833    inputs: '[{"type": "string"}]'
1834    outputs: null
1835  utc_time_created: '2023-07-14 10:18:44.353510'
1836          """
1837      )
1838      tmp_path.joinpath("python_env.yaml").write_text(
1839          """
1840  python: 3.8.16
1841  build_dependencies:
1842      - pip==23.1.2
1843      - setuptools==56.0.0
1844      - wheel==0.40.0
1845  dependencies:
1846      - -r requirements.txt
1847          """
1848      )
1849      tmp_path.joinpath("requirements.txt").write_text(
1850          """
1851  mlflow==2.4.0
1852  cloudpickle==2.2.1
1853  pandas==2.0.3
1854          """
1855      )
1856      tmp_path.joinpath("input_example.json").write_text(
1857          """
1858  {"data": [["input"]]}
1859          """
1860      )
1861  
1862      def my_model(model_input):
1863          return model_input
1864  
1865      from mlflow.pyfunc.model import _FunctionPythonModel
1866  
1867      python_model = _FunctionPythonModel(my_model, signature=infer_signature(["input"]))
1868  
1869      with open(tmp_path / "python_model.pkl", "wb") as out:
1870          cloudpickle.dump(python_model, out)
1871  
1872      assert Version(mlflow.__version__) > Version("2.4.0")
1873      model_uri = str(tmp_path)
1874      pyfunc_loaded = mlflow.pyfunc.load_model(model_uri)
1875  
1876      assert pyfunc_loaded.metadata.signature == ModelSignature(Schema([ColSpec("string")]))
1877  
1878      # predict is compatible
1879      local_predict = pyfunc_loaded.predict(["input"])
1880      assert local_predict.values[0].tolist() == ["input"]
1881  
1882      # model serving is compatible
1883      response = score_model_in_process(
1884          model_uri,
1885          data=dump_input_data(["a", "b"]),
1886          content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON,
1887      )
1888      assert response.status_code == 200
1889      prediction = json.loads(response.content.decode("utf-8"))["predictions"]
1890      assert prediction == [{"0": "a"}, {"0": "b"}]
1891  
1892  
1893  def test_enforce_schema_with_arrays_in_python_model_predict(sample_params_with_arrays):
1894      params = sample_params_with_arrays
1895      signature = infer_signature(["input1"], params=params)
1896      with mlflow.start_run():
1897          model_info = mlflow.pyfunc.log_model(
1898              name="test_model",
1899              python_model=PythonModelWithArrayParams(),
1900              signature=signature,
1901          )
1902  
1903      loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)
1904      loaded_predict = loaded_model.predict(["a", "b"], params=params)
1905      for param, value in params.items():
1906          assert (loaded_predict[param] == value).all()
1907  
1908      # Automatically convert type if it's not consistent with schema
1909      # 1. int -> long, float, double
1910      for param in ["double_array", "float_array", "long_array"]:
1911          loaded_predict = loaded_model.predict(
1912              ["a", "b"], params={param: np.array([np.int32(1), np.int32(2)])}
1913          )
1914          assert (loaded_predict[param] == params[param]).all()
1915      # 2. long -> float, double
1916      for param in ["double_array", "float_array"]:
1917          loaded_predict = loaded_model.predict(["a", "b"], params={param: np.array([1, 2])})
1918          assert (loaded_predict[param] == params[param]).all()
1919      # 3. float -> double
1920      loaded_predict = loaded_model.predict(
1921          ["a", "b"], params={"double_array": np.array([np.float32(1), np.float32(2)])}
1922      )
1923      assert (loaded_predict["double_array"] == params["double_array"]).all()
1924      # 4. any -> datetime (try conversion)
1925      loaded_predict = loaded_model.predict(
1926          ["a", "b"],
1927          params={"datetime_array": np.array(["2023-06-26 00:00:00", "2023-06-26 00:00:00"])},
1928      )
1929      assert (loaded_predict["datetime_array"] == params["datetime_array"]).all()
1930  
1931      # Raise error if failing to convert the type
1932      with pytest.raises(
1933          MlflowException,
1934          match=r"Failed to convert value `1.0` from type `<class 'float'>` to `DataType.datetime`",
1935      ):
1936          loaded_model.predict(["a", "b"], params={"datetime_array": [1.0, 2.0]})
1937      with pytest.raises(MlflowException, match=r"Failed to validate type and shape for 'int_array'"):
1938          loaded_model.predict(["a", "b"], params={"int_array": np.array([1.0, 2.0])})
1939      with pytest.raises(
1940          MlflowException, match=r"Failed to validate type and shape for 'float_array'"
1941      ):
1942          loaded_model.predict(["a", "b"], params={"float_array": [True, False]})
1943      with pytest.raises(
1944          MlflowException, match=r"Failed to validate type and shape for 'double_array'"
1945      ):
1946          loaded_model.predict(["a", "b"], params={"double_array": [1.0, "2.0"]})
1947  
1948  
1949  def test_enforce_schema_with_arrays_in_python_model_serving(sample_params_with_arrays):
1950      params = sample_params_with_arrays
1951      signature = infer_signature(["input1"], params=params)
1952      with mlflow.start_run():
1953          model_info = mlflow.pyfunc.log_model(
1954              name="test_model",
1955              python_model=PythonModelWithArrayParams(),
1956              signature=signature,
1957          )
1958  
1959      with pyfunc_scoring_endpoint(
1960          model_info.model_uri, extra_args=["--env-manager", "local"]
1961      ) as endpoint:
1962          response = endpoint.invoke(
1963              data=dump_input_data(["a", "b"], params=params),
1964              content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON,
1965          )
1966          assert response.status_code == 200
1967          prediction = json.loads(response.content.decode("utf-8"))["predictions"]
1968          for param, value in params.items():
1969              if param == "datetime_array":
1970                  assert prediction[param] == list(map(np.datetime_as_string, value))
1971              else:
1972                  assert (prediction[param] == value).all()
1973  
1974          # Test invalid params for model serving
1975          response = endpoint.invoke(
1976              data=dump_input_data(["a", "b"], params={"datetime_array": [1.0, 2.0]}),
1977              content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON,
1978          )
1979          assert response.status_code == 400
1980          assert (
1981              "Failed to convert value `1.0` from type `<class 'float'>` to `DataType.datetime`"
1982              in json.loads(response.content.decode("utf-8"))["message"]
1983          )
1984  
1985          response = endpoint.invoke(
1986              data=dump_input_data(["a", "b"], params={"int_array": np.array([1.0, 2.0])}),
1987              content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON,
1988          )
1989          assert response.status_code == 400
1990          assert (
1991              "Failed to validate type and shape for 'int_array'"
1992              in json.loads(response.content.decode("utf-8"))["message"]
1993          )
1994  
1995          response = endpoint.invoke(
1996              data=dump_input_data(["a", "b"], params={"float_array": [True, False]}),
1997              content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON,
1998          )
1999          assert response.status_code == 400
2000          assert (
2001              "Failed to validate type and shape for 'float_array'"
2002              in json.loads(response.content.decode("utf-8"))["message"]
2003          )
2004  
2005          response = endpoint.invoke(
2006              data=dump_input_data(["a", "b"], params={"double_array": [1.0, "2.0"]}),
2007              content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON,
2008          )
2009          assert response.status_code == 400
2010          assert (
2011              "Failed to validate type and shape for 'double_array'"
2012              in json.loads(response.content.decode("utf-8"))["message"]
2013          )
2014  
2015  
2016  @pytest.mark.parametrize(
2017      ("example", "input_schema", "output_schema"),
2018      [
2019          (
2020              ["input1", "input2", "input3"],
2021              Schema([ColSpec(DataType.string)]),
2022              Schema([ColSpec(DataType.string, 0)]),
2023          ),
2024          (
2025              [{"a": "a", "b": "b"}, {"a": "b"}],
2026              Schema([ColSpec(DataType.string, "a"), ColSpec(DataType.string, "b", required=False)]),
2027              Schema([ColSpec(DataType.string, "a"), ColSpec(DataType.string, "b", required=False)]),
2028          ),
2029          (
2030              {"a": ["a", "b", "c"], "b": "b"},
2031              Schema([ColSpec(Array(DataType.string), "a"), ColSpec(DataType.string, "b")]),
2032              Schema([ColSpec(Array(DataType.string), "a"), ColSpec(DataType.string, "b")]),
2033          ),
2034          (
2035              pd.DataFrame({"a": ["a", "b", "c"], "b": "b"}),
2036              Schema([ColSpec(DataType.string, "a"), ColSpec(DataType.string, "b")]),
2037              Schema([ColSpec(DataType.string, "a"), ColSpec(DataType.string, "b")]),
2038          ),
2039      ],
2040  )
2041  def test_pyfunc_model_input_example_with_params(
2042      sample_params_basic, param_schema_basic, tmp_path, example, input_schema, output_schema
2043  ):
2044      class MyModel(mlflow.pyfunc.PythonModel):
2045          def predict(self, context, model_input, params=None):
2046              return model_input
2047  
2048      with mlflow.start_run():
2049          model_info = mlflow.pyfunc.log_model(
2050              name="test_model",
2051              python_model=MyModel(),
2052              input_example=(example, sample_params_basic),
2053          )
2054  
2055      # Test _infer_signature_from_input_example
2056      assert model_info.signature.inputs == input_schema
2057      assert model_info.signature.outputs == output_schema
2058      assert model_info.signature.params == param_schema_basic
2059  
2060      # Test predict
2061      loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)
2062      prediction = loaded_model.predict(example)
2063      expected_df = pd.DataFrame([example] if isinstance(example, dict) else example)
2064      pd.testing.assert_frame_equal(prediction, expected_df)
2065  
2066      # Test saved example
2067      local_path = _download_artifact_from_uri(model_info.model_uri, output_path=tmp_path)
2068      mlflow_model = Model.load(os.path.join(local_path, "MLmodel"))
2069      loaded_example = mlflow_model.load_input_example(local_path)
2070      if isinstance(example, list) and all(np.isscalar(x) for x in example):
2071          np.testing.assert_equal(loaded_example, example)
2072      else:
2073          if isinstance(example, pd.DataFrame):
2074              pd.testing.assert_frame_equal(loaded_example, example)
2075          else:
2076              assert loaded_example == example
2077  
2078      for test_example in ["saved_example", "manual_example"]:
2079          if test_example == "saved_example":
2080              payload = mlflow_model.get_serving_input(local_path)
2081          else:
2082              if isinstance(example, pd.DataFrame):
2083                  payload = json.dumps({"dataframe_split": example.to_dict(orient="split")})
2084              else:
2085                  payload = json.dumps({"inputs": example})
2086  
2087          response = score_model_in_process(
2088              model_info.model_uri,
2089              data=payload,
2090              content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON,
2091          )
2092          assert response.status_code == 200, response.content
2093          result = json.loads(response.content.decode("utf-8"))["predictions"]
2094          result = pd.DataFrame(result).values.tolist()[0]
2095          np.testing.assert_equal(result, expected_df.values.tolist()[0])
2096  
2097  
2098  def test_invalid_input_example_warn_when_model_logging():
2099      class MyModel(mlflow.pyfunc.PythonModel):
2100          def predict(self, context, model_input, params=None):
2101              # List[str] is converted to pandas DataFrame
2102              # after schema enforcement, so this is invalid
2103              assert isinstance(model_input, list)
2104              return "string"
2105  
2106      with mock.patch("mlflow.models.model._logger.warning") as mock_warning:
2107          with mlflow.start_run():
2108              mlflow.pyfunc.log_model(
2109                  name="test_model",
2110                  python_model=MyModel(),
2111                  input_example=["some string"],
2112              )
2113          assert any(
2114              "Failed to validate serving input example" in call[0][0]
2115              for call in mock_warning.call_args_list
2116          )
2117  
2118  
2119  def assert_equal(a, b):
2120      if isinstance(a, pd.DataFrame):
2121          pd.testing.assert_frame_equal(a, b)
2122      elif isinstance(a, np.ndarray) or isinstance(b, np.ndarray):
2123          np.testing.assert_equal(a, b)
2124      elif isinstance(a, dict):
2125          assert a.keys() == b.keys()
2126          for key in a:
2127              assert_equal(a[key], b[key])
2128      else:
2129          assert a == b
2130  
2131  
2132  @pytest.mark.parametrize(
2133      ("example", "signature", "expected_input", "expected_output"),
2134      [
2135          (
2136              pd.DataFrame({"a": ["input1", "input2", "input3"]}),
2137              ModelSignature(
2138                  Schema([ColSpec(DataType.string, "a")]), Schema([ColSpec(DataType.string)])
2139              ),
2140              pd.DataFrame({"a": ["input1", "input2", "input3"]}),
2141              "string output",
2142          ),
2143          (
2144              np.array([1, 2, 3]),
2145              ModelSignature(
2146                  Schema([TensorSpec(np.dtype("int64"), (-1,))]),
2147                  Schema([TensorSpec(np.dtype("float64"), (-1,))]),
2148              ),
2149              np.array([1, 2, 3]),
2150              np.array([1.0, 2.0, 3.0]),
2151          ),
2152          (
2153              np.array([1, 2, 3, np.nan]),
2154              ModelSignature(
2155                  Schema([TensorSpec(np.dtype("float64"), (-1,))]),
2156                  Schema([TensorSpec(np.dtype("float64"), (-1,))]),
2157              ),
2158              np.array([1, 2, 3, np.nan]),
2159              np.array([1.0, 2.0, 3.0, np.nan]),
2160          ),
2161          (
2162              {"a": np.array([1, 2, 3])},
2163              ModelSignature(
2164                  Schema([TensorSpec(np.dtype("int64"), (-1,), "a")]),
2165                  Schema([TensorSpec(np.dtype("float64"), (-1,), "b")]),
2166              ),
2167              {"a": np.array([1, 2, 3])},
2168              {"b": np.array([1.0, 2.0, 3.0])},
2169          ),
2170          (
2171              ["input1", "input2", "input3"],
2172              ModelSignature(Schema([ColSpec(DataType.string)]), Schema([ColSpec(DataType.string)])),
2173              # This is due to _enforce_schema
2174              pd.DataFrame(["input1", "input2", "input3"]),
2175              ["input1", "input2", "input3"],
2176          ),
2177          (
2178              [{"a": ["sentence1", "sentence2"], "b": ["answer1", "answer2"]}],
2179              ModelSignature(
2180                  Schema([
2181                      ColSpec(Array(DataType.string), "a"),
2182                      ColSpec(Array(DataType.string), "b"),
2183                  ]),
2184                  Schema([ColSpec(DataType.string, "output")]),
2185              ),
2186              pd.DataFrame([{"a": ["sentence1", "sentence2"], "b": ["answer1", "answer2"]}]),
2187              {"output": "some prediction"},
2188          ),
2189          (
2190              {"messages": [{"role": "user", "content": "some question"}]},
2191              ModelSignature(
2192                  Schema([
2193                      ColSpec(
2194                          Array(
2195                              Object([
2196                                  Property("role", DataType.string),
2197                                  Property("content", DataType.string),
2198                              ])
2199                          ),
2200                          "messages",
2201                      )
2202                  ]),
2203                  Schema([ColSpec(DataType.string, "output")]),
2204              ),
2205              # we assume the field is array so we need another list wrapper
2206              pd.DataFrame([{"messages": [{"role": "user", "content": "some question"}]}]),
2207              {"output": "some prediction"},
2208          ),
2209      ],
2210  )
2211  def test_input_example_validation_during_logging(
2212      tmp_path, example, signature, expected_input, expected_output
2213  ):
2214      from mlflow.models import validate_serving_input
2215  
2216      class MyModel(mlflow.pyfunc.PythonModel):
2217          def predict(self, context, model_input, params=None):
2218              assert_equal(model_input, expected_input)
2219              return expected_output
2220  
2221      with mlflow.start_run():
2222          model_info = mlflow.pyfunc.log_model(
2223              name="test_model",
2224              python_model=MyModel(),
2225              input_example=example,
2226          )
2227          assert model_info.signature == signature
2228  
2229      mlflow_model = Model.load(model_info.model_uri)
2230      local_path = _download_artifact_from_uri(model_info.model_uri, output_path=tmp_path)
2231      serving_input_example = mlflow_model.get_serving_input(local_path)
2232      response = score_model_in_process(
2233          model_info.model_uri,
2234          data=serving_input_example,
2235          content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON,
2236      )
2237      assert response.status_code == 200, response.content
2238      if is_unified_llm_input(example):
2239          result = json.loads(response.content.decode("utf-8"))
2240      else:
2241          result = json.loads(response.content.decode("utf-8"))["predictions"]
2242      assert_equal(result, expected_output)
2243  
2244      # make sure validate_serving_input has the same output
2245      assert convert_input_example_to_serving_input(example) == serving_input_example
2246      result = validate_serving_input(model_info.model_uri, serving_input_example)
2247      assert_equal(result, expected_output)
2248  
2249  
2250  def test_pyfunc_schema_inference_not_generate_trace():
2251      # Test that the model logging call does not generate a trace.
2252      # When input example is provided, we run prediction to infer
2253      # the model signature, but it should not generate a trace.
2254      class MyModel(mlflow.pyfunc.PythonModel):
2255          @mlflow.trace()
2256          def predict(self, context, model_input):
2257              return model_input
2258  
2259      with mlflow.start_run():
2260          model_info = mlflow.pyfunc.log_model(
2261              name="test_model",
2262              python_model=MyModel(),
2263              input_example=["input"],
2264          )
2265  
2266      # No trace should be generated
2267      traces = get_traces()
2268      assert len(traces) == 0
2269  
2270      # Normal prediction should emit a trace
2271      loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)
2272      loaded_model.predict("input")
2273      traces = get_traces()
2274      assert len(traces) == 1
2275  
2276  
2277  @pytest.mark.parametrize(
2278      ("data", "schema"),
2279      [
2280          ({"a": np.array([1, 2, 3])}, Schema([ColSpec(DataType.long, name="a")])),
2281          ({"query": "sentence"}, Schema([ColSpec(DataType.string, name="query")])),
2282          (
2283              {"query": ["sentence_1", "sentence_2"]},
2284              Schema([ColSpec(DataType.string, name="query")]),
2285          ),
2286          (
2287              {"query": ["sentence_1", "sentence_2"], "table": "some_table"},
2288              Schema([
2289                  ColSpec(DataType.string, name="query"),
2290                  ColSpec(DataType.string, name="table"),
2291              ]),
2292          ),
2293          (
2294              [{"query": "sentence"}, {"query": "sentence"}],
2295              Schema([ColSpec(DataType.string, name="query")]),
2296          ),
2297          (
2298              [
2299                  {"query": ["sentence_1", "sentence_2"], "table": "some_table"},
2300                  {"query": ["sentence_1", "sentence_2"], "table": "some_table"},
2301              ],
2302              Schema([
2303                  ColSpec(DataType.string, name="query"),
2304                  ColSpec(DataType.string, name="table"),
2305              ]),
2306          ),
2307      ],
2308  )
2309  def test_pyfunc_model_schema_enforcement_with_dicts_and_lists(data, schema):
2310      class MyModel(mlflow.pyfunc.PythonModel):
2311          def predict(self, context, model_input, params=None):
2312              return model_input
2313  
2314      signature = ModelSignature(schema)
2315      with mlflow.start_run():
2316          model_info = mlflow.pyfunc.log_model(
2317              name="test_model",
2318              python_model=MyModel(),
2319              signature=signature,
2320          )
2321      loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)
2322      prediction = loaded_model.predict(data)
2323      if isinstance(data, dict) and all(
2324          isinstance(x, str) or (isinstance(x, list) and all(isinstance(y, str) for y in x))
2325          for x in data.values()
2326      ):
2327          df = pd.DataFrame([data])
2328      else:
2329          df = pd.DataFrame(data)
2330      pd.testing.assert_frame_equal(prediction, df)
2331  
2332      # Test pandas DataFrame input
2333      prediction = loaded_model.predict(df)
2334      pd.testing.assert_frame_equal(prediction, df)
2335  
2336  
2337  @pytest.mark.parametrize(
2338      ("data", "schema"),
2339      [
2340          ({"query": "sentence"}, Schema([ColSpec(DataType.string, name="query")])),
2341          (
2342              {"query": ["sentence_1", "sentence_2"]},
2343              Schema([ColSpec(DataType.string, name="query")]),
2344          ),
2345          (
2346              {"query": ["sentence_1", "sentence_2"], "table": "some_table"},
2347              Schema([
2348                  ColSpec(DataType.string, name="query"),
2349                  ColSpec(DataType.string, name="table"),
2350              ]),
2351          ),
2352      ],
2353  )
2354  # `instances` is an invalid key for schema with MLflow < 2.9.0
2355  @pytest.mark.parametrize("format_key", ["inputs", "dataframe_split", "dataframe_records"])
2356  def test_pyfunc_model_serving_with_dicts(data, schema, format_key):
2357      class MyModel(mlflow.pyfunc.PythonModel):
2358          def predict(self, context, model_input, params=None):
2359              return model_input
2360  
2361      signature = ModelSignature(schema)
2362      with mlflow.start_run():
2363          model_info = mlflow.pyfunc.log_model(
2364              name="test_model",
2365              python_model=MyModel(),
2366              signature=signature,
2367          )
2368  
2369      df = (
2370          pd.DataFrame([data])
2371          if all(isinstance(x, str) for x in data.values())
2372          else pd.DataFrame(data)
2373      )
2374      if format_key == "inputs":
2375          payload = {format_key: data}
2376      elif format_key in ("dataframe_split", "dataframe_records"):
2377          payload = {format_key: df.to_dict(orient=format_key[10:])}
2378  
2379      response = score_model_in_process(
2380          model_info.model_uri,
2381          data=json.dumps(payload),
2382          content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON,
2383      )
2384      assert response.status_code == 200, response.content
2385      result = json.loads(response.content.decode("utf-8"))["predictions"]
2386      # This is not consistent with batch inference df
2387      pd.testing.assert_frame_equal(pd.DataFrame(result), df)
2388  
2389  
2390  @pytest.mark.parametrize(
2391      ("data", "schema"),
2392      [
2393          (
2394              [{"query": "sentence"}, {"query": "sentence"}],
2395              Schema([ColSpec(DataType.string, name="query")]),
2396          ),
2397          (
2398              [
2399                  {"query": ["sentence_1", "sentence_2"], "table": "some_table"},
2400                  {"query": ["sentence_1", "sentence_2"], "table": "some_table"},
2401              ],
2402              Schema([
2403                  ColSpec(DataType.string, name="query"),
2404                  ColSpec(DataType.string, name="table"),
2405              ]),
2406          ),
2407      ],
2408  )
2409  # `inputs`` is an invalid key for schema with MLflow < 2.9.0
2410  @pytest.mark.parametrize("format_key", ["instances", "dataframe_split", "dataframe_records"])
2411  def test_pyfunc_model_serving_with_lists_of_dicts(data, schema, format_key):
2412      class MyModel(mlflow.pyfunc.PythonModel):
2413          def predict(self, context, model_input, params=None):
2414              return model_input
2415  
2416      signature = ModelSignature(schema)
2417      with mlflow.start_run():
2418          model_info = mlflow.pyfunc.log_model(
2419              name="test_model",
2420              python_model=MyModel(),
2421              signature=signature,
2422          )
2423  
2424      df = pd.DataFrame(data)
2425      if format_key == "instances":
2426          payload = {format_key: data}
2427      elif format_key in ("dataframe_split", "dataframe_records"):
2428          payload = {format_key: df.to_dict(orient=format_key[10:])}
2429  
2430      response = score_model_in_process(
2431          model_info.model_uri,
2432          data=json.dumps(payload),
2433          content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON,
2434      )
2435      assert response.status_code == 200, response.content
2436      result = json.loads(response.content.decode("utf-8"))["predictions"]
2437      pd.testing.assert_frame_equal(pd.DataFrame(result), df)
2438  
2439  
2440  @pytest.mark.parametrize(
2441      ("data", "schema"),
2442      [
2443          ({"query": "sentence"}, Schema([ColSpec(DataType.string, name="query")])),
2444          (
2445              {"query": ["sentence_1", "sentence_2"]},
2446              Schema([ColSpec(Array(DataType.string), name="query")]),
2447          ),
2448          (
2449              {"query": {"a": "a", "b": 1}},
2450              Schema([
2451                  ColSpec(
2452                      Object([Property("a", DataType.string), Property("b", DataType.long)]),
2453                      "query",
2454                  )
2455              ]),
2456          ),
2457          (
2458              {"query": ["sentence_1", "sentence_2"], "table": "some_table"},
2459              Schema([
2460                  ColSpec(Array(DataType.string), name="query"),
2461                  ColSpec(DataType.string, name="table"),
2462              ]),
2463          ),
2464          (
2465              {"query": [{"name": "value", "age": 10}, {"name": "value"}], "table": ["some_table"]},
2466              Schema([
2467                  ColSpec(
2468                      Array(
2469                          Object([
2470                              Property("name", DataType.string),
2471                              Property("age", DataType.long, required=False),
2472                          ])
2473                      ),
2474                      name="query",
2475                  ),
2476                  ColSpec(Array(DataType.string), name="table"),
2477              ]),
2478          ),
2479          (
2480              [{"query": "sentence"}, {"query": "sentence"}],
2481              Schema([ColSpec(DataType.string, name="query")]),
2482          ),
2483          (
2484              [
2485                  {"query": ["sentence_1", "sentence_2"], "table": "some_table"},
2486                  {"query": ["sentence_1", "sentence_2"]},
2487              ],
2488              Schema([
2489                  ColSpec(Array(DataType.string), name="query"),
2490                  ColSpec(DataType.string, name="table", required=False),
2491              ]),
2492          ),
2493      ],
2494  )
2495  def test_pyfunc_model_schema_enforcement_with_objects_and_arrays(data, schema):
2496      class MyModel(mlflow.pyfunc.PythonModel):
2497          def load_context(self, context):
2498              self.pipeline = "pipeline"
2499  
2500          def predict(self, context, model_input, params=None):
2501              assert self.pipeline == "pipeline"
2502              return model_input
2503  
2504      signature = infer_signature(data)
2505      assert signature.inputs == schema
2506      pdf = pd.DataFrame(data if isinstance(data, list) else [data])
2507      assert infer_signature(pdf).inputs == schema
2508  
2509      with mlflow.start_run():
2510          model_info = mlflow.pyfunc.log_model(
2511              name="test_model",
2512              python_model=MyModel(),
2513              signature=signature,
2514          )
2515      loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)
2516      prediction = loaded_model.predict(data)
2517      df = pd.DataFrame(data) if isinstance(data, list) else pd.DataFrame([data])
2518      pd.testing.assert_frame_equal(prediction, df)
2519  
2520      # Test pandas DataFrame input
2521      prediction = loaded_model.predict(df)
2522      pd.testing.assert_frame_equal(prediction, df)
2523  
2524  
2525  @pytest.mark.parametrize(
2526      "data",
2527      [
2528          {"query": "sentence"},
2529          {"query": ["sentence_1", "sentence_2"]},
2530          {"query": ["sentence_1", "sentence_2"], "table": "some_table"},
2531          {"query": [{"name": "value"}, {"name": "value"}], "table": ["some_table"]},
2532          [{"query": "sentence"}, {"query": "sentence"}],
2533          [
2534              {"query": ["sentence_1", "sentence_2"], "table": "some_table"},
2535              {"query": ["sentence_1", "sentence_2"]},
2536          ],
2537          [
2538              {"query": [{"name": "value"}, {"name": "value"}], "table": ["some_table"]},
2539              {"query": [{"name": "value", "age": 10}, {"name": "value"}], "table": ["some_table"]},
2540          ],
2541      ],
2542  )
2543  @pytest.mark.parametrize("format_key", ["inputs", "dataframe_split", "dataframe_records"])
2544  def test_pyfunc_model_scoring_with_objects_and_arrays(data, format_key):
2545      class MyModel(mlflow.pyfunc.PythonModel):
2546          def predict(self, context, model_input, params=None):
2547              return model_input
2548  
2549      with mlflow.start_run():
2550          model_info = mlflow.pyfunc.log_model(
2551              name="test_model",
2552              python_model=MyModel(),
2553              signature=infer_signature(data),
2554          )
2555  
2556      df = pd.DataFrame(data) if isinstance(data, list) else pd.DataFrame([data])
2557  
2558      if format_key == "inputs":
2559          payload = {format_key: data}
2560      elif format_key == "dataframe_split":
2561          payload = {format_key: df.to_dict(orient="split")}
2562      elif format_key == "dataframe_records":
2563          payload = {format_key: df.to_dict(orient="records")}
2564  
2565      response = score_model_in_process(
2566          model_info.model_uri,
2567          data=json.dumps(payload),
2568          content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON,
2569      )
2570      assert response.status_code == 200, response.content
2571      result = json.loads(response.content.decode("utf-8"))["predictions"]
2572      expected_result = df.to_dict(orient="records")
2573      np.testing.assert_equal(result, expected_result)
2574  
2575  
2576  @pytest.mark.parametrize(
2577      "data",
2578      [
2579          {"query": "sentence"},
2580          {"query": ["sentence_1", "sentence_2"]},
2581          {"query": ["sentence_1", "sentence_2"], "table": "some_table"},
2582          {"query": [{"name": "value"}, {"name": "value"}], "table": ["some_table"]},
2583          [{"query": "sentence"}, {"query": "sentence"}],
2584      ],
2585  )
2586  def test_pyfunc_model_scoring_with_objects_and_arrays_instances(data):
2587      class MyModel(mlflow.pyfunc.PythonModel):
2588          def predict(self, context, model_input, params=None):
2589              return model_input
2590  
2591      with mlflow.start_run():
2592          model_info = mlflow.pyfunc.log_model(
2593              name="test_model",
2594              python_model=MyModel(),
2595              signature=infer_signature(data),
2596          )
2597  
2598      df = pd.DataFrame(data) if isinstance(data, list) else pd.DataFrame([data])
2599      response = score_model_in_process(
2600          model_info.model_uri,
2601          data=json.dumps({"instances": data}),
2602          content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON,
2603      )
2604      assert response.status_code == 200, response.content
2605      result = json.loads(response.content.decode("utf-8"))["predictions"]
2606      expected_result = df.to_dict(orient="records")
2607      np.testing.assert_equal(result, expected_result)
2608  
2609  
2610  @pytest.mark.parametrize(
2611      "data",
2612      [
2613          [{"query": {"a": "b"}, "name": "A"}, {"query": {"a": "c"}, "name": "B"}],
2614          [
2615              {"query": ["sentence_1", "sentence_2"], "table": "some_table"},
2616              {"query": ["sentence_1", "sentence_2"]},
2617          ],
2618          [
2619              {"query": [{"name": "value"}, {"name": "value"}], "table": ["some_table"]},
2620              {"query": [{"name": "value", "age": 10}, {"name": "value"}], "table": ["some_table"]},
2621          ],
2622      ],
2623  )
2624  def test_pyfunc_model_scoring_with_objects_and_arrays_instances_errors(data):
2625      class MyModel(mlflow.pyfunc.PythonModel):
2626          def predict(self, context, model_input, params=None):
2627              return model_input
2628  
2629      with mlflow.start_run():
2630          model_info = mlflow.pyfunc.log_model(
2631              name="test_model",
2632              python_model=MyModel(),
2633              signature=infer_signature(data),
2634          )
2635  
2636      response = score_model_in_process(
2637          model_info.model_uri,
2638          data=json.dumps({"instances": data}),
2639          content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON,
2640      )
2641      assert response.status_code == 400, response.content
2642      assert "Failed to enforce schema" in json.loads(response.content.decode("utf-8"))["message"]
2643  
2644  
2645  @pytest.mark.parametrize(
2646      ("data", "schema"),
2647      [
2648          (
2649              [{"query": "question1"}, {"query": "question2"}],
2650              Schema([ColSpec(DataType.string, "query")]),
2651          ),
2652          (
2653              [{"query": ["sentence_1", "sentence_2"]}, {"query": ["sentence_1", "sentence_2"]}],
2654              Schema([ColSpec(DataType.string, "query")]),
2655          ),
2656          (
2657              [
2658                  {"query": ["sentence_1", "sentence_2"], "table": "some_table"},
2659                  {"query": ["sentence_1", "sentence_2"], "table": "some_table"},
2660              ],
2661              Schema([ColSpec(DataType.string, "query"), ColSpec(DataType.string, "table")]),
2662          ),
2663      ],
2664  )
2665  def test_pyfunc_model_scoring_instances_backwards_compatibility(data, schema):
2666      class MyModel(mlflow.pyfunc.PythonModel):
2667          def predict(self, context, model_input, params=None):
2668              return model_input
2669  
2670      with mlflow.start_run():
2671          model_info = mlflow.pyfunc.log_model(
2672              name="test_model",
2673              python_model=MyModel(),
2674              signature=ModelSignature(schema),
2675          )
2676  
2677      response = score_model_in_process(
2678          model_info.model_uri,
2679          data=json.dumps({"instances": data}),
2680          content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON,
2681      )
2682      assert response.status_code == 200, response.content
2683      result = json.loads(response.content.decode("utf-8"))["predictions"]
2684      np.testing.assert_equal(result, data)
2685  
2686  
2687  @pytest.mark.parametrize(
2688      ("data", "schema"),
2689      [
2690          (
2691              {
2692                  "netsed_list": [
2693                      [["a", "b"], ["c", "d"]],
2694                      [["e", "f"], ["g"]],
2695                  ]
2696              },
2697              Schema([ColSpec(Array(Array(DataType.string)), name="netsed_list")]),
2698          ),
2699          (
2700              {
2701                  "numpy_2d_array": [
2702                      np.array([[np.int32(1), np.int32(2)], [np.int32(3), np.int32(4)]])
2703                  ]
2704              },
2705              Schema([ColSpec(Array(Array(DataType.integer)), name="numpy_2d_array")]),
2706          ),
2707          (
2708              {"list_of_np_array": [[np.array(["a", "b"])], [np.array(["c", "d"])]]},
2709              Schema([ColSpec(Array(Array(DataType.string)), name="list_of_np_array")]),
2710          ),
2711      ],
2712  )
2713  def test_pyfunc_model_schema_enforcement_nested_array(data, schema):
2714      class MyModel(mlflow.pyfunc.PythonModel):
2715          def predict(self, context, model_input, params=None):
2716              return model_input
2717  
2718      df = pd.DataFrame.from_records(data)
2719      signature = infer_signature(df)
2720      assert signature.inputs == schema
2721  
2722      with mlflow.start_run():
2723          model_info = mlflow.pyfunc.log_model(
2724              name="test_model",
2725              python_model=MyModel(),
2726              signature=signature,
2727          )
2728      loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)
2729      prediction = loaded_model.predict(df)
2730      pd.testing.assert_frame_equal(prediction, df)
2731  
2732  
2733  @pytest.mark.parametrize(
2734      ("data", "schema"),
2735      [
2736          (
2737              {
2738                  "simple_map": [
2739                      {"a": 3, "b": 4},
2740                      {},
2741                      {"c": 5},
2742                  ]
2743              },
2744              Schema([ColSpec(Map(value_type=DataType.long), name="simple_map")]),
2745          ),
2746          (
2747              {
2748                  "simple_map": [
2749                      {"a": 3, "b": 4},
2750                      {},
2751                      {"c": 5},
2752                  ]
2753              },
2754              Schema([ColSpec(Map(value_type=DataType.long))]),  # Unnamed column
2755          ),
2756          (
2757              {
2758                  "nested_map": [
2759                      {"a": {"a1": 3, "a2": 4}, "b": {"b1": 5}},
2760                      {},
2761                      {"c": {}},
2762                  ]
2763              },
2764              Schema([ColSpec(Map(value_type=Map(value_type=DataType.long)), name="nested_map")]),
2765          ),
2766          (
2767              {
2768                  "array_in_map": [
2769                      {"a": [1, 2, 3], "b": [4, 5]},
2770                      {},
2771                      {"c": []},
2772                  ]
2773              },
2774              Schema([ColSpec(Map(value_type=Array(dtype=DataType.long)), name="array_in_map")]),
2775          ),
2776          (
2777              {
2778                  "object_in_map": [
2779                      {"a": {"key1": "a1", "key2": 1}, "b": {"key1": "b1"}},
2780                      {},
2781                      {"c": {"key1": "c1"}},
2782                  ]
2783              },
2784              Schema([
2785                  ColSpec(
2786                      Map(
2787                          value_type=Object([
2788                              Property("key1", DataType.string),
2789                              Property("key2", DataType.long, required=False),
2790                          ])
2791                      ),
2792                      name="object_in_map",
2793                  )
2794              ]),
2795          ),
2796          (
2797              {
2798                  "map_in_array": [
2799                      [{"a": 3, "b": 4}, {"c": 5}],
2800                      [],
2801                      [{"d": 6}],
2802                  ]
2803              },
2804              Schema([ColSpec(Array(dtype=Map(value_type=DataType.long)), name="map_in_array")]),
2805          ),
2806          (
2807              {
2808                  "map_in_object": [
2809                      {"key1": {"a": 3, "b": 4}, "key2": {"c": 5}},
2810                      {"key1": {"d": 6}},
2811                  ]
2812              },
2813              Schema([
2814                  ColSpec(
2815                      Object([
2816                          Property("key1", Map(value_type=DataType.long)),
2817                          Property("key2", Map(value_type=DataType.long), required=False),
2818                      ]),
2819                      name="map_in_object",
2820                  )
2821              ]),
2822          ),
2823      ],
2824  )
2825  @pytest.mark.parametrize("format_key", ["dataframe_split", "dataframe_records"])
2826  def test_pyfunc_model_schema_enforcement_map_type(data, schema, format_key):
2827      class MyModel(mlflow.pyfunc.PythonModel):
2828          def predict(self, context, model_input, params=None):
2829              return model_input
2830  
2831      df = pd.DataFrame.from_records(data)
2832  
2833      with mlflow.start_run():
2834          model_info = mlflow.pyfunc.log_model(
2835              name="test_model",
2836              python_model=MyModel(),
2837              signature=ModelSignature(inputs=schema, outputs=schema),
2838          )
2839      loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)
2840      prediction = loaded_model.predict(df)
2841      pd.testing.assert_frame_equal(prediction, df)
2842  
2843      if format_key == "dataframe_split":
2844          payload = {format_key: df.to_dict(orient="split")}
2845      elif format_key == "dataframe_records":
2846          payload = {format_key: df.to_dict(orient="records")}
2847  
2848      class CustomJsonEncoder(json.JSONEncoder):
2849          def default(self, o):
2850              import numpy as np
2851  
2852              if isinstance(o, np.int64):
2853                  return int(o)
2854  
2855              return super().default(o)
2856  
2857      response = score_model_in_process(
2858          model_info.model_uri,
2859          data=json.dumps(payload, cls=CustomJsonEncoder),
2860          content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON,
2861      )
2862      assert response.status_code == 200, response.content
2863      result = json.loads(response.content.decode("utf-8"))["predictions"]
2864      expected_result = df.to_dict(orient="records")
2865      np.testing.assert_equal(result, expected_result)
2866  
2867  
2868  @pytest.mark.parametrize(
2869      ("data", "schema"),
2870      [
2871          (
2872              [
2873                  {
2874                      "object_column": {"query": ["sentence_1", "sentence_2"], "table": "some_table"},
2875                      "string_column": "some_string",
2876                      "array_column": [{"name": "value"}, {"name": "value"}],
2877                  },
2878                  {
2879                      "object_column": {"query": ["sentence_1", "sentence_2"]},
2880                      "string_column": "some_string",
2881                      "array_column": [{"name": "value"}],
2882                  },
2883              ],
2884              Schema([
2885                  ColSpec(
2886                      Object([
2887                          Property("query", Array(DataType.string)),
2888                          Property("table", DataType.string, required=False),
2889                      ]),
2890                      "object_column",
2891                  ),
2892                  ColSpec(DataType.string, "string_column"),
2893                  ColSpec(
2894                      Array(Object([Property("name", DataType.string)])),
2895                      "array_column",
2896                  ),
2897              ]),
2898          ),
2899      ],
2900  )
2901  @pytest.mark.parametrize("format_key", ["inputs", "dataframe_split", "dataframe_records"])
2902  def test_pyfunc_model_schema_enforcement_complex(data, schema, format_key):
2903      class MyModel(mlflow.pyfunc.PythonModel):
2904          def predict(self, context, model_input, params=None):
2905              return model_input
2906  
2907      df = pd.DataFrame.from_records(data)
2908      signature = infer_signature(df)
2909      assert signature.inputs == schema
2910  
2911      with mlflow.start_run():
2912          model_info = mlflow.pyfunc.log_model(
2913              name="test_model",
2914              python_model=MyModel(),
2915              signature=signature,
2916          )
2917      loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)
2918      prediction = loaded_model.predict(df)
2919      pd.testing.assert_frame_equal(prediction, df)
2920  
2921      if format_key == "inputs":
2922          payload = {format_key: data}
2923      elif format_key == "dataframe_split":
2924          payload = {format_key: df.to_dict(orient="split")}
2925      elif format_key == "dataframe_records":
2926          payload = {format_key: df.to_dict(orient="records")}
2927  
2928      response = score_model_in_process(
2929          model_info.model_uri,
2930          data=json.dumps(payload),
2931          content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON,
2932      )
2933      assert response.status_code == 200, response.content
2934      result = json.loads(response.content.decode("utf-8"))["predictions"]
2935      expected_result = df.to_dict(orient="records")
2936      np.testing.assert_equal(result, expected_result)
2937  
2938  
2939  def test_zero_or_one_longs_convert_to_floats():
2940      zeros = pd.DataFrame([{"temperature": 0}, {"temperature": 0.9}, {"temperature": 1}, {}])
2941      schema = Schema([ColSpec(DataType.double, name="temperature", required=False)])
2942      data = _enforce_schema(zeros, schema)
2943      pd.testing.assert_series_equal(
2944          data["temperature"], pd.Series([0.0, 0.9, 1.0, np.nan], dtype=np.float64), check_names=False
2945      )
2946  
2947  
2948  @pytest.mark.parametrize(
2949      ("input_example", "expected_schema", "payload_example"),
2950      [
2951          ({"a": None}, Schema([ColSpec(type=AnyType(), name="a", required=False)]), {"a": "string"}),
2952          (
2953              {"a": [None, []]},
2954              Schema([ColSpec(Array(AnyType()), name="a", required=False)]),
2955              {"a": ["abc", "123"]},
2956          ),
2957          (
2958              {"a": [None]},
2959              Schema([ColSpec(type=Array(AnyType()), name="a", required=False)]),
2960              {"a": ["abc"]},
2961          ),
2962          (
2963              {"a": [None, "string"]},
2964              Schema([ColSpec(type=Array(DataType.string), name="a", required=False)]),
2965              {"a": ["abc"]},
2966          ),
2967          (
2968              {"a": {"x": None}},
2969              Schema([ColSpec(type=Object([Property("x", AnyType(), required=False)]), name="a")]),
2970              {"a": {"x": 234}},
2971          ),
2972          (
2973              [
2974                  {
2975                      "messages": [
2976                          {
2977                              "content": "You are a helpful assistant.",
2978                              "additional_kwargs": {},
2979                              "response_metadata": {},
2980                              "type": "system",
2981                              "name": None,
2982                              "id": None,
2983                          },
2984                          {
2985                              "content": "What would you like to ask?",
2986                              "additional_kwargs": {},
2987                              "response_metadata": {},
2988                              "type": "ai",
2989                              "name": None,
2990                              "id": None,
2991                              "example": False,
2992                              "tool_calls": [],
2993                              "invalid_tool_calls": [],
2994                              "usage_metadata": None,
2995                          },
2996                          {
2997                              "content": "Who owns MLflow?",
2998                              "additional_kwargs": {},
2999                              "response_metadata": {},
3000                              "type": "human",
3001                              "name": None,
3002                              "id": None,
3003                              "example": False,
3004                          },
3005                      ],
3006                      "text": "Hello?",
3007                  }
3008              ],
3009              Schema([
3010                  ColSpec(
3011                      Array(
3012                          Object(
3013                              properties=[
3014                                  Property("content", DataType.string),
3015                                  Property("additional_kwargs", AnyType(), required=False),
3016                                  Property("response_metadata", AnyType(), required=False),
3017                                  Property("type", DataType.string),
3018                                  Property("name", AnyType(), required=False),
3019                                  Property("id", AnyType(), required=False),
3020                                  Property("example", DataType.boolean, required=False),
3021                                  Property("tool_calls", AnyType(), required=False),
3022                                  Property("invalid_tool_calls", AnyType(), required=False),
3023                                  Property("usage_metadata", AnyType(), required=False),
3024                              ]
3025                          )
3026                      ),
3027                      name="messages",
3028                  ),
3029                  ColSpec(DataType.string, name="text"),
3030              ]),
3031              [
3032                  {
3033                      "messages": [
3034                          {
3035                              "content": "You are a helpful assistant.",
3036                              "additional_kwargs": {"x": "x"},
3037                              "response_metadata": {"y": "y"},
3038                              "type": "system",
3039                              "name": "test",
3040                              "id": 1234567,
3041                              "tool_calls": [{"tool1": "abc"}],
3042                              "invalid_tool_calls": ["tool2", "tool3"],
3043                          },
3044                      ],
3045                      "text": "Hello?",
3046                  }
3047              ],
3048          ),
3049      ],
3050  )
3051  def test_schema_enforcement_for_anytype(input_example, expected_schema, payload_example):
3052      class MyModel(mlflow.pyfunc.PythonModel):
3053          def predict(self, context, model_input, params=None):
3054              return model_input
3055  
3056      with mlflow.start_run():
3057          model_info = mlflow.pyfunc.log_model(
3058              name="test_model",
3059              python_model=MyModel(),
3060              input_example=input_example,
3061          )
3062      assert model_info.signature.inputs == expected_schema
3063      loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)
3064      prediction = loaded_model.predict(payload_example)
3065      df = (
3066          pd.DataFrame(payload_example)
3067          if isinstance(payload_example, list)
3068          else pd.DataFrame([payload_example])
3069      )
3070      pd.testing.assert_frame_equal(prediction, df)
3071  
3072      data = convert_input_example_to_serving_input(payload_example)
3073      response = score_model_in_process(
3074          model_info.model_uri,
3075          data=data,
3076          content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON,
3077      )
3078      assert response.status_code == 200, response.content
3079      result = json.loads(response.content.decode("utf-8"))["predictions"]
3080      expected_result = df.to_dict(orient="records")
3081      np.testing.assert_equal(result, expected_result)