Cradicle Explorer

utils.py
  1  import logging
  2  import warnings
  3  from collections import defaultdict
  4  from copy import deepcopy
  5  from typing import Any, Dict, List
  6  
  7  import numpy as np
  8  import pandas as pd
  9  import pydantic
 10  
 11  from mlflow.exceptions import MlflowException
 12  from mlflow.protos.databricks_pb2 import INVALID_PARAMETER_VALUE
 13  from mlflow.types import DataType
 14  from mlflow.types.schema import (
 15      HAS_PYSPARK,
 16      AnyType,
 17      Array,
 18      ColSpec,
 19      Map,
 20      Object,
 21      ParamSchema,
 22      ParamSpec,
 23      Property,
 24      Schema,
 25      SparkMLVector,
 26      TensorSpec,
 27  )
 28  
 29  MULTIPLE_TYPES_ERROR_MSG = (
 30      "Expected all values in the list to be of the same type. To specify a model signature "
 31      "with a list containing elements of multiple types, define the signature manually "
 32      "using the Array(AnyType()) type from mlflow.models.schema."
 33  )
 34  _logger = logging.getLogger(__name__)
 35  
 36  
 37  class TensorsNotSupportedException(MlflowException):
 38      def __init__(self, msg):
 39          super().__init__(f"Multidimensional arrays (aka tensors) are not supported. {msg}")
 40  
 41  
 42  def _get_tensor_shape(data, variable_dimension: int | None = 0) -> tuple[int, ...]:
 43      """Infer the shape of the inputted data.
 44  
 45      This method creates the shape of the tensor to store in the TensorSpec. The variable dimension
 46      is assumed to be the first dimension by default. This assumption can be overridden by inputting
 47      a different variable dimension or `None` to represent that the input tensor does not contain a
 48      variable dimension.
 49  
 50      Args:
 51          data: Dataset to infer from.
 52          variable_dimension: An optional integer representing a variable dimension.
 53  
 54      Returns:
 55          tuple: Shape of the inputted data (including a variable dimension)
 56      """
 57      from scipy.sparse import csc_matrix, csr_matrix
 58  
 59      if not isinstance(data, (np.ndarray, csr_matrix, csc_matrix)):
 60          raise TypeError(f"Expected numpy.ndarray or csc/csr matrix, got '{type(data)}'.")
 61      variable_input_data_shape = data.shape
 62      if variable_dimension is not None:
 63          try:
 64              variable_input_data_shape = list(variable_input_data_shape)
 65              variable_input_data_shape[variable_dimension] = -1
 66          except IndexError:
 67              raise MlflowException(
 68                  f"The specified variable_dimension {variable_dimension} is out of bounds with "
 69                  f"respect to the number of dimensions {data.ndim} in the input dataset"
 70              )
 71      return tuple(variable_input_data_shape)
 72  
 73  
 74  def clean_tensor_type(dtype: np.dtype):
 75      """
 76      This method strips away the size information stored in flexible datatypes such as np.str_ and
 77      np.bytes_. Other numpy dtypes are returned unchanged.
 78  
 79      Args:
 80          dtype: Numpy dtype of a tensor
 81  
 82      Returns:
 83          dtype: Cleaned numpy dtype
 84      """
 85      if not isinstance(dtype, np.dtype):
 86          raise TypeError(
 87              f"Expected `type` to be instance of `{np.dtype}`, received `{dtype.__class__}`"
 88          )
 89  
 90      # Special casing for np.str_ and np.bytes_
 91      if dtype.char == "U":
 92          return np.dtype("str")
 93      elif dtype.char == "S":
 94          return np.dtype("bytes")
 95      return dtype
 96  
 97  
 98  def _infer_colspec_type(data: Any) -> DataType | Array | Object | AnyType:
 99      """
100      Infer an MLflow Colspec type from the dataset.
101  
102      Args:
103          data: data to infer from.
104  
105      Returns:
106          Object
107      """
108      dtype = _infer_datatype(data)
109  
110      if dtype is None:
111          raise MlflowException(
112              f"Numpy array must include at least one non-empty item. Invalid input `{data}`."
113          )
114  
115      return dtype
116  
117  
118  class InvalidDataForSignatureInferenceError(MlflowException):
119      def __init__(self, message):
120          super().__init__(message=message, error_code=INVALID_PARAMETER_VALUE)
121  
122  
123  def _infer_datatype(data: Any) -> DataType | Array | Object | AnyType | None:
124      """
125      Infer the datatype of input data.
126      Data type and inferred schema type mapping:
127          - dict -> Object
128          - list -> Array
129          - numpy.ndarray -> Array
130          - scalar -> DataType
131          - None, empty dictionary/list -> AnyType
132  
133      .. Note::
134          Empty numpy arrays are inferred as None to keep the backward compatibility, as numpy
135          arrays are used by some traditional ML flavors.
136          e.g. numpy.array([]) -> None, numpy.array([[], []]) -> None
137          While empty lists are inferred as AnyType instead of None after the support of AnyType.
138          e.g. [] -> AnyType, [[], []] -> Array(Any)
139      """
140      if isinstance(data, pydantic.BaseModel):
141          raise InvalidDataForSignatureInferenceError(
142              message="MLflow does not support inferring model signature from input example "
143              "with Pydantic objects. To use Pydantic objects, define your PythonModel's "
144              "`predict` method with a Pydantic type hint, and model signature will be automatically "
145              "inferred when logging the model. e.g. "
146              "`def predict(self, model_input: list[PydanticType])`. Check "
147              "https://mlflow.org/docs/latest/model/python_model.html#type-hint-usage-in-pythonmodel "
148              "for more details."
149          )
150  
151      if _is_none_or_nan(data) or (isinstance(data, (list, dict)) and not data):
152          return AnyType()
153  
154      if isinstance(data, dict):
155          properties = []
156          for k, v in data.items():
157              dtype = _infer_datatype(v)
158              if dtype is None:
159                  raise MlflowException("Dictionary value must not be an empty numpy array.")
160              properties.append(
161                  Property(name=k, dtype=dtype, required=not isinstance(dtype, AnyType))
162              )
163          return Object(properties=properties)
164  
165      if isinstance(data, (list, np.ndarray)):
166          return _infer_array_datatype(data)
167  
168      return _infer_scalar_datatype(data)
169  
170  
171  def _infer_array_datatype(data: list[Any] | np.ndarray) -> Array | None:
172      """Infer schema from an array. This tries to infer type if there is at least one
173      non-null item in the list, assuming the list has a homogeneous type. However,
174      if the list is empty or all items are null, returns None as a sign of undetermined.
175  
176      E.g.
177          ["a", "b"] => Array(string)
178          ["a", None] => Array(string)
179          [["a", "b"], []] => Array(Array(string))
180          [["a", "b"], None] => Array(Array(string))
181          [] => None
182          [None] => Array(Any)
183  
184      Args:
185          data: data to infer from.
186  
187      Returns:
188          Array(dtype) or None if undetermined
189      """
190      result = None
191      for item in data:
192          dtype = _infer_datatype(item)
193  
194          # Skip item with undetermined type
195          if dtype is None:
196              continue
197  
198          if result is None:
199              result = Array(dtype)
200          elif isinstance(result.dtype, (Array, Object, Map, AnyType)):
201              try:
202                  result = Array(result.dtype._merge(dtype))
203              except MlflowException as e:
204                  raise MlflowException.invalid_parameter_value(MULTIPLE_TYPES_ERROR_MSG) from e
205          elif isinstance(result.dtype, DataType):
206              if not isinstance(dtype, AnyType) and dtype != result.dtype:
207                  raise MlflowException.invalid_parameter_value(MULTIPLE_TYPES_ERROR_MSG)
208          else:
209              raise MlflowException.invalid_parameter_value(
210                  f"{dtype} is not a valid type for an item of a list or numpy array."
211              )
212      return result
213  
214  
215  # datetime is not included here
216  SCALAR_TO_DATATYPE_MAPPING = {
217      bool: DataType.boolean,
218      np.bool_: DataType.boolean,
219      int: DataType.long,
220      np.int64: DataType.long,
221      np.int32: DataType.integer,
222      float: DataType.double,
223      np.float64: DataType.double,
224      np.float32: DataType.float,
225      str: DataType.string,
226      np.str_: DataType.string,
227      object: DataType.string,
228      bytes: DataType.binary,
229      np.bytes_: DataType.binary,
230      bytearray: DataType.binary,
231  }
232  
233  
234  def _infer_scalar_datatype(data) -> DataType:
235      if data_type := SCALAR_TO_DATATYPE_MAPPING.get(type(data)):
236          return data_type
237      if DataType.check_type(DataType.datetime, data):
238          return DataType.datetime
239      if HAS_PYSPARK:
240          for data_type in DataType.all_types():
241              if isinstance(data, type(data_type.to_spark())):
242                  return data_type
243      raise MlflowException.invalid_parameter_value(
244          f"Data {data} is not one of the supported DataType"
245      )
246  
247  
248  def _infer_schema(data: Any) -> Schema:
249      """
250      Infer an MLflow schema from a dataset.
251  
252      Data inputted as a numpy array or a dictionary is represented by :py:class:`TensorSpec`.
253      All other inputted data types are specified by :py:class:`ColSpec`.
254  
255      A `TensorSpec` captures the data shape (default variable axis is 0), the data type (numpy.dtype)
256      and an optional name for each individual tensor of the dataset.
257      A `ColSpec` captures the data type (defined in :py:class:`DataType`) and an optional name for
258      each individual column of the dataset.
259  
260      This method will raise an exception if the user data contains incompatible types or is not
261      passed in one of the supported formats (containers).
262  
263      The input should be one of these:
264        - pandas.DataFrame
265        - pandas.Series
266        - numpy.ndarray
267        - dictionary of (name -> numpy.ndarray)
268        - pyspark.sql.DataFrame
269        - scipy.sparse.csr_matrix/csc_matrix
270        - DataType
271        - List[DataType]
272        - Dict[str, Union[DataType, List, Dict]]
273        - List[Dict[str, Union[DataType, List, Dict]]]
274  
275      The last two formats are used to represent complex data structures. For example,
276  
277          Input Data:
278              [
279                  {
280                      'text': 'some sentence',
281                      'ids': ['id1'],
282                      'dict': {'key': 'value'}
283                  },
284                  {
285                      'text': 'some sentence',
286                      'ids': ['id1', 'id2'],
287                      'dict': {'key': 'value', 'key2': 'value2'}
288                  },
289              ]
290  
291          The corresponding pandas DataFrame representation should look like this:
292  
293                      output         ids                                dict
294              0  some sentence  [id1, id2]                    {'key': 'value'}
295              1  some sentence  [id1, id2]  {'key': 'value', 'key2': 'value2'}
296  
297          The inferred schema should look like this:
298  
299              Schema([
300                  ColSpec(type=DataType.string, name='output'),
301                  ColSpec(type=Array(dtype=DataType.string), name='ids'),
302                  ColSpec(
303                      type=Object([
304                          Property(name='key', dtype=DataType.string),
305                          Property(name='key2', dtype=DataType.string, required=False)
306                      ]),
307                      name='dict')]
308                  ),
309              ])
310  
311      The element types should be mappable to one of :py:class:`mlflow.models.signature.DataType` for
312      dataframes and to one of numpy types for tensors.
313  
314      Args:
315          data: Dataset to infer from.
316  
317      Returns:
318          Schema
319      """
320      from scipy.sparse import csc_matrix, csr_matrix
321  
322      # To keep backward compatibility with < 2.9.0, an empty list is inferred as string.
323      #   ref: https://github.com/mlflow/mlflow/pull/10125#discussion_r1372751487
324      if isinstance(data, list) and data == []:
325          return Schema([ColSpec(DataType.string)])
326  
327      if isinstance(data, list) and all(isinstance(value, dict) for value in data):
328          col_data_mapping = defaultdict(list)
329          for item in data:
330              for k, v in item.items():
331                  col_data_mapping[k].append(v)
332          requiredness = {}
333          for col in col_data_mapping:
334              # if col exists in item but its value is None, then it is not required
335              requiredness[col] = all(item.get(col) is not None for item in data)
336  
337          schema = Schema([
338              ColSpec(_infer_colspec_type(values).dtype, name=name, required=requiredness[name])
339              for name, values in col_data_mapping.items()
340          ])
341  
342      elif isinstance(data, dict):
343          # dictionary of (name -> numpy.ndarray)
344          if all(isinstance(values, np.ndarray) for values in data.values()):
345              schema = Schema([
346                  TensorSpec(
347                      type=clean_tensor_type(ndarray.dtype),
348                      shape=_get_tensor_shape(ndarray),
349                      name=name,
350                  )
351                  for name, ndarray in data.items()
352              ])
353          # Dict[str, Union[DataType, List, Dict]]
354          else:
355              if any(not isinstance(key, str) for key in data):
356                  raise MlflowException("The dictionary keys are not all strings.")
357              schema = Schema([
358                  ColSpec(
359                      _infer_colspec_type(value),
360                      name=name,
361                      required=_infer_required(value),
362                  )
363                  for name, value in data.items()
364              ])
365      # pandas.Series
366      elif isinstance(data, pd.Series):
367          name = getattr(data, "name", None)
368          schema = Schema([
369              ColSpec(
370                  type=_infer_pandas_column(data),
371                  name=name,
372                  required=_infer_required(data),
373              )
374          ])
375      # pandas.DataFrame
376      elif isinstance(data, pd.DataFrame):
377          schema = Schema([
378              ColSpec(
379                  type=_infer_pandas_column(data[col]),
380                  name=col,
381                  required=_infer_required(data[col]),
382              )
383              for col in data.columns
384          ])
385      # numpy.ndarray
386      elif isinstance(data, np.ndarray):
387          schema = Schema([
388              TensorSpec(type=clean_tensor_type(data.dtype), shape=_get_tensor_shape(data))
389          ])
390      # scipy.sparse.csr_matrix/csc_matrix
391      elif isinstance(data, (csc_matrix, csr_matrix)):
392          schema = Schema([
393              TensorSpec(type=clean_tensor_type(data.data.dtype), shape=_get_tensor_shape(data))
394          ])
395      # pyspark.sql.DataFrame
396      elif _is_spark_df(data):
397          schema = Schema([
398              ColSpec(
399                  type=_infer_spark_type(field.dataType, data, field.name),
400                  name=field.name,
401                  # Avoid setting required field for spark dataframe
402                  # as the default value for spark df nullable is True
403                  # which counterparts to default required=True in ColSpec
404              )
405              for field in data.schema.fields
406          ])
407      elif isinstance(data, list):
408          # Assume list as a single column
409          # List[DataType]
410          # e.g. ['some sentence', 'some sentence'] -> Schema([ColSpec(type=DataType.string)])
411          # The corresponding pandas DataFrame representation should be pd.DataFrame(data)
412          # We set required=True as unnamed optional inputs is not allowed
413          schema = Schema([ColSpec(_infer_colspec_type(data).dtype)])
414      else:
415          # DataType
416          # e.g. "some sentence" -> Schema([ColSpec(type=DataType.string)])
417          try:
418              # We set required=True as unnamed optional inputs is not allowed
419              schema = Schema([ColSpec(_infer_colspec_type(data))])
420          except MlflowException as e:
421              raise MlflowException.invalid_parameter_value(
422                  "Failed to infer schema. Expected one of the following types:\n"
423                  "- pandas.DataFrame\n"
424                  "- pandas.Series\n"
425                  "- numpy.ndarray\n"
426                  "- dictionary of (name -> numpy.ndarray)\n"
427                  "- pyspark.sql.DataFrame\n"
428                  "- scipy.sparse.csr_matrix\n"
429                  "- scipy.sparse.csc_matrix\n"
430                  "- DataType\n"
431                  "- List[DataType]\n"
432                  "- Dict[str, Union[DataType, List, Dict]]\n"
433                  "- List[Dict[str, Union[DataType, List, Dict]]]\n"
434                  f"but got '{data}'.\n"
435                  f"Error: {e}",
436              )
437      if not schema.is_tensor_spec() and any(
438          t in (DataType.integer, DataType.long) for t in schema.input_types()
439      ):
440          warnings.warn(
441              "Hint: Inferred schema contains integer column(s). Integer columns in "
442              "Python cannot represent missing values. If your input data contains "
443              "missing values at inference time, it will be encoded as floats and will "
444              "cause a schema enforcement error. The best way to avoid this problem is "
445              "to infer the model schema based on a realistic data sample (training "
446              "dataset) that includes missing values. Alternatively, you can declare "
447              "integer columns as doubles (float64) whenever these columns may have "
448              "missing values. See `Handling Integers With Missing Values "
449              "<https://www.mlflow.org/docs/latest/models.html#"
450              "handling-integers-with-missing-values>`_ for more details."
451          )
452      return schema
453  
454  
455  def _infer_numpy_dtype(dtype) -> DataType:
456      supported_types = np.dtype
457  
458      # noinspection PyBroadException
459      try:
460          from pandas.core.dtypes.base import ExtensionDtype
461  
462          supported_types = (np.dtype, ExtensionDtype)
463      except ImportError:
464          # This version of pandas does not support extension types
465          pass
466      if not isinstance(dtype, supported_types):
467          raise TypeError(f"Expected numpy.dtype or pandas.ExtensionDtype, got '{type(dtype)}'.")
468  
469      if dtype.kind == "b":
470          return DataType.boolean
471      elif dtype.kind in {"i", "u"}:
472          if dtype.itemsize < 4 or (dtype.kind == "i" and dtype.itemsize == 4):
473              return DataType.integer
474          elif dtype.itemsize < 8 or (dtype.kind == "i" and dtype.itemsize == 8):
475              return DataType.long
476      elif dtype.kind == "f":
477          if dtype.itemsize <= 4:
478              return DataType.float
479          elif dtype.itemsize <= 8:
480              return DataType.double
481  
482      elif dtype.kind == "U":
483          return DataType.string
484      elif dtype.kind == "S":
485          return DataType.binary
486      elif dtype.kind == "O":
487          raise Exception(
488              "Can not infer object without looking at the values, call _map_numpy_array instead."
489          )
490      elif dtype.kind == "M":
491          return DataType.datetime
492      raise MlflowException(f"Unsupported numpy data type '{dtype}', kind '{dtype.kind}'")
493  
494  
495  def _is_none_or_nan(x):
496      if isinstance(x, float):
497          return np.isnan(x)
498      # NB: We can't use pd.isna() because the input can be a series.
499      return x is None or x is pd.NA or x is pd.NaT
500  
501  
502  def _infer_required(col) -> bool:
503      if isinstance(col, (list, pd.Series)):
504          return not any(_is_none_or_nan(x) for x in col)
505      return not _is_none_or_nan(col)
506  
507  
508  def _infer_pandas_column(col: pd.Series) -> DataType:
509      if not isinstance(col, pd.Series):
510          raise TypeError(f"Expected pandas.Series, got '{type(col)}'.")
511      if len(col.values.shape) > 1:
512          raise MlflowException(f"Expected 1d array, got array with shape {col.shape}")
513  
514      if col.dtype.kind == "O":
515          col = col.infer_objects()
516      if col.dtype.kind == "O":
517          try:
518              # We convert pandas Series into list and infer the schema.
519              # The real schema for internal field should be the Array's dtype
520              arr_type = _infer_colspec_type(col.to_list())
521              return arr_type.dtype
522          except Exception as e:
523              # For backwards compatibility, we fall back to string
524              # if the provided array is of string type
525              if pd.api.types.is_string_dtype(col):
526                  return DataType.string
527              raise MlflowException(f"Failed to infer schema for pandas.Series {col}. Error: {e}")
528      else:
529          # NB: The following works for numpy types as well as pandas extension types.
530          return _infer_numpy_dtype(col.dtype)
531  
532  
533  def _infer_spark_type(x, data=None, col_name=None) -> DataType:
534      import pyspark.sql.types
535      from pyspark.ml.linalg import VectorUDT
536      from pyspark.sql.functions import col, collect_list
537  
538      if isinstance(x, pyspark.sql.types.NumericType):
539          if isinstance(x, pyspark.sql.types.IntegralType):
540              if isinstance(x, pyspark.sql.types.LongType):
541                  return DataType.long
542              else:
543                  return DataType.integer
544          elif isinstance(x, pyspark.sql.types.FloatType):
545              return DataType.float
546          elif isinstance(x, pyspark.sql.types.DoubleType):
547              return DataType.double
548      elif isinstance(x, pyspark.sql.types.BooleanType):
549          return DataType.boolean
550      elif isinstance(x, pyspark.sql.types.StringType):
551          return DataType.string
552      elif isinstance(x, pyspark.sql.types.BinaryType):
553          return DataType.binary
554      # NB: Spark differentiates date and timestamps, so we coerce both to TimestampType.
555      elif isinstance(x, (pyspark.sql.types.DateType, pyspark.sql.types.TimestampType)):
556          return DataType.datetime
557      elif isinstance(x, pyspark.sql.types.ArrayType):
558          return Array(_infer_spark_type(x.elementType))
559      elif isinstance(x, pyspark.sql.types.StructType):
560          return Object(
561              properties=[
562                  Property(
563                      name=f.name,
564                      dtype=_infer_spark_type(f.dataType),
565                      required=not f.nullable,
566                  )
567                  for f in x.fields
568              ]
569          )
570      elif isinstance(x, pyspark.sql.types.MapType):
571          if data is None or col_name is None:
572              raise MlflowException("Cannot infer schema for MapType without data and column name.")
573          # Map MapType to StructType
574          # Note that MapType assumes all values are of same type,
575          # if they're not then spark picks the first item's type
576          # and tries to convert rest to that type.
577          # e.g.
578          # >>> spark.createDataFrame([{"col": {"a": 1, "b": "b"}}]).show()
579          # +-------------------+
580          # |                col|
581          # +-------------------+
582          # |{a -> 1, b -> null}|
583          # +-------------------+
584          if isinstance(x.valueType, pyspark.sql.types.MapType):
585              raise MlflowException(
586                  "Please construct spark DataFrame with schema using StructType "
587                  "for dictionary/map fields, MLflow schema inference only supports "
588                  "scalar, array and struct types."
589              )
590  
591          merged_keys = (
592              data
593              .selectExpr(f"map_keys({col_name}) as keys")
594              .agg(collect_list(col("keys")).alias("merged_keys"))
595              .head()
596              .merged_keys
597          )
598          keys = {key for sublist in merged_keys for key in sublist}
599          return Object(
600              properties=[
601                  Property(
602                      name=k,
603                      dtype=_infer_spark_type(x.valueType),
604                  )
605                  for k in keys
606              ]
607          )
608      elif isinstance(x, VectorUDT):
609          return SparkMLVector()
610  
611      else:
612          raise MlflowException.invalid_parameter_value(
613              f"Unsupported Spark Type '{type(x)}' for MLflow schema."
614          )
615  
616  
617  def _is_spark_df(x) -> bool:
618      try:
619          import pyspark.sql.dataframe
620  
621          if isinstance(x, pyspark.sql.dataframe.DataFrame):
622              return True
623      except ImportError:
624          return False
625      # For spark 4.0
626      try:
627          import pyspark.sql.connect.dataframe
628  
629          return isinstance(x, pyspark.sql.connect.dataframe.DataFrame)
630      except ImportError:
631          return False
632  
633  
634  def _validate_input_dictionary_contains_only_strings_and_lists_of_strings(data) -> None:
635      # isinstance(True, int) is True
636      invalid_keys = [
637          key for key in data.keys() if not isinstance(key, (str, int)) or isinstance(key, bool)
638      ]
639      if invalid_keys:
640          raise MlflowException(
641              f"The dictionary keys are not all strings or indexes. Invalid keys: {invalid_keys}"
642          )
643      if any(isinstance(value, np.ndarray) for value in data.values()) and not all(
644          isinstance(value, np.ndarray) for value in data.values()
645      ):
646          raise MlflowException("The dictionary values are not all numpy.ndarray.")
647  
648      invalid_values = [
649          key
650          for key, value in data.items()
651          if (isinstance(value, list) and not all(isinstance(item, (str, bytes)) for item in value))
652          or (not isinstance(value, (np.ndarray, list, str, bytes)))
653      ]
654      if invalid_values:
655          raise MlflowException.invalid_parameter_value(
656              "Invalid values in dictionary. If passing a dictionary containing strings, all "
657              "values must be either strings or lists of strings. If passing a dictionary containing "
658              "numeric values, the data must be enclosed in a numpy.ndarray. The following keys "
659              f"in the input dictionary are invalid: {invalid_values}",
660          )
661  
662  
663  def _is_list_str(type_hint: Any) -> bool:
664      return type_hint in [
665          List[str],  # noqa: UP006
666          list[str],
667      ]
668  
669  
670  def _is_list_dict_str(type_hint: Any) -> bool:
671      return type_hint in [
672          List[Dict[str, str]],  # noqa: UP006
673          list[Dict[str, str]],  # noqa: UP006
674          List[dict[str, str]],  # noqa: UP006
675          list[dict[str, str]],
676      ]
677  
678  
679  def _get_array_depth(l: Any) -> int:
680      if isinstance(l, np.ndarray):
681          return l.ndim
682      if isinstance(l, list):
683          return max(_get_array_depth(item) for item in l) + 1 if l else 1
684      return 0
685  
686  
687  def _infer_type_and_shape(value):
688      if isinstance(value, (list, np.ndarray)):
689          ndim = _get_array_depth(value)
690          if ndim != 1:
691              raise MlflowException.invalid_parameter_value(
692                  f"Expected parameters to be 1D array or scalar, got {ndim}D array",
693              )
694          if all(DataType.check_type(DataType.datetime, v) for v in value):
695              return DataType.datetime, (-1,)
696          value_type = _infer_numpy_dtype(np.array(value).dtype)
697          return value_type, (-1,)
698      elif DataType.check_type(DataType.datetime, value):
699          return DataType.datetime, None
700      elif np.isscalar(value):
701          try:
702              value_type = _infer_numpy_dtype(np.array(value).dtype)
703              return value_type, None
704          except (Exception, MlflowException) as e:
705              raise MlflowException.invalid_parameter_value(
706                  f"Failed to infer schema for parameter {value}: {e!r}"
707              )
708      elif isinstance(value, dict):
709          # reuse _infer_schema to infer schema for dict, wrapping it in a dictionary is
710          # necessary to make sure value is inferred as Object
711          schema = _infer_schema({"value": value})
712          object_type = schema.inputs[0].type
713          return object_type, None
714      raise MlflowException.invalid_parameter_value(
715          f"Expected parameters to be 1D array or scalar, got {type(value).__name__}",
716      )
717  
718  
719  def _infer_param_schema(parameters: dict[str, Any]):
720      if not isinstance(parameters, dict):
721          raise MlflowException.invalid_parameter_value(
722              f"Expected parameters to be dict, got {type(parameters).__name__}",
723          )
724  
725      param_specs = []
726      invalid_params = []
727      for name, value in parameters.items():
728          try:
729              value_type, shape = _infer_type_and_shape(value)
730              param_specs.append(
731                  ParamSpec(name=name, dtype=value_type, default=deepcopy(value), shape=shape)
732              )
733          except Exception as e:
734              invalid_params.append((name, value, e))
735  
736      if invalid_params:
737          raise MlflowException.invalid_parameter_value(
738              f"Failed to infer schema for parameters: {invalid_params}",
739          )
740  
741      return ParamSchema(param_specs)