/ mlflow / error_classification.py
error_classification.py
  1  """Centralized error classification for MLflow exceptions.
  2  
  3  Maps error codes to sqlstate codes and error classes for structured error
  4  classification and observability. Client-side errors use the KAM0x/XXM0x
  5  namespace, while server/CP errors use the KAMCx/XXMCx namespace.
  6  
  7  Terminology:
  8      error_code: The existing MLflow error code from the protobuf definition
  9          (e.g., INVALID_PARAMETER_VALUE, INTERNAL_ERROR). Defined in
 10          mlflow/protos/databricks.proto. These are coarse-grained — many
 11          different failure modes share the same error_code.
 12  
 13      error_class: A more specific classification of the error (e.g.,
 14          SCHEMA_ENFORCEMENT_FAILED, ATTRIBUTE_NOT_FOUND). Defined in the
 15          ErrorClass enum below. When an error_class is not explicitly set
 16          at a raise site, it is auto-derived from the error_code.
 17  
 18      sqlstate: A 5-character code used by reliability dashboards to
 19          categorize errors (e.g., KAM01, XXMC0). Defined in the SqlState
 20          enum below. Derived automatically from error_class (if a specific
 21          mapping exists) or from error_code (generic fallback).
 22  
 23  Derivation chain in MlflowException.__init__:
 24      1. error_class: explicit value if provided, otherwise derived from error_code
 25      2. sqlstate: explicit value if provided, otherwise derived from error_class
 26         (via _ERROR_CLASS_TO_SQLSTATE), otherwise derived from error_code
 27         (via _CLIENT_ERROR_CODE_TO_SQLSTATE)
 28  
 29  When to override at a raise site:
 30      Most raise sites do NOT need to pass sqlstate or error_class — both are
 31      auto-derived from error_code. Only pass error_class when the error_code
 32      is too coarse to distinguish the specific failure. For example,
 33      INVALID_PARAMETER_VALUE is used for both schema enforcement failures and
 34      attribute lookup failures, so those raise sites pass error_class to
 35      get distinct sqlstate codes (KAM01 vs KAM04). Never pass sqlstate
 36      directly — it is always derived from error_class.
 37  """
 38  
 39  from __future__ import annotations
 40  
 41  from enum import Enum
 42  
 43  
 44  class SqlState(str, Enum):
 45      """SQLSTATE codes for MLflow error classification."""
 46  
 47      # Client system errors (XXM0x)
 48      CLIENT_INTERNAL_ERROR = "XXM00"
 49  
 50      # Client user errors (KAM0x)
 51      CLIENT_ATTRIBUTE_NOT_FOUND = "KAM04"
 52      CLIENT_INVALID_PARAMETER = "KAM00"
 53      CLIENT_MODEL_SERIALIZATION_FAILED = "KAM03"
 54      CLIENT_PREDICTION_FUNCTION_FAILED = "KAM02"
 55      CLIENT_SCHEMA_ENFORCEMENT_FAILED = "KAM01"
 56  
 57      # CP/server system errors (XXMCx)
 58      CP_INTERNAL_ERROR = "XXMC0"
 59      CP_INVALID_STATE = "XXMC2"
 60      CP_TEMPORARILY_UNAVAILABLE = "XXMC1"
 61  
 62      # CP/server user errors (KAMCx)
 63      CP_INVALID_PARAMETER = "KAMC4"
 64      CP_PERMISSION_DENIED = "KAMC1"
 65      CP_REQUEST_RATE_LIMITED = "KAMC3"
 66      CP_RESOURCE_CONFLICT = "KAMC5"
 67      CP_RESOURCE_NOT_FOUND = "KAMC2"
 68  
 69      @classmethod
 70      def from_client_error_code(cls, error_code: str) -> str | None:
 71          result = _CLIENT_ERROR_CODE_TO_SQLSTATE.get(error_code)
 72          return result.value if result is not None else None
 73  
 74      @classmethod
 75      def from_cp_error_code(cls, error_code: str) -> str | None:
 76          result = _CP_ERROR_CODE_TO_SQLSTATE.get(error_code)
 77          return result.value if result is not None else None
 78  
 79      @classmethod
 80      def from_error_class(cls, error_class: str) -> str | None:
 81          result = _ERROR_CLASS_TO_SQLSTATE.get(error_class)
 82          return result.value if result is not None else None
 83  
 84  
 85  class ErrorClass(str, Enum):
 86      """Error class names for MLflow error classification."""
 87  
 88      # Client error classes
 89      ATTRIBUTE_NOT_FOUND = "ATTRIBUTE_NOT_FOUND"
 90      CLIENT_INTERNAL_ERROR = "CLIENT_INTERNAL_ERROR"
 91      FEATURE_DISABLED = "FEATURE_DISABLED"
 92      INVALID_PARAMETER_VALUE = "INVALID_PARAMETER_VALUE"
 93      MODEL_SERIALIZATION_FAILED = "MODEL_SERIALIZATION_FAILED"
 94      PERMISSION_DENIED = "PERMISSION_DENIED"
 95      PREDICTION_FUNCTION_FAILED = "PREDICTION_FUNCTION_FAILED"
 96      RESOURCE_ALREADY_EXISTS = "RESOURCE_ALREADY_EXISTS"
 97      RESOURCE_NOT_FOUND = "RESOURCE_NOT_FOUND"
 98      SCHEMA_ENFORCEMENT_FAILED = "SCHEMA_ENFORCEMENT_FAILED"
 99  
100      # CP error classes
101      CP_INTERNAL_ERROR = "CP_INTERNAL_ERROR"
102      CP_INVALID_PARAMETER_VALUE = "CP_INVALID_PARAMETER_VALUE"
103      CP_INVALID_STATE = "CP_INVALID_STATE"
104      CP_PERMISSION_DENIED = "CP_PERMISSION_DENIED"
105      CP_REQUEST_RATE_LIMITED = "CP_REQUEST_RATE_LIMITED"
106      CP_RESOURCE_CONFLICT = "CP_RESOURCE_CONFLICT"
107      CP_RESOURCE_NOT_FOUND = "CP_RESOURCE_NOT_FOUND"
108      CP_TEMPORARILY_UNAVAILABLE = "CP_TEMPORARILY_UNAVAILABLE"
109  
110      @classmethod
111      def from_client_error_code(cls, error_code: str) -> str | None:
112          result = _CLIENT_ERROR_CODE_TO_ERROR_CLASS.get(error_code)
113          return result.value if result is not None else None
114  
115      @classmethod
116      def from_cp_error_code(cls, error_code: str) -> str | None:
117          result = _CP_ERROR_CODE_TO_ERROR_CLASS.get(error_code)
118          return result.value if result is not None else None
119  
120  
121  # Client-side mappings: error_code -> sqlstate or error_class
122  _CLIENT_ERROR_CODE_TO_SQLSTATE: dict[str, SqlState] = {
123      "BAD_REQUEST": SqlState.CLIENT_INVALID_PARAMETER,
124      "CUSTOMER_UNAUTHORIZED": SqlState.CLIENT_INVALID_PARAMETER,
125      "ENDPOINT_NOT_FOUND": SqlState.CLIENT_INVALID_PARAMETER,
126      "FEATURE_DISABLED": SqlState.CLIENT_INVALID_PARAMETER,
127      "INTERNAL_ERROR": SqlState.CLIENT_INTERNAL_ERROR,
128      "INVALID_PARAMETER_VALUE": SqlState.CLIENT_INVALID_PARAMETER,
129      "INVALID_STATE": SqlState.CLIENT_INTERNAL_ERROR,
130      "NOT_FOUND": SqlState.CLIENT_INVALID_PARAMETER,
131      "PERMISSION_DENIED": SqlState.CLIENT_INVALID_PARAMETER,
132      "RESOURCE_ALREADY_EXISTS": SqlState.CLIENT_INVALID_PARAMETER,
133      "RESOURCE_DOES_NOT_EXIST": SqlState.CLIENT_INVALID_PARAMETER,
134      "TEMPORARILY_UNAVAILABLE": SqlState.CLIENT_INTERNAL_ERROR,
135  }
136  
137  _CLIENT_ERROR_CODE_TO_ERROR_CLASS: dict[str, ErrorClass] = {
138      "BAD_REQUEST": ErrorClass.INVALID_PARAMETER_VALUE,
139      "CUSTOMER_UNAUTHORIZED": ErrorClass.PERMISSION_DENIED,
140      "ENDPOINT_NOT_FOUND": ErrorClass.RESOURCE_NOT_FOUND,
141      "FEATURE_DISABLED": ErrorClass.FEATURE_DISABLED,
142      "INTERNAL_ERROR": ErrorClass.CLIENT_INTERNAL_ERROR,
143      "INVALID_PARAMETER_VALUE": ErrorClass.INVALID_PARAMETER_VALUE,
144      "INVALID_STATE": ErrorClass.CLIENT_INTERNAL_ERROR,
145      "NOT_FOUND": ErrorClass.RESOURCE_NOT_FOUND,
146      "PERMISSION_DENIED": ErrorClass.PERMISSION_DENIED,
147      "RESOURCE_ALREADY_EXISTS": ErrorClass.RESOURCE_ALREADY_EXISTS,
148      "RESOURCE_DOES_NOT_EXIST": ErrorClass.RESOURCE_NOT_FOUND,
149      "TEMPORARILY_UNAVAILABLE": ErrorClass.CLIENT_INTERNAL_ERROR,
150  }
151  
152  # CP/server-side mappings: error_code -> sqlstate or error_class
153  _CP_ERROR_CODE_TO_SQLSTATE: dict[str, SqlState] = {
154      "BAD_REQUEST": SqlState.CP_INVALID_PARAMETER,
155      "CUSTOMER_UNAUTHORIZED": SqlState.CP_PERMISSION_DENIED,
156      "ENDPOINT_NOT_FOUND": SqlState.CP_RESOURCE_NOT_FOUND,
157      "INTERNAL_ERROR": SqlState.CP_INTERNAL_ERROR,
158      "INVALID_PARAMETER_VALUE": SqlState.CP_INVALID_PARAMETER,
159      "INVALID_STATE": SqlState.CP_INVALID_STATE,
160      "NOT_FOUND": SqlState.CP_RESOURCE_NOT_FOUND,
161      "PERMISSION_DENIED": SqlState.CP_PERMISSION_DENIED,
162      "REQUEST_LIMIT_EXCEEDED": SqlState.CP_REQUEST_RATE_LIMITED,
163      "RESOURCE_ALREADY_EXISTS": SqlState.CP_RESOURCE_CONFLICT,
164      "RESOURCE_CONFLICT": SqlState.CP_RESOURCE_CONFLICT,
165      "RESOURCE_DOES_NOT_EXIST": SqlState.CP_RESOURCE_NOT_FOUND,
166      "RESOURCE_EXHAUSTED": SqlState.CP_REQUEST_RATE_LIMITED,
167      "TEMPORARILY_UNAVAILABLE": SqlState.CP_TEMPORARILY_UNAVAILABLE,
168      "UNAUTHENTICATED": SqlState.CP_PERMISSION_DENIED,
169  }
170  
171  _CP_ERROR_CODE_TO_ERROR_CLASS: dict[str, ErrorClass] = {
172      "BAD_REQUEST": ErrorClass.CP_INVALID_PARAMETER_VALUE,
173      "CUSTOMER_UNAUTHORIZED": ErrorClass.CP_PERMISSION_DENIED,
174      "ENDPOINT_NOT_FOUND": ErrorClass.CP_RESOURCE_NOT_FOUND,
175      "INTERNAL_ERROR": ErrorClass.CP_INTERNAL_ERROR,
176      "INVALID_PARAMETER_VALUE": ErrorClass.CP_INVALID_PARAMETER_VALUE,
177      "INVALID_STATE": ErrorClass.CP_INVALID_STATE,
178      "NOT_FOUND": ErrorClass.CP_RESOURCE_NOT_FOUND,
179      "PERMISSION_DENIED": ErrorClass.CP_PERMISSION_DENIED,
180      "REQUEST_LIMIT_EXCEEDED": ErrorClass.CP_REQUEST_RATE_LIMITED,
181      "RESOURCE_ALREADY_EXISTS": ErrorClass.CP_RESOURCE_CONFLICT,
182      "RESOURCE_CONFLICT": ErrorClass.CP_RESOURCE_CONFLICT,
183      "RESOURCE_DOES_NOT_EXIST": ErrorClass.CP_RESOURCE_NOT_FOUND,
184      "RESOURCE_EXHAUSTED": ErrorClass.CP_REQUEST_RATE_LIMITED,
185      "TEMPORARILY_UNAVAILABLE": ErrorClass.CP_TEMPORARILY_UNAVAILABLE,
186      "UNAUTHENTICATED": ErrorClass.CP_PERMISSION_DENIED,
187  }
188  
189  # error_class -> sqlstate mapping for specific error patterns that override the
190  # generic auto-derive. Used at raise sites where the error_code (e.g.,
191  # INVALID_PARAMETER_VALUE) is too coarse to distinguish the specific failure.
192  _ERROR_CLASS_TO_SQLSTATE: dict[str, SqlState] = {
193      ErrorClass.ATTRIBUTE_NOT_FOUND: SqlState.CLIENT_ATTRIBUTE_NOT_FOUND,
194      ErrorClass.MODEL_SERIALIZATION_FAILED: SqlState.CLIENT_MODEL_SERIALIZATION_FAILED,
195      ErrorClass.PREDICTION_FUNCTION_FAILED: SqlState.CLIENT_PREDICTION_FUNCTION_FAILED,
196      ErrorClass.SCHEMA_ENFORCEMENT_FAILED: SqlState.CLIENT_SCHEMA_ENFORCEMENT_FAILED,
197  }