error_classification.py
1 """Centralized error classification for MLflow exceptions. 2 3 Maps error codes to sqlstate codes and error classes for structured error 4 classification and observability. Client-side errors use the KAM0x/XXM0x 5 namespace, while server/CP errors use the KAMCx/XXMCx namespace. 6 7 Terminology: 8 error_code: The existing MLflow error code from the protobuf definition 9 (e.g., INVALID_PARAMETER_VALUE, INTERNAL_ERROR). Defined in 10 mlflow/protos/databricks.proto. These are coarse-grained — many 11 different failure modes share the same error_code. 12 13 error_class: A more specific classification of the error (e.g., 14 SCHEMA_ENFORCEMENT_FAILED, ATTRIBUTE_NOT_FOUND). Defined in the 15 ErrorClass enum below. When an error_class is not explicitly set 16 at a raise site, it is auto-derived from the error_code. 17 18 sqlstate: A 5-character code used by reliability dashboards to 19 categorize errors (e.g., KAM01, XXMC0). Defined in the SqlState 20 enum below. Derived automatically from error_class (if a specific 21 mapping exists) or from error_code (generic fallback). 22 23 Derivation chain in MlflowException.__init__: 24 1. error_class: explicit value if provided, otherwise derived from error_code 25 2. sqlstate: explicit value if provided, otherwise derived from error_class 26 (via _ERROR_CLASS_TO_SQLSTATE), otherwise derived from error_code 27 (via _CLIENT_ERROR_CODE_TO_SQLSTATE) 28 29 When to override at a raise site: 30 Most raise sites do NOT need to pass sqlstate or error_class — both are 31 auto-derived from error_code. Only pass error_class when the error_code 32 is too coarse to distinguish the specific failure. For example, 33 INVALID_PARAMETER_VALUE is used for both schema enforcement failures and 34 attribute lookup failures, so those raise sites pass error_class to 35 get distinct sqlstate codes (KAM01 vs KAM04). Never pass sqlstate 36 directly — it is always derived from error_class. 37 """ 38 39 from __future__ import annotations 40 41 from enum import Enum 42 43 44 class SqlState(str, Enum): 45 """SQLSTATE codes for MLflow error classification.""" 46 47 # Client system errors (XXM0x) 48 CLIENT_INTERNAL_ERROR = "XXM00" 49 50 # Client user errors (KAM0x) 51 CLIENT_ATTRIBUTE_NOT_FOUND = "KAM04" 52 CLIENT_INVALID_PARAMETER = "KAM00" 53 CLIENT_MODEL_SERIALIZATION_FAILED = "KAM03" 54 CLIENT_PREDICTION_FUNCTION_FAILED = "KAM02" 55 CLIENT_SCHEMA_ENFORCEMENT_FAILED = "KAM01" 56 57 # CP/server system errors (XXMCx) 58 CP_INTERNAL_ERROR = "XXMC0" 59 CP_INVALID_STATE = "XXMC2" 60 CP_TEMPORARILY_UNAVAILABLE = "XXMC1" 61 62 # CP/server user errors (KAMCx) 63 CP_INVALID_PARAMETER = "KAMC4" 64 CP_PERMISSION_DENIED = "KAMC1" 65 CP_REQUEST_RATE_LIMITED = "KAMC3" 66 CP_RESOURCE_CONFLICT = "KAMC5" 67 CP_RESOURCE_NOT_FOUND = "KAMC2" 68 69 @classmethod 70 def from_client_error_code(cls, error_code: str) -> str | None: 71 result = _CLIENT_ERROR_CODE_TO_SQLSTATE.get(error_code) 72 return result.value if result is not None else None 73 74 @classmethod 75 def from_cp_error_code(cls, error_code: str) -> str | None: 76 result = _CP_ERROR_CODE_TO_SQLSTATE.get(error_code) 77 return result.value if result is not None else None 78 79 @classmethod 80 def from_error_class(cls, error_class: str) -> str | None: 81 result = _ERROR_CLASS_TO_SQLSTATE.get(error_class) 82 return result.value if result is not None else None 83 84 85 class ErrorClass(str, Enum): 86 """Error class names for MLflow error classification.""" 87 88 # Client error classes 89 ATTRIBUTE_NOT_FOUND = "ATTRIBUTE_NOT_FOUND" 90 CLIENT_INTERNAL_ERROR = "CLIENT_INTERNAL_ERROR" 91 FEATURE_DISABLED = "FEATURE_DISABLED" 92 INVALID_PARAMETER_VALUE = "INVALID_PARAMETER_VALUE" 93 MODEL_SERIALIZATION_FAILED = "MODEL_SERIALIZATION_FAILED" 94 PERMISSION_DENIED = "PERMISSION_DENIED" 95 PREDICTION_FUNCTION_FAILED = "PREDICTION_FUNCTION_FAILED" 96 RESOURCE_ALREADY_EXISTS = "RESOURCE_ALREADY_EXISTS" 97 RESOURCE_NOT_FOUND = "RESOURCE_NOT_FOUND" 98 SCHEMA_ENFORCEMENT_FAILED = "SCHEMA_ENFORCEMENT_FAILED" 99 100 # CP error classes 101 CP_INTERNAL_ERROR = "CP_INTERNAL_ERROR" 102 CP_INVALID_PARAMETER_VALUE = "CP_INVALID_PARAMETER_VALUE" 103 CP_INVALID_STATE = "CP_INVALID_STATE" 104 CP_PERMISSION_DENIED = "CP_PERMISSION_DENIED" 105 CP_REQUEST_RATE_LIMITED = "CP_REQUEST_RATE_LIMITED" 106 CP_RESOURCE_CONFLICT = "CP_RESOURCE_CONFLICT" 107 CP_RESOURCE_NOT_FOUND = "CP_RESOURCE_NOT_FOUND" 108 CP_TEMPORARILY_UNAVAILABLE = "CP_TEMPORARILY_UNAVAILABLE" 109 110 @classmethod 111 def from_client_error_code(cls, error_code: str) -> str | None: 112 result = _CLIENT_ERROR_CODE_TO_ERROR_CLASS.get(error_code) 113 return result.value if result is not None else None 114 115 @classmethod 116 def from_cp_error_code(cls, error_code: str) -> str | None: 117 result = _CP_ERROR_CODE_TO_ERROR_CLASS.get(error_code) 118 return result.value if result is not None else None 119 120 121 # Client-side mappings: error_code -> sqlstate or error_class 122 _CLIENT_ERROR_CODE_TO_SQLSTATE: dict[str, SqlState] = { 123 "BAD_REQUEST": SqlState.CLIENT_INVALID_PARAMETER, 124 "CUSTOMER_UNAUTHORIZED": SqlState.CLIENT_INVALID_PARAMETER, 125 "ENDPOINT_NOT_FOUND": SqlState.CLIENT_INVALID_PARAMETER, 126 "FEATURE_DISABLED": SqlState.CLIENT_INVALID_PARAMETER, 127 "INTERNAL_ERROR": SqlState.CLIENT_INTERNAL_ERROR, 128 "INVALID_PARAMETER_VALUE": SqlState.CLIENT_INVALID_PARAMETER, 129 "INVALID_STATE": SqlState.CLIENT_INTERNAL_ERROR, 130 "NOT_FOUND": SqlState.CLIENT_INVALID_PARAMETER, 131 "PERMISSION_DENIED": SqlState.CLIENT_INVALID_PARAMETER, 132 "RESOURCE_ALREADY_EXISTS": SqlState.CLIENT_INVALID_PARAMETER, 133 "RESOURCE_DOES_NOT_EXIST": SqlState.CLIENT_INVALID_PARAMETER, 134 "TEMPORARILY_UNAVAILABLE": SqlState.CLIENT_INTERNAL_ERROR, 135 } 136 137 _CLIENT_ERROR_CODE_TO_ERROR_CLASS: dict[str, ErrorClass] = { 138 "BAD_REQUEST": ErrorClass.INVALID_PARAMETER_VALUE, 139 "CUSTOMER_UNAUTHORIZED": ErrorClass.PERMISSION_DENIED, 140 "ENDPOINT_NOT_FOUND": ErrorClass.RESOURCE_NOT_FOUND, 141 "FEATURE_DISABLED": ErrorClass.FEATURE_DISABLED, 142 "INTERNAL_ERROR": ErrorClass.CLIENT_INTERNAL_ERROR, 143 "INVALID_PARAMETER_VALUE": ErrorClass.INVALID_PARAMETER_VALUE, 144 "INVALID_STATE": ErrorClass.CLIENT_INTERNAL_ERROR, 145 "NOT_FOUND": ErrorClass.RESOURCE_NOT_FOUND, 146 "PERMISSION_DENIED": ErrorClass.PERMISSION_DENIED, 147 "RESOURCE_ALREADY_EXISTS": ErrorClass.RESOURCE_ALREADY_EXISTS, 148 "RESOURCE_DOES_NOT_EXIST": ErrorClass.RESOURCE_NOT_FOUND, 149 "TEMPORARILY_UNAVAILABLE": ErrorClass.CLIENT_INTERNAL_ERROR, 150 } 151 152 # CP/server-side mappings: error_code -> sqlstate or error_class 153 _CP_ERROR_CODE_TO_SQLSTATE: dict[str, SqlState] = { 154 "BAD_REQUEST": SqlState.CP_INVALID_PARAMETER, 155 "CUSTOMER_UNAUTHORIZED": SqlState.CP_PERMISSION_DENIED, 156 "ENDPOINT_NOT_FOUND": SqlState.CP_RESOURCE_NOT_FOUND, 157 "INTERNAL_ERROR": SqlState.CP_INTERNAL_ERROR, 158 "INVALID_PARAMETER_VALUE": SqlState.CP_INVALID_PARAMETER, 159 "INVALID_STATE": SqlState.CP_INVALID_STATE, 160 "NOT_FOUND": SqlState.CP_RESOURCE_NOT_FOUND, 161 "PERMISSION_DENIED": SqlState.CP_PERMISSION_DENIED, 162 "REQUEST_LIMIT_EXCEEDED": SqlState.CP_REQUEST_RATE_LIMITED, 163 "RESOURCE_ALREADY_EXISTS": SqlState.CP_RESOURCE_CONFLICT, 164 "RESOURCE_CONFLICT": SqlState.CP_RESOURCE_CONFLICT, 165 "RESOURCE_DOES_NOT_EXIST": SqlState.CP_RESOURCE_NOT_FOUND, 166 "RESOURCE_EXHAUSTED": SqlState.CP_REQUEST_RATE_LIMITED, 167 "TEMPORARILY_UNAVAILABLE": SqlState.CP_TEMPORARILY_UNAVAILABLE, 168 "UNAUTHENTICATED": SqlState.CP_PERMISSION_DENIED, 169 } 170 171 _CP_ERROR_CODE_TO_ERROR_CLASS: dict[str, ErrorClass] = { 172 "BAD_REQUEST": ErrorClass.CP_INVALID_PARAMETER_VALUE, 173 "CUSTOMER_UNAUTHORIZED": ErrorClass.CP_PERMISSION_DENIED, 174 "ENDPOINT_NOT_FOUND": ErrorClass.CP_RESOURCE_NOT_FOUND, 175 "INTERNAL_ERROR": ErrorClass.CP_INTERNAL_ERROR, 176 "INVALID_PARAMETER_VALUE": ErrorClass.CP_INVALID_PARAMETER_VALUE, 177 "INVALID_STATE": ErrorClass.CP_INVALID_STATE, 178 "NOT_FOUND": ErrorClass.CP_RESOURCE_NOT_FOUND, 179 "PERMISSION_DENIED": ErrorClass.CP_PERMISSION_DENIED, 180 "REQUEST_LIMIT_EXCEEDED": ErrorClass.CP_REQUEST_RATE_LIMITED, 181 "RESOURCE_ALREADY_EXISTS": ErrorClass.CP_RESOURCE_CONFLICT, 182 "RESOURCE_CONFLICT": ErrorClass.CP_RESOURCE_CONFLICT, 183 "RESOURCE_DOES_NOT_EXIST": ErrorClass.CP_RESOURCE_NOT_FOUND, 184 "RESOURCE_EXHAUSTED": ErrorClass.CP_REQUEST_RATE_LIMITED, 185 "TEMPORARILY_UNAVAILABLE": ErrorClass.CP_TEMPORARILY_UNAVAILABLE, 186 "UNAUTHENTICATED": ErrorClass.CP_PERMISSION_DENIED, 187 } 188 189 # error_class -> sqlstate mapping for specific error patterns that override the 190 # generic auto-derive. Used at raise sites where the error_code (e.g., 191 # INVALID_PARAMETER_VALUE) is too coarse to distinguish the specific failure. 192 _ERROR_CLASS_TO_SQLSTATE: dict[str, SqlState] = { 193 ErrorClass.ATTRIBUTE_NOT_FOUND: SqlState.CLIENT_ATTRIBUTE_NOT_FOUND, 194 ErrorClass.MODEL_SERIALIZATION_FAILED: SqlState.CLIENT_MODEL_SERIALIZATION_FAILED, 195 ErrorClass.PREDICTION_FUNCTION_FAILED: SqlState.CLIENT_PREDICTION_FUNCTION_FAILED, 196 ErrorClass.SCHEMA_ENFORCEMENT_FAILED: SqlState.CLIENT_SCHEMA_ENFORCEMENT_FAILED, 197 }