data_integrity_tests.py
1 from abc import ABC 2 from typing import ClassVar 3 from typing import Dict 4 from typing import List 5 from typing import Optional 6 from typing import Union 7 8 import numpy 9 import numpy as np 10 import pandas as pd 11 from pandas.core.dtypes.common import infer_dtype_from_object # type: ignore[attr-defined] 12 13 from evidently.legacy.base_metric import ColumnName 14 from evidently.legacy.metrics import ColumnRegExpMetric 15 from evidently.legacy.metrics import ColumnSummaryMetric 16 from evidently.legacy.metrics import DatasetMissingValuesMetric 17 from evidently.legacy.metrics import DatasetSummaryMetric 18 from evidently.legacy.metrics.data_integrity.dataset_missing_values_metric import DatasetMissingValues 19 from evidently.legacy.metrics.data_integrity.dataset_missing_values_metric import DatasetMissingValuesMetricResult 20 from evidently.legacy.metrics.data_integrity.dataset_summary_metric import DatasetSummary 21 from evidently.legacy.model.widget import BaseWidgetInfo 22 from evidently.legacy.renderers.base_renderer import DetailsInfo 23 from evidently.legacy.renderers.base_renderer import TestHtmlInfo 24 from evidently.legacy.renderers.base_renderer import TestRenderer 25 from evidently.legacy.renderers.base_renderer import default_renderer 26 from evidently.legacy.tests.base_test import BaseCheckValueTest 27 from evidently.legacy.tests.base_test import ColumnCheckValueParameters 28 from evidently.legacy.tests.base_test import ConditionFromReferenceMixin 29 from evidently.legacy.tests.base_test import GroupData 30 from evidently.legacy.tests.base_test import GroupingTypes 31 from evidently.legacy.tests.base_test import Test 32 from evidently.legacy.tests.base_test import TestParameters 33 from evidently.legacy.tests.base_test import TestResult 34 from evidently.legacy.tests.base_test import TestStatus 35 from evidently.legacy.tests.base_test import TestValueCondition 36 from evidently.legacy.tests.utils import approx 37 from evidently.legacy.tests.utils import dataframes_to_table 38 from evidently.legacy.tests.utils import plot_dicts_to_table 39 from evidently.legacy.tests.utils import plot_value_counts_tables_ref_curr 40 from evidently.legacy.utils.data_preprocessing import DataDefinition 41 from evidently.legacy.utils.generators import BaseGenerator 42 from evidently.legacy.utils.types import Numeric 43 from evidently.legacy.utils.types import NumericApprox 44 45 DATA_INTEGRITY_GROUP = GroupData(id="data_integrity", title="Data Integrity", description="") 46 GroupingTypes.TestGroup.add_value(DATA_INTEGRITY_GROUP) 47 48 49 class BaseIntegrityValueTest(ConditionFromReferenceMixin[DatasetSummary], ABC): 50 group: ClassVar = DATA_INTEGRITY_GROUP.id 51 _metric: DatasetSummaryMetric 52 53 def __init__( 54 self, 55 eq: Optional[NumericApprox] = None, 56 gt: Optional[Numeric] = None, 57 gte: Optional[Numeric] = None, 58 is_in: Optional[List[Union[Numeric, str, bool]]] = None, 59 lt: Optional[Numeric] = None, 60 lte: Optional[Numeric] = None, 61 not_eq: Optional[Numeric] = None, 62 not_in: Optional[List[Union[Numeric, str, bool]]] = None, 63 is_critical: bool = True, 64 ): 65 super().__init__( 66 eq=eq, 67 gt=gt, 68 gte=gte, 69 is_in=is_in, 70 lt=lt, 71 lte=lte, 72 not_eq=not_eq, 73 not_in=not_in, 74 is_critical=is_critical, 75 ) 76 self._metric = DatasetSummaryMetric() 77 78 79 class TestNumberOfColumns(BaseIntegrityValueTest): 80 class Config: 81 type_alias = "evidently:test:TestNumberOfColumns" 82 83 """Number of all columns in the data, including utility columns (id/index, datetime, target, predictions)""" 84 85 name: ClassVar = "Number of Columns" 86 87 def get_condition_from_reference(self, reference: Optional[DatasetSummary]): 88 if reference is not None: 89 return TestValueCondition(eq=reference.number_of_columns) 90 return TestValueCondition(gt=0) 91 92 def calculate_value_for_test(self) -> Numeric: 93 return self.metric.get_result().current.number_of_columns 94 95 def get_description(self, value: Numeric) -> str: 96 return f"The number of columns is {value}. The test threshold is {self.get_condition()}." 97 98 99 @default_renderer(wrap_type=TestNumberOfColumns) 100 class TestNumberOfColumnsRenderer(TestRenderer): 101 def render_html(self, obj: TestNumberOfColumns) -> TestHtmlInfo: 102 info = super().render_html(obj) 103 columns = ["column name", "current dtype"] 104 dict_curr = obj.metric.get_result().current.columns_type 105 dict_ref = None 106 reference_stats = obj.metric.get_result().reference 107 108 if reference_stats is not None: 109 dict_ref = reference_stats.columns_type 110 columns = columns + ["reference dtype"] 111 112 additional_plots = plot_dicts_to_table(dict_curr, dict_ref, columns, "number_of_column", "diff") 113 info.details = additional_plots 114 return info 115 116 117 class TestNumberOfRows(BaseIntegrityValueTest): 118 class Config: 119 type_alias = "evidently:test:TestNumberOfRows" 120 121 """Number of rows in the data""" 122 123 name: ClassVar = "Number of Rows" 124 125 def get_condition_from_reference(self, reference: Optional[DatasetSummary]): 126 if reference is not None: 127 return TestValueCondition(eq=approx(reference.number_of_rows, relative=0.1)) 128 129 return TestValueCondition(gt=30) 130 131 def calculate_value_for_test(self) -> Numeric: 132 return self.metric.get_result().current.number_of_rows 133 134 def get_description(self, value: Numeric) -> str: 135 return f"The number of rows is {value}. The test threshold is {self.get_condition()}." 136 137 138 class BaseIntegrityMissingValuesValuesTest(ConditionFromReferenceMixin[DatasetMissingValues], ABC): 139 group: ClassVar = DATA_INTEGRITY_GROUP.id 140 _metric: DatasetMissingValuesMetric 141 missing_values: Optional[list] = None 142 replace: bool = True 143 144 def __init__( 145 self, 146 missing_values: Optional[list] = None, 147 replace: bool = True, 148 eq: Optional[Numeric] = None, 149 gt: Optional[Numeric] = None, 150 gte: Optional[Numeric] = None, 151 is_in: Optional[List[Union[Numeric, str, bool]]] = None, 152 lt: Optional[Numeric] = None, 153 lte: Optional[Numeric] = None, 154 not_eq: Optional[Numeric] = None, 155 not_in: Optional[List[Union[Numeric, str, bool]]] = None, 156 is_critical: bool = True, 157 ): 158 self.missing_values = missing_values 159 self.replace = replace 160 super().__init__( 161 eq=eq, 162 gt=gt, 163 gte=gte, 164 is_in=is_in, 165 lt=lt, 166 lte=lte, 167 not_eq=not_eq, 168 not_in=not_in, 169 is_critical=is_critical, 170 ) 171 self._metric = DatasetMissingValuesMetric(missing_values=self.missing_values, replace=self.replace) 172 173 174 class BaseTestMissingValuesRenderer(TestRenderer): 175 """Common class for tests of missing values. 176 Some tests have the same details visualizations. 177 """ 178 179 MISSING_VALUES_NAMING_MAPPING = { 180 None: "Pandas nulls (None, NAN, etc.)", 181 "": '"" (empty string)', 182 np.inf: 'Numpy "inf" value', 183 -np.inf: 'Numpy "-inf" value', 184 } 185 186 @staticmethod 187 def _get_number_and_percents_of_missing_values(missing_values_info: DatasetMissingValues) -> pd.DataFrame: 188 """Get a string with missing values numbers and percents from info for results table""" 189 result = {} 190 191 for columns_name in missing_values_info.number_of_missing_values_by_column: 192 missing_values_count = missing_values_info.number_of_missing_values_by_column[columns_name] 193 percent_count = missing_values_info.share_of_missing_values_by_column[columns_name] * 100 194 result[columns_name] = f"{missing_values_count} ({percent_count:.2f}%)" 195 196 return pd.DataFrame.from_dict( 197 { 198 name: dict( 199 value=missing_values_info.number_of_missing_values_by_column[name], 200 display=f"{missing_values_info.number_of_missing_values_by_column[name]}" 201 f" ({missing_values_info.share_of_missing_values_by_column[name] * 100:.2f}%)", 202 ) 203 for name in missing_values_info.number_of_missing_values_by_column.keys() 204 }, 205 orient="index", 206 columns=["value", "display"], 207 ) 208 209 def get_table_with_missing_values_and_percents_by_column( 210 self, info: TestHtmlInfo, metric_result: DatasetMissingValuesMetricResult, name: str 211 ) -> TestHtmlInfo: 212 """Get a table with missing values number and percents""" 213 columns = ["column name", "current number of missing values"] 214 dict_curr = self._get_number_and_percents_of_missing_values(metric_result.current) 215 dict_ref = None 216 reference_stats = metric_result.reference 217 218 if reference_stats is not None: 219 # add one more column and values for reference data 220 columns.append("reference number of missing values") 221 dict_ref = self._get_number_and_percents_of_missing_values(reference_stats) 222 223 additional_plots = dataframes_to_table(dict_curr, dict_ref, columns, name) 224 info.details = additional_plots 225 return info 226 227 def _replace_missing_values_to_description(self, values: dict) -> dict: 228 """Replace missing values in the dict keys to human-readable string""" 229 return {self.MISSING_VALUES_NAMING_MAPPING.get(k, k): v for k, v in values.items()} 230 231 def get_table_with_number_of_missing_values_by_one_missing_value( 232 self, info: TestHtmlInfo, current_missing_values: dict, reference_missing_values: Optional[dict], name: str 233 ) -> TestHtmlInfo: 234 columns = ["missing value", "current number of missing values"] 235 dict_curr = self._replace_missing_values_to_description(current_missing_values) 236 dict_ref: Optional[dict] = None 237 238 if reference_missing_values is not None: 239 # add one more column and values for reference data 240 columns.append("reference number of missing values") 241 # cast keys to str because None could be in keys, and it is not processed correctly in visual tables 242 dict_ref = self._replace_missing_values_to_description(reference_missing_values) 243 244 additional_plots = plot_dicts_to_table(dict_curr, dict_ref, columns, name) 245 info.details = additional_plots 246 return info 247 248 249 class TestNumberOfDifferentMissingValues(BaseIntegrityMissingValuesValuesTest): 250 class Config: 251 type_alias = "evidently:test:TestNumberOfDifferentMissingValues" 252 253 """Check a number of different encoded missing values.""" 254 255 name: ClassVar = "Different Types of Missing Values" 256 257 def get_condition_from_reference(self, reference: Optional[DatasetMissingValues]): 258 if reference is not None: 259 return TestValueCondition(eq=reference.number_of_different_missing_values) 260 261 return TestValueCondition(eq=0) 262 263 def calculate_value_for_test(self) -> Numeric: 264 return self.metric.get_result().current.number_of_different_missing_values 265 266 def get_description(self, value: Numeric) -> str: 267 return ( 268 f"The number of differently encoded types of missing values is {value}. " 269 f"The test threshold is {self.get_condition()}." 270 ) 271 272 273 @default_renderer(wrap_type=TestNumberOfDifferentMissingValues) 274 class TestNumberOfDifferentMissingValuesRenderer(BaseTestMissingValuesRenderer): 275 def render_html(self, obj: TestNumberOfDifferentMissingValues) -> TestHtmlInfo: 276 """Get a table with a missing value and number of the value in the dataset""" 277 info = super().render_html(obj) 278 metric_result = obj.metric.get_result() 279 current_missing_values = metric_result.current.different_missing_values 280 281 if metric_result.reference is None: 282 reference_missing_values = None 283 284 else: 285 reference_missing_values = metric_result.reference.different_missing_values 286 287 return self.get_table_with_number_of_missing_values_by_one_missing_value( 288 info, 289 current_missing_values, 290 reference_missing_values, 291 "number_of_different_missing_values", 292 ) 293 294 295 class TestNumberOfMissingValues(BaseIntegrityMissingValuesValuesTest): 296 class Config: 297 type_alias = "evidently:test:TestNumberOfMissingValues" 298 299 """Check a number of missing values.""" 300 301 name: ClassVar = "The Number of Missing Values" 302 303 def get_condition_from_reference(self, reference: Optional[DatasetMissingValues]): 304 if reference is not None: 305 curr_number_of_rows = self.metric.get_result().current.number_of_rows 306 ref_number_of_rows = reference.number_of_rows 307 mult = curr_number_of_rows / ref_number_of_rows 308 return TestValueCondition( 309 lte=approx( 310 reference.number_of_missing_values * mult, 311 relative=0.1, 312 ), 313 ) 314 315 return TestValueCondition(eq=0) 316 317 def calculate_value_for_test(self) -> Numeric: 318 return self.metric.get_result().current.number_of_missing_values 319 320 def get_description(self, value: Numeric) -> str: 321 return f"The number of missing values is {value}. The test threshold is {self.get_condition()}." 322 323 324 @default_renderer(wrap_type=TestNumberOfMissingValues) 325 class TestNumberOfMissingValuesRenderer(BaseTestMissingValuesRenderer): 326 def render_html(self, obj: TestNumberOfMissingValues) -> TestHtmlInfo: 327 info = super().render_html(obj) 328 metric_result = obj.metric.get_result() 329 return self.get_table_with_missing_values_and_percents_by_column( 330 info, metric_result, "number_of_missing_values" 331 ) 332 333 334 class TestShareOfMissingValues(BaseIntegrityMissingValuesValuesTest): 335 class Config: 336 type_alias = "evidently:test:TestShareOfMissingValues" 337 338 """Check a share of missing values.""" 339 340 name: ClassVar = "Share of Missing Values" 341 342 def get_condition_from_reference(self, reference: Optional[DatasetMissingValues]): 343 if reference is not None: 344 return TestValueCondition(lte=approx(reference.share_of_missing_values, relative=0.1)) 345 346 return TestValueCondition(eq=0) 347 348 def calculate_value_for_test(self) -> Numeric: 349 return self.metric.get_result().current.share_of_missing_values 350 351 def get_description(self, value: Numeric) -> str: 352 return f"The share of missing values is {value:.3g}. The test threshold is {self.get_condition()}." 353 354 355 @default_renderer(wrap_type=TestShareOfMissingValues) 356 class TestShareOfMissingValuesRenderer(BaseTestMissingValuesRenderer): 357 def render_html(self, obj: TestNumberOfMissingValues) -> TestHtmlInfo: 358 info = super().render_html(obj) 359 metric_result = obj.metric.get_result() 360 return self.get_table_with_missing_values_and_percents_by_column(info, metric_result, "share_of_missing_values") 361 362 363 class TestNumberOfColumnsWithMissingValues(BaseIntegrityMissingValuesValuesTest): 364 class Config: 365 type_alias = "evidently:test:TestNumberOfColumnsWithMissingValues" 366 367 """Check a number of columns with a missing value.""" 368 369 name: ClassVar = "The Number of Columns With Missing Values" 370 371 def get_condition_from_reference(self, reference: Optional[DatasetMissingValues]): 372 if reference is not None: 373 return TestValueCondition(lte=reference.number_of_columns_with_missing_values) 374 375 return TestValueCondition(eq=0) 376 377 def calculate_value_for_test(self) -> Numeric: 378 return self.metric.get_result().current.number_of_columns_with_missing_values 379 380 def get_description(self, value: Numeric) -> str: 381 return ( 382 f"The number of columns with missing values is {value}. " f"The test threshold is {self.get_condition()}." 383 ) 384 385 386 @default_renderer(wrap_type=TestNumberOfColumnsWithMissingValues) 387 class TestNumberOfColumnsWithMissingValuesRenderer(BaseTestMissingValuesRenderer): 388 def render_html(self, obj: TestNumberOfMissingValues) -> TestHtmlInfo: 389 info = super().render_html(obj) 390 metric_result = obj.metric.get_result() 391 return self.get_table_with_missing_values_and_percents_by_column( 392 info, metric_result, "number_of_columns_with_missing_values" 393 ) 394 395 396 class TestShareOfColumnsWithMissingValues(BaseIntegrityMissingValuesValuesTest): 397 class Config: 398 type_alias = "evidently:test:TestShareOfColumnsWithMissingValues" 399 400 """Check a share of columns with a missing value.""" 401 402 name: ClassVar = "The Share of Columns With Missing Values" 403 404 def get_condition_from_reference(self, reference: Optional[DatasetMissingValues]): 405 if reference is not None: 406 return TestValueCondition(lte=reference.share_of_columns_with_missing_values) 407 408 return TestValueCondition(eq=0) 409 410 def calculate_value_for_test(self) -> Numeric: 411 return self.metric.get_result().current.share_of_columns_with_missing_values 412 413 def get_description(self, value: Numeric) -> str: 414 return ( 415 f"The share of columns with missing values is {value:.3g}. " 416 f"The test threshold is {self.get_condition()}." 417 ) 418 419 420 @default_renderer(wrap_type=TestShareOfColumnsWithMissingValues) 421 class TestShareOfColumnsWithMissingValuesRenderer(BaseTestMissingValuesRenderer): 422 def render_html(self, obj: TestNumberOfMissingValues) -> TestHtmlInfo: 423 info = super().render_html(obj) 424 metric_result = obj.metric.get_result() 425 return self.get_table_with_missing_values_and_percents_by_column( 426 info, metric_result, "share_of_columns_with_missing_values" 427 ) 428 429 430 class TestNumberOfRowsWithMissingValues(BaseIntegrityMissingValuesValuesTest): 431 class Config: 432 type_alias = "evidently:test:TestNumberOfRowsWithMissingValues" 433 434 """Check a number of rows with a missing value.""" 435 436 name: ClassVar = "The Number Of Rows With Missing Values" 437 438 def get_condition_from_reference(self, reference: Optional[DatasetMissingValues]): 439 if reference is not None: 440 curr_number_of_rows = self.metric.get_result().current.number_of_rows 441 ref_number_of_rows = reference.number_of_rows 442 mult = curr_number_of_rows / ref_number_of_rows 443 return TestValueCondition( 444 lte=approx(reference.number_of_rows_with_missing_values * mult, relative=0.1), 445 ) 446 447 return TestValueCondition(eq=0) 448 449 def calculate_value_for_test(self) -> Numeric: 450 return self.metric.get_result().current.number_of_rows_with_missing_values 451 452 def get_description(self, value: Numeric) -> str: 453 return f"The number of rows with missing values is {value}. " f"The test threshold is {self.get_condition()}." 454 455 456 class TestShareOfRowsWithMissingValues(BaseIntegrityMissingValuesValuesTest): 457 class Config: 458 type_alias = "evidently:test:TestShareOfRowsWithMissingValues" 459 460 """Check a share of rows with a missing value.""" 461 462 name: ClassVar = "The Share of Rows With Missing Values" 463 464 def get_condition_from_reference(self, reference: Optional[DatasetMissingValues]): 465 if reference is not None: 466 return TestValueCondition(lte=approx(reference.share_of_rows_with_missing_values, relative=0.1)) 467 468 return TestValueCondition(eq=0) 469 470 def calculate_value_for_test(self) -> Numeric: 471 return self.metric.get_result().current.share_of_rows_with_missing_values 472 473 def get_description(self, value: Numeric) -> str: 474 return ( 475 f"The share of rows with missing values is {value:.3g}. " f"The test threshold is {self.get_condition()}." 476 ) 477 478 479 class BaseIntegrityColumnMissingValuesTest(ConditionFromReferenceMixin[DatasetMissingValues], ABC): 480 group: ClassVar = DATA_INTEGRITY_GROUP.id 481 _metric: DatasetMissingValuesMetric 482 column_name: str 483 missing_values: Optional[List] = None 484 replace: bool = True 485 486 def __init__( 487 self, 488 column_name: str, 489 missing_values: Optional[list] = None, 490 replace: bool = True, 491 eq: Optional[Numeric] = None, 492 gt: Optional[Numeric] = None, 493 gte: Optional[Numeric] = None, 494 is_in: Optional[List[Union[Numeric, str, bool]]] = None, 495 lt: Optional[Numeric] = None, 496 lte: Optional[Numeric] = None, 497 not_eq: Optional[Numeric] = None, 498 not_in: Optional[List[Union[Numeric, str, bool]]] = None, 499 is_critical: bool = True, 500 ): 501 self.column_name = column_name 502 self.missing_values = missing_values 503 self.replace = replace 504 super().__init__( 505 eq=eq, 506 gt=gt, 507 gte=gte, 508 is_in=is_in, 509 lt=lt, 510 lte=lte, 511 not_eq=not_eq, 512 not_in=not_in, 513 is_critical=is_critical, 514 ) 515 self._metric = DatasetMissingValuesMetric(missing_values=self.missing_values, replace=self.replace) 516 517 518 class TestColumnNumberOfDifferentMissingValues(BaseIntegrityColumnMissingValuesTest): 519 class Config: 520 type_alias = "evidently:test:TestColumnNumberOfDifferentMissingValues" 521 522 """Check a number of differently encoded missing values in one column.""" 523 524 name: ClassVar = "Different Types of Missing Values in a Column" 525 526 def get_condition_from_reference(self, reference: Optional[DatasetMissingValues]): 527 if reference is not None: 528 if self.column_name not in reference.number_of_different_missing_values_by_column: 529 raise ValueError( 530 f"Cannot define test default conditions: no column '{self.column_name}' in reference dataset." 531 ) 532 533 ref_value = reference.number_of_different_missing_values_by_column[self.column_name] 534 return TestValueCondition(lte=ref_value) 535 536 return TestValueCondition(eq=0) 537 538 def calculate_value_for_test(self) -> Numeric: 539 metric_data = self.metric.get_result().current 540 return metric_data.number_of_different_missing_values_by_column[self.column_name] 541 542 def get_description(self, value: Numeric) -> str: 543 return ( 544 f"The number of differently encoded types of missing values in the column **{self.column_name}** " 545 f"is {value}. The test threshold is {self.get_condition()}." 546 ) 547 548 549 @default_renderer(wrap_type=TestColumnNumberOfDifferentMissingValues) 550 class TestColumnNumberOfDifferentMissingValuesRenderer(BaseTestMissingValuesRenderer): 551 def render_html(self, obj: TestColumnNumberOfDifferentMissingValues) -> TestHtmlInfo: 552 """Get a table with a missing value and number of the value in the dataset""" 553 info = super().render_html(obj) 554 metric_result = obj.metric.get_result() 555 current_missing_values = metric_result.current.different_missing_values_by_column[obj.column_name] 556 557 if metric_result.reference is None: 558 reference_missing_values = None 559 560 else: 561 reference_missing_values = metric_result.reference.different_missing_values_by_column[obj.column_name] 562 563 return self.get_table_with_number_of_missing_values_by_one_missing_value( 564 info, 565 current_missing_values, 566 reference_missing_values, 567 "number_of_different_missing_values", 568 ) 569 570 571 class TestColumnNumberOfMissingValues(BaseIntegrityColumnMissingValuesTest): 572 class Config: 573 type_alias = "evidently:test:TestColumnNumberOfMissingValues" 574 575 """Check a number of missing values in one column.""" 576 577 name: ClassVar = "The Number of Missing Values in a Column" 578 579 def get_condition_from_reference(self, reference: Optional[DatasetMissingValues]): 580 if reference is not None: 581 curr_number_of_rows = self.metric.get_result().current.number_of_rows 582 ref_number_of_rows = reference.number_of_rows 583 mult = curr_number_of_rows / ref_number_of_rows 584 ref_value = reference.number_of_missing_values_by_column[self.column_name] 585 return TestValueCondition(lte=approx(ref_value * mult, relative=0.1)) 586 587 return TestValueCondition(eq=0) 588 589 def calculate_value_for_test(self) -> Numeric: 590 return self.metric.get_result().current.number_of_missing_values_by_column[self.column_name] 591 592 def get_description(self, value: Numeric) -> str: 593 return ( 594 f"The number of missing values in the column **{self.column_name}** is {value}. " 595 f"The test threshold is {self.get_condition()}." 596 ) 597 598 599 class TestColumnShareOfMissingValues(BaseIntegrityColumnMissingValuesTest): 600 class Config: 601 type_alias = "evidently:test:TestColumnShareOfMissingValues" 602 603 """Check a share of missing values in one column.""" 604 605 name: ClassVar = "The Share of Missing Values in a Column" 606 607 def get_condition_from_reference(self, reference: Optional[DatasetMissingValues]): 608 if reference is not None: 609 ref_value = reference.share_of_missing_values_by_column[self.column_name] 610 return TestValueCondition(lte=approx(ref_value, relative=0.1)) 611 612 return TestValueCondition(eq=0) 613 614 def calculate_value_for_test(self) -> Numeric: 615 return self.metric.get_result().current.share_of_missing_values_by_column[self.column_name] 616 617 def get_description(self, value: Numeric) -> str: 618 return ( 619 f"The share of missing values in the column **{self.column_name}** is {value:.3g}. " 620 f"The test threshold is {self.get_condition()}." 621 ) 622 623 def get_parameters(self): 624 return ColumnCheckValueParameters( 625 condition=self.get_condition(), value=self._value, column_name=self.column_name 626 ) 627 628 629 class TestAllColumnsShareOfMissingValues(BaseGenerator): 630 columns: Optional[List[str]] 631 632 def __init__(self, columns: Optional[List[str]] = None, is_critical: bool = True): 633 self.is_critical = is_critical 634 self.columns = columns 635 636 def generate(self, data_definition: DataDefinition) -> List[TestColumnShareOfMissingValues]: 637 if self.columns is None: 638 columns = [column.column_name for column in data_definition.get_columns()] 639 640 else: 641 columns = self.columns 642 643 return [ 644 TestColumnShareOfMissingValues( 645 column_name=column, 646 is_critical=self.is_critical, 647 ) 648 for column in columns 649 ] 650 651 652 class TestNumberOfConstantColumns(BaseIntegrityValueTest): 653 class Config: 654 type_alias = "evidently:test:TestNumberOfConstantColumns" 655 656 """Number of columns contained only one unique value""" 657 658 name: ClassVar = "Number of Constant Columns" 659 660 def get_condition_from_reference(self, reference: Optional[DatasetSummary]): 661 if reference is not None: 662 value = reference.number_of_constant_columns 663 return TestValueCondition(lte=value) 664 665 return TestValueCondition(eq=0) 666 667 def calculate_value_for_test(self) -> Numeric: 668 return self.metric.get_result().current.number_of_constant_columns 669 670 def get_description(self, value: Numeric) -> str: 671 return f"The number of constant columns is {value}. The test threshold is {self.get_condition()}." 672 673 674 @default_renderer(wrap_type=TestNumberOfConstantColumns) 675 class TestNumberOfConstantColumnsRenderer(TestRenderer): 676 def render_html(self, obj: TestNumberOfConstantColumns) -> TestHtmlInfo: 677 info = super().render_html(obj) 678 columns = ["column name", "current nunique"] 679 dict_curr = obj.metric.get_result().current.number_uniques_by_columns 680 dict_ref = {} 681 reference_stats = obj.metric.get_result().reference 682 683 if reference_stats is not None: 684 dict_ref = reference_stats.number_uniques_by_columns 685 columns = columns + ["reference nunique"] 686 687 additional_plots = plot_dicts_to_table(dict_curr, dict_ref, columns, "number_of_constant_cols", "curr", True) 688 info.details = additional_plots 689 return info 690 691 692 class TestNumberOfEmptyRows(BaseIntegrityValueTest): 693 class Config: 694 type_alias = "evidently:test:TestNumberOfEmptyRows" 695 696 """Number of rows contained all NAN values""" 697 698 name: ClassVar = "Number of Empty Rows" 699 700 def get_condition_from_reference(self, reference: Optional[DatasetSummary]): 701 if reference is not None: 702 ref_number_of_empty_rows = reference.number_of_empty_rows 703 curr_number_of_rows = self.metric.get_result().current.number_of_rows 704 ref_number_of_rows = reference.number_of_rows 705 mult = curr_number_of_rows / ref_number_of_rows 706 return TestValueCondition(eq=approx(ref_number_of_empty_rows * mult, 0.1)) 707 708 return TestValueCondition(eq=0) 709 710 def calculate_value_for_test(self) -> Numeric: 711 return self.metric.get_result().current.number_of_empty_rows 712 713 def get_description(self, value: Numeric) -> str: 714 return f"Number of Empty Rows is {value}. The test threshold is {self.get_condition()}." 715 716 717 class TestNumberOfEmptyColumns(BaseIntegrityValueTest): 718 class Config: 719 type_alias = "evidently:test:TestNumberOfEmptyColumns" 720 721 """Number of columns contained all NAN values""" 722 723 name: ClassVar = "Number of Empty Columns" 724 725 def get_condition_from_reference(self, reference: Optional[DatasetSummary]): 726 if reference is not None: 727 return TestValueCondition(lte=reference.number_of_empty_columns) 728 729 return TestValueCondition(eq=0) 730 731 def calculate_value_for_test(self) -> Numeric: 732 return self.metric.get_result().current.number_of_empty_columns 733 734 def get_description(self, value: Numeric) -> str: 735 return f"Number of Empty Columns is {value}. The test threshold is {self.get_condition()}." 736 737 738 @default_renderer(wrap_type=TestNumberOfEmptyColumns) 739 class TestNumberOfEmptyColumnsRenderer(TestRenderer): 740 def render_html(self, obj: TestNumberOfEmptyColumns) -> TestHtmlInfo: 741 info = super().render_html(obj) 742 columns = ["column name", "current number of NaNs"] 743 dict_curr = obj.metric.get_result().current.nans_by_columns 744 dict_ref = {} 745 reference_stats = obj.metric.get_result().reference 746 747 if reference_stats is not None: 748 dict_ref = reference_stats.nans_by_columns 749 columns = columns + ["reference number of NaNs"] 750 751 additional_plots = plot_dicts_to_table(dict_curr, dict_ref, columns, "number_of_empty_columns") 752 info.details = additional_plots 753 return info 754 755 756 class TestNumberOfDuplicatedRows(BaseIntegrityValueTest): 757 class Config: 758 type_alias = "evidently:test:TestNumberOfDuplicatedRows" 759 760 """How many rows have duplicates in the dataset""" 761 762 name: ClassVar = "Number of Duplicate Rows" 763 764 def get_condition_from_reference(self, reference: Optional[DatasetSummary]): 765 if reference is not None: 766 ref_num_of_duplicates = reference.number_of_duplicated_rows 767 curr_number_of_rows = self.metric.get_result().current.number_of_rows 768 ref_number_of_rows = reference.number_of_rows 769 mult = curr_number_of_rows / ref_number_of_rows 770 return TestValueCondition(eq=approx(ref_num_of_duplicates * mult, 0.1)) 771 772 return TestValueCondition(eq=0) 773 774 def calculate_value_for_test(self) -> Numeric: 775 return self.metric.get_result().current.number_of_duplicated_rows 776 777 def get_description(self, value: Numeric) -> str: 778 return f"The number of duplicate rows is {value}. The test threshold is {self.get_condition()}." 779 780 781 class TestNumberOfDuplicatedColumns(BaseIntegrityValueTest): 782 class Config: 783 type_alias = "evidently:test:TestNumberOfDuplicatedColumns" 784 785 """How many columns have duplicates in the dataset""" 786 787 name: ClassVar = "Number of Duplicate Columns" 788 789 def get_condition_from_reference(self, reference: Optional[DatasetSummary]): 790 if reference is not None: 791 value = reference.number_of_duplicated_columns 792 return TestValueCondition(lte=value) 793 794 return TestValueCondition(eq=0) 795 796 def calculate_value_for_test(self) -> Numeric: 797 return self.metric.get_result().current.number_of_duplicated_columns 798 799 def get_description(self, value: Numeric) -> str: 800 return f"The number of duplicate columns is {value}. The test threshold is {self.get_condition()}." 801 802 803 class BaseIntegrityByColumnsConditionTest(BaseCheckValueTest, ABC): 804 group: ClassVar = DATA_INTEGRITY_GROUP.id 805 _data_integrity_metric: ColumnSummaryMetric 806 column_name: ColumnName 807 808 def __init__( 809 self, 810 column_name: Union[str, ColumnName], 811 eq: Optional[Numeric] = None, 812 gt: Optional[Numeric] = None, 813 gte: Optional[Numeric] = None, 814 is_in: Optional[List[Union[Numeric, str, bool]]] = None, 815 lt: Optional[Numeric] = None, 816 lte: Optional[Numeric] = None, 817 not_eq: Optional[Numeric] = None, 818 not_in: Optional[List[Union[Numeric, str, bool]]] = None, 819 is_critical: bool = True, 820 ): 821 super().__init__( 822 eq=eq, 823 gt=gt, 824 gte=gte, 825 is_in=is_in, 826 lt=lt, 827 lte=lte, 828 not_eq=not_eq, 829 not_in=not_in, 830 is_critical=is_critical, 831 ) 832 self.column_name = ColumnName.from_any(column_name) 833 self._data_integrity_metric = ColumnSummaryMetric(column_name=column_name) 834 835 def groups(self) -> Dict[str, str]: 836 if self.column_name is not None: 837 return {GroupingTypes.ByFeature.id: self.column_name.display_name} 838 return {} 839 840 841 class BaseIntegrityOneColumnTest(Test, ABC): 842 group: ClassVar = DATA_INTEGRITY_GROUP.id 843 _metric: ColumnSummaryMetric 844 column_name: ColumnName 845 846 def __init__(self, column_name: Union[str, ColumnName], is_critical: bool = True): 847 self.column_name = ColumnName.from_any(column_name) 848 super().__init__(is_critical=is_critical) 849 self._metric = ColumnSummaryMetric(self.column_name) 850 851 @property 852 def metric(self): 853 return self._metric 854 855 def groups(self) -> Dict[str, str]: 856 return {GroupingTypes.ByFeature.id: self.column_name.display_name} 857 858 859 class TestColumnAllConstantValues(BaseIntegrityOneColumnTest): 860 class Config: 861 type_alias = "evidently:test:TestColumnAllConstantValues" 862 863 """Test that there is only one unique value in a column""" 864 865 name: ClassVar = "All Constant Values in a Column" 866 _metric: ColumnSummaryMetric 867 868 def check(self): 869 uniques_in_column = self.metric.get_result().current_characteristics.unique 870 number_of_rows = self.metric.get_result().current_characteristics.number_of_rows 871 column_name = self.column_name 872 873 description = ( 874 f"The number of the unique values in the column **{column_name}** " 875 f"is {uniques_in_column} out of {number_of_rows}" 876 ) 877 878 if uniques_in_column <= 1: 879 status = TestStatus.FAIL 880 881 else: 882 status = TestStatus.SUCCESS 883 884 return TestResult( 885 name=self.name, description=description, status=status, groups=self.groups(), group=self.group 886 ) 887 888 889 @default_renderer(wrap_type=TestColumnAllConstantValues) 890 class TestColumnAllConstantValuesRenderer(TestRenderer): 891 def render_html(self, obj: TestColumnAllConstantValues) -> TestHtmlInfo: 892 info = super().render_html(obj) 893 column_name = obj.column_name 894 counts_data = obj.metric.get_result().plot_data.counts_of_values 895 if counts_data is not None: 896 curr_df = counts_data["current"] 897 ref_df = None 898 if "reference" in counts_data.keys(): 899 ref_df = counts_data["reference"] 900 additional_plots = plot_value_counts_tables_ref_curr(column_name, curr_df, ref_df, "AllConstantValues") 901 info.details = additional_plots 902 return info 903 904 905 class TestColumnAllUniqueValues(BaseIntegrityOneColumnTest): 906 class Config: 907 type_alias = "evidently:test:TestColumnAllUniqueValues" 908 909 """Test that there is only uniques values in a column""" 910 911 name: ClassVar = "All Unique Values in a Column" 912 913 def check(self): 914 uniques_in_column = self.metric.get_result().current_characteristics.unique 915 number_of_rows = self.metric.get_result().current_characteristics.number_of_rows 916 nans_in_column = self.metric.get_result().current_characteristics.missing 917 column_name = self.column_name 918 919 description = ( 920 f"The number of the unique values in the column **{column_name}** " 921 f"is {uniques_in_column} out of {number_of_rows}" 922 ) 923 924 if uniques_in_column != number_of_rows - nans_in_column: 925 status = TestStatus.FAIL 926 927 else: 928 status = TestStatus.SUCCESS 929 930 return TestResult( 931 name=self.name, description=description, status=status, groups=self.groups(), group=self.group 932 ) 933 934 935 @default_renderer(wrap_type=TestColumnAllUniqueValues) 936 class TestColumnAllUniqueValuesRenderer(TestRenderer): 937 def render_html(self, obj: TestColumnAllUniqueValues) -> TestHtmlInfo: 938 info = super().render_html(obj) 939 column_name = obj.column_name 940 counts_data = obj.metric.get_result().plot_data.counts_of_values 941 if counts_data is not None: 942 curr_df = counts_data["current"] 943 ref_df = None 944 if "reference" in counts_data.keys(): 945 ref_df = counts_data["reference"] 946 additional_plots = plot_value_counts_tables_ref_curr(column_name, curr_df, ref_df, "AllUniqueValues") 947 info.details = additional_plots 948 return info 949 950 951 class ColumnTypeParameter(TestParameters): 952 class Config: 953 type_alias = "evidently:test_parameters:ColumnTypeParameter" 954 955 actual_type: str 956 column_name: str 957 expected_type: str 958 959 960 class ColumnTypesParameter(TestParameters): 961 class Config: 962 type_alias = "evidently:test_parameters:ColumnTypesParameter" 963 964 columns: List[ColumnTypeParameter] 965 966 967 class TestColumnsType(Test): 968 class Config: 969 type_alias = "evidently:test:TestColumnsType" 970 971 """This test compares columns type against the specified ones or a reference dataframe""" 972 973 group: ClassVar = DATA_INTEGRITY_GROUP.id 974 name: ClassVar = "Column Types" 975 columns_type: Optional[dict] 976 _metric: DatasetSummaryMetric 977 978 def __init__(self, columns_type: Optional[dict] = None, is_critical: bool = True): 979 self.columns_type = columns_type 980 self._metric = DatasetSummaryMetric() 981 super().__init__(is_critical=is_critical) 982 983 @property 984 def metric(self): 985 return self._metric 986 987 def check(self): 988 status = TestStatus.SUCCESS 989 data_columns_type = self.metric.get_result().current.columns_type 990 991 if self.columns_type is None: 992 if self.metric.get_result().reference is None: 993 status = TestStatus.ERROR 994 description = "Cannot compare column types without conditions or a reference" 995 return TestResult(name=self.name, description=description, status=status, group=self.group) 996 997 # get types from reference 998 columns_type = self.metric.get_result().reference.columns_type 999 1000 else: 1001 columns_type = self.columns_type 1002 1003 if not columns_type: 1004 status = TestStatus.ERROR 1005 description = "Columns type condition is empty" 1006 return TestResult(name=self.name, description=description, status=status, group=self.group) 1007 1008 invalid_types_count = 0 1009 columns = [] 1010 1011 for column_name, expected_type_object in columns_type.items(): 1012 real_column_type_object = data_columns_type.get(column_name) 1013 1014 if real_column_type_object is None: 1015 status = TestStatus.ERROR 1016 description = f"No column '{column_name}' in the metrics data" 1017 return TestResult(name=self.name, description=description, status=status, group=self.group) 1018 1019 if isinstance(expected_type_object, numpy.dtypes.DateTime64DType): 1020 expected_type = expected_type_object.type 1021 else: 1022 expected_type = infer_dtype_from_object(expected_type_object) 1023 1024 if isinstance(real_column_type_object, numpy.dtypes.DateTime64DType): 1025 real_column_type = real_column_type_object.type 1026 else: 1027 real_column_type = infer_dtype_from_object(real_column_type_object) 1028 columns.append( 1029 ColumnTypeParameter( 1030 actual_type=real_column_type.__name__, expected_type=expected_type.__name__, column_name=column_name 1031 ) 1032 ) 1033 1034 if expected_type == real_column_type or issubclass(real_column_type, expected_type): 1035 # types are matched or expected type is a parent 1036 continue 1037 1038 status = TestStatus.FAIL 1039 invalid_types_count += 1 1040 1041 return TestResult( 1042 name=self.name, 1043 description=f"The number of columns with a type " 1044 f"mismatch is {invalid_types_count} out of {len(columns_type)}.", 1045 status=status, 1046 parameters=ColumnTypesParameter(columns=columns), 1047 group=self.group, 1048 ) 1049 1050 def groups(self) -> Dict[str, str]: 1051 return {} 1052 1053 1054 @default_renderer(wrap_type=TestColumnsType) 1055 class TestColumnsTypeRenderer(TestRenderer): 1056 def render_html(self, obj: TestColumnsType) -> TestHtmlInfo: 1057 info = super().render_html(obj) 1058 1059 parameters = obj.get_result().parameters 1060 assert isinstance(parameters, ColumnTypesParameter) 1061 info.details = [ 1062 DetailsInfo( 1063 title="", 1064 info=BaseWidgetInfo( 1065 title="", 1066 type="table", 1067 params={ 1068 "header": ["Column Name", "Actual Type", "Expected Type"], 1069 "data": [[c.column_name, c.actual_type, c.expected_type] for c in parameters.columns], 1070 }, 1071 size=2, 1072 ), 1073 ), 1074 ] 1075 return info 1076 1077 1078 class TestColumnRegExp(BaseCheckValueTest, ABC): 1079 group: ClassVar = DATA_INTEGRITY_GROUP.id 1080 name: ClassVar = "RegExp Match" 1081 _metric: ColumnRegExpMetric 1082 column_name: str 1083 reg_exp: str 1084 1085 def __init__( 1086 self, 1087 column_name: str, 1088 reg_exp: str, 1089 eq: Optional[Numeric] = None, 1090 gt: Optional[Numeric] = None, 1091 gte: Optional[Numeric] = None, 1092 is_in: Optional[List[Union[Numeric, str, bool]]] = None, 1093 lt: Optional[Numeric] = None, 1094 lte: Optional[Numeric] = None, 1095 not_eq: Optional[Numeric] = None, 1096 not_in: Optional[List[Union[Numeric, str, bool]]] = None, 1097 is_critical: bool = True, 1098 ): 1099 self.column_name = column_name 1100 self.reg_exp = reg_exp 1101 super().__init__( 1102 eq=eq, 1103 gt=gt, 1104 gte=gte, 1105 is_in=is_in, 1106 lt=lt, 1107 lte=lte, 1108 not_eq=not_eq, 1109 not_in=not_in, 1110 is_critical=is_critical, 1111 ) 1112 self._metric = ColumnRegExpMetric(column_name=self.column_name, reg_exp=self.reg_exp) 1113 1114 @property 1115 def metric(self): 1116 return self._metric 1117 1118 def groups(self) -> Dict[str, str]: 1119 if self.column_name is not None: 1120 return {GroupingTypes.ByFeature.id: self.column_name} 1121 return {} 1122 1123 def get_condition(self) -> TestValueCondition: 1124 if self.condition.has_condition(): 1125 return self.condition 1126 1127 metric_result = self.metric.get_result() 1128 1129 if metric_result.reference: 1130 ref_value = metric_result.reference.number_of_not_matched 1131 mult = metric_result.current.number_of_rows / metric_result.reference.number_of_rows 1132 1133 if mult is not None: 1134 return TestValueCondition(eq=approx(ref_value * mult, relative=0.1)) 1135 1136 return TestValueCondition(eq=0) 1137 1138 def calculate_value_for_test(self) -> Optional[Numeric]: 1139 return self.metric.get_result().current.number_of_not_matched 1140 1141 def get_description(self, value: Numeric) -> str: 1142 return ( 1143 f"The number of the mismatched values in the column **{self.column_name}** is {value}. " 1144 f"The test threshold is {self.get_condition()}." 1145 ) 1146 1147 1148 @default_renderer(wrap_type=TestColumnRegExp) 1149 class TestColumnRegExpRenderer(TestRenderer): 1150 def render_html(self, obj: TestColumnRegExp) -> TestHtmlInfo: 1151 info = super().render_html(obj) 1152 column_name = obj.column_name 1153 metric_result = obj.metric.get_result() 1154 1155 if metric_result.current.table_of_not_matched: 1156 curr_df = pd.DataFrame(metric_result.current.table_of_not_matched.items()) 1157 curr_df.columns = pd.Index(["x", "count"]) 1158 1159 else: 1160 curr_df = pd.DataFrame(columns=["x", "count"]) 1161 1162 ref_df = None 1163 1164 if metric_result.reference is not None and metric_result.reference.table_of_not_matched: 1165 ref_df = pd.DataFrame(metric_result.reference.table_of_not_matched.items()) 1166 ref_df.columns = pd.Index(["x", "count"]) 1167 1168 additional_plots = plot_value_counts_tables_ref_curr( 1169 column_name, curr_df, ref_df, f"{column_name}_ColumnValueRegExp" 1170 ) 1171 info.details = additional_plots 1172 return info