Cradicle Explorer

/ src / evidently / legacy / tests / data_integrity_tests.py
data_integrity_tests.py
   1  from abc import ABC
   2  from typing import ClassVar
   3  from typing import Dict
   4  from typing import List
   5  from typing import Optional
   6  from typing import Union
   7  
   8  import numpy
   9  import numpy as np
  10  import pandas as pd
  11  from pandas.core.dtypes.common import infer_dtype_from_object  # type: ignore[attr-defined]
  12  
  13  from evidently.legacy.base_metric import ColumnName
  14  from evidently.legacy.metrics import ColumnRegExpMetric
  15  from evidently.legacy.metrics import ColumnSummaryMetric
  16  from evidently.legacy.metrics import DatasetMissingValuesMetric
  17  from evidently.legacy.metrics import DatasetSummaryMetric
  18  from evidently.legacy.metrics.data_integrity.dataset_missing_values_metric import DatasetMissingValues
  19  from evidently.legacy.metrics.data_integrity.dataset_missing_values_metric import DatasetMissingValuesMetricResult
  20  from evidently.legacy.metrics.data_integrity.dataset_summary_metric import DatasetSummary
  21  from evidently.legacy.model.widget import BaseWidgetInfo
  22  from evidently.legacy.renderers.base_renderer import DetailsInfo
  23  from evidently.legacy.renderers.base_renderer import TestHtmlInfo
  24  from evidently.legacy.renderers.base_renderer import TestRenderer
  25  from evidently.legacy.renderers.base_renderer import default_renderer
  26  from evidently.legacy.tests.base_test import BaseCheckValueTest
  27  from evidently.legacy.tests.base_test import ColumnCheckValueParameters
  28  from evidently.legacy.tests.base_test import ConditionFromReferenceMixin
  29  from evidently.legacy.tests.base_test import GroupData
  30  from evidently.legacy.tests.base_test import GroupingTypes
  31  from evidently.legacy.tests.base_test import Test
  32  from evidently.legacy.tests.base_test import TestParameters
  33  from evidently.legacy.tests.base_test import TestResult
  34  from evidently.legacy.tests.base_test import TestStatus
  35  from evidently.legacy.tests.base_test import TestValueCondition
  36  from evidently.legacy.tests.utils import approx
  37  from evidently.legacy.tests.utils import dataframes_to_table
  38  from evidently.legacy.tests.utils import plot_dicts_to_table
  39  from evidently.legacy.tests.utils import plot_value_counts_tables_ref_curr
  40  from evidently.legacy.utils.data_preprocessing import DataDefinition
  41  from evidently.legacy.utils.generators import BaseGenerator
  42  from evidently.legacy.utils.types import Numeric
  43  from evidently.legacy.utils.types import NumericApprox
  44  
  45  DATA_INTEGRITY_GROUP = GroupData(id="data_integrity", title="Data Integrity", description="")
  46  GroupingTypes.TestGroup.add_value(DATA_INTEGRITY_GROUP)
  47  
  48  
  49  class BaseIntegrityValueTest(ConditionFromReferenceMixin[DatasetSummary], ABC):
  50      group: ClassVar = DATA_INTEGRITY_GROUP.id
  51      _metric: DatasetSummaryMetric
  52  
  53      def __init__(
  54          self,
  55          eq: Optional[NumericApprox] = None,
  56          gt: Optional[Numeric] = None,
  57          gte: Optional[Numeric] = None,
  58          is_in: Optional[List[Union[Numeric, str, bool]]] = None,
  59          lt: Optional[Numeric] = None,
  60          lte: Optional[Numeric] = None,
  61          not_eq: Optional[Numeric] = None,
  62          not_in: Optional[List[Union[Numeric, str, bool]]] = None,
  63          is_critical: bool = True,
  64      ):
  65          super().__init__(
  66              eq=eq,
  67              gt=gt,
  68              gte=gte,
  69              is_in=is_in,
  70              lt=lt,
  71              lte=lte,
  72              not_eq=not_eq,
  73              not_in=not_in,
  74              is_critical=is_critical,
  75          )
  76          self._metric = DatasetSummaryMetric()
  77  
  78  
  79  class TestNumberOfColumns(BaseIntegrityValueTest):
  80      class Config:
  81          type_alias = "evidently:test:TestNumberOfColumns"
  82  
  83      """Number of all columns in the data, including utility columns (id/index, datetime, target, predictions)"""
  84  
  85      name: ClassVar = "Number of Columns"
  86  
  87      def get_condition_from_reference(self, reference: Optional[DatasetSummary]):
  88          if reference is not None:
  89              return TestValueCondition(eq=reference.number_of_columns)
  90          return TestValueCondition(gt=0)
  91  
  92      def calculate_value_for_test(self) -> Numeric:
  93          return self.metric.get_result().current.number_of_columns
  94  
  95      def get_description(self, value: Numeric) -> str:
  96          return f"The number of columns is {value}. The test threshold is {self.get_condition()}."
  97  
  98  
  99  @default_renderer(wrap_type=TestNumberOfColumns)
 100  class TestNumberOfColumnsRenderer(TestRenderer):
 101      def render_html(self, obj: TestNumberOfColumns) -> TestHtmlInfo:
 102          info = super().render_html(obj)
 103          columns = ["column name", "current dtype"]
 104          dict_curr = obj.metric.get_result().current.columns_type
 105          dict_ref = None
 106          reference_stats = obj.metric.get_result().reference
 107  
 108          if reference_stats is not None:
 109              dict_ref = reference_stats.columns_type
 110              columns = columns + ["reference dtype"]
 111  
 112          additional_plots = plot_dicts_to_table(dict_curr, dict_ref, columns, "number_of_column", "diff")
 113          info.details = additional_plots
 114          return info
 115  
 116  
 117  class TestNumberOfRows(BaseIntegrityValueTest):
 118      class Config:
 119          type_alias = "evidently:test:TestNumberOfRows"
 120  
 121      """Number of rows in the data"""
 122  
 123      name: ClassVar = "Number of Rows"
 124  
 125      def get_condition_from_reference(self, reference: Optional[DatasetSummary]):
 126          if reference is not None:
 127              return TestValueCondition(eq=approx(reference.number_of_rows, relative=0.1))
 128  
 129          return TestValueCondition(gt=30)
 130  
 131      def calculate_value_for_test(self) -> Numeric:
 132          return self.metric.get_result().current.number_of_rows
 133  
 134      def get_description(self, value: Numeric) -> str:
 135          return f"The number of rows is {value}. The test threshold is {self.get_condition()}."
 136  
 137  
 138  class BaseIntegrityMissingValuesValuesTest(ConditionFromReferenceMixin[DatasetMissingValues], ABC):
 139      group: ClassVar = DATA_INTEGRITY_GROUP.id
 140      _metric: DatasetMissingValuesMetric
 141      missing_values: Optional[list] = None
 142      replace: bool = True
 143  
 144      def __init__(
 145          self,
 146          missing_values: Optional[list] = None,
 147          replace: bool = True,
 148          eq: Optional[Numeric] = None,
 149          gt: Optional[Numeric] = None,
 150          gte: Optional[Numeric] = None,
 151          is_in: Optional[List[Union[Numeric, str, bool]]] = None,
 152          lt: Optional[Numeric] = None,
 153          lte: Optional[Numeric] = None,
 154          not_eq: Optional[Numeric] = None,
 155          not_in: Optional[List[Union[Numeric, str, bool]]] = None,
 156          is_critical: bool = True,
 157      ):
 158          self.missing_values = missing_values
 159          self.replace = replace
 160          super().__init__(
 161              eq=eq,
 162              gt=gt,
 163              gte=gte,
 164              is_in=is_in,
 165              lt=lt,
 166              lte=lte,
 167              not_eq=not_eq,
 168              not_in=not_in,
 169              is_critical=is_critical,
 170          )
 171          self._metric = DatasetMissingValuesMetric(missing_values=self.missing_values, replace=self.replace)
 172  
 173  
 174  class BaseTestMissingValuesRenderer(TestRenderer):
 175      """Common class for tests of missing values.
 176      Some tests have the same details visualizations.
 177      """
 178  
 179      MISSING_VALUES_NAMING_MAPPING = {
 180          None: "Pandas nulls (None, NAN, etc.)",
 181          "": '"" (empty string)',
 182          np.inf: 'Numpy "inf" value',
 183          -np.inf: 'Numpy "-inf" value',
 184      }
 185  
 186      @staticmethod
 187      def _get_number_and_percents_of_missing_values(missing_values_info: DatasetMissingValues) -> pd.DataFrame:
 188          """Get a string with missing values numbers and percents from info for results table"""
 189          result = {}
 190  
 191          for columns_name in missing_values_info.number_of_missing_values_by_column:
 192              missing_values_count = missing_values_info.number_of_missing_values_by_column[columns_name]
 193              percent_count = missing_values_info.share_of_missing_values_by_column[columns_name] * 100
 194              result[columns_name] = f"{missing_values_count} ({percent_count:.2f}%)"
 195  
 196          return pd.DataFrame.from_dict(
 197              {
 198                  name: dict(
 199                      value=missing_values_info.number_of_missing_values_by_column[name],
 200                      display=f"{missing_values_info.number_of_missing_values_by_column[name]}"
 201                      f" ({missing_values_info.share_of_missing_values_by_column[name] * 100:.2f}%)",
 202                  )
 203                  for name in missing_values_info.number_of_missing_values_by_column.keys()
 204              },
 205              orient="index",
 206              columns=["value", "display"],
 207          )
 208  
 209      def get_table_with_missing_values_and_percents_by_column(
 210          self, info: TestHtmlInfo, metric_result: DatasetMissingValuesMetricResult, name: str
 211      ) -> TestHtmlInfo:
 212          """Get a table with missing values number and percents"""
 213          columns = ["column name", "current number of missing values"]
 214          dict_curr = self._get_number_and_percents_of_missing_values(metric_result.current)
 215          dict_ref = None
 216          reference_stats = metric_result.reference
 217  
 218          if reference_stats is not None:
 219              # add one more column and values for reference data
 220              columns.append("reference number of missing values")
 221              dict_ref = self._get_number_and_percents_of_missing_values(reference_stats)
 222  
 223          additional_plots = dataframes_to_table(dict_curr, dict_ref, columns, name)
 224          info.details = additional_plots
 225          return info
 226  
 227      def _replace_missing_values_to_description(self, values: dict) -> dict:
 228          """Replace missing values in the dict keys to human-readable string"""
 229          return {self.MISSING_VALUES_NAMING_MAPPING.get(k, k): v for k, v in values.items()}
 230  
 231      def get_table_with_number_of_missing_values_by_one_missing_value(
 232          self, info: TestHtmlInfo, current_missing_values: dict, reference_missing_values: Optional[dict], name: str
 233      ) -> TestHtmlInfo:
 234          columns = ["missing value", "current number of missing values"]
 235          dict_curr = self._replace_missing_values_to_description(current_missing_values)
 236          dict_ref: Optional[dict] = None
 237  
 238          if reference_missing_values is not None:
 239              # add one more column and values for reference data
 240              columns.append("reference number of missing values")
 241              # cast keys to str because None could be in keys, and it is not processed correctly in visual tables
 242              dict_ref = self._replace_missing_values_to_description(reference_missing_values)
 243  
 244          additional_plots = plot_dicts_to_table(dict_curr, dict_ref, columns, name)
 245          info.details = additional_plots
 246          return info
 247  
 248  
 249  class TestNumberOfDifferentMissingValues(BaseIntegrityMissingValuesValuesTest):
 250      class Config:
 251          type_alias = "evidently:test:TestNumberOfDifferentMissingValues"
 252  
 253      """Check a number of different encoded missing values."""
 254  
 255      name: ClassVar = "Different Types of Missing Values"
 256  
 257      def get_condition_from_reference(self, reference: Optional[DatasetMissingValues]):
 258          if reference is not None:
 259              return TestValueCondition(eq=reference.number_of_different_missing_values)
 260  
 261          return TestValueCondition(eq=0)
 262  
 263      def calculate_value_for_test(self) -> Numeric:
 264          return self.metric.get_result().current.number_of_different_missing_values
 265  
 266      def get_description(self, value: Numeric) -> str:
 267          return (
 268              f"The number of differently encoded types of missing values is {value}. "
 269              f"The test threshold is {self.get_condition()}."
 270          )
 271  
 272  
 273  @default_renderer(wrap_type=TestNumberOfDifferentMissingValues)
 274  class TestNumberOfDifferentMissingValuesRenderer(BaseTestMissingValuesRenderer):
 275      def render_html(self, obj: TestNumberOfDifferentMissingValues) -> TestHtmlInfo:
 276          """Get a table with a missing value and number of the value in the dataset"""
 277          info = super().render_html(obj)
 278          metric_result = obj.metric.get_result()
 279          current_missing_values = metric_result.current.different_missing_values
 280  
 281          if metric_result.reference is None:
 282              reference_missing_values = None
 283  
 284          else:
 285              reference_missing_values = metric_result.reference.different_missing_values
 286  
 287          return self.get_table_with_number_of_missing_values_by_one_missing_value(
 288              info,
 289              current_missing_values,
 290              reference_missing_values,
 291              "number_of_different_missing_values",
 292          )
 293  
 294  
 295  class TestNumberOfMissingValues(BaseIntegrityMissingValuesValuesTest):
 296      class Config:
 297          type_alias = "evidently:test:TestNumberOfMissingValues"
 298  
 299      """Check a number of missing values."""
 300  
 301      name: ClassVar = "The Number of Missing Values"
 302  
 303      def get_condition_from_reference(self, reference: Optional[DatasetMissingValues]):
 304          if reference is not None:
 305              curr_number_of_rows = self.metric.get_result().current.number_of_rows
 306              ref_number_of_rows = reference.number_of_rows
 307              mult = curr_number_of_rows / ref_number_of_rows
 308              return TestValueCondition(
 309                  lte=approx(
 310                      reference.number_of_missing_values * mult,
 311                      relative=0.1,
 312                  ),
 313              )
 314  
 315          return TestValueCondition(eq=0)
 316  
 317      def calculate_value_for_test(self) -> Numeric:
 318          return self.metric.get_result().current.number_of_missing_values
 319  
 320      def get_description(self, value: Numeric) -> str:
 321          return f"The number of missing values is {value}. The test threshold is {self.get_condition()}."
 322  
 323  
 324  @default_renderer(wrap_type=TestNumberOfMissingValues)
 325  class TestNumberOfMissingValuesRenderer(BaseTestMissingValuesRenderer):
 326      def render_html(self, obj: TestNumberOfMissingValues) -> TestHtmlInfo:
 327          info = super().render_html(obj)
 328          metric_result = obj.metric.get_result()
 329          return self.get_table_with_missing_values_and_percents_by_column(
 330              info, metric_result, "number_of_missing_values"
 331          )
 332  
 333  
 334  class TestShareOfMissingValues(BaseIntegrityMissingValuesValuesTest):
 335      class Config:
 336          type_alias = "evidently:test:TestShareOfMissingValues"
 337  
 338      """Check a share of missing values."""
 339  
 340      name: ClassVar = "Share of Missing Values"
 341  
 342      def get_condition_from_reference(self, reference: Optional[DatasetMissingValues]):
 343          if reference is not None:
 344              return TestValueCondition(lte=approx(reference.share_of_missing_values, relative=0.1))
 345  
 346          return TestValueCondition(eq=0)
 347  
 348      def calculate_value_for_test(self) -> Numeric:
 349          return self.metric.get_result().current.share_of_missing_values
 350  
 351      def get_description(self, value: Numeric) -> str:
 352          return f"The share of missing values is {value:.3g}. The test threshold is {self.get_condition()}."
 353  
 354  
 355  @default_renderer(wrap_type=TestShareOfMissingValues)
 356  class TestShareOfMissingValuesRenderer(BaseTestMissingValuesRenderer):
 357      def render_html(self, obj: TestNumberOfMissingValues) -> TestHtmlInfo:
 358          info = super().render_html(obj)
 359          metric_result = obj.metric.get_result()
 360          return self.get_table_with_missing_values_and_percents_by_column(info, metric_result, "share_of_missing_values")
 361  
 362  
 363  class TestNumberOfColumnsWithMissingValues(BaseIntegrityMissingValuesValuesTest):
 364      class Config:
 365          type_alias = "evidently:test:TestNumberOfColumnsWithMissingValues"
 366  
 367      """Check a number of columns with a missing value."""
 368  
 369      name: ClassVar = "The Number of Columns With Missing Values"
 370  
 371      def get_condition_from_reference(self, reference: Optional[DatasetMissingValues]):
 372          if reference is not None:
 373              return TestValueCondition(lte=reference.number_of_columns_with_missing_values)
 374  
 375          return TestValueCondition(eq=0)
 376  
 377      def calculate_value_for_test(self) -> Numeric:
 378          return self.metric.get_result().current.number_of_columns_with_missing_values
 379  
 380      def get_description(self, value: Numeric) -> str:
 381          return (
 382              f"The number of columns with missing values is {value}. " f"The test threshold is {self.get_condition()}."
 383          )
 384  
 385  
 386  @default_renderer(wrap_type=TestNumberOfColumnsWithMissingValues)
 387  class TestNumberOfColumnsWithMissingValuesRenderer(BaseTestMissingValuesRenderer):
 388      def render_html(self, obj: TestNumberOfMissingValues) -> TestHtmlInfo:
 389          info = super().render_html(obj)
 390          metric_result = obj.metric.get_result()
 391          return self.get_table_with_missing_values_and_percents_by_column(
 392              info, metric_result, "number_of_columns_with_missing_values"
 393          )
 394  
 395  
 396  class TestShareOfColumnsWithMissingValues(BaseIntegrityMissingValuesValuesTest):
 397      class Config:
 398          type_alias = "evidently:test:TestShareOfColumnsWithMissingValues"
 399  
 400      """Check a share of columns with a missing value."""
 401  
 402      name: ClassVar = "The Share of Columns With Missing Values"
 403  
 404      def get_condition_from_reference(self, reference: Optional[DatasetMissingValues]):
 405          if reference is not None:
 406              return TestValueCondition(lte=reference.share_of_columns_with_missing_values)
 407  
 408          return TestValueCondition(eq=0)
 409  
 410      def calculate_value_for_test(self) -> Numeric:
 411          return self.metric.get_result().current.share_of_columns_with_missing_values
 412  
 413      def get_description(self, value: Numeric) -> str:
 414          return (
 415              f"The share of columns with missing values is {value:.3g}. "
 416              f"The test threshold is {self.get_condition()}."
 417          )
 418  
 419  
 420  @default_renderer(wrap_type=TestShareOfColumnsWithMissingValues)
 421  class TestShareOfColumnsWithMissingValuesRenderer(BaseTestMissingValuesRenderer):
 422      def render_html(self, obj: TestNumberOfMissingValues) -> TestHtmlInfo:
 423          info = super().render_html(obj)
 424          metric_result = obj.metric.get_result()
 425          return self.get_table_with_missing_values_and_percents_by_column(
 426              info, metric_result, "share_of_columns_with_missing_values"
 427          )
 428  
 429  
 430  class TestNumberOfRowsWithMissingValues(BaseIntegrityMissingValuesValuesTest):
 431      class Config:
 432          type_alias = "evidently:test:TestNumberOfRowsWithMissingValues"
 433  
 434      """Check a number of rows with a missing value."""
 435  
 436      name: ClassVar = "The Number Of Rows With Missing Values"
 437  
 438      def get_condition_from_reference(self, reference: Optional[DatasetMissingValues]):
 439          if reference is not None:
 440              curr_number_of_rows = self.metric.get_result().current.number_of_rows
 441              ref_number_of_rows = reference.number_of_rows
 442              mult = curr_number_of_rows / ref_number_of_rows
 443              return TestValueCondition(
 444                  lte=approx(reference.number_of_rows_with_missing_values * mult, relative=0.1),
 445              )
 446  
 447          return TestValueCondition(eq=0)
 448  
 449      def calculate_value_for_test(self) -> Numeric:
 450          return self.metric.get_result().current.number_of_rows_with_missing_values
 451  
 452      def get_description(self, value: Numeric) -> str:
 453          return f"The number of rows with missing values is {value}. " f"The test threshold is {self.get_condition()}."
 454  
 455  
 456  class TestShareOfRowsWithMissingValues(BaseIntegrityMissingValuesValuesTest):
 457      class Config:
 458          type_alias = "evidently:test:TestShareOfRowsWithMissingValues"
 459  
 460      """Check a share of rows with a missing value."""
 461  
 462      name: ClassVar = "The Share of Rows With Missing Values"
 463  
 464      def get_condition_from_reference(self, reference: Optional[DatasetMissingValues]):
 465          if reference is not None:
 466              return TestValueCondition(lte=approx(reference.share_of_rows_with_missing_values, relative=0.1))
 467  
 468          return TestValueCondition(eq=0)
 469  
 470      def calculate_value_for_test(self) -> Numeric:
 471          return self.metric.get_result().current.share_of_rows_with_missing_values
 472  
 473      def get_description(self, value: Numeric) -> str:
 474          return (
 475              f"The share of rows with missing values is {value:.3g}. " f"The test threshold is {self.get_condition()}."
 476          )
 477  
 478  
 479  class BaseIntegrityColumnMissingValuesTest(ConditionFromReferenceMixin[DatasetMissingValues], ABC):
 480      group: ClassVar = DATA_INTEGRITY_GROUP.id
 481      _metric: DatasetMissingValuesMetric
 482      column_name: str
 483      missing_values: Optional[List] = None
 484      replace: bool = True
 485  
 486      def __init__(
 487          self,
 488          column_name: str,
 489          missing_values: Optional[list] = None,
 490          replace: bool = True,
 491          eq: Optional[Numeric] = None,
 492          gt: Optional[Numeric] = None,
 493          gte: Optional[Numeric] = None,
 494          is_in: Optional[List[Union[Numeric, str, bool]]] = None,
 495          lt: Optional[Numeric] = None,
 496          lte: Optional[Numeric] = None,
 497          not_eq: Optional[Numeric] = None,
 498          not_in: Optional[List[Union[Numeric, str, bool]]] = None,
 499          is_critical: bool = True,
 500      ):
 501          self.column_name = column_name
 502          self.missing_values = missing_values
 503          self.replace = replace
 504          super().__init__(
 505              eq=eq,
 506              gt=gt,
 507              gte=gte,
 508              is_in=is_in,
 509              lt=lt,
 510              lte=lte,
 511              not_eq=not_eq,
 512              not_in=not_in,
 513              is_critical=is_critical,
 514          )
 515          self._metric = DatasetMissingValuesMetric(missing_values=self.missing_values, replace=self.replace)
 516  
 517  
 518  class TestColumnNumberOfDifferentMissingValues(BaseIntegrityColumnMissingValuesTest):
 519      class Config:
 520          type_alias = "evidently:test:TestColumnNumberOfDifferentMissingValues"
 521  
 522      """Check a number of differently encoded missing values in one column."""
 523  
 524      name: ClassVar = "Different Types of Missing Values in a Column"
 525  
 526      def get_condition_from_reference(self, reference: Optional[DatasetMissingValues]):
 527          if reference is not None:
 528              if self.column_name not in reference.number_of_different_missing_values_by_column:
 529                  raise ValueError(
 530                      f"Cannot define test default conditions: no column '{self.column_name}' in reference dataset."
 531                  )
 532  
 533              ref_value = reference.number_of_different_missing_values_by_column[self.column_name]
 534              return TestValueCondition(lte=ref_value)
 535  
 536          return TestValueCondition(eq=0)
 537  
 538      def calculate_value_for_test(self) -> Numeric:
 539          metric_data = self.metric.get_result().current
 540          return metric_data.number_of_different_missing_values_by_column[self.column_name]
 541  
 542      def get_description(self, value: Numeric) -> str:
 543          return (
 544              f"The number of differently encoded types of missing values in the column **{self.column_name}** "
 545              f"is {value}. The test threshold is {self.get_condition()}."
 546          )
 547  
 548  
 549  @default_renderer(wrap_type=TestColumnNumberOfDifferentMissingValues)
 550  class TestColumnNumberOfDifferentMissingValuesRenderer(BaseTestMissingValuesRenderer):
 551      def render_html(self, obj: TestColumnNumberOfDifferentMissingValues) -> TestHtmlInfo:
 552          """Get a table with a missing value and number of the value in the dataset"""
 553          info = super().render_html(obj)
 554          metric_result = obj.metric.get_result()
 555          current_missing_values = metric_result.current.different_missing_values_by_column[obj.column_name]
 556  
 557          if metric_result.reference is None:
 558              reference_missing_values = None
 559  
 560          else:
 561              reference_missing_values = metric_result.reference.different_missing_values_by_column[obj.column_name]
 562  
 563          return self.get_table_with_number_of_missing_values_by_one_missing_value(
 564              info,
 565              current_missing_values,
 566              reference_missing_values,
 567              "number_of_different_missing_values",
 568          )
 569  
 570  
 571  class TestColumnNumberOfMissingValues(BaseIntegrityColumnMissingValuesTest):
 572      class Config:
 573          type_alias = "evidently:test:TestColumnNumberOfMissingValues"
 574  
 575      """Check a number of missing values in one column."""
 576  
 577      name: ClassVar = "The Number of Missing Values in a Column"
 578  
 579      def get_condition_from_reference(self, reference: Optional[DatasetMissingValues]):
 580          if reference is not None:
 581              curr_number_of_rows = self.metric.get_result().current.number_of_rows
 582              ref_number_of_rows = reference.number_of_rows
 583              mult = curr_number_of_rows / ref_number_of_rows
 584              ref_value = reference.number_of_missing_values_by_column[self.column_name]
 585              return TestValueCondition(lte=approx(ref_value * mult, relative=0.1))
 586  
 587          return TestValueCondition(eq=0)
 588  
 589      def calculate_value_for_test(self) -> Numeric:
 590          return self.metric.get_result().current.number_of_missing_values_by_column[self.column_name]
 591  
 592      def get_description(self, value: Numeric) -> str:
 593          return (
 594              f"The number of missing values in the column **{self.column_name}** is {value}. "
 595              f"The test threshold is {self.get_condition()}."
 596          )
 597  
 598  
 599  class TestColumnShareOfMissingValues(BaseIntegrityColumnMissingValuesTest):
 600      class Config:
 601          type_alias = "evidently:test:TestColumnShareOfMissingValues"
 602  
 603      """Check a share of missing values in one column."""
 604  
 605      name: ClassVar = "The Share of Missing Values in a Column"
 606  
 607      def get_condition_from_reference(self, reference: Optional[DatasetMissingValues]):
 608          if reference is not None:
 609              ref_value = reference.share_of_missing_values_by_column[self.column_name]
 610              return TestValueCondition(lte=approx(ref_value, relative=0.1))
 611  
 612          return TestValueCondition(eq=0)
 613  
 614      def calculate_value_for_test(self) -> Numeric:
 615          return self.metric.get_result().current.share_of_missing_values_by_column[self.column_name]
 616  
 617      def get_description(self, value: Numeric) -> str:
 618          return (
 619              f"The share of missing values in the column **{self.column_name}** is {value:.3g}. "
 620              f"The test threshold is {self.get_condition()}."
 621          )
 622  
 623      def get_parameters(self):
 624          return ColumnCheckValueParameters(
 625              condition=self.get_condition(), value=self._value, column_name=self.column_name
 626          )
 627  
 628  
 629  class TestAllColumnsShareOfMissingValues(BaseGenerator):
 630      columns: Optional[List[str]]
 631  
 632      def __init__(self, columns: Optional[List[str]] = None, is_critical: bool = True):
 633          self.is_critical = is_critical
 634          self.columns = columns
 635  
 636      def generate(self, data_definition: DataDefinition) -> List[TestColumnShareOfMissingValues]:
 637          if self.columns is None:
 638              columns = [column.column_name for column in data_definition.get_columns()]
 639  
 640          else:
 641              columns = self.columns
 642  
 643          return [
 644              TestColumnShareOfMissingValues(
 645                  column_name=column,
 646                  is_critical=self.is_critical,
 647              )
 648              for column in columns
 649          ]
 650  
 651  
 652  class TestNumberOfConstantColumns(BaseIntegrityValueTest):
 653      class Config:
 654          type_alias = "evidently:test:TestNumberOfConstantColumns"
 655  
 656      """Number of columns contained only one unique value"""
 657  
 658      name: ClassVar = "Number of Constant Columns"
 659  
 660      def get_condition_from_reference(self, reference: Optional[DatasetSummary]):
 661          if reference is not None:
 662              value = reference.number_of_constant_columns
 663              return TestValueCondition(lte=value)
 664  
 665          return TestValueCondition(eq=0)
 666  
 667      def calculate_value_for_test(self) -> Numeric:
 668          return self.metric.get_result().current.number_of_constant_columns
 669  
 670      def get_description(self, value: Numeric) -> str:
 671          return f"The number of constant columns is {value}. The test threshold is {self.get_condition()}."
 672  
 673  
 674  @default_renderer(wrap_type=TestNumberOfConstantColumns)
 675  class TestNumberOfConstantColumnsRenderer(TestRenderer):
 676      def render_html(self, obj: TestNumberOfConstantColumns) -> TestHtmlInfo:
 677          info = super().render_html(obj)
 678          columns = ["column name", "current nunique"]
 679          dict_curr = obj.metric.get_result().current.number_uniques_by_columns
 680          dict_ref = {}
 681          reference_stats = obj.metric.get_result().reference
 682  
 683          if reference_stats is not None:
 684              dict_ref = reference_stats.number_uniques_by_columns
 685              columns = columns + ["reference nunique"]
 686  
 687          additional_plots = plot_dicts_to_table(dict_curr, dict_ref, columns, "number_of_constant_cols", "curr", True)
 688          info.details = additional_plots
 689          return info
 690  
 691  
 692  class TestNumberOfEmptyRows(BaseIntegrityValueTest):
 693      class Config:
 694          type_alias = "evidently:test:TestNumberOfEmptyRows"
 695  
 696      """Number of rows contained all NAN values"""
 697  
 698      name: ClassVar = "Number of Empty Rows"
 699  
 700      def get_condition_from_reference(self, reference: Optional[DatasetSummary]):
 701          if reference is not None:
 702              ref_number_of_empty_rows = reference.number_of_empty_rows
 703              curr_number_of_rows = self.metric.get_result().current.number_of_rows
 704              ref_number_of_rows = reference.number_of_rows
 705              mult = curr_number_of_rows / ref_number_of_rows
 706              return TestValueCondition(eq=approx(ref_number_of_empty_rows * mult, 0.1))
 707  
 708          return TestValueCondition(eq=0)
 709  
 710      def calculate_value_for_test(self) -> Numeric:
 711          return self.metric.get_result().current.number_of_empty_rows
 712  
 713      def get_description(self, value: Numeric) -> str:
 714          return f"Number of Empty Rows is {value}. The test threshold is {self.get_condition()}."
 715  
 716  
 717  class TestNumberOfEmptyColumns(BaseIntegrityValueTest):
 718      class Config:
 719          type_alias = "evidently:test:TestNumberOfEmptyColumns"
 720  
 721      """Number of columns contained all NAN values"""
 722  
 723      name: ClassVar = "Number of Empty Columns"
 724  
 725      def get_condition_from_reference(self, reference: Optional[DatasetSummary]):
 726          if reference is not None:
 727              return TestValueCondition(lte=reference.number_of_empty_columns)
 728  
 729          return TestValueCondition(eq=0)
 730  
 731      def calculate_value_for_test(self) -> Numeric:
 732          return self.metric.get_result().current.number_of_empty_columns
 733  
 734      def get_description(self, value: Numeric) -> str:
 735          return f"Number of Empty Columns is {value}. The test threshold is {self.get_condition()}."
 736  
 737  
 738  @default_renderer(wrap_type=TestNumberOfEmptyColumns)
 739  class TestNumberOfEmptyColumnsRenderer(TestRenderer):
 740      def render_html(self, obj: TestNumberOfEmptyColumns) -> TestHtmlInfo:
 741          info = super().render_html(obj)
 742          columns = ["column name", "current number of NaNs"]
 743          dict_curr = obj.metric.get_result().current.nans_by_columns
 744          dict_ref = {}
 745          reference_stats = obj.metric.get_result().reference
 746  
 747          if reference_stats is not None:
 748              dict_ref = reference_stats.nans_by_columns
 749              columns = columns + ["reference number of NaNs"]
 750  
 751          additional_plots = plot_dicts_to_table(dict_curr, dict_ref, columns, "number_of_empty_columns")
 752          info.details = additional_plots
 753          return info
 754  
 755  
 756  class TestNumberOfDuplicatedRows(BaseIntegrityValueTest):
 757      class Config:
 758          type_alias = "evidently:test:TestNumberOfDuplicatedRows"
 759  
 760      """How many rows have duplicates in the dataset"""
 761  
 762      name: ClassVar = "Number of Duplicate Rows"
 763  
 764      def get_condition_from_reference(self, reference: Optional[DatasetSummary]):
 765          if reference is not None:
 766              ref_num_of_duplicates = reference.number_of_duplicated_rows
 767              curr_number_of_rows = self.metric.get_result().current.number_of_rows
 768              ref_number_of_rows = reference.number_of_rows
 769              mult = curr_number_of_rows / ref_number_of_rows
 770              return TestValueCondition(eq=approx(ref_num_of_duplicates * mult, 0.1))
 771  
 772          return TestValueCondition(eq=0)
 773  
 774      def calculate_value_for_test(self) -> Numeric:
 775          return self.metric.get_result().current.number_of_duplicated_rows
 776  
 777      def get_description(self, value: Numeric) -> str:
 778          return f"The number of duplicate rows is {value}. The test threshold is {self.get_condition()}."
 779  
 780  
 781  class TestNumberOfDuplicatedColumns(BaseIntegrityValueTest):
 782      class Config:
 783          type_alias = "evidently:test:TestNumberOfDuplicatedColumns"
 784  
 785      """How many columns have duplicates in the dataset"""
 786  
 787      name: ClassVar = "Number of Duplicate Columns"
 788  
 789      def get_condition_from_reference(self, reference: Optional[DatasetSummary]):
 790          if reference is not None:
 791              value = reference.number_of_duplicated_columns
 792              return TestValueCondition(lte=value)
 793  
 794          return TestValueCondition(eq=0)
 795  
 796      def calculate_value_for_test(self) -> Numeric:
 797          return self.metric.get_result().current.number_of_duplicated_columns
 798  
 799      def get_description(self, value: Numeric) -> str:
 800          return f"The number of duplicate columns is {value}. The test threshold is {self.get_condition()}."
 801  
 802  
 803  class BaseIntegrityByColumnsConditionTest(BaseCheckValueTest, ABC):
 804      group: ClassVar = DATA_INTEGRITY_GROUP.id
 805      _data_integrity_metric: ColumnSummaryMetric
 806      column_name: ColumnName
 807  
 808      def __init__(
 809          self,
 810          column_name: Union[str, ColumnName],
 811          eq: Optional[Numeric] = None,
 812          gt: Optional[Numeric] = None,
 813          gte: Optional[Numeric] = None,
 814          is_in: Optional[List[Union[Numeric, str, bool]]] = None,
 815          lt: Optional[Numeric] = None,
 816          lte: Optional[Numeric] = None,
 817          not_eq: Optional[Numeric] = None,
 818          not_in: Optional[List[Union[Numeric, str, bool]]] = None,
 819          is_critical: bool = True,
 820      ):
 821          super().__init__(
 822              eq=eq,
 823              gt=gt,
 824              gte=gte,
 825              is_in=is_in,
 826              lt=lt,
 827              lte=lte,
 828              not_eq=not_eq,
 829              not_in=not_in,
 830              is_critical=is_critical,
 831          )
 832          self.column_name = ColumnName.from_any(column_name)
 833          self._data_integrity_metric = ColumnSummaryMetric(column_name=column_name)
 834  
 835      def groups(self) -> Dict[str, str]:
 836          if self.column_name is not None:
 837              return {GroupingTypes.ByFeature.id: self.column_name.display_name}
 838          return {}
 839  
 840  
 841  class BaseIntegrityOneColumnTest(Test, ABC):
 842      group: ClassVar = DATA_INTEGRITY_GROUP.id
 843      _metric: ColumnSummaryMetric
 844      column_name: ColumnName
 845  
 846      def __init__(self, column_name: Union[str, ColumnName], is_critical: bool = True):
 847          self.column_name = ColumnName.from_any(column_name)
 848          super().__init__(is_critical=is_critical)
 849          self._metric = ColumnSummaryMetric(self.column_name)
 850  
 851      @property
 852      def metric(self):
 853          return self._metric
 854  
 855      def groups(self) -> Dict[str, str]:
 856          return {GroupingTypes.ByFeature.id: self.column_name.display_name}
 857  
 858  
 859  class TestColumnAllConstantValues(BaseIntegrityOneColumnTest):
 860      class Config:
 861          type_alias = "evidently:test:TestColumnAllConstantValues"
 862  
 863      """Test that there is only one unique value in a column"""
 864  
 865      name: ClassVar = "All Constant Values in a Column"
 866      _metric: ColumnSummaryMetric
 867  
 868      def check(self):
 869          uniques_in_column = self.metric.get_result().current_characteristics.unique
 870          number_of_rows = self.metric.get_result().current_characteristics.number_of_rows
 871          column_name = self.column_name
 872  
 873          description = (
 874              f"The number of the unique values in the column **{column_name}** "
 875              f"is {uniques_in_column} out of {number_of_rows}"
 876          )
 877  
 878          if uniques_in_column <= 1:
 879              status = TestStatus.FAIL
 880  
 881          else:
 882              status = TestStatus.SUCCESS
 883  
 884          return TestResult(
 885              name=self.name, description=description, status=status, groups=self.groups(), group=self.group
 886          )
 887  
 888  
 889  @default_renderer(wrap_type=TestColumnAllConstantValues)
 890  class TestColumnAllConstantValuesRenderer(TestRenderer):
 891      def render_html(self, obj: TestColumnAllConstantValues) -> TestHtmlInfo:
 892          info = super().render_html(obj)
 893          column_name = obj.column_name
 894          counts_data = obj.metric.get_result().plot_data.counts_of_values
 895          if counts_data is not None:
 896              curr_df = counts_data["current"]
 897              ref_df = None
 898              if "reference" in counts_data.keys():
 899                  ref_df = counts_data["reference"]
 900              additional_plots = plot_value_counts_tables_ref_curr(column_name, curr_df, ref_df, "AllConstantValues")
 901              info.details = additional_plots
 902          return info
 903  
 904  
 905  class TestColumnAllUniqueValues(BaseIntegrityOneColumnTest):
 906      class Config:
 907          type_alias = "evidently:test:TestColumnAllUniqueValues"
 908  
 909      """Test that there is only uniques values in a column"""
 910  
 911      name: ClassVar = "All Unique Values in a Column"
 912  
 913      def check(self):
 914          uniques_in_column = self.metric.get_result().current_characteristics.unique
 915          number_of_rows = self.metric.get_result().current_characteristics.number_of_rows
 916          nans_in_column = self.metric.get_result().current_characteristics.missing
 917          column_name = self.column_name
 918  
 919          description = (
 920              f"The number of the unique values in the column **{column_name}** "
 921              f"is {uniques_in_column}  out of {number_of_rows}"
 922          )
 923  
 924          if uniques_in_column != number_of_rows - nans_in_column:
 925              status = TestStatus.FAIL
 926  
 927          else:
 928              status = TestStatus.SUCCESS
 929  
 930          return TestResult(
 931              name=self.name, description=description, status=status, groups=self.groups(), group=self.group
 932          )
 933  
 934  
 935  @default_renderer(wrap_type=TestColumnAllUniqueValues)
 936  class TestColumnAllUniqueValuesRenderer(TestRenderer):
 937      def render_html(self, obj: TestColumnAllUniqueValues) -> TestHtmlInfo:
 938          info = super().render_html(obj)
 939          column_name = obj.column_name
 940          counts_data = obj.metric.get_result().plot_data.counts_of_values
 941          if counts_data is not None:
 942              curr_df = counts_data["current"]
 943              ref_df = None
 944              if "reference" in counts_data.keys():
 945                  ref_df = counts_data["reference"]
 946              additional_plots = plot_value_counts_tables_ref_curr(column_name, curr_df, ref_df, "AllUniqueValues")
 947              info.details = additional_plots
 948          return info
 949  
 950  
 951  class ColumnTypeParameter(TestParameters):
 952      class Config:
 953          type_alias = "evidently:test_parameters:ColumnTypeParameter"
 954  
 955      actual_type: str
 956      column_name: str
 957      expected_type: str
 958  
 959  
 960  class ColumnTypesParameter(TestParameters):
 961      class Config:
 962          type_alias = "evidently:test_parameters:ColumnTypesParameter"
 963  
 964      columns: List[ColumnTypeParameter]
 965  
 966  
 967  class TestColumnsType(Test):
 968      class Config:
 969          type_alias = "evidently:test:TestColumnsType"
 970  
 971      """This test compares columns type against the specified ones or a reference dataframe"""
 972  
 973      group: ClassVar = DATA_INTEGRITY_GROUP.id
 974      name: ClassVar = "Column Types"
 975      columns_type: Optional[dict]
 976      _metric: DatasetSummaryMetric
 977  
 978      def __init__(self, columns_type: Optional[dict] = None, is_critical: bool = True):
 979          self.columns_type = columns_type
 980          self._metric = DatasetSummaryMetric()
 981          super().__init__(is_critical=is_critical)
 982  
 983      @property
 984      def metric(self):
 985          return self._metric
 986  
 987      def check(self):
 988          status = TestStatus.SUCCESS
 989          data_columns_type = self.metric.get_result().current.columns_type
 990  
 991          if self.columns_type is None:
 992              if self.metric.get_result().reference is None:
 993                  status = TestStatus.ERROR
 994                  description = "Cannot compare column types without conditions or a reference"
 995                  return TestResult(name=self.name, description=description, status=status, group=self.group)
 996  
 997              # get types from reference
 998              columns_type = self.metric.get_result().reference.columns_type
 999  
1000          else:
1001              columns_type = self.columns_type
1002  
1003              if not columns_type:
1004                  status = TestStatus.ERROR
1005                  description = "Columns type condition is empty"
1006                  return TestResult(name=self.name, description=description, status=status, group=self.group)
1007  
1008          invalid_types_count = 0
1009          columns = []
1010  
1011          for column_name, expected_type_object in columns_type.items():
1012              real_column_type_object = data_columns_type.get(column_name)
1013  
1014              if real_column_type_object is None:
1015                  status = TestStatus.ERROR
1016                  description = f"No column '{column_name}' in the metrics data"
1017                  return TestResult(name=self.name, description=description, status=status, group=self.group)
1018  
1019              if isinstance(expected_type_object, numpy.dtypes.DateTime64DType):
1020                  expected_type = expected_type_object.type
1021              else:
1022                  expected_type = infer_dtype_from_object(expected_type_object)
1023  
1024              if isinstance(real_column_type_object, numpy.dtypes.DateTime64DType):
1025                  real_column_type = real_column_type_object.type
1026              else:
1027                  real_column_type = infer_dtype_from_object(real_column_type_object)
1028              columns.append(
1029                  ColumnTypeParameter(
1030                      actual_type=real_column_type.__name__, expected_type=expected_type.__name__, column_name=column_name
1031                  )
1032              )
1033  
1034              if expected_type == real_column_type or issubclass(real_column_type, expected_type):
1035                  # types are matched or expected type is a parent
1036                  continue
1037  
1038              status = TestStatus.FAIL
1039              invalid_types_count += 1
1040  
1041          return TestResult(
1042              name=self.name,
1043              description=f"The number of columns with a type "
1044              f"mismatch is {invalid_types_count} out of {len(columns_type)}.",
1045              status=status,
1046              parameters=ColumnTypesParameter(columns=columns),
1047              group=self.group,
1048          )
1049  
1050      def groups(self) -> Dict[str, str]:
1051          return {}
1052  
1053  
1054  @default_renderer(wrap_type=TestColumnsType)
1055  class TestColumnsTypeRenderer(TestRenderer):
1056      def render_html(self, obj: TestColumnsType) -> TestHtmlInfo:
1057          info = super().render_html(obj)
1058  
1059          parameters = obj.get_result().parameters
1060          assert isinstance(parameters, ColumnTypesParameter)
1061          info.details = [
1062              DetailsInfo(
1063                  title="",
1064                  info=BaseWidgetInfo(
1065                      title="",
1066                      type="table",
1067                      params={
1068                          "header": ["Column Name", "Actual Type", "Expected Type"],
1069                          "data": [[c.column_name, c.actual_type, c.expected_type] for c in parameters.columns],
1070                      },
1071                      size=2,
1072                  ),
1073              ),
1074          ]
1075          return info
1076  
1077  
1078  class TestColumnRegExp(BaseCheckValueTest, ABC):
1079      group: ClassVar = DATA_INTEGRITY_GROUP.id
1080      name: ClassVar = "RegExp Match"
1081      _metric: ColumnRegExpMetric
1082      column_name: str
1083      reg_exp: str
1084  
1085      def __init__(
1086          self,
1087          column_name: str,
1088          reg_exp: str,
1089          eq: Optional[Numeric] = None,
1090          gt: Optional[Numeric] = None,
1091          gte: Optional[Numeric] = None,
1092          is_in: Optional[List[Union[Numeric, str, bool]]] = None,
1093          lt: Optional[Numeric] = None,
1094          lte: Optional[Numeric] = None,
1095          not_eq: Optional[Numeric] = None,
1096          not_in: Optional[List[Union[Numeric, str, bool]]] = None,
1097          is_critical: bool = True,
1098      ):
1099          self.column_name = column_name
1100          self.reg_exp = reg_exp
1101          super().__init__(
1102              eq=eq,
1103              gt=gt,
1104              gte=gte,
1105              is_in=is_in,
1106              lt=lt,
1107              lte=lte,
1108              not_eq=not_eq,
1109              not_in=not_in,
1110              is_critical=is_critical,
1111          )
1112          self._metric = ColumnRegExpMetric(column_name=self.column_name, reg_exp=self.reg_exp)
1113  
1114      @property
1115      def metric(self):
1116          return self._metric
1117  
1118      def groups(self) -> Dict[str, str]:
1119          if self.column_name is not None:
1120              return {GroupingTypes.ByFeature.id: self.column_name}
1121          return {}
1122  
1123      def get_condition(self) -> TestValueCondition:
1124          if self.condition.has_condition():
1125              return self.condition
1126  
1127          metric_result = self.metric.get_result()
1128  
1129          if metric_result.reference:
1130              ref_value = metric_result.reference.number_of_not_matched
1131              mult = metric_result.current.number_of_rows / metric_result.reference.number_of_rows
1132  
1133              if mult is not None:
1134                  return TestValueCondition(eq=approx(ref_value * mult, relative=0.1))
1135  
1136          return TestValueCondition(eq=0)
1137  
1138      def calculate_value_for_test(self) -> Optional[Numeric]:
1139          return self.metric.get_result().current.number_of_not_matched
1140  
1141      def get_description(self, value: Numeric) -> str:
1142          return (
1143              f"The number of the mismatched values in the column **{self.column_name}** is {value}. "
1144              f"The test threshold is {self.get_condition()}."
1145          )
1146  
1147  
1148  @default_renderer(wrap_type=TestColumnRegExp)
1149  class TestColumnRegExpRenderer(TestRenderer):
1150      def render_html(self, obj: TestColumnRegExp) -> TestHtmlInfo:
1151          info = super().render_html(obj)
1152          column_name = obj.column_name
1153          metric_result = obj.metric.get_result()
1154  
1155          if metric_result.current.table_of_not_matched:
1156              curr_df = pd.DataFrame(metric_result.current.table_of_not_matched.items())
1157              curr_df.columns = pd.Index(["x", "count"])
1158  
1159          else:
1160              curr_df = pd.DataFrame(columns=["x", "count"])
1161  
1162          ref_df = None
1163  
1164          if metric_result.reference is not None and metric_result.reference.table_of_not_matched:
1165              ref_df = pd.DataFrame(metric_result.reference.table_of_not_matched.items())
1166              ref_df.columns = pd.Index(["x", "count"])
1167  
1168          additional_plots = plot_value_counts_tables_ref_curr(
1169              column_name, curr_df, ref_df, f"{column_name}_ColumnValueRegExp"
1170          )
1171          info.details = additional_plots
1172          return info