test_data_quality_tests.py
1 import json 2 3 import pandas as pd 4 import pytest 5 from pytest import approx as pytest_approx 6 7 from evidently.legacy.pipeline.column_mapping import ColumnMapping 8 from evidently.legacy.test_suite import TestSuite 9 from evidently.legacy.tests import TestCategoryCount 10 from evidently.legacy.tests import TestColumnQuantile 11 from evidently.legacy.tests import TestColumnValueMax 12 from evidently.legacy.tests import TestColumnValueMean 13 from evidently.legacy.tests import TestColumnValueMedian 14 from evidently.legacy.tests import TestColumnValueMin 15 from evidently.legacy.tests import TestColumnValueStd 16 from evidently.legacy.tests import TestConflictPrediction 17 from evidently.legacy.tests import TestConflictTarget 18 from evidently.legacy.tests import TestHighlyCorrelatedColumns 19 from evidently.legacy.tests import TestMeanInNSigmas 20 from evidently.legacy.tests import TestMostCommonValueShare 21 from evidently.legacy.tests import TestNumberOfOutListValues 22 from evidently.legacy.tests import TestNumberOfOutRangeValues 23 from evidently.legacy.tests import TestNumberOfUniqueValues 24 from evidently.legacy.tests import TestShareOfOutListValues 25 from evidently.legacy.tests import TestShareOfOutRangeValues 26 from evidently.legacy.tests import TestTargetFeaturesCorrelations 27 from evidently.legacy.tests import TestTargetPredictionCorrelation 28 from evidently.legacy.tests import TestUniqueValuesShare 29 from evidently.legacy.tests import TestValueList 30 from evidently.legacy.tests import TestValueRange 31 from evidently.legacy.tests.base_test import TestStatus 32 from evidently.legacy.tests.utils import approx 33 34 35 @pytest.mark.parametrize( 36 "test_dataset, reference_dataset, test_object, expected_success", 37 ( 38 ( 39 pd.DataFrame( 40 {"category_feature": ["n", "d", "p", "n"], "numerical_feature": [0, 1, 2, 5], "target": [0, 0, 0, 1]} 41 ), 42 None, 43 TestColumnValueMin(column_name="numerical_feature", gte=10), 44 False, 45 ), 46 ( 47 pd.DataFrame( 48 {"category_feature": ["n", "d", "p", "n"], "numerical_feature": [0, 1, 2, 5], "target": [0, 0, 0, 1]} 49 ), 50 None, 51 TestColumnValueMin(column_name="numerical_feature", eq=0), 52 True, 53 ), 54 ( 55 pd.DataFrame( 56 { 57 "category_feature": ["n", "d", "p", "n"], 58 "numerical_feature": [0.4, 0.1, -1.45, 5], 59 "target": [0, 0, 0, 1], 60 } 61 ), 62 None, 63 TestColumnValueMin(column_name="numerical_feature", eq=approx(-1, absolute=0.5)), 64 True, 65 ), 66 ( 67 pd.DataFrame( 68 { 69 "category_feature": ["n", "d", "p", "n"], 70 "numerical_feature": [10, 7, 5.1, 4.9], 71 "target": [0, 0, 0, 1], 72 } 73 ), 74 None, 75 TestColumnValueMin(column_name="numerical_feature", lt=approx(10, relative=0.5)), 76 True, 77 ), 78 ( 79 pd.DataFrame( 80 {"category_feature": ["n", "d", "p", "n"], "numerical_feature": [10, 7, 5.1, 5], "target": [0, 0, 0, 1]} 81 ), 82 None, 83 TestColumnValueMin(column_name="numerical_feature", lt=approx(10, relative=0.5)), 84 False, 85 ), 86 ), 87 ) 88 def test_data_quality_test_min( 89 test_dataset: pd.DataFrame, reference_dataset: pd.DataFrame, test_object: TestColumnValueMin, expected_success: bool 90 ) -> None: 91 suite = TestSuite(tests=[test_object]) 92 mapping = ColumnMapping(categorical_features=["category_feature"], numerical_features=["numerical_feature"]) 93 suite.run(current_data=test_dataset, reference_data=reference_dataset, column_mapping=mapping) 94 if expected_success: 95 suite._inner_suite.raise_for_error() 96 assert bool(suite) is expected_success 97 98 99 @pytest.mark.parametrize( 100 "test_dataset, reference_dataset, test_object, expected_success", 101 ( 102 ( 103 pd.DataFrame( 104 {"category_feature": ["n", "d", "p", "n"], "numerical_feature": [0, 1, 2, 5], "target": [0, 0, 0, 1]} 105 ), 106 None, 107 TestColumnValueMin(column_name="numerical_feature"), 108 False, 109 ), 110 ), 111 ) 112 def test_data_quality_test_min_exception( 113 test_dataset: pd.DataFrame, reference_dataset: pd.DataFrame, test_object: TestColumnValueMin, expected_success: bool 114 ) -> None: 115 suite = TestSuite(tests=[test_object]) 116 suite.run(current_data=test_dataset, reference_data=reference_dataset) 117 assert suite.as_dict()["tests"][0]["status"] == TestStatus.ERROR.value 118 119 120 def test_data_quality_test_min_render(): 121 test_dataset = pd.DataFrame({"numerical_feature": [0, 1, 2, 5], "target": [0, 0, 0, 1]}) 122 suite = TestSuite(tests=[TestColumnValueMin(column_name="numerical_feature", eq=0)]) 123 column_mapping = ColumnMapping(numerical_features=["numerical_feature"]) 124 suite.run(current_data=test_dataset, reference_data=None, column_mapping=column_mapping) 125 assert suite.show() 126 assert suite.json() 127 128 suite = TestSuite(tests=[TestColumnValueMin(column_name="numerical_feature")]) 129 mapping = ColumnMapping(numerical_features=["numerical_feature"]) 130 suite.run(current_data=test_dataset, reference_data=test_dataset, column_mapping=mapping) 131 assert suite.show() 132 assert suite.json() 133 134 135 def test_data_quality_test_max() -> None: 136 test_dataset = pd.DataFrame( 137 {"category_feature": ["n", "d", "p", "n"], "numerical_feature": [0, 1, 2, 5], "target": [0, 0, 0, 1]} 138 ) 139 suite = TestSuite(tests=[TestColumnValueMax(column_name="numerical_feature", gt=10)]) 140 mapping = ColumnMapping(categorical_features=["category_feature"], numerical_features=["numerical_feature"]) 141 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 142 assert not suite 143 144 suite = TestSuite(tests=[TestColumnValueMax(column_name="numerical_feature", eq=5)]) 145 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 146 assert suite 147 148 149 def test_data_quality_test_max_render(): 150 test_dataset = pd.DataFrame({"numerical_feature": [0, 1, 2, 5], "target": [0, 0, 0, 1]}) 151 suite = TestSuite(tests=[TestColumnValueMax(column_name="numerical_feature", eq=0)]) 152 mapping = ColumnMapping(numerical_features=["numerical_feature"]) 153 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 154 assert suite.show() 155 assert suite.json() 156 157 suite = TestSuite(tests=[TestColumnValueMax(column_name="numerical_feature")]) 158 suite.run(current_data=test_dataset, reference_data=test_dataset, column_mapping=mapping) 159 assert suite.show() 160 assert suite.json() 161 162 163 def test_data_quality_test_mean() -> None: 164 test_dataset = pd.DataFrame( 165 {"category_feature": ["n", "d", "p", "n"], "numerical_feature": [0, 1, 2, 5], "target": [0, 0, 0, 1]} 166 ) 167 suite = TestSuite(tests=[TestColumnValueMean(column_name="numerical_feature", eq=5)]) 168 mapping = ColumnMapping(categorical_features=["category_feature"], numerical_features=["numerical_feature"]) 169 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 170 assert not suite 171 172 suite = TestSuite(tests=[TestColumnValueMean(column_name="numerical_feature", gt=0, lt=10)]) 173 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 174 assert suite 175 176 suite = TestSuite(tests=[TestColumnValueMean(column_name="numerical_feature", eq=2)]) 177 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 178 assert suite 179 180 181 def test_data_quality_test_mean_render(): 182 test_dataset = pd.DataFrame({"numerical_feature": [0, 1, 2, 5], "target": [0, 0, 0, 1]}) 183 mapping = ColumnMapping(numerical_features=["numerical_feature"]) 184 suite = TestSuite(tests=[TestColumnValueMean(column_name="numerical_feature", eq=0)]) 185 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 186 assert suite.show() 187 assert suite.json() 188 189 suite = TestSuite(tests=[TestColumnValueMean(column_name="numerical_feature")]) 190 suite.run(current_data=test_dataset, reference_data=test_dataset, column_mapping=mapping) 191 assert suite.show() 192 assert suite.json() 193 194 195 def test_data_quality_test_conflict_target() -> None: 196 test_dataset = pd.DataFrame( 197 {"category_feature": ["n", "n", "p", "n"], "numerical_feature": [0, 0, 2, 5], "target": [0, 1, 0, 1]} 198 ) 199 mapping = ColumnMapping(categorical_features=["category_feature"], numerical_features=["numerical_feature"]) 200 suite = TestSuite(tests=[TestConflictTarget()]) 201 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 202 assert not suite 203 204 test_dataset = pd.DataFrame( 205 {"category_feature": ["n", "d", "p", "n"], "numerical_feature": [0, 1, 2, 5], "target": [0, 0, 0, 1]} 206 ) 207 suite = TestSuite(tests=[TestConflictTarget()]) 208 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 209 suite._inner_suite.raise_for_error() 210 assert suite 211 assert suite.show() 212 assert suite.json() 213 214 215 def test_data_quality_test_conflict_prediction() -> None: 216 test_dataset = pd.DataFrame( 217 {"category_feature": ["n", "n", "p", "n"], "numerical_feature": [0, 0, 2, 5], "prediction": [0, 1, 0, 1]} 218 ) 219 mapping = ColumnMapping(categorical_features=["category_feature"], numerical_features=["numerical_feature"]) 220 suite = TestSuite(tests=[TestConflictPrediction()]) 221 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 222 suite._inner_suite.raise_for_error() 223 assert not suite 224 225 test_dataset = pd.DataFrame( 226 {"category_feature": ["n", "d", "p", "n"], "numerical_feature": [0, 1, 2, 5], "prediction": [0, 0, 0, 1]} 227 ) 228 suite = TestSuite(tests=[TestConflictPrediction()]) 229 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 230 assert suite 231 assert suite.show() 232 assert suite.json() 233 234 235 def test_data_quality_test_target_prediction_correlation() -> None: 236 test_dataset = pd.DataFrame( 237 { 238 "category_feature": ["n", "d", "p", "n"], 239 "numerical_feature": [0, 1, 2, 5], 240 "target": [0, 0, 0, 1], 241 "prediction": [0, 0, 1, 1], 242 } 243 ) 244 mapping = ColumnMapping(categorical_features=["category_feature"], numerical_features=["numerical_feature"]) 245 suite = TestSuite(tests=[TestTargetPredictionCorrelation(gt=0.5, method="cramer_v")]) 246 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 247 assert suite 248 assert suite.show() 249 assert suite.json() 250 251 252 def test_data_quality_test_median() -> None: 253 test_dataset = pd.DataFrame( 254 { 255 "feature1": [0, 1, 2, 5], 256 "target": [0, 0, 0, 1], 257 "prediction": [0, 0, 1, 1], 258 } 259 ) 260 mapping = ColumnMapping(numerical_features=["feature1"]) 261 suite = TestSuite(tests=[TestColumnValueMedian(column_name="no_existing_feature", eq=1.5)]) 262 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 263 assert not suite 264 suite = TestSuite(tests=[TestColumnValueMedian(column_name="feature1", eq=1.5)]) 265 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 266 assert suite 267 assert suite.show() 268 assert suite.json() 269 270 271 def test_data_quality_test_std() -> None: 272 test_dataset = pd.DataFrame( 273 { 274 "feature1": [0, 1, 2, 5], 275 "target": [0, 0, 0, 1], 276 "prediction": [0, 0, 1, 1], 277 } 278 ) 279 suite = TestSuite(tests=[TestColumnValueStd(column_name="no_existing_feature", eq=1.5)]) 280 suite.run(current_data=test_dataset, reference_data=None, column_mapping=ColumnMapping()) 281 assert not suite 282 suite = TestSuite(tests=[TestColumnValueStd(column_name="feature1", lt=2)]) 283 mapping = ColumnMapping(numerical_features=["feature1"]) 284 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 285 assert not suite 286 suite = TestSuite(tests=[TestColumnValueStd(column_name="feature1", gt=2, lt=3)]) 287 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 288 assert suite 289 assert suite.show() 290 assert suite.json() 291 292 293 def test_data_quality_test_unique_number() -> None: 294 test_dataset = pd.DataFrame( 295 { 296 "feature1": [0, 1, 2, 5], 297 "target": [0, 0, 0, 1], 298 "prediction": [0, 0, 1, 1], 299 } 300 ) 301 suite = TestSuite(tests=[TestNumberOfUniqueValues(column_name="no_existing_feature", eq=4)]) 302 suite.run(current_data=test_dataset, reference_data=None, column_mapping=ColumnMapping()) 303 assert not suite 304 suite = TestSuite(tests=[TestNumberOfUniqueValues(column_name="feature1", lt=2)]) 305 mapping = ColumnMapping(numerical_features=["feature1"]) 306 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 307 assert not suite 308 suite = TestSuite(tests=[TestNumberOfUniqueValues(column_name="feature1", eq=4)]) 309 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 310 assert suite 311 assert suite.show() 312 assert suite.json() 313 314 315 def test_data_quality_test_unique_share() -> None: 316 test_dataset = pd.DataFrame( 317 { 318 "feature1": [0, 1, 2, 5], 319 "target": [0, 0, 0, 1], 320 "prediction": [0, 0, 1, 1], 321 } 322 ) 323 suite = TestSuite(tests=[TestUniqueValuesShare(column_name="no_existing_feature", eq=1.5)]) 324 suite.run(current_data=test_dataset, reference_data=None, column_mapping=ColumnMapping()) 325 assert not suite 326 suite = TestSuite(tests=[TestUniqueValuesShare(column_name="feature1", lt=0.5)]) 327 mapping = ColumnMapping(numerical_features=["feature1"]) 328 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 329 assert not suite 330 suite = TestSuite(tests=[TestUniqueValuesShare(column_name="feature1", eq=1)]) 331 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 332 assert suite 333 assert suite.show() 334 assert suite.json() 335 336 337 def test_data_quality_test_most_common_value_share() -> None: 338 test_dataset = pd.DataFrame( 339 { 340 "feature1": [0, 1, 1, 5], 341 "target": [0, 0, 0, 1], 342 "prediction": [0, 0, 1, 1], 343 } 344 ) 345 suite = TestSuite(tests=[TestMostCommonValueShare(column_name="feature1")]) 346 mapping = ColumnMapping(numerical_features=["feature1"]) 347 suite.run(current_data=test_dataset, reference_data=test_dataset, column_mapping=mapping) 348 assert suite 349 suite = TestSuite(tests=[TestMostCommonValueShare(column_name="no_existing_feature", eq=0.5)]) 350 suite.run(current_data=test_dataset, reference_data=None, column_mapping=ColumnMapping()) 351 assert not suite 352 suite = TestSuite(tests=[TestMostCommonValueShare(column_name="feature1", lt=0.5)]) 353 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 354 assert not suite 355 suite = TestSuite(tests=[TestMostCommonValueShare(column_name="feature1", eq=0.5)]) 356 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 357 assert suite 358 assert suite.show() 359 assert suite.json() 360 361 362 def test_data_quality_test_most_common_value_share_json_render() -> None: 363 test_dataset = pd.DataFrame( 364 { 365 "feature1": [0, 1, 1, 5], 366 } 367 ) 368 suite = TestSuite(tests=[TestMostCommonValueShare(column_name="feature1", eq=0.5)]) 369 mapping = ColumnMapping(numerical_features=["feature1"]) 370 suite.run(current_data=test_dataset, reference_data=test_dataset, column_mapping=mapping) 371 assert suite 372 373 result_from_json = json.loads(suite.json()) 374 assert result_from_json["summary"]["all_passed"] is True 375 test_info = result_from_json["tests"][0] 376 assert test_info == { 377 "description": ( 378 "The most common value in the column **feature1** is 1. Its share is 0.5. The test threshold is eq=0.5." 379 ), 380 "group": "data_quality", 381 "name": "Share of the Most Common Value", 382 "parameters": {"column_name": "feature1", "condition": {"eq": 0.5}, "value": 0.5}, 383 "status": "SUCCESS", 384 } 385 386 387 def test_data_quality_test_value_in_n_sigmas() -> None: 388 test_dataset = pd.DataFrame( 389 { 390 "feature1": [0, 1, 1, 20], 391 "target": [0, 0, 0, 1], 392 "prediction": [0, 0, 1, 1], 393 } 394 ) 395 reference_dataset = pd.DataFrame( 396 { 397 "feature1": [0, 1, 1, 3], 398 "target": [0, 0, 0, 1], 399 "prediction": [0, 0, 1, 1], 400 } 401 ) 402 suite = TestSuite(tests=[TestMeanInNSigmas(column_name="feature1")]) 403 mapping = ColumnMapping(numerical_features=["feature1"]) 404 suite.run(current_data=test_dataset, reference_data=reference_dataset, column_mapping=mapping) 405 assert not suite 406 407 suite = TestSuite(tests=[TestMeanInNSigmas(column_name="not_exist_feature", n_sigmas=3)]) 408 suite.run(current_data=test_dataset, reference_data=reference_dataset, column_mapping=ColumnMapping()) 409 assert not suite 410 411 suite = TestSuite(tests=[TestMeanInNSigmas(column_name="feature1", n_sigmas=4)]) 412 suite.run(current_data=test_dataset, reference_data=reference_dataset, column_mapping=mapping) 413 assert suite 414 assert suite.show() 415 assert suite.json() 416 417 418 def test_data_quality_test_value_in_n_sigmas_json_render() -> None: 419 test_dataset = pd.DataFrame( 420 { 421 "feature1": [0, 1, 1, 0], 422 "target": [0, 0, 0, 1], 423 "prediction": [0, 0, 1, 1], 424 } 425 ) 426 suite = TestSuite(tests=[TestMeanInNSigmas(column_name="feature1", n_sigmas=5)]) 427 mapping = ColumnMapping(numerical_features=["feature1"]) 428 suite.run(current_data=test_dataset, reference_data=test_dataset, column_mapping=mapping) 429 assert suite 430 431 result_from_json = json.loads(suite.json()) 432 assert result_from_json["summary"]["all_passed"] is True 433 test_info = result_from_json["tests"][0] 434 assert test_info == { 435 "description": "The mean value of the column **feature1** is 0.5. The expected range is from -2.4 to 3.4", 436 "group": "data_quality", 437 "name": "Mean Value Stability", 438 "parameters": { 439 "column_name": "feature1", 440 "current_mean": 0.5, 441 "n_sigmas": 5, 442 "reference_mean": 0.5, 443 "reference_std": 0.58, 444 }, 445 "status": "SUCCESS", 446 } 447 448 449 def test_data_quality_test_value_in_range() -> None: 450 test_dataset = pd.DataFrame( 451 { 452 "feature1": [0, 1, 2, 3, 4, 20], 453 "target": [0, 0, 0, 1, 0, 1], 454 "prediction": [0, 0, 1, 1, 0, 1], 455 } 456 ) 457 suite = TestSuite(tests=[TestValueRange(column_name="feature1", left=0, right=10)]) 458 suite.run(current_data=test_dataset, reference_data=None, column_mapping=ColumnMapping()) 459 assert not suite 460 461 suite = TestSuite(tests=[TestValueRange(column_name="feature1", left=0, right=100)]) 462 suite.run(current_data=test_dataset, reference_data=None, column_mapping=ColumnMapping()) 463 suite._inner_suite.raise_for_error() 464 assert suite 465 466 reference_dataset = pd.DataFrame( 467 { 468 "feature1": [0, 1, 1, 3, 2, 4, 5], 469 "target": [0, 0, 0, 1, 0, 1, 1], 470 "prediction": [0, 0, 1, 1, 0, 1, 1], 471 } 472 ) 473 suite = TestSuite(tests=[TestValueRange(column_name="feature1")]) 474 suite.run(current_data=test_dataset, reference_data=reference_dataset, column_mapping=ColumnMapping()) 475 assert not suite 476 477 suite = TestSuite(tests=[TestValueRange(column_name="feature1", right=100)]) 478 suite.run(current_data=test_dataset, reference_data=reference_dataset, column_mapping=ColumnMapping()) 479 assert suite 480 assert suite.show() 481 assert suite.json() 482 483 484 def test_data_quality_test_number_of_values_not_in_range() -> None: 485 test_dataset = pd.DataFrame( 486 { 487 "feature1": [0, 1, 1, 2, 3, 4, 15], 488 "target": [0, 0, 2, 3, 4, 5, 1], 489 } 490 ) 491 suite = TestSuite(tests=[TestNumberOfOutRangeValues(column_name="feature1", left=0, right=10, lt=1)]) 492 suite.run(current_data=test_dataset, reference_data=None, column_mapping=ColumnMapping()) 493 assert not suite 494 495 suite = TestSuite(tests=[TestNumberOfOutRangeValues(column_name="feature1", left=0, right=10, lte=1)]) 496 suite.run(current_data=test_dataset, reference_data=None, column_mapping=ColumnMapping()) 497 assert suite 498 499 reference_dataset = pd.DataFrame( 500 { 501 "feature1": [0, 1, 1, 3, 4, 5, 6, 7], 502 "target": [0, 0, 0, 1, 0, 0, 1, 1], 503 "prediction": [0, 0, 1, 1, 0, 0, 1, 1], 504 } 505 ) 506 suite = TestSuite(tests=[TestNumberOfOutRangeValues(column_name="feature1", lt=1)]) 507 suite.run( 508 current_data=test_dataset, 509 reference_data=reference_dataset, 510 column_mapping=ColumnMapping( 511 prediction=None, 512 numerical_features=["feature1"], 513 ), 514 ) 515 assert not suite 516 517 suite = TestSuite(tests=[TestNumberOfOutRangeValues(column_name="feature1", lte=1)]) 518 suite.run( 519 current_data=test_dataset, 520 reference_data=reference_dataset, 521 column_mapping=ColumnMapping( 522 prediction=None, 523 numerical_features=["feature1"], 524 ), 525 ) 526 assert suite 527 assert suite.show() 528 assert suite.json() 529 530 531 def test_data_quality_test_share_of_values_not_in_range() -> None: 532 test_dataset = pd.DataFrame( 533 { 534 "feature1": [0, 1, 1, 2, 3, 4, 15], 535 "target": [0, 0, 2, 3, 4, 5, 1], 536 } 537 ) 538 suite = TestSuite(tests=[TestShareOfOutRangeValues(column_name="feature1", left=0, right=10, lt=0.1)]) 539 mapping = ColumnMapping(numerical_features=["feature1"]) 540 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 541 assert not suite 542 543 suite = TestSuite(tests=[TestShareOfOutRangeValues(column_name="feature1", left=0, right=10, lt=0.5)]) 544 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 545 assert suite 546 547 reference_dataset = pd.DataFrame( 548 { 549 "feature1": [0, 1, 1, 3, 4, 5, 6, 7], 550 "target": [0, 0, 0, 1, 0, 0, 1, 1], 551 "prediction": [0, 0, 1, 1, 0, 0, 1, 1], 552 } 553 ) 554 suite = TestSuite(tests=[TestShareOfOutRangeValues(column_name="feature1", lt=0.1)]) 555 suite.run( 556 current_data=test_dataset, 557 reference_data=reference_dataset, 558 column_mapping=ColumnMapping( 559 prediction=None, 560 numerical_features=["feature1"], 561 ), 562 ) 563 assert not suite 564 565 suite = TestSuite(tests=[TestShareOfOutRangeValues(column_name="feature1", lte=0.5)]) 566 suite.run( 567 current_data=test_dataset, 568 reference_data=reference_dataset, 569 column_mapping=ColumnMapping( 570 prediction=None, 571 numerical_features=["feature1"], 572 ), 573 ) 574 assert suite 575 assert suite.show() 576 assert suite.json() 577 578 579 def test_data_quality_test_share_of_values_not_in_range_json_render() -> None: 580 test_dataset = pd.DataFrame( 581 { 582 "feature1": [0, 1, 1, 0, 24, 2, 3, 4], 583 } 584 ) 585 suite = TestSuite(tests=[TestShareOfOutRangeValues(column_name="feature1", left=0, right=10, gt=0.2)]) 586 mapping = ColumnMapping(numerical_features=["feature1"]) 587 suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping) 588 assert not suite 589 590 result_from_json = json.loads(suite.json()) 591 assert result_from_json["summary"]["all_passed"] is False 592 test_info = result_from_json["tests"][0] 593 assert test_info == { 594 "description": ( 595 "The share of values out of range in the column **feature1** is 0.125 (1 out of 8)." 596 " The test threshold is gt=0.2." 597 ), 598 "group": "data_quality", 599 "name": "Share of Out-of-Range Values", 600 "parameters": {"condition": {"gt": 0.2}, "left": 0, "right": 10, "value": 0.125}, 601 "status": "FAIL", 602 } 603 604 605 def test_data_quality_test_value_in_list() -> None: 606 test_dataset = pd.DataFrame( 607 { 608 "feature1": [0, 1, 2, 3, 4, 20], 609 "target": [0, 0, 0, 1, 0, 1], 610 "prediction": [0, 0, 1, 1, 0, 1], 611 } 612 ) 613 reference_dataset = pd.DataFrame( 614 { 615 "feature1": [0, 1, 1, 3, 2, 4, 5], 616 "target": [0, 0, 0, 1, 0, 1, 1], 617 "prediction": [0, 0, 1, 2, 0, 1, 1], 618 } 619 ) 620 suite = TestSuite(tests=[TestValueList(column_name="feature1")]) 621 suite.run(current_data=test_dataset, reference_data=reference_dataset, column_mapping=ColumnMapping()) 622 assert not suite 623 624 suite = TestSuite(tests=[TestValueList(column_name="prediction", values=[0, 1])]) 625 suite.run(current_data=test_dataset, reference_data=reference_dataset, column_mapping=ColumnMapping()) 626 assert suite 627 628 suite = TestSuite(tests=[TestValueList(column_name="target")]) 629 suite.run(current_data=test_dataset, reference_data=reference_dataset, column_mapping=ColumnMapping()) 630 assert suite 631 assert suite.show() 632 assert suite.json() 633 634 635 def test_data_quality_test_value_in_list_json_render() -> None: 636 test_dataset = pd.DataFrame( 637 { 638 "target": [0, 0, 1, 1], 639 } 640 ) 641 reference_dataset = pd.DataFrame( 642 { 643 "target": [0, 0, 0, 1], 644 } 645 ) 646 suite = TestSuite(tests=[TestValueList(column_name="target")]) 647 suite.run(current_data=test_dataset, reference_data=reference_dataset, column_mapping=ColumnMapping()) 648 assert suite 649 650 result_from_json = json.loads(suite.json()) 651 assert result_from_json["summary"]["all_passed"] is True 652 test_info = result_from_json["tests"][0] 653 assert test_info == { 654 "description": "All values in the column **target** are in the list.", 655 "group": "data_quality", 656 "name": "Out-of-List Values", 657 "parameters": {"column_name": "target", "value": 0, "values": None}, 658 "status": "SUCCESS", 659 } 660 661 662 def test_data_quality_test_number_of_values_not_in_list() -> None: 663 test_dataset = pd.DataFrame( 664 { 665 "feature1": [2, 4, 4, 20], 666 "target": [0, 0, 0, 1], 667 "prediction": [0, 0, 1, 1], 668 } 669 ) 670 reference_dataset = pd.DataFrame( 671 { 672 "feature1": [2, 4, 4, 2], 673 "target": [0, 0, 0, 1], 674 "prediction": [0, 0, 1, 1], 675 } 676 ) 677 suite = TestSuite(tests=[TestNumberOfOutListValues(column_name="feature1", gt=10)]) 678 suite.run(current_data=test_dataset, reference_data=reference_dataset, column_mapping=ColumnMapping()) 679 assert not suite 680 681 suite = TestSuite(tests=[TestNumberOfOutListValues(column_name="feature1", lt=2)]) 682 suite.run(current_data=test_dataset, reference_data=reference_dataset, column_mapping=ColumnMapping()) 683 assert suite 684 assert suite.show() 685 assert suite.json() 686 687 688 def test_data_quality_test_share_of_values_not_in_list() -> None: 689 test_dataset = pd.DataFrame( 690 { 691 "feature1": [0, 1, 1, 20], 692 "target": [0, 0, 0, 1], 693 "prediction": [0, 0, 1, 1], 694 } 695 ) 696 697 suite = TestSuite(tests=[TestShareOfOutListValues(column_name="feature1", values=[0], lt=0.5)]) 698 suite.run(current_data=test_dataset, reference_data=None, column_mapping=ColumnMapping()) 699 assert not suite 700 701 suite = TestSuite(tests=[TestShareOfOutListValues(column_name="feature1", values=[0, 1], lt=0.5)]) 702 suite.run(current_data=test_dataset, reference_data=None, column_mapping=ColumnMapping()) 703 assert suite 704 705 706 def test_data_quality_test_share_of_values_not_in_list_json_render() -> None: 707 current_dataset = pd.DataFrame( 708 { 709 "feature1": [0, 1, 10, 20], 710 } 711 ) 712 reference_dataset = pd.DataFrame( 713 { 714 "feature1": [0, 1, 1, 20], 715 } 716 ) 717 718 suite = TestSuite(tests=[TestShareOfOutListValues(column_name="feature1")]) 719 suite.run(current_data=current_dataset, reference_data=reference_dataset) 720 assert not suite 721 722 result_from_json = json.loads(suite.json()) 723 assert result_from_json["summary"]["all_passed"] is False 724 test_info = result_from_json["tests"][0] 725 assert test_info == { 726 "description": ( 727 "The share of values out of list in the column **feature1** is 0.25 (1 out of 4)." 728 " The test threshold is eq=0 ± 1e-12." 729 ), 730 "group": "data_quality", 731 "name": "Share of Out-of-List Values", 732 "parameters": { 733 "condition": {"eq": {"absolute": 1e-12, "relative": 1e-06, "value": 0}}, 734 "value": 0.25, 735 "values": None, 736 }, 737 "status": "FAIL", 738 } 739 740 741 def test_data_quality_test_value_quantile() -> None: 742 test_dataset = pd.DataFrame( 743 { 744 "feature1": [0, 1, 2, 3], 745 "target": [0, 0, 0, 1], 746 "prediction": [0, 0, 1, 1], 747 } 748 ) 749 750 suite = TestSuite(tests=[TestColumnQuantile(column_name="feature1", quantile=0.7, lt=1)]) 751 suite.run(current_data=test_dataset, reference_data=None, column_mapping=ColumnMapping()) 752 assert not suite 753 754 suite = TestSuite(tests=[TestColumnQuantile(column_name="feature1", quantile=0.2, lt=0.7)]) 755 suite.run(current_data=test_dataset, reference_data=None, column_mapping=ColumnMapping()) 756 suite._inner_suite.raise_for_error() 757 assert suite 758 assert suite.show() 759 assert suite.json() 760 761 762 @pytest.mark.skip("require proper tests case") 763 def test_data_quality_test_highly_correlated_features() -> None: 764 test_dataset = pd.DataFrame( 765 { 766 "feature1": [0, 1, 2, 3], 767 "feature2": [0, 0, 0, 1], 768 "feature3": [0, 0, 1, 1], 769 } 770 ) 771 suite = TestSuite(tests=[TestHighlyCorrelatedColumns()]) 772 suite.run(current_data=test_dataset, reference_data=test_dataset) 773 assert suite 774 775 suite = TestSuite(tests=[TestHighlyCorrelatedColumns(gt=1)]) 776 suite.run(current_data=test_dataset, reference_data=None) 777 assert not suite 778 779 suite = TestSuite(tests=[TestHighlyCorrelatedColumns(lt=1)]) 780 suite.run(current_data=test_dataset, reference_data=None) 781 assert suite 782 assert suite.show() 783 assert suite.json() 784 785 786 @pytest.mark.skip("require proper tests case") 787 def test_data_quality_test_highly_correlated_features_json_render() -> None: 788 test_dataset = pd.DataFrame( 789 { 790 "feature1": [0, 1, 2, 3], 791 "feature2": [0, 2, 3, 4], 792 "target": [0, 0, 0, 1], 793 "prediction": [0, 0, 1, 1], 794 } 795 ) 796 suite = TestSuite(tests=[TestHighlyCorrelatedColumns()]) 797 suite.run(current_data=test_dataset, reference_data=test_dataset) 798 assert suite 799 800 result_from_json = json.loads(suite.json()) 801 assert result_from_json["summary"]["all_passed"] is True 802 test_info = result_from_json["tests"][0] 803 assert test_info == { 804 "description": "The maximum correlation is 0.983. The test threshold is eq=0.983 ± 0.0983.", 805 "group": "data_quality", 806 "name": "Highly Correlated Columns", 807 "parameters": { 808 "value": 0.983, 809 "condition": {"eq": {"absolute": 1e-12, "relative": 0.1, "value": 0.9827076298239908}}, 810 }, 811 "status": "SUCCESS", 812 } 813 814 815 @pytest.mark.skip("require proper tests case") 816 def test_data_quality_test_target_features_correlation() -> None: 817 test_dataset = pd.DataFrame( 818 { 819 "feature1": [0, 1, 2, 3], 820 "target": [0, 0, 0, 1], 821 } 822 ) 823 column_mapping = ColumnMapping(task="regression") 824 825 suite = TestSuite(tests=[TestTargetFeaturesCorrelations()]) 826 suite.run(current_data=test_dataset, reference_data=test_dataset, column_mapping=column_mapping) 827 assert suite 828 829 suite = TestSuite(tests=[TestTargetFeaturesCorrelations(gt=1)]) 830 suite.run(current_data=test_dataset, reference_data=None, column_mapping=column_mapping) 831 assert not suite 832 833 suite = TestSuite(tests=[TestTargetFeaturesCorrelations(lt=1)]) 834 suite.run(current_data=test_dataset, reference_data=None, column_mapping=column_mapping) 835 assert suite 836 assert suite.show() 837 assert suite.json() 838 839 840 @pytest.mark.skip("require proper tests case") 841 def test_data_quality_test_target_features_correlation_errors() -> None: 842 test_dataset = pd.DataFrame( 843 { 844 "feature1": [0, 1, 2, 3], 845 "prediction": [0, 0, 0, 1], 846 } 847 ) 848 suite = TestSuite(tests=[TestTargetFeaturesCorrelations()]) 849 suite.run(current_data=test_dataset, reference_data=test_dataset) 850 assert not suite 851 852 assert suite.as_dict()["tests"][0] == { 853 "description": "No target in the current dataset", 854 "group": "data_quality", 855 "name": "Correlation between Target and Features", 856 "parameters": {"value": None, "condition": {"lt": 0.9}}, 857 "status": "ERROR", 858 } 859 860 861 @pytest.mark.skip("require proper tests case") 862 def test_data_quality_test_target_features_correlation_json_render() -> None: 863 test_dataset = pd.DataFrame( 864 { 865 "feature1": [0, 1, 2, 3], 866 "target": [0.0, 0.0, 0.0, 1.0], 867 "prediction": [0.0, 0.0, 0.0, 1.0], 868 } 869 ) 870 column_mapping = ColumnMapping(task="regression") 871 suite = TestSuite(tests=[TestTargetFeaturesCorrelations()]) 872 suite.run(current_data=test_dataset, reference_data=test_dataset, column_mapping=column_mapping) 873 assert suite 874 875 result_from_json = json.loads(suite.json()) 876 assert result_from_json["summary"]["all_passed"] is True 877 test_info = result_from_json["tests"][0] 878 assert test_info == { 879 "description": "The maximum correlation is 0.775. The test threshold is eq=0.775 ± 0.0775.", 880 "group": "data_quality", 881 "name": "Correlation between Target and Features", 882 "parameters": { 883 "abs_max_target_features_correlation": 0.775, 884 "condition": {"eq": {"absolute": 1e-12, "relative": 0.1, "value": pytest_approx(0.775, rel=0.1)}}, 885 }, 886 "status": "SUCCESS", 887 } 888 889 890 def test_category_count_binary_column(): 891 df = pd.DataFrame({"a": [True, False]}) 892 test = TestCategoryCount(column_name="a", category=False, lte=0) 893 data_quality = TestSuite( 894 tests=[ 895 test, 896 ] 897 ) 898 899 data_quality.run(reference_data=None, current_data=df) 900 901 assert "False" in test.get_description(0)