/ tests / tests / test_data_quality_tests.py
test_data_quality_tests.py
  1  import json
  2  
  3  import pandas as pd
  4  import pytest
  5  from pytest import approx as pytest_approx
  6  
  7  from evidently.legacy.pipeline.column_mapping import ColumnMapping
  8  from evidently.legacy.test_suite import TestSuite
  9  from evidently.legacy.tests import TestCategoryCount
 10  from evidently.legacy.tests import TestColumnQuantile
 11  from evidently.legacy.tests import TestColumnValueMax
 12  from evidently.legacy.tests import TestColumnValueMean
 13  from evidently.legacy.tests import TestColumnValueMedian
 14  from evidently.legacy.tests import TestColumnValueMin
 15  from evidently.legacy.tests import TestColumnValueStd
 16  from evidently.legacy.tests import TestConflictPrediction
 17  from evidently.legacy.tests import TestConflictTarget
 18  from evidently.legacy.tests import TestHighlyCorrelatedColumns
 19  from evidently.legacy.tests import TestMeanInNSigmas
 20  from evidently.legacy.tests import TestMostCommonValueShare
 21  from evidently.legacy.tests import TestNumberOfOutListValues
 22  from evidently.legacy.tests import TestNumberOfOutRangeValues
 23  from evidently.legacy.tests import TestNumberOfUniqueValues
 24  from evidently.legacy.tests import TestShareOfOutListValues
 25  from evidently.legacy.tests import TestShareOfOutRangeValues
 26  from evidently.legacy.tests import TestTargetFeaturesCorrelations
 27  from evidently.legacy.tests import TestTargetPredictionCorrelation
 28  from evidently.legacy.tests import TestUniqueValuesShare
 29  from evidently.legacy.tests import TestValueList
 30  from evidently.legacy.tests import TestValueRange
 31  from evidently.legacy.tests.base_test import TestStatus
 32  from evidently.legacy.tests.utils import approx
 33  
 34  
 35  @pytest.mark.parametrize(
 36      "test_dataset, reference_dataset, test_object, expected_success",
 37      (
 38          (
 39              pd.DataFrame(
 40                  {"category_feature": ["n", "d", "p", "n"], "numerical_feature": [0, 1, 2, 5], "target": [0, 0, 0, 1]}
 41              ),
 42              None,
 43              TestColumnValueMin(column_name="numerical_feature", gte=10),
 44              False,
 45          ),
 46          (
 47              pd.DataFrame(
 48                  {"category_feature": ["n", "d", "p", "n"], "numerical_feature": [0, 1, 2, 5], "target": [0, 0, 0, 1]}
 49              ),
 50              None,
 51              TestColumnValueMin(column_name="numerical_feature", eq=0),
 52              True,
 53          ),
 54          (
 55              pd.DataFrame(
 56                  {
 57                      "category_feature": ["n", "d", "p", "n"],
 58                      "numerical_feature": [0.4, 0.1, -1.45, 5],
 59                      "target": [0, 0, 0, 1],
 60                  }
 61              ),
 62              None,
 63              TestColumnValueMin(column_name="numerical_feature", eq=approx(-1, absolute=0.5)),
 64              True,
 65          ),
 66          (
 67              pd.DataFrame(
 68                  {
 69                      "category_feature": ["n", "d", "p", "n"],
 70                      "numerical_feature": [10, 7, 5.1, 4.9],
 71                      "target": [0, 0, 0, 1],
 72                  }
 73              ),
 74              None,
 75              TestColumnValueMin(column_name="numerical_feature", lt=approx(10, relative=0.5)),
 76              True,
 77          ),
 78          (
 79              pd.DataFrame(
 80                  {"category_feature": ["n", "d", "p", "n"], "numerical_feature": [10, 7, 5.1, 5], "target": [0, 0, 0, 1]}
 81              ),
 82              None,
 83              TestColumnValueMin(column_name="numerical_feature", lt=approx(10, relative=0.5)),
 84              False,
 85          ),
 86      ),
 87  )
 88  def test_data_quality_test_min(
 89      test_dataset: pd.DataFrame, reference_dataset: pd.DataFrame, test_object: TestColumnValueMin, expected_success: bool
 90  ) -> None:
 91      suite = TestSuite(tests=[test_object])
 92      mapping = ColumnMapping(categorical_features=["category_feature"], numerical_features=["numerical_feature"])
 93      suite.run(current_data=test_dataset, reference_data=reference_dataset, column_mapping=mapping)
 94      if expected_success:
 95          suite._inner_suite.raise_for_error()
 96      assert bool(suite) is expected_success
 97  
 98  
 99  @pytest.mark.parametrize(
100      "test_dataset, reference_dataset, test_object, expected_success",
101      (
102          (
103              pd.DataFrame(
104                  {"category_feature": ["n", "d", "p", "n"], "numerical_feature": [0, 1, 2, 5], "target": [0, 0, 0, 1]}
105              ),
106              None,
107              TestColumnValueMin(column_name="numerical_feature"),
108              False,
109          ),
110      ),
111  )
112  def test_data_quality_test_min_exception(
113      test_dataset: pd.DataFrame, reference_dataset: pd.DataFrame, test_object: TestColumnValueMin, expected_success: bool
114  ) -> None:
115      suite = TestSuite(tests=[test_object])
116      suite.run(current_data=test_dataset, reference_data=reference_dataset)
117      assert suite.as_dict()["tests"][0]["status"] == TestStatus.ERROR.value
118  
119  
120  def test_data_quality_test_min_render():
121      test_dataset = pd.DataFrame({"numerical_feature": [0, 1, 2, 5], "target": [0, 0, 0, 1]})
122      suite = TestSuite(tests=[TestColumnValueMin(column_name="numerical_feature", eq=0)])
123      column_mapping = ColumnMapping(numerical_features=["numerical_feature"])
124      suite.run(current_data=test_dataset, reference_data=None, column_mapping=column_mapping)
125      assert suite.show()
126      assert suite.json()
127  
128      suite = TestSuite(tests=[TestColumnValueMin(column_name="numerical_feature")])
129      mapping = ColumnMapping(numerical_features=["numerical_feature"])
130      suite.run(current_data=test_dataset, reference_data=test_dataset, column_mapping=mapping)
131      assert suite.show()
132      assert suite.json()
133  
134  
135  def test_data_quality_test_max() -> None:
136      test_dataset = pd.DataFrame(
137          {"category_feature": ["n", "d", "p", "n"], "numerical_feature": [0, 1, 2, 5], "target": [0, 0, 0, 1]}
138      )
139      suite = TestSuite(tests=[TestColumnValueMax(column_name="numerical_feature", gt=10)])
140      mapping = ColumnMapping(categorical_features=["category_feature"], numerical_features=["numerical_feature"])
141      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
142      assert not suite
143  
144      suite = TestSuite(tests=[TestColumnValueMax(column_name="numerical_feature", eq=5)])
145      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
146      assert suite
147  
148  
149  def test_data_quality_test_max_render():
150      test_dataset = pd.DataFrame({"numerical_feature": [0, 1, 2, 5], "target": [0, 0, 0, 1]})
151      suite = TestSuite(tests=[TestColumnValueMax(column_name="numerical_feature", eq=0)])
152      mapping = ColumnMapping(numerical_features=["numerical_feature"])
153      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
154      assert suite.show()
155      assert suite.json()
156  
157      suite = TestSuite(tests=[TestColumnValueMax(column_name="numerical_feature")])
158      suite.run(current_data=test_dataset, reference_data=test_dataset, column_mapping=mapping)
159      assert suite.show()
160      assert suite.json()
161  
162  
163  def test_data_quality_test_mean() -> None:
164      test_dataset = pd.DataFrame(
165          {"category_feature": ["n", "d", "p", "n"], "numerical_feature": [0, 1, 2, 5], "target": [0, 0, 0, 1]}
166      )
167      suite = TestSuite(tests=[TestColumnValueMean(column_name="numerical_feature", eq=5)])
168      mapping = ColumnMapping(categorical_features=["category_feature"], numerical_features=["numerical_feature"])
169      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
170      assert not suite
171  
172      suite = TestSuite(tests=[TestColumnValueMean(column_name="numerical_feature", gt=0, lt=10)])
173      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
174      assert suite
175  
176      suite = TestSuite(tests=[TestColumnValueMean(column_name="numerical_feature", eq=2)])
177      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
178      assert suite
179  
180  
181  def test_data_quality_test_mean_render():
182      test_dataset = pd.DataFrame({"numerical_feature": [0, 1, 2, 5], "target": [0, 0, 0, 1]})
183      mapping = ColumnMapping(numerical_features=["numerical_feature"])
184      suite = TestSuite(tests=[TestColumnValueMean(column_name="numerical_feature", eq=0)])
185      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
186      assert suite.show()
187      assert suite.json()
188  
189      suite = TestSuite(tests=[TestColumnValueMean(column_name="numerical_feature")])
190      suite.run(current_data=test_dataset, reference_data=test_dataset, column_mapping=mapping)
191      assert suite.show()
192      assert suite.json()
193  
194  
195  def test_data_quality_test_conflict_target() -> None:
196      test_dataset = pd.DataFrame(
197          {"category_feature": ["n", "n", "p", "n"], "numerical_feature": [0, 0, 2, 5], "target": [0, 1, 0, 1]}
198      )
199      mapping = ColumnMapping(categorical_features=["category_feature"], numerical_features=["numerical_feature"])
200      suite = TestSuite(tests=[TestConflictTarget()])
201      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
202      assert not suite
203  
204      test_dataset = pd.DataFrame(
205          {"category_feature": ["n", "d", "p", "n"], "numerical_feature": [0, 1, 2, 5], "target": [0, 0, 0, 1]}
206      )
207      suite = TestSuite(tests=[TestConflictTarget()])
208      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
209      suite._inner_suite.raise_for_error()
210      assert suite
211      assert suite.show()
212      assert suite.json()
213  
214  
215  def test_data_quality_test_conflict_prediction() -> None:
216      test_dataset = pd.DataFrame(
217          {"category_feature": ["n", "n", "p", "n"], "numerical_feature": [0, 0, 2, 5], "prediction": [0, 1, 0, 1]}
218      )
219      mapping = ColumnMapping(categorical_features=["category_feature"], numerical_features=["numerical_feature"])
220      suite = TestSuite(tests=[TestConflictPrediction()])
221      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
222      suite._inner_suite.raise_for_error()
223      assert not suite
224  
225      test_dataset = pd.DataFrame(
226          {"category_feature": ["n", "d", "p", "n"], "numerical_feature": [0, 1, 2, 5], "prediction": [0, 0, 0, 1]}
227      )
228      suite = TestSuite(tests=[TestConflictPrediction()])
229      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
230      assert suite
231      assert suite.show()
232      assert suite.json()
233  
234  
235  def test_data_quality_test_target_prediction_correlation() -> None:
236      test_dataset = pd.DataFrame(
237          {
238              "category_feature": ["n", "d", "p", "n"],
239              "numerical_feature": [0, 1, 2, 5],
240              "target": [0, 0, 0, 1],
241              "prediction": [0, 0, 1, 1],
242          }
243      )
244      mapping = ColumnMapping(categorical_features=["category_feature"], numerical_features=["numerical_feature"])
245      suite = TestSuite(tests=[TestTargetPredictionCorrelation(gt=0.5, method="cramer_v")])
246      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
247      assert suite
248      assert suite.show()
249      assert suite.json()
250  
251  
252  def test_data_quality_test_median() -> None:
253      test_dataset = pd.DataFrame(
254          {
255              "feature1": [0, 1, 2, 5],
256              "target": [0, 0, 0, 1],
257              "prediction": [0, 0, 1, 1],
258          }
259      )
260      mapping = ColumnMapping(numerical_features=["feature1"])
261      suite = TestSuite(tests=[TestColumnValueMedian(column_name="no_existing_feature", eq=1.5)])
262      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
263      assert not suite
264      suite = TestSuite(tests=[TestColumnValueMedian(column_name="feature1", eq=1.5)])
265      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
266      assert suite
267      assert suite.show()
268      assert suite.json()
269  
270  
271  def test_data_quality_test_std() -> None:
272      test_dataset = pd.DataFrame(
273          {
274              "feature1": [0, 1, 2, 5],
275              "target": [0, 0, 0, 1],
276              "prediction": [0, 0, 1, 1],
277          }
278      )
279      suite = TestSuite(tests=[TestColumnValueStd(column_name="no_existing_feature", eq=1.5)])
280      suite.run(current_data=test_dataset, reference_data=None, column_mapping=ColumnMapping())
281      assert not suite
282      suite = TestSuite(tests=[TestColumnValueStd(column_name="feature1", lt=2)])
283      mapping = ColumnMapping(numerical_features=["feature1"])
284      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
285      assert not suite
286      suite = TestSuite(tests=[TestColumnValueStd(column_name="feature1", gt=2, lt=3)])
287      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
288      assert suite
289      assert suite.show()
290      assert suite.json()
291  
292  
293  def test_data_quality_test_unique_number() -> None:
294      test_dataset = pd.DataFrame(
295          {
296              "feature1": [0, 1, 2, 5],
297              "target": [0, 0, 0, 1],
298              "prediction": [0, 0, 1, 1],
299          }
300      )
301      suite = TestSuite(tests=[TestNumberOfUniqueValues(column_name="no_existing_feature", eq=4)])
302      suite.run(current_data=test_dataset, reference_data=None, column_mapping=ColumnMapping())
303      assert not suite
304      suite = TestSuite(tests=[TestNumberOfUniqueValues(column_name="feature1", lt=2)])
305      mapping = ColumnMapping(numerical_features=["feature1"])
306      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
307      assert not suite
308      suite = TestSuite(tests=[TestNumberOfUniqueValues(column_name="feature1", eq=4)])
309      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
310      assert suite
311      assert suite.show()
312      assert suite.json()
313  
314  
315  def test_data_quality_test_unique_share() -> None:
316      test_dataset = pd.DataFrame(
317          {
318              "feature1": [0, 1, 2, 5],
319              "target": [0, 0, 0, 1],
320              "prediction": [0, 0, 1, 1],
321          }
322      )
323      suite = TestSuite(tests=[TestUniqueValuesShare(column_name="no_existing_feature", eq=1.5)])
324      suite.run(current_data=test_dataset, reference_data=None, column_mapping=ColumnMapping())
325      assert not suite
326      suite = TestSuite(tests=[TestUniqueValuesShare(column_name="feature1", lt=0.5)])
327      mapping = ColumnMapping(numerical_features=["feature1"])
328      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
329      assert not suite
330      suite = TestSuite(tests=[TestUniqueValuesShare(column_name="feature1", eq=1)])
331      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
332      assert suite
333      assert suite.show()
334      assert suite.json()
335  
336  
337  def test_data_quality_test_most_common_value_share() -> None:
338      test_dataset = pd.DataFrame(
339          {
340              "feature1": [0, 1, 1, 5],
341              "target": [0, 0, 0, 1],
342              "prediction": [0, 0, 1, 1],
343          }
344      )
345      suite = TestSuite(tests=[TestMostCommonValueShare(column_name="feature1")])
346      mapping = ColumnMapping(numerical_features=["feature1"])
347      suite.run(current_data=test_dataset, reference_data=test_dataset, column_mapping=mapping)
348      assert suite
349      suite = TestSuite(tests=[TestMostCommonValueShare(column_name="no_existing_feature", eq=0.5)])
350      suite.run(current_data=test_dataset, reference_data=None, column_mapping=ColumnMapping())
351      assert not suite
352      suite = TestSuite(tests=[TestMostCommonValueShare(column_name="feature1", lt=0.5)])
353      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
354      assert not suite
355      suite = TestSuite(tests=[TestMostCommonValueShare(column_name="feature1", eq=0.5)])
356      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
357      assert suite
358      assert suite.show()
359      assert suite.json()
360  
361  
362  def test_data_quality_test_most_common_value_share_json_render() -> None:
363      test_dataset = pd.DataFrame(
364          {
365              "feature1": [0, 1, 1, 5],
366          }
367      )
368      suite = TestSuite(tests=[TestMostCommonValueShare(column_name="feature1", eq=0.5)])
369      mapping = ColumnMapping(numerical_features=["feature1"])
370      suite.run(current_data=test_dataset, reference_data=test_dataset, column_mapping=mapping)
371      assert suite
372  
373      result_from_json = json.loads(suite.json())
374      assert result_from_json["summary"]["all_passed"] is True
375      test_info = result_from_json["tests"][0]
376      assert test_info == {
377          "description": (
378              "The most common value in the column **feature1** is 1. Its share is 0.5. The test threshold is eq=0.5."
379          ),
380          "group": "data_quality",
381          "name": "Share of the Most Common Value",
382          "parameters": {"column_name": "feature1", "condition": {"eq": 0.5}, "value": 0.5},
383          "status": "SUCCESS",
384      }
385  
386  
387  def test_data_quality_test_value_in_n_sigmas() -> None:
388      test_dataset = pd.DataFrame(
389          {
390              "feature1": [0, 1, 1, 20],
391              "target": [0, 0, 0, 1],
392              "prediction": [0, 0, 1, 1],
393          }
394      )
395      reference_dataset = pd.DataFrame(
396          {
397              "feature1": [0, 1, 1, 3],
398              "target": [0, 0, 0, 1],
399              "prediction": [0, 0, 1, 1],
400          }
401      )
402      suite = TestSuite(tests=[TestMeanInNSigmas(column_name="feature1")])
403      mapping = ColumnMapping(numerical_features=["feature1"])
404      suite.run(current_data=test_dataset, reference_data=reference_dataset, column_mapping=mapping)
405      assert not suite
406  
407      suite = TestSuite(tests=[TestMeanInNSigmas(column_name="not_exist_feature", n_sigmas=3)])
408      suite.run(current_data=test_dataset, reference_data=reference_dataset, column_mapping=ColumnMapping())
409      assert not suite
410  
411      suite = TestSuite(tests=[TestMeanInNSigmas(column_name="feature1", n_sigmas=4)])
412      suite.run(current_data=test_dataset, reference_data=reference_dataset, column_mapping=mapping)
413      assert suite
414      assert suite.show()
415      assert suite.json()
416  
417  
418  def test_data_quality_test_value_in_n_sigmas_json_render() -> None:
419      test_dataset = pd.DataFrame(
420          {
421              "feature1": [0, 1, 1, 0],
422              "target": [0, 0, 0, 1],
423              "prediction": [0, 0, 1, 1],
424          }
425      )
426      suite = TestSuite(tests=[TestMeanInNSigmas(column_name="feature1", n_sigmas=5)])
427      mapping = ColumnMapping(numerical_features=["feature1"])
428      suite.run(current_data=test_dataset, reference_data=test_dataset, column_mapping=mapping)
429      assert suite
430  
431      result_from_json = json.loads(suite.json())
432      assert result_from_json["summary"]["all_passed"] is True
433      test_info = result_from_json["tests"][0]
434      assert test_info == {
435          "description": "The mean value of the column **feature1** is 0.5. The expected range is from -2.4 to 3.4",
436          "group": "data_quality",
437          "name": "Mean Value Stability",
438          "parameters": {
439              "column_name": "feature1",
440              "current_mean": 0.5,
441              "n_sigmas": 5,
442              "reference_mean": 0.5,
443              "reference_std": 0.58,
444          },
445          "status": "SUCCESS",
446      }
447  
448  
449  def test_data_quality_test_value_in_range() -> None:
450      test_dataset = pd.DataFrame(
451          {
452              "feature1": [0, 1, 2, 3, 4, 20],
453              "target": [0, 0, 0, 1, 0, 1],
454              "prediction": [0, 0, 1, 1, 0, 1],
455          }
456      )
457      suite = TestSuite(tests=[TestValueRange(column_name="feature1", left=0, right=10)])
458      suite.run(current_data=test_dataset, reference_data=None, column_mapping=ColumnMapping())
459      assert not suite
460  
461      suite = TestSuite(tests=[TestValueRange(column_name="feature1", left=0, right=100)])
462      suite.run(current_data=test_dataset, reference_data=None, column_mapping=ColumnMapping())
463      suite._inner_suite.raise_for_error()
464      assert suite
465  
466      reference_dataset = pd.DataFrame(
467          {
468              "feature1": [0, 1, 1, 3, 2, 4, 5],
469              "target": [0, 0, 0, 1, 0, 1, 1],
470              "prediction": [0, 0, 1, 1, 0, 1, 1],
471          }
472      )
473      suite = TestSuite(tests=[TestValueRange(column_name="feature1")])
474      suite.run(current_data=test_dataset, reference_data=reference_dataset, column_mapping=ColumnMapping())
475      assert not suite
476  
477      suite = TestSuite(tests=[TestValueRange(column_name="feature1", right=100)])
478      suite.run(current_data=test_dataset, reference_data=reference_dataset, column_mapping=ColumnMapping())
479      assert suite
480      assert suite.show()
481      assert suite.json()
482  
483  
484  def test_data_quality_test_number_of_values_not_in_range() -> None:
485      test_dataset = pd.DataFrame(
486          {
487              "feature1": [0, 1, 1, 2, 3, 4, 15],
488              "target": [0, 0, 2, 3, 4, 5, 1],
489          }
490      )
491      suite = TestSuite(tests=[TestNumberOfOutRangeValues(column_name="feature1", left=0, right=10, lt=1)])
492      suite.run(current_data=test_dataset, reference_data=None, column_mapping=ColumnMapping())
493      assert not suite
494  
495      suite = TestSuite(tests=[TestNumberOfOutRangeValues(column_name="feature1", left=0, right=10, lte=1)])
496      suite.run(current_data=test_dataset, reference_data=None, column_mapping=ColumnMapping())
497      assert suite
498  
499      reference_dataset = pd.DataFrame(
500          {
501              "feature1": [0, 1, 1, 3, 4, 5, 6, 7],
502              "target": [0, 0, 0, 1, 0, 0, 1, 1],
503              "prediction": [0, 0, 1, 1, 0, 0, 1, 1],
504          }
505      )
506      suite = TestSuite(tests=[TestNumberOfOutRangeValues(column_name="feature1", lt=1)])
507      suite.run(
508          current_data=test_dataset,
509          reference_data=reference_dataset,
510          column_mapping=ColumnMapping(
511              prediction=None,
512              numerical_features=["feature1"],
513          ),
514      )
515      assert not suite
516  
517      suite = TestSuite(tests=[TestNumberOfOutRangeValues(column_name="feature1", lte=1)])
518      suite.run(
519          current_data=test_dataset,
520          reference_data=reference_dataset,
521          column_mapping=ColumnMapping(
522              prediction=None,
523              numerical_features=["feature1"],
524          ),
525      )
526      assert suite
527      assert suite.show()
528      assert suite.json()
529  
530  
531  def test_data_quality_test_share_of_values_not_in_range() -> None:
532      test_dataset = pd.DataFrame(
533          {
534              "feature1": [0, 1, 1, 2, 3, 4, 15],
535              "target": [0, 0, 2, 3, 4, 5, 1],
536          }
537      )
538      suite = TestSuite(tests=[TestShareOfOutRangeValues(column_name="feature1", left=0, right=10, lt=0.1)])
539      mapping = ColumnMapping(numerical_features=["feature1"])
540      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
541      assert not suite
542  
543      suite = TestSuite(tests=[TestShareOfOutRangeValues(column_name="feature1", left=0, right=10, lt=0.5)])
544      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
545      assert suite
546  
547      reference_dataset = pd.DataFrame(
548          {
549              "feature1": [0, 1, 1, 3, 4, 5, 6, 7],
550              "target": [0, 0, 0, 1, 0, 0, 1, 1],
551              "prediction": [0, 0, 1, 1, 0, 0, 1, 1],
552          }
553      )
554      suite = TestSuite(tests=[TestShareOfOutRangeValues(column_name="feature1", lt=0.1)])
555      suite.run(
556          current_data=test_dataset,
557          reference_data=reference_dataset,
558          column_mapping=ColumnMapping(
559              prediction=None,
560              numerical_features=["feature1"],
561          ),
562      )
563      assert not suite
564  
565      suite = TestSuite(tests=[TestShareOfOutRangeValues(column_name="feature1", lte=0.5)])
566      suite.run(
567          current_data=test_dataset,
568          reference_data=reference_dataset,
569          column_mapping=ColumnMapping(
570              prediction=None,
571              numerical_features=["feature1"],
572          ),
573      )
574      assert suite
575      assert suite.show()
576      assert suite.json()
577  
578  
579  def test_data_quality_test_share_of_values_not_in_range_json_render() -> None:
580      test_dataset = pd.DataFrame(
581          {
582              "feature1": [0, 1, 1, 0, 24, 2, 3, 4],
583          }
584      )
585      suite = TestSuite(tests=[TestShareOfOutRangeValues(column_name="feature1", left=0, right=10, gt=0.2)])
586      mapping = ColumnMapping(numerical_features=["feature1"])
587      suite.run(current_data=test_dataset, reference_data=None, column_mapping=mapping)
588      assert not suite
589  
590      result_from_json = json.loads(suite.json())
591      assert result_from_json["summary"]["all_passed"] is False
592      test_info = result_from_json["tests"][0]
593      assert test_info == {
594          "description": (
595              "The share of values out of range in the column **feature1** is 0.125 (1 out of 8)."
596              "  The test threshold is gt=0.2."
597          ),
598          "group": "data_quality",
599          "name": "Share of Out-of-Range Values",
600          "parameters": {"condition": {"gt": 0.2}, "left": 0, "right": 10, "value": 0.125},
601          "status": "FAIL",
602      }
603  
604  
605  def test_data_quality_test_value_in_list() -> None:
606      test_dataset = pd.DataFrame(
607          {
608              "feature1": [0, 1, 2, 3, 4, 20],
609              "target": [0, 0, 0, 1, 0, 1],
610              "prediction": [0, 0, 1, 1, 0, 1],
611          }
612      )
613      reference_dataset = pd.DataFrame(
614          {
615              "feature1": [0, 1, 1, 3, 2, 4, 5],
616              "target": [0, 0, 0, 1, 0, 1, 1],
617              "prediction": [0, 0, 1, 2, 0, 1, 1],
618          }
619      )
620      suite = TestSuite(tests=[TestValueList(column_name="feature1")])
621      suite.run(current_data=test_dataset, reference_data=reference_dataset, column_mapping=ColumnMapping())
622      assert not suite
623  
624      suite = TestSuite(tests=[TestValueList(column_name="prediction", values=[0, 1])])
625      suite.run(current_data=test_dataset, reference_data=reference_dataset, column_mapping=ColumnMapping())
626      assert suite
627  
628      suite = TestSuite(tests=[TestValueList(column_name="target")])
629      suite.run(current_data=test_dataset, reference_data=reference_dataset, column_mapping=ColumnMapping())
630      assert suite
631      assert suite.show()
632      assert suite.json()
633  
634  
635  def test_data_quality_test_value_in_list_json_render() -> None:
636      test_dataset = pd.DataFrame(
637          {
638              "target": [0, 0, 1, 1],
639          }
640      )
641      reference_dataset = pd.DataFrame(
642          {
643              "target": [0, 0, 0, 1],
644          }
645      )
646      suite = TestSuite(tests=[TestValueList(column_name="target")])
647      suite.run(current_data=test_dataset, reference_data=reference_dataset, column_mapping=ColumnMapping())
648      assert suite
649  
650      result_from_json = json.loads(suite.json())
651      assert result_from_json["summary"]["all_passed"] is True
652      test_info = result_from_json["tests"][0]
653      assert test_info == {
654          "description": "All values in the column **target** are in the list.",
655          "group": "data_quality",
656          "name": "Out-of-List Values",
657          "parameters": {"column_name": "target", "value": 0, "values": None},
658          "status": "SUCCESS",
659      }
660  
661  
662  def test_data_quality_test_number_of_values_not_in_list() -> None:
663      test_dataset = pd.DataFrame(
664          {
665              "feature1": [2, 4, 4, 20],
666              "target": [0, 0, 0, 1],
667              "prediction": [0, 0, 1, 1],
668          }
669      )
670      reference_dataset = pd.DataFrame(
671          {
672              "feature1": [2, 4, 4, 2],
673              "target": [0, 0, 0, 1],
674              "prediction": [0, 0, 1, 1],
675          }
676      )
677      suite = TestSuite(tests=[TestNumberOfOutListValues(column_name="feature1", gt=10)])
678      suite.run(current_data=test_dataset, reference_data=reference_dataset, column_mapping=ColumnMapping())
679      assert not suite
680  
681      suite = TestSuite(tests=[TestNumberOfOutListValues(column_name="feature1", lt=2)])
682      suite.run(current_data=test_dataset, reference_data=reference_dataset, column_mapping=ColumnMapping())
683      assert suite
684      assert suite.show()
685      assert suite.json()
686  
687  
688  def test_data_quality_test_share_of_values_not_in_list() -> None:
689      test_dataset = pd.DataFrame(
690          {
691              "feature1": [0, 1, 1, 20],
692              "target": [0, 0, 0, 1],
693              "prediction": [0, 0, 1, 1],
694          }
695      )
696  
697      suite = TestSuite(tests=[TestShareOfOutListValues(column_name="feature1", values=[0], lt=0.5)])
698      suite.run(current_data=test_dataset, reference_data=None, column_mapping=ColumnMapping())
699      assert not suite
700  
701      suite = TestSuite(tests=[TestShareOfOutListValues(column_name="feature1", values=[0, 1], lt=0.5)])
702      suite.run(current_data=test_dataset, reference_data=None, column_mapping=ColumnMapping())
703      assert suite
704  
705  
706  def test_data_quality_test_share_of_values_not_in_list_json_render() -> None:
707      current_dataset = pd.DataFrame(
708          {
709              "feature1": [0, 1, 10, 20],
710          }
711      )
712      reference_dataset = pd.DataFrame(
713          {
714              "feature1": [0, 1, 1, 20],
715          }
716      )
717  
718      suite = TestSuite(tests=[TestShareOfOutListValues(column_name="feature1")])
719      suite.run(current_data=current_dataset, reference_data=reference_dataset)
720      assert not suite
721  
722      result_from_json = json.loads(suite.json())
723      assert result_from_json["summary"]["all_passed"] is False
724      test_info = result_from_json["tests"][0]
725      assert test_info == {
726          "description": (
727              "The share of values out of list in the column **feature1** is 0.25 (1 out of 4)."
728              " The test threshold is eq=0 ± 1e-12."
729          ),
730          "group": "data_quality",
731          "name": "Share of Out-of-List Values",
732          "parameters": {
733              "condition": {"eq": {"absolute": 1e-12, "relative": 1e-06, "value": 0}},
734              "value": 0.25,
735              "values": None,
736          },
737          "status": "FAIL",
738      }
739  
740  
741  def test_data_quality_test_value_quantile() -> None:
742      test_dataset = pd.DataFrame(
743          {
744              "feature1": [0, 1, 2, 3],
745              "target": [0, 0, 0, 1],
746              "prediction": [0, 0, 1, 1],
747          }
748      )
749  
750      suite = TestSuite(tests=[TestColumnQuantile(column_name="feature1", quantile=0.7, lt=1)])
751      suite.run(current_data=test_dataset, reference_data=None, column_mapping=ColumnMapping())
752      assert not suite
753  
754      suite = TestSuite(tests=[TestColumnQuantile(column_name="feature1", quantile=0.2, lt=0.7)])
755      suite.run(current_data=test_dataset, reference_data=None, column_mapping=ColumnMapping())
756      suite._inner_suite.raise_for_error()
757      assert suite
758      assert suite.show()
759      assert suite.json()
760  
761  
762  @pytest.mark.skip("require proper tests case")
763  def test_data_quality_test_highly_correlated_features() -> None:
764      test_dataset = pd.DataFrame(
765          {
766              "feature1": [0, 1, 2, 3],
767              "feature2": [0, 0, 0, 1],
768              "feature3": [0, 0, 1, 1],
769          }
770      )
771      suite = TestSuite(tests=[TestHighlyCorrelatedColumns()])
772      suite.run(current_data=test_dataset, reference_data=test_dataset)
773      assert suite
774  
775      suite = TestSuite(tests=[TestHighlyCorrelatedColumns(gt=1)])
776      suite.run(current_data=test_dataset, reference_data=None)
777      assert not suite
778  
779      suite = TestSuite(tests=[TestHighlyCorrelatedColumns(lt=1)])
780      suite.run(current_data=test_dataset, reference_data=None)
781      assert suite
782      assert suite.show()
783      assert suite.json()
784  
785  
786  @pytest.mark.skip("require proper tests case")
787  def test_data_quality_test_highly_correlated_features_json_render() -> None:
788      test_dataset = pd.DataFrame(
789          {
790              "feature1": [0, 1, 2, 3],
791              "feature2": [0, 2, 3, 4],
792              "target": [0, 0, 0, 1],
793              "prediction": [0, 0, 1, 1],
794          }
795      )
796      suite = TestSuite(tests=[TestHighlyCorrelatedColumns()])
797      suite.run(current_data=test_dataset, reference_data=test_dataset)
798      assert suite
799  
800      result_from_json = json.loads(suite.json())
801      assert result_from_json["summary"]["all_passed"] is True
802      test_info = result_from_json["tests"][0]
803      assert test_info == {
804          "description": "The maximum correlation is 0.983. The test threshold is eq=0.983 ± 0.0983.",
805          "group": "data_quality",
806          "name": "Highly Correlated Columns",
807          "parameters": {
808              "value": 0.983,
809              "condition": {"eq": {"absolute": 1e-12, "relative": 0.1, "value": 0.9827076298239908}},
810          },
811          "status": "SUCCESS",
812      }
813  
814  
815  @pytest.mark.skip("require proper tests case")
816  def test_data_quality_test_target_features_correlation() -> None:
817      test_dataset = pd.DataFrame(
818          {
819              "feature1": [0, 1, 2, 3],
820              "target": [0, 0, 0, 1],
821          }
822      )
823      column_mapping = ColumnMapping(task="regression")
824  
825      suite = TestSuite(tests=[TestTargetFeaturesCorrelations()])
826      suite.run(current_data=test_dataset, reference_data=test_dataset, column_mapping=column_mapping)
827      assert suite
828  
829      suite = TestSuite(tests=[TestTargetFeaturesCorrelations(gt=1)])
830      suite.run(current_data=test_dataset, reference_data=None, column_mapping=column_mapping)
831      assert not suite
832  
833      suite = TestSuite(tests=[TestTargetFeaturesCorrelations(lt=1)])
834      suite.run(current_data=test_dataset, reference_data=None, column_mapping=column_mapping)
835      assert suite
836      assert suite.show()
837      assert suite.json()
838  
839  
840  @pytest.mark.skip("require proper tests case")
841  def test_data_quality_test_target_features_correlation_errors() -> None:
842      test_dataset = pd.DataFrame(
843          {
844              "feature1": [0, 1, 2, 3],
845              "prediction": [0, 0, 0, 1],
846          }
847      )
848      suite = TestSuite(tests=[TestTargetFeaturesCorrelations()])
849      suite.run(current_data=test_dataset, reference_data=test_dataset)
850      assert not suite
851  
852      assert suite.as_dict()["tests"][0] == {
853          "description": "No target in the current dataset",
854          "group": "data_quality",
855          "name": "Correlation between Target and Features",
856          "parameters": {"value": None, "condition": {"lt": 0.9}},
857          "status": "ERROR",
858      }
859  
860  
861  @pytest.mark.skip("require proper tests case")
862  def test_data_quality_test_target_features_correlation_json_render() -> None:
863      test_dataset = pd.DataFrame(
864          {
865              "feature1": [0, 1, 2, 3],
866              "target": [0.0, 0.0, 0.0, 1.0],
867              "prediction": [0.0, 0.0, 0.0, 1.0],
868          }
869      )
870      column_mapping = ColumnMapping(task="regression")
871      suite = TestSuite(tests=[TestTargetFeaturesCorrelations()])
872      suite.run(current_data=test_dataset, reference_data=test_dataset, column_mapping=column_mapping)
873      assert suite
874  
875      result_from_json = json.loads(suite.json())
876      assert result_from_json["summary"]["all_passed"] is True
877      test_info = result_from_json["tests"][0]
878      assert test_info == {
879          "description": "The maximum correlation is 0.775. The test threshold is eq=0.775 ± 0.0775.",
880          "group": "data_quality",
881          "name": "Correlation between Target and Features",
882          "parameters": {
883              "abs_max_target_features_correlation": 0.775,
884              "condition": {"eq": {"absolute": 1e-12, "relative": 0.1, "value": pytest_approx(0.775, rel=0.1)}},
885          },
886          "status": "SUCCESS",
887      }
888  
889  
890  def test_category_count_binary_column():
891      df = pd.DataFrame({"a": [True, False]})
892      test = TestCategoryCount(column_name="a", category=False, lte=0)
893      data_quality = TestSuite(
894          tests=[
895              test,
896          ]
897      )
898  
899      data_quality.run(reference_data=None, current_data=df)
900  
901      assert "False" in test.get_description(0)