Feat: support where filter (#176)

* added support to where method and tests * corrected method * added tests and changed flow structure * corrected failing test --------- Co-authored-by: andrzej.nescior <[email protected]>
awslabs · Dec 6, 2023 · 0c0e1e4 · 0c0e1e4
1 parent e74e974
commit 0c0e1e4
Show file tree

Hide file tree

Showing 2 changed files with 49 additions and 0 deletions.
diff --git a/pydeequ/checks.py b/pydeequ/checks.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 from enum import Enum
 
+from py4j.protocol import Py4JError
 from pyspark.sql import SparkSession
 
 from pydeequ.check_functions import is_one
@@ -116,6 +117,13 @@ def addConstraint(self, constraint):
         self.constraints.append(constraint)
         self._Check = constraint._Check
 
+    def where(self, filter: str):
+        try:
+            self._Check = self._Check.where(filter)
+        except Py4JError:
+            raise TypeError(f"Method doesn't exist in {self._Check.getClass()}, class has to be filterable")
+        return self
+
     def addFilterableContstraint(self, creationFunc):
         """Adds a constraint that can subsequently be replaced with a filtered version
         :param creationFunc:

diff --git a/tests/test_checks.py b/tests/test_checks.py
@@ -467,6 +467,12 @@ def hasNumberOfDistinctValues(self, column, assertion, binningUdf, maxBins, hint
         df = VerificationResult.checkResultsAsDataFrame(self.spark, result)
         return df.select("constraint_status").collect()
 
+    def where(self, assertion, filter, hint=None):
+        check = Check(self.spark, CheckLevel.Warning, "test where")
+        result = VerificationSuite(self.spark).onData(self.df).addCheck(check.hasSize(assertion, hint).where(filter)).run()
+        df = VerificationResult.checkResultsAsDataFrame(self.spark, result)
+        return df.select("constraint_status").collect()
+
     def test_hasSize(self):
         self.assertEqual(self.hasSize(lambda x: x == 3.0), [Row(constraint_status="Success")])
         self.assertEqual(
@@ -1245,6 +1251,41 @@ def test_fail_isGreaterThanOrEqualTo(self):
         )
         self.assertEqual(self.isGreaterThanOrEqualTo("h", "f", lambda x: x == 1), [Row(constraint_status="Success")])
 
+    def test_where(self):
+        self.assertEqual(self.where(lambda x: x == 2.0, "boolean='true'", "column 'boolean' has two values true"),
+                         [Row(constraint_status="Success")])
+        self.assertEqual(
+            self.where(lambda x: x == 3.0, "d=5", "column 'd' has three values 3"),
+            [Row(constraint_status="Success")],
+        )
+        self.assertEqual(
+            self.where(lambda x: x == 2.0, "ssn='000-00-0000'", "column 'ssn' has one value 000-00-0000"),
+            [Row(constraint_status="Failure")],
+        )
+        check = Check(self.spark, CheckLevel.Warning, "test where").hasMin("f", lambda x: x == 2, "The f has min value 2 becasue of the additional filter").where('f>=2')
+        result = VerificationSuite(self.spark).onData(self.df).addCheck(check.isGreaterThan("e", "h", lambda x: x == 1, "Column H is not smaller than Column E")).run()
+        df = VerificationResult.checkResultsAsDataFrame(self.spark, result)
+        self.assertEqual(
+            df.select("constraint_status").collect(),
+            [Row(constraint_status="Success"), Row(constraint_status="Failure")],
+        )
+        with self.assertRaises(TypeError):
+            Check(self.spark, CheckLevel.Warning, "test where").kllSketchSatisfies(
+                "b", lambda x: x.parameters().apply(0) == 1.0, KLLParameters(self.spark, 2, 0.64, 2)
+            ).where("d=5")
+
+    @pytest.mark.xfail(reason="@unittest.expectedFailure")
+    def test_fail_where(self):
+        self.assertEqual(self.where(lambda x: x == 2.0, "boolean='false'", "column 'boolean' has one value false"),
+                         [Row(constraint_status="Success")])
+        self.assertEqual(
+            self.where(lambda x: x == 3.0, "a='bar'", "column 'a' has one value 'bar'"),
+            [Row(constraint_status="Success")],
+        )
+        self.assertEqual(
+            self.where(lambda x: x == 1.0, "f=1", "column 'f' has one value 1"),
+            [Row(constraint_status="Failure")],
+        )
     # def test_hasNumberOfDistinctValues(self):
     #     #Todo: test binningUDf
     #     self.assertEqual(self.hasNumberOfDistinctValues('b', lambda x: x == 3, None, 3, "Column B has 3 distinct values"),