Skip to content

Commit

Permalink
feat: add how="any","all" to df.dropna, dropinf, etc (#2104)
Browse files Browse the repository at this point in the history
* Make df.dropna and friends safer

* Support how="any","all" in df.dropna, dropinf, etc

See #2084

* Fixup grammar

Co-authored-by: Jovan Veljanoski <[email protected]>

* Fixup: Apply suggestions from review

* Use array_factory in dropna tests

Now we test against numpy, arrow, and chunked arrow arrays.

Co-authored-by: Jovan Veljanoski <[email protected]>
  • Loading branch information
NickCrews and JovanVeljanoski authored Jun 27, 2022
1 parent 6d2b774 commit 35c250d
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 21 deletions.
41 changes: 30 additions & 11 deletions packages/vaex-core/vaex/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5036,42 +5036,61 @@ def create(current):
return selections.SelectionDropNa(drop_nan, drop_masked, column_names, current, mode)
self._selection(create, name)

def dropmissing(self, column_names=None):
def dropmissing(self, column_names=None, how="any"):
"""Create a shallow copy of a DataFrame, with filtering set using ismissing.
:param column_names: The columns to consider, default: all (real, non-virtual) columns
:param str how: One of ("any", "all").
If "any", then drop rows where any of the columns are missing.
If "all", then drop rows where all of the columns are missing.
:rtype: DataFrame
"""
return self._filter_all(self.func.ismissing, column_names)
return self._filter_all(self.func.ismissing, column_names, how=how)

def dropnan(self, column_names=None):
def dropnan(self, column_names=None, how="any"):
"""Create a shallow copy of a DataFrame, with filtering set using isnan.
:param column_names: The columns to consider, default: all (real, non-virtual) columns
:param str how: One of ("any", "all").
If "any", then drop rows where any of the columns are nan.
If "all", then drop rows where all of the columns are nan.
:rtype: DataFrame
"""
return self._filter_all(self.func.isnan, column_names)
return self._filter_all(self.func.isnan, column_names, how=how)

def dropna(self, column_names=None):
def dropna(self, column_names=None, how="any"):
"""Create a shallow copy of a DataFrame, with filtering set using isna.
:param column_names: The columns to consider, default: all (real, non-virtual) columns
:param str how: One of ("any", "all").
If "any", then drop rows where any of the columns are na.
If "all", then drop rows where all of the columns are na.
:rtype: DataFrame
"""
return self._filter_all(self.func.isna, column_names)
return self._filter_all(self.func.isna, column_names, how=how)

def dropinf(self, column_names=None):
def dropinf(self, column_names=None, how="any"):
""" Create a shallow copy of a DataFrame, with filtering set using isinf.
:param column_names: The columns to consider, default: all (real, non-virtual) columns
:param str how: One of ("any", "all").
If "any", then drop rows where any of the columns are inf.
If "all", then drop rows where all of the columns are inf.
:rtype: DataFrame
"""
return self._filter_all(self.func.isinf, column_names)
return self._filter_all(self.func.isinf, column_names, how=how)

def _filter_all(self, f, column_names=None):
column_names = column_names or self.get_column_names(virtual=False)
def _filter_all(self, f, column_names=None, how="any"):
if column_names is None:
column_names = self.get_column_names(virtual=False)
if how not in ("any", "all"):
raise ValueError("`how` must be either 'any' or 'all'")
expression = f(self[column_names[0]])
for column in column_names[1:]:
expression = expression | f(self[column])
if how == "any":
expression = expression | f(self[column])
else:
expression = expression & f(self[column])
return self.filter(~expression, mode='and')

def select_nothing(self, name="default"):
Expand Down
32 changes: 22 additions & 10 deletions tests/dropna_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,16 +112,28 @@ def test_dropna():
assert (df.s.dropna().tolist() == ["aap", "noot", "mies"])
assert (df.o.dropna().tolist() == ["aap", "noot"])


def test_dropna_all_columns():
x = [1, 2, 3, 4, 5]
y = ['dog', 'dog', None, 'cat', None]
df = vaex.from_arrays(x=x, y=y)

df_dropped = df.dropna()
assert df_dropped.x.tolist() == [1, 2, 4]
assert df_dropped.y.tolist() == ['dog', 'dog', 'cat']

@pytest.fixture
def df_with_missings(array_factory1, array_factory2):
# Create arrays separately so that the DF might have a mix of
# numpy and arrow arrays.
nan = array_factory1([1.1, np.nan, np.nan, 4.4, 5.5])
na = array_factory2(['dog', 'dog', None, 'cat', None])
df = vaex.from_arrays(nan=nan, na=na)
return df

def test_dropna_all_columns(df_with_missings):
df = df_with_missings
# These two should be equivalent
for df_dropped in (df.dropna(), df.dropna(how="any")):
assert df_dropped.nan.tolist() == [1.1, 4.4]
assert df_dropped.na.tolist() == ['dog', 'cat']

df_dropped = df.dropna(how="all")
assert df_dropped.nan.fillna(99).tolist() == [1.1, 99, 4.4, 5.5]
assert df_dropped.na.tolist() == ['dog', 'dog', 'cat', None]

with pytest.raises(ValueError):
df_dropped = df.dropna(how="invalid")

def test_dropna_string_columns():
data_dict = {'10': [1, 2, np.nan],
Expand Down

0 comments on commit 35c250d

Please sign in to comment.