Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get encoded columns from original via Dataset and gradually adapt feature_fix #101

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions src/trousse/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import pandas as pd
from joblib import Parallel, delayed

from . import feature_operations as fop
from .exceptions import MultipleObjectsInFileError, NotShelveFileError
from .operations_list import OperationsList
from .settings import CATEG_COL_THRESHOLD
Expand All @@ -22,6 +23,7 @@
if typing.TYPE_CHECKING: # pragma: no cover
from .feature_operations import FeatureOperation


logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -490,6 +492,33 @@ def operations_history(self) -> OperationsList:
"""
return self._operations_history

def encoded_columns_from_original(self, column: str) -> List[str]:
"""Return the list of encoded columns name from ``column``.

Parameters
----------
column : str
Column name

Returns
-------
List[str]
List of encoded columns name from ``column``
"""
encoders_on_column = self._operations_history.operations_from_original_column(
column, [fop.OrdinalEncoder, fop.OneHotEncoder]
)

encoded_columns = []

for encoder in encoders_on_column:
if encoder.derived_columns is None:
encoded_columns.extend(encoder.columns)
else:
encoded_columns.extend(encoder.derived_columns)

return encoded_columns

def _get_categorical_cols(self, col_list: Tuple[str]) -> Set[str]:
"""
Identify every categorical column in dataset.
Expand Down
83 changes: 28 additions & 55 deletions src/trousse/feature_fix.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
from typing import Any, Tuple

import numpy as np
import pandas as pd

from .dataset import Dataset, copy_dataset_with_new_df
from .dataset import Dataset
from .feature_operations import FeatureOperation, OneHotEncoder, OrdinalEncoder

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -187,10 +186,9 @@ def combine_categorical_columns_to_one(


def _one_hot_encode_column(
df: pd.DataFrame,
dataset: Dataset,
column: str,
drop_one_new_column: bool = True,
drop_old_column: bool = False,
):
"""
OneHotEncoding of 'column' in df
Expand All @@ -200,51 +198,49 @@ def _one_hot_encode_column(
df
column
drop_one_new_column
drop_old_column

Returns
-------

"""
dataset = Dataset(df_object=df)
one_hot_encoder = OneHotEncoder(columns=[column], derived_column_suffix="_enc")

encoded_dataset = one_hot_encoder(dataset)

new_columns = sorted(
derived_columns = sorted(
list(set(encoded_dataset.data.columns) - set(dataset.data.columns))
)
return encoded_dataset.data, one_hot_encoder.encoder, new_columns
return encoded_dataset, derived_columns


def _ordinal_encode_column(df, column, drop_old_column: bool = False):
def _ordinal_encode_column(
dataset: Dataset,
column: str,
):
"""

Parameters
----------
df
column
drop_old_column

Returns
-------

"""

dataset = Dataset(df_object=df)
derived_column = f"{column}_enc"
ordinal_encoder = OrdinalEncoder(columns=[column], derived_columns=[derived_column])

encoded_dataset = ordinal_encoder(dataset)
return encoded_dataset.data, ordinal_encoder.encoder, [derived_column]
return encoded_dataset, [derived_column]


def encode_single_categorical_column(
dataset: Dataset,
col_name: str,
encoding: Any = "EncodingFunctions.ORDINAL",
drop_one_new_column: bool = True,
drop_old_column: bool = False,
force: bool = False,
case_sensitive: bool = False,
):
Expand Down Expand Up @@ -279,44 +275,42 @@ def encode_single_categorical_column(
"""
# If the column has already been encoded and the new column has already been
# created, return dataset
enc_column = dataset.get_enc_column_from_original(column_name=col_name)
enc_column = dataset.encoded_columns_from_original(column=col_name)

# Check if encoding operation is required
if not force:
if enc_column is not None:
if len(enc_column) > 0:
logging.warning(
f"The column {col_name} has already been encoded "
f'as "{enc_column}". No further operations are performed '
)
return dataset
elif dataset[col_name].dtype.kind in "biufc":
logging.warning(
f"The column {col_name} is already numeric. No further operations "
"are performed "
f'as "{enc_column}". No further operations are performed.'
)
return dataset
# elif dataset[col_name].dtype.kind in "biufc":
# logging.warning(
# f"The column {col_name} is already numeric. No further operations
# are performed "
# )
# return dataset

dataset_to_encode = dataset.copy()

df_to_encode = dataset.data.copy()
# Find index of rows with NaN and convert it to a fixed value so the corresponding
# encoded col will be dropped
nan_serie_map = df_to_encode[col_name].isna()
nan_serie_map = nan_serie_map.index[nan_serie_map].tolist()
df_to_encode.loc[nan_serie_map][col_name] = NAN_CATEGORY.title()
# Set to 'title' case so str with different capitalization are interpreted as equal
if not case_sensitive:
df_to_encode.loc[:, col_name] = df_to_encode[col_name].astype(str).str.title()
dataset_to_encode.data.loc[:, col_name] = (
dataset_to_encode.data[col_name].astype(str).str.title()
)

# Encoding using the selected function
if encoding == "ORDINAL":
df_encoded, encoder, new_columns = _ordinal_encode_column(
df_to_encode, column=col_name, drop_old_column=drop_old_column
dataset_encoded, _ = _ordinal_encode_column(
dataset_to_encode,
column=col_name,
)
elif encoding == "ONEHOT":
df_encoded, encoder, new_columns = _one_hot_encode_column(
df_to_encode,
dataset_encoded, _ = _one_hot_encode_column(
dataset_to_encode,
column=col_name,
drop_one_new_column=drop_one_new_column,
drop_old_column=drop_old_column,
)
else:
logging.error(
Expand All @@ -325,27 +319,6 @@ def encode_single_categorical_column(
)
return None

# Set the rows with missing values originally to NaN
df_encoded.loc[nan_serie_map, col_name] = pd.NA
df_encoded.loc[nan_serie_map, new_columns] = np.nan

# Generate encoded values map
encoded_values_map = {}
for val_id, val in enumerate(encoder.categories_[0]):
encoded_values_map[val_id] = val

dataset_encoded = copy_dataset_with_new_df(dataset, df_encoded)

dataset_encoded.track_history(
FeatureOperation(
original_columns=col_name,
operation_type="CATEGORICAL_ENCODING",
encoder=encoder,
encoded_values_map=encoded_values_map,
derived_columns=tuple(new_columns),
)
)

return dataset_encoded


Expand Down
1 change: 0 additions & 1 deletion src/trousse/feature_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,7 +580,6 @@ def _remove_nan_category(
self, encoded_categories: List[str], columns_enc: pd.DataFrame
) -> Tuple[List[str], pd.DataFrame]:
"""Remove the NaN category from the encoded categories and corresponding column.

If the NaN category is not present,``encoded_categories`` and
``columns_enc`` will be returned without modification.

Expand Down
26 changes: 21 additions & 5 deletions src/trousse/operations_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import collections
import typing
from typing import Any, List, Union
from typing import Any, List, Optional, Union

if typing.TYPE_CHECKING: # pragma: no cover
from .feature_operations import FeatureOperation
Expand Down Expand Up @@ -59,26 +59,42 @@ def operations_from_derived_column(
)

def operations_from_original_column(
self, original_column: str
self, original_column: str, operation_types: Optional[List[type]] = None
) -> List["FeatureOperation"]:
"""Return the FeatureOperations applied on ``original_column``
"""Return the FeatureOperations applied on ``original_column``.

If ``operation_types`` is not None, return only the ``FeatureOperations`` with
corresponding types.

Parameters
----------
original_column : str
The column on which the FeatureOperation has been applied on.
operation_types : List[type], optional
List of types (i.e. FeatureOperation subclasses name) to filter the
FeatureOperations to return. Default is None, meaning that all types of
operations will be returned.

Returns
-------
List[FeatureOperation]
FeatureOperations applied on ``original_column``
FeatureOperations applied on ``original_column`` of type compatible with
``operation_types``
"""
return list(
operations = list(
filter(
lambda op: original_column in op.columns,
self[original_column],
)
)
if operation_types is not None:
operations = list(
filter(
lambda op: any([isinstance(op, t) for t in operation_types]),
operations,
)
)
return operations

def original_columns_from_derived_column(self, derived_column: str) -> List[str]:
"""Return the name of the columns from which ``derived_column`` is generated.
Expand Down
12 changes: 5 additions & 7 deletions tests/integration/test_feature_fix.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import pandas as pd
import pytest
import sklearn.preprocessing as sk_preproc

import trousse.feature_fix as ffx
from trousse.dataset import Dataset
Expand All @@ -24,9 +23,9 @@ def test_ordinal_encode_column(csv, column, derived_column, expected_csv):
dataset = Dataset(data_file=csv)
expected_df = load_expectation(expected_csv, type_="csv")

encoded_df, _, new_cols = ffx._ordinal_encode_column(dataset.data, column, False)
encoded_dataset, new_cols = ffx._ordinal_encode_column(dataset, column)

pd.testing.assert_frame_equal(encoded_df, expected_df)
pd.testing.assert_frame_equal(encoded_dataset.data, expected_df)
assert derived_column == new_cols


Expand Down Expand Up @@ -55,10 +54,9 @@ def test_one_hot_encode_column(
dataset = Dataset(data_file=csv)
expected_df = load_expectation(expected_csv, type_="csv")

encoded_df, encoder, new_cols = ffx._one_hot_encode_column(
dataset.data, column, drop_one_new_column
encoded_dataset, new_cols = ffx._one_hot_encode_column(
dataset, column, drop_one_new_column
)

assert expected_new_cols == new_cols
pd.testing.assert_frame_equal(encoded_df, expected_df, check_dtype=False)
assert isinstance(encoder, sk_preproc.OneHotEncoder)
pd.testing.assert_frame_equal(encoded_dataset.data, expected_df, check_dtype=False)
53 changes: 52 additions & 1 deletion tests/unit/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,6 @@ def it_knows_how_to_track_history(
self, request, metadata_cols, derived_columns, expected_metadata_cols
):
operations_list_iadd_ = method_mock(request, OperationsList, "__iadd__")

expected_df = DataFrameMock.df_generic(10)
get_df_from_csv_ = function_mock(request, "trousse.dataset.get_df_from_csv")
get_df_from_csv_.return_value = expected_df
Expand All @@ -257,6 +256,58 @@ def it_knows_how_to_track_history(
assert dataset.metadata_cols == expected_metadata_cols
operations_list_iadd_.assert_called_once_with(ANY, feat_op)

@pytest.mark.parametrize(
"op_from_original_column_ret_value, expected_columns",
[
(
[
fop.OrdinalEncoder(
columns=["col"], derived_columns=["encoded_col"]
),
fop.OrdinalEncoder(columns=["col"], derived_columns=None),
],
["encoded_col", "col"],
),
(
[
fop.OrdinalEncoder(
columns=["col"], derived_columns=["encoded_col"]
),
fop.OrdinalEncoder(columns=["col"], derived_columns=None),
fop.OrdinalEncoder(
columns=["col"], derived_columns=["encoded_col2"]
),
],
["encoded_col", "col", "encoded_col2"],
),
(
[],
[],
),
],
)
def it_knows_how_to_get_encoded_columns_from_original(
self, request, op_from_original_column_ret_value, expected_columns
):
op_list__op_from_original_column_ = method_mock(
request, OperationsList, "operations_from_original_column"
)
op_list__op_from_original_column_.return_value = (
op_from_original_column_ret_value
)
expected_df = DataFrameMock.df_generic(10)
get_df_from_csv_ = function_mock(request, "trousse.dataset.get_df_from_csv")
get_df_from_csv_.return_value = expected_df
dataset = Dataset(data_file="fake/path")

columns = dataset.encoded_columns_from_original("col")

assert type(columns) == list
assert columns == expected_columns
op_list__op_from_original_column_.assert_called_once_with(
dataset.operations_history, "col", [fop.OrdinalEncoder, fop.OneHotEncoder]
)


class DescribeColumnListByType:
def it_knows_its_str(self, request):
Expand Down
Loading