diff --git a/src/trousse/dataset.py b/src/trousse/dataset.py index 7c8491a..4f388f5 100644 --- a/src/trousse/dataset.py +++ b/src/trousse/dataset.py @@ -14,6 +14,7 @@ import pandas as pd from joblib import Parallel, delayed +from . import feature_operations as fop from .exceptions import MultipleObjectsInFileError, NotShelveFileError from .operations_list import OperationsList from .settings import CATEG_COL_THRESHOLD @@ -22,6 +23,7 @@ if typing.TYPE_CHECKING: # pragma: no cover from .feature_operations import FeatureOperation + logger = logging.getLogger(__name__) @@ -490,6 +492,33 @@ def operations_history(self) -> OperationsList: """ return self._operations_history + def encoded_columns_from_original(self, column: str) -> List[str]: + """Return the list of encoded columns name from ``column``. + + Parameters + ---------- + column : str + Column name + + Returns + ------- + List[str] + List of encoded columns name from ``column`` + """ + encoders_on_column = self._operations_history.operations_from_original_column( + column, [fop.OrdinalEncoder, fop.OneHotEncoder] + ) + + encoded_columns = [] + + for encoder in encoders_on_column: + if encoder.derived_columns is None: + encoded_columns.extend(encoder.columns) + else: + encoded_columns.extend(encoder.derived_columns) + + return encoded_columns + def _get_categorical_cols(self, col_list: Tuple[str]) -> Set[str]: """ Identify every categorical column in dataset. diff --git a/src/trousse/feature_fix.py b/src/trousse/feature_fix.py index 748e9a8..666d891 100644 --- a/src/trousse/feature_fix.py +++ b/src/trousse/feature_fix.py @@ -5,9 +5,8 @@ from typing import Any, Tuple import numpy as np -import pandas as pd -from .dataset import Dataset, copy_dataset_with_new_df +from .dataset import Dataset from .feature_operations import FeatureOperation, OneHotEncoder, OrdinalEncoder logger = logging.getLogger(__name__) @@ -187,10 +186,9 @@ def combine_categorical_columns_to_one( def _one_hot_encode_column( - df: pd.DataFrame, + dataset: Dataset, column: str, drop_one_new_column: bool = True, - drop_old_column: bool = False, ): """ OneHotEncoding of 'column' in df @@ -200,43 +198,42 @@ def _one_hot_encode_column( df column drop_one_new_column - drop_old_column Returns ------- """ - dataset = Dataset(df_object=df) one_hot_encoder = OneHotEncoder(columns=[column], derived_column_suffix="_enc") encoded_dataset = one_hot_encoder(dataset) - new_columns = sorted( + derived_columns = sorted( list(set(encoded_dataset.data.columns) - set(dataset.data.columns)) ) - return encoded_dataset.data, one_hot_encoder.encoder, new_columns + return encoded_dataset, derived_columns -def _ordinal_encode_column(df, column, drop_old_column: bool = False): +def _ordinal_encode_column( + dataset: Dataset, + column: str, +): """ Parameters ---------- df column - drop_old_column Returns ------- """ - dataset = Dataset(df_object=df) derived_column = f"{column}_enc" ordinal_encoder = OrdinalEncoder(columns=[column], derived_columns=[derived_column]) encoded_dataset = ordinal_encoder(dataset) - return encoded_dataset.data, ordinal_encoder.encoder, [derived_column] + return encoded_dataset, [derived_column] def encode_single_categorical_column( @@ -244,7 +241,6 @@ def encode_single_categorical_column( col_name: str, encoding: Any = "EncodingFunctions.ORDINAL", drop_one_new_column: bool = True, - drop_old_column: bool = False, force: bool = False, case_sensitive: bool = False, ): @@ -279,44 +275,42 @@ def encode_single_categorical_column( """ # If the column has already been encoded and the new column has already been # created, return dataset - enc_column = dataset.get_enc_column_from_original(column_name=col_name) + enc_column = dataset.encoded_columns_from_original(column=col_name) # Check if encoding operation is required if not force: - if enc_column is not None: + if len(enc_column) > 0: logging.warning( f"The column {col_name} has already been encoded " - f'as "{enc_column}". No further operations are performed ' - ) - return dataset - elif dataset[col_name].dtype.kind in "biufc": - logging.warning( - f"The column {col_name} is already numeric. No further operations " - "are performed " + f'as "{enc_column}". No further operations are performed.' ) return dataset + # elif dataset[col_name].dtype.kind in "biufc": + # logging.warning( + # f"The column {col_name} is already numeric. No further operations + # are performed " + # ) + # return dataset + + dataset_to_encode = dataset.copy() - df_to_encode = dataset.data.copy() - # Find index of rows with NaN and convert it to a fixed value so the corresponding - # encoded col will be dropped - nan_serie_map = df_to_encode[col_name].isna() - nan_serie_map = nan_serie_map.index[nan_serie_map].tolist() - df_to_encode.loc[nan_serie_map][col_name] = NAN_CATEGORY.title() # Set to 'title' case so str with different capitalization are interpreted as equal if not case_sensitive: - df_to_encode.loc[:, col_name] = df_to_encode[col_name].astype(str).str.title() + dataset_to_encode.data.loc[:, col_name] = ( + dataset_to_encode.data[col_name].astype(str).str.title() + ) # Encoding using the selected function if encoding == "ORDINAL": - df_encoded, encoder, new_columns = _ordinal_encode_column( - df_to_encode, column=col_name, drop_old_column=drop_old_column + dataset_encoded, _ = _ordinal_encode_column( + dataset_to_encode, + column=col_name, ) elif encoding == "ONEHOT": - df_encoded, encoder, new_columns = _one_hot_encode_column( - df_to_encode, + dataset_encoded, _ = _one_hot_encode_column( + dataset_to_encode, column=col_name, drop_one_new_column=drop_one_new_column, - drop_old_column=drop_old_column, ) else: logging.error( @@ -325,27 +319,6 @@ def encode_single_categorical_column( ) return None - # Set the rows with missing values originally to NaN - df_encoded.loc[nan_serie_map, col_name] = pd.NA - df_encoded.loc[nan_serie_map, new_columns] = np.nan - - # Generate encoded values map - encoded_values_map = {} - for val_id, val in enumerate(encoder.categories_[0]): - encoded_values_map[val_id] = val - - dataset_encoded = copy_dataset_with_new_df(dataset, df_encoded) - - dataset_encoded.track_history( - FeatureOperation( - original_columns=col_name, - operation_type="CATEGORICAL_ENCODING", - encoder=encoder, - encoded_values_map=encoded_values_map, - derived_columns=tuple(new_columns), - ) - ) - return dataset_encoded diff --git a/src/trousse/feature_operations.py b/src/trousse/feature_operations.py index 3ea322f..42d2394 100644 --- a/src/trousse/feature_operations.py +++ b/src/trousse/feature_operations.py @@ -580,7 +580,6 @@ def _remove_nan_category( self, encoded_categories: List[str], columns_enc: pd.DataFrame ) -> Tuple[List[str], pd.DataFrame]: """Remove the NaN category from the encoded categories and corresponding column. - If the NaN category is not present,``encoded_categories`` and ``columns_enc`` will be returned without modification. diff --git a/src/trousse/operations_list.py b/src/trousse/operations_list.py index ff0f6e6..36dc870 100644 --- a/src/trousse/operations_list.py +++ b/src/trousse/operations_list.py @@ -2,7 +2,7 @@ import collections import typing -from typing import Any, List, Union +from typing import Any, List, Optional, Union if typing.TYPE_CHECKING: # pragma: no cover from .feature_operations import FeatureOperation @@ -59,26 +59,42 @@ def operations_from_derived_column( ) def operations_from_original_column( - self, original_column: str + self, original_column: str, operation_types: Optional[List[type]] = None ) -> List["FeatureOperation"]: - """Return the FeatureOperations applied on ``original_column`` + """Return the FeatureOperations applied on ``original_column``. + + If ``operation_types`` is not None, return only the ``FeatureOperations`` with + corresponding types. Parameters ---------- original_column : str The column on which the FeatureOperation has been applied on. + operation_types : List[type], optional + List of types (i.e. FeatureOperation subclasses name) to filter the + FeatureOperations to return. Default is None, meaning that all types of + operations will be returned. Returns ------- List[FeatureOperation] - FeatureOperations applied on ``original_column`` + FeatureOperations applied on ``original_column`` of type compatible with + ``operation_types`` """ - return list( + operations = list( filter( lambda op: original_column in op.columns, self[original_column], ) ) + if operation_types is not None: + operations = list( + filter( + lambda op: any([isinstance(op, t) for t in operation_types]), + operations, + ) + ) + return operations def original_columns_from_derived_column(self, derived_column: str) -> List[str]: """Return the name of the columns from which ``derived_column`` is generated. diff --git a/tests/integration/test_feature_fix.py b/tests/integration/test_feature_fix.py index 0321184..91492cb 100644 --- a/tests/integration/test_feature_fix.py +++ b/tests/integration/test_feature_fix.py @@ -1,6 +1,5 @@ import pandas as pd import pytest -import sklearn.preprocessing as sk_preproc import trousse.feature_fix as ffx from trousse.dataset import Dataset @@ -24,9 +23,9 @@ def test_ordinal_encode_column(csv, column, derived_column, expected_csv): dataset = Dataset(data_file=csv) expected_df = load_expectation(expected_csv, type_="csv") - encoded_df, _, new_cols = ffx._ordinal_encode_column(dataset.data, column, False) + encoded_dataset, new_cols = ffx._ordinal_encode_column(dataset, column) - pd.testing.assert_frame_equal(encoded_df, expected_df) + pd.testing.assert_frame_equal(encoded_dataset.data, expected_df) assert derived_column == new_cols @@ -55,10 +54,9 @@ def test_one_hot_encode_column( dataset = Dataset(data_file=csv) expected_df = load_expectation(expected_csv, type_="csv") - encoded_df, encoder, new_cols = ffx._one_hot_encode_column( - dataset.data, column, drop_one_new_column + encoded_dataset, new_cols = ffx._one_hot_encode_column( + dataset, column, drop_one_new_column ) assert expected_new_cols == new_cols - pd.testing.assert_frame_equal(encoded_df, expected_df, check_dtype=False) - assert isinstance(encoder, sk_preproc.OneHotEncoder) + pd.testing.assert_frame_equal(encoded_dataset.data, expected_df, check_dtype=False) diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 0758cd9..5f291bd 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -243,7 +243,6 @@ def it_knows_how_to_track_history( self, request, metadata_cols, derived_columns, expected_metadata_cols ): operations_list_iadd_ = method_mock(request, OperationsList, "__iadd__") - expected_df = DataFrameMock.df_generic(10) get_df_from_csv_ = function_mock(request, "trousse.dataset.get_df_from_csv") get_df_from_csv_.return_value = expected_df @@ -257,6 +256,58 @@ def it_knows_how_to_track_history( assert dataset.metadata_cols == expected_metadata_cols operations_list_iadd_.assert_called_once_with(ANY, feat_op) + @pytest.mark.parametrize( + "op_from_original_column_ret_value, expected_columns", + [ + ( + [ + fop.OrdinalEncoder( + columns=["col"], derived_columns=["encoded_col"] + ), + fop.OrdinalEncoder(columns=["col"], derived_columns=None), + ], + ["encoded_col", "col"], + ), + ( + [ + fop.OrdinalEncoder( + columns=["col"], derived_columns=["encoded_col"] + ), + fop.OrdinalEncoder(columns=["col"], derived_columns=None), + fop.OrdinalEncoder( + columns=["col"], derived_columns=["encoded_col2"] + ), + ], + ["encoded_col", "col", "encoded_col2"], + ), + ( + [], + [], + ), + ], + ) + def it_knows_how_to_get_encoded_columns_from_original( + self, request, op_from_original_column_ret_value, expected_columns + ): + op_list__op_from_original_column_ = method_mock( + request, OperationsList, "operations_from_original_column" + ) + op_list__op_from_original_column_.return_value = ( + op_from_original_column_ret_value + ) + expected_df = DataFrameMock.df_generic(10) + get_df_from_csv_ = function_mock(request, "trousse.dataset.get_df_from_csv") + get_df_from_csv_.return_value = expected_df + dataset = Dataset(data_file="fake/path") + + columns = dataset.encoded_columns_from_original("col") + + assert type(columns) == list + assert columns == expected_columns + op_list__op_from_original_column_.assert_called_once_with( + dataset.operations_history, "col", [fop.OrdinalEncoder, fop.OneHotEncoder] + ) + class DescribeColumnListByType: def it_knows_its_str(self, request): diff --git a/tests/unit/test_operations_list.py b/tests/unit/test_operations_list.py index a47ebb8..a9659c0 100644 --- a/tests/unit/test_operations_list.py +++ b/tests/unit/test_operations_list.py @@ -139,18 +139,118 @@ def it_can_get_operations_from_derived_column(self, request): assert type(operations) == list assert operations == [fop1] - def it_can_get_operations_from_original_column(self, request): + @pytest.mark.parametrize( + "getitem_return, operations_types, expected_operations", + [ + ( + [ + fop.FillNA(columns=["col4"], derived_columns=["col1"], value=0), + fop.FillNA(columns=["col1"], derived_columns=["col4"], value=0), + fop.FillNA(columns=["col4"], derived_columns=None, value=0), + ], + None, + [ + fop.FillNA(columns=["col4"], derived_columns=["col1"], value=0), + fop.FillNA(columns=["col4"], derived_columns=None, value=0), + ], + ), + ( + [ + fop.FillNA(columns=["col4"], derived_columns=["col1"], value=0), + fop.FillNA(columns=["col1"], derived_columns=["col4"], value=0), + fop.FillNA(columns=["col4"], derived_columns=None, value=0), + fop.ReplaceSubstrings( + columns=["col4"], + derived_columns=["replaced_col4"], + replacement_map={"a": "b", "c": "d"}, + ), + ], + None, + [ + fop.FillNA(columns=["col4"], derived_columns=["col1"], value=0), + fop.FillNA(columns=["col4"], derived_columns=None, value=0), + fop.ReplaceSubstrings( + columns=["col4"], + derived_columns=["replaced_col4"], + replacement_map={"a": "b", "c": "d"}, + ), + ], + ), + ( + [ + fop.FillNA(columns=["col4"], derived_columns=["col1"], value=0), + fop.FillNA(columns=["col1"], derived_columns=["col4"], value=0), + fop.FillNA(columns=["col4"], derived_columns=None, value=0), + fop.ReplaceSubstrings( + columns=["col4"], + derived_columns=["replaced_col4"], + replacement_map={"a": "b", "c": "d"}, + ), + ], + [fop.FillNA], + [ + fop.FillNA(columns=["col4"], derived_columns=["col1"], value=0), + fop.FillNA(columns=["col4"], derived_columns=None, value=0), + ], + ), + ( + [ + fop.FillNA(columns=["col4"], derived_columns=["col1"], value=0), + fop.FillNA(columns=["col1"], derived_columns=["col4"], value=0), + fop.FillNA(columns=["col4"], derived_columns=None, value=0), + fop.ReplaceSubstrings( + columns=["col4"], + derived_columns=["replaced_col4"], + replacement_map={"a": "b", "c": "d"}, + ), + ], + [fop.ReplaceSubstrings], + [ + fop.ReplaceSubstrings( + columns=["col4"], + derived_columns=["replaced_col4"], + replacement_map={"a": "b", "c": "d"}, + ), + ], + ), + ( + [ + fop.FillNA(columns=["col4"], derived_columns=["col1"], value=0), + fop.FillNA(columns=["col1"], derived_columns=["col4"], value=0), + fop.FillNA(columns=["col4"], derived_columns=None, value=0), + fop.ReplaceSubstrings( + columns=["col4"], + derived_columns=["replaced_col4"], + replacement_map={"a": "b", "c": "d"}, + ), + ], + [fop.ReplaceSubstrings, fop.FillNA], + [ + fop.FillNA(columns=["col4"], derived_columns=["col1"], value=0), + fop.FillNA(columns=["col4"], derived_columns=None, value=0), + fop.ReplaceSubstrings( + columns=["col4"], + derived_columns=["replaced_col4"], + replacement_map={"a": "b", "c": "d"}, + ), + ], + ), + ], + ) + def it_can_get_operations_from_original_column( + self, request, getitem_return, operations_types, expected_operations + ): op_list = OperationsList() getitem_ = method_mock(request, OperationsList, "__getitem__") - fop0 = fop.FillNA(columns=["col4"], derived_columns=["col1"], value=0) - fop1 = fop.FillNA(columns=["col1"], derived_columns=["col4"], value=0) - fop2 = fop.FillNA(columns=["col4"], derived_columns=None, value=0) - getitem_.return_value = [fop0, fop1, fop2] - operations = op_list.operations_from_original_column("col4") + getitem_.return_value = getitem_return + + operations = op_list.operations_from_original_column( + "col4", operation_types=operations_types + ) assert type(operations) == list - assert operations == [fop0, fop2] + assert operations == expected_operations def it_can_get_original_columns_from_derived_column(self, request): op_list = OperationsList()