Skip to content

Commit

Permalink
Gradually adapt encoding functions in feature_fix to new FeatureOpera…
Browse files Browse the repository at this point in the history
…tion usage
  • Loading branch information
alessiamarcolini committed Feb 3, 2021
1 parent 546bef8 commit ff0adac
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 64 deletions.
83 changes: 28 additions & 55 deletions src/trousse/feature_fix.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
from typing import Any, Tuple

import numpy as np
import pandas as pd

from .dataset import Dataset, copy_dataset_with_new_df
from .dataset import Dataset
from .feature_operations import FeatureOperation, OneHotEncoder, OrdinalEncoder

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -187,10 +186,9 @@ def combine_categorical_columns_to_one(


def _one_hot_encode_column(
df: pd.DataFrame,
dataset: Dataset,
column: str,
drop_one_new_column: bool = True,
drop_old_column: bool = False,
):
"""
OneHotEncoding of 'column' in df
Expand All @@ -200,51 +198,49 @@ def _one_hot_encode_column(
df
column
drop_one_new_column
drop_old_column
Returns
-------
"""
dataset = Dataset(df_object=df)
one_hot_encoder = OneHotEncoder(columns=[column], derived_column_suffix="_enc")

encoded_dataset = one_hot_encoder(dataset)

new_columns = sorted(
derived_columns = sorted(
list(set(encoded_dataset.data.columns) - set(dataset.data.columns))
)
return encoded_dataset.data, one_hot_encoder.encoder, new_columns
return encoded_dataset, derived_columns


def _ordinal_encode_column(df, column, drop_old_column: bool = False):
def _ordinal_encode_column(
dataset: Dataset,
column: str,
):
"""
Parameters
----------
df
column
drop_old_column
Returns
-------
"""

dataset = Dataset(df_object=df)
derived_column = f"{column}_enc"
ordinal_encoder = OrdinalEncoder(columns=[column], derived_columns=[derived_column])

encoded_dataset = ordinal_encoder(dataset)
return encoded_dataset.data, ordinal_encoder.encoder, [derived_column]
return encoded_dataset, [derived_column]


def encode_single_categorical_column(
dataset: Dataset,
col_name: str,
encoding: Any = "EncodingFunctions.ORDINAL",
drop_one_new_column: bool = True,
drop_old_column: bool = False,
force: bool = False,
case_sensitive: bool = False,
):
Expand Down Expand Up @@ -279,44 +275,42 @@ def encode_single_categorical_column(
"""
# If the column has already been encoded and the new column has already been
# created, return dataset
enc_column = dataset.get_enc_column_from_original(column_name=col_name)
enc_column = dataset.encoded_columns_from_original(column=col_name)

# Check if encoding operation is required
if not force:
if enc_column is not None:
if len(enc_column) > 0:
logging.warning(
f"The column {col_name} has already been encoded "
f'as "{enc_column}". No further operations are performed '
)
return dataset
elif dataset[col_name].dtype.kind in "biufc":
logging.warning(
f"The column {col_name} is already numeric. No further operations "
"are performed "
f'as "{enc_column}". No further operations are performed.'
)
return dataset
# elif dataset[col_name].dtype.kind in "biufc":
# logging.warning(
# f"The column {col_name} is already numeric. No further operations
# are performed "
# )
# return dataset

dataset_to_encode = dataset.copy()

df_to_encode = dataset.data.copy()
# Find index of rows with NaN and convert it to a fixed value so the corresponding
# encoded col will be dropped
nan_serie_map = df_to_encode[col_name].isna()
nan_serie_map = nan_serie_map.index[nan_serie_map].tolist()
df_to_encode.loc[nan_serie_map][col_name] = NAN_CATEGORY.title()
# Set to 'title' case so str with different capitalization are interpreted as equal
if not case_sensitive:
df_to_encode.loc[:, col_name] = df_to_encode[col_name].astype(str).str.title()
dataset_to_encode.data.loc[:, col_name] = (
dataset_to_encode.data[col_name].astype(str).str.title()
)

# Encoding using the selected function
if encoding == "ORDINAL":
df_encoded, encoder, new_columns = _ordinal_encode_column(
df_to_encode, column=col_name, drop_old_column=drop_old_column
dataset_encoded, _ = _ordinal_encode_column(
dataset_to_encode,
column=col_name,
)
elif encoding == "ONEHOT":
df_encoded, encoder, new_columns = _one_hot_encode_column(
df_to_encode,
dataset_encoded, _ = _one_hot_encode_column(
dataset_to_encode,
column=col_name,
drop_one_new_column=drop_one_new_column,
drop_old_column=drop_old_column,
)
else:
logging.error(
Expand All @@ -325,27 +319,6 @@ def encode_single_categorical_column(
)
return None

# Set the rows with missing values originally to NaN
df_encoded.loc[nan_serie_map, col_name] = pd.NA
df_encoded.loc[nan_serie_map, new_columns] = np.nan

# Generate encoded values map
encoded_values_map = {}
for val_id, val in enumerate(encoder.categories_[0]):
encoded_values_map[val_id] = val

dataset_encoded = copy_dataset_with_new_df(dataset, df_encoded)

dataset_encoded.track_history(
FeatureOperation(
original_columns=col_name,
operation_type="CATEGORICAL_ENCODING",
encoder=encoder,
encoded_values_map=encoded_values_map,
derived_columns=tuple(new_columns),
)
)

return dataset_encoded


Expand Down
2 changes: 1 addition & 1 deletion src/trousse/feature_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import copy
from abc import ABC, abstractmethod
from typing import Any, List, Mapping, Optional, Tuple
from typing import Any, List, Mapping, Tuple

import numpy as np
import pandas as pd
Expand Down
12 changes: 5 additions & 7 deletions tests/integration/test_feature_fix.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import pandas as pd
import pytest
import sklearn.preprocessing as sk_preproc

import trousse.feature_fix as ffx
from trousse.dataset import Dataset
Expand All @@ -24,9 +23,9 @@ def test_ordinal_encode_column(csv, column, derived_column, expected_csv):
dataset = Dataset(data_file=csv)
expected_df = load_expectation(expected_csv, type_="csv")

encoded_df, _, new_cols = ffx._ordinal_encode_column(dataset.data, column, False)
encoded_dataset, new_cols = ffx._ordinal_encode_column(dataset, column)

pd.testing.assert_frame_equal(encoded_df, expected_df)
pd.testing.assert_frame_equal(encoded_dataset.data, expected_df)
assert derived_column == new_cols


Expand Down Expand Up @@ -55,10 +54,9 @@ def test_one_hot_encode_column(
dataset = Dataset(data_file=csv)
expected_df = load_expectation(expected_csv, type_="csv")

encoded_df, encoder, new_cols = ffx._one_hot_encode_column(
dataset.data, column, drop_one_new_column
encoded_dataset, new_cols = ffx._one_hot_encode_column(
dataset, column, drop_one_new_column
)

assert expected_new_cols == new_cols
pd.testing.assert_frame_equal(encoded_df, expected_df, check_dtype=False)
assert isinstance(encoder, sk_preproc.OneHotEncoder)
pd.testing.assert_frame_equal(encoded_dataset.data, expected_df, check_dtype=False)
2 changes: 1 addition & 1 deletion tests/unit/test_feature_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from trousse import feature_operations as fop
from trousse.dataset import Dataset

from ..dataset_util import DataFrameMock, SeriesMock
from ..dataset_util import DataFrameMock
from ..unitutil import ANY, function_mock, initializer_mock, instance_mock, method_mock


Expand Down

0 comments on commit ff0adac

Please sign in to comment.