Skip to content

Commit

Permalink
Merge pull request #157 from gketronDS/main
Browse files Browse the repository at this point in the history
Adding SKLearn OrdinalEncoder as a Transformer
  • Loading branch information
nickotto authored Jan 21, 2025
2 parents 6ab464a + 9276e8b commit 9ca7800
Show file tree
Hide file tree
Showing 7 changed files with 149 additions and 13 deletions.
Binary file added ImputerExperiments/data/r/.DS_Store
Binary file not shown.
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
#TODO update this
from setuptools import setup, find_packages


def calculate_version():
initpy = open('tpot2/_version.py').read().split('\n')
version = list(filter(lambda x: '__version__' in x, initpy))[0].split('\'')[1]
Expand Down
2 changes: 1 addition & 1 deletion tpot2/builtin_modules/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .feature_set_selector import FeatureSetSelector
from .zero_count import ZeroCount
from .column_one_hot_encoder import ColumnOneHotEncoder
from .column_one_hot_encoder import ColumnOneHotEncoder, ColumnOrdinalEncoder
from .arithmetictransformer import ArithmeticTransformer
from .arithmetictransformer import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer
from .passthrough import Passthrough, SkipTransformer
Expand Down
137 changes: 136 additions & 1 deletion tpot2/builtin_modules/column_one_hot_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import sklearn

import pandas as pd
Expand Down Expand Up @@ -203,3 +203,138 @@ def transform(self, X):
return pd.concat([X_not_sel.reset_index(drop=True), X_sel.reset_index(drop=True)], axis=1)
else:
return np.hstack((X_not_sel, X_sel))

class ColumnOrdinalEncoder(BaseEstimator, TransformerMixin):


def __init__(self, columns='auto', handle_unknown='error', unknown_value = -1, encoded_missing_value = np.nan, min_frequency=None,max_categories=None):
'''
Parameters
----------
columns : str, list, default='auto'
Determines which columns to onehot encode with sklearn.preprocessing.OneHotEncoder.
- 'auto' : Automatically select categorical features based on columns with less than 10 unique values
- 'categorical' : Automatically select categorical features
- 'numeric' : Automatically select numeric features
- 'all' : Select all features
- list : A list of columns to select
drop, handle_unknown, sparse_output, min_frequency, max_categories : see sklearn.preprocessing.OneHotEncoder
'''

self.columns = columns
self.handle_unknown = handle_unknown
self.unknown_value = unknown_value
self.encoded_missing_value = encoded_missing_value
self.min_frequency = min_frequency
self.max_categories = max_categories



def fit(self, X, y=None):
"""Fit OneHotEncoder to X, then transform X.
Equivalent to self.fit(X).transform(X), but more convenient and more
efficient. See fit for the parameters, transform for the return value.
Parameters
----------
X : array-like or sparse matrix, shape=(n_samples, n_features)
Dense array or sparse matrix.
y: array-like {n_samples,} (Optional, ignored)
Feature labels
"""

if (self.columns == "categorical" or self.columns == "numeric") and not isinstance(X, pd.DataFrame):
raise ValueError(f"Invalid value for columns: {self.columns}. "
"Only 'all' or <list> is supported for np arrays")

if self.columns == "categorical":
self.columns_ = list(X.select_dtypes(exclude='number').columns)
elif self.columns == "numeric":
self.columns_ = [col for col in X.columns if is_numeric_dtype(X[col])]
elif self.columns == "auto":
self.columns_ = auto_select_categorical_features(X)
elif self.columns == "all":
if isinstance(X, pd.DataFrame):
self.columns_ = X.columns
else:
self.columns_ = list(range(X.shape[1]))
elif isinstance(self.columns, list):
self.columns_ = self.columns
else:
raise ValueError(f"Invalid value for columns: {self.columns}")

if len(self.columns_) == 0:
return self

self.enc = sklearn.preprocessing.OrdinalEncoder(categories='auto',
handle_unknown = self.handle_unknown,
unknown_value = self.unknown_value,
encoded_missing_value = self.encoded_missing_value,
min_frequency = self.min_frequency,
max_categories = self.max_categories)
#TODO make this more consistent with sklearn baseimputer/baseencoder
'''
if isinstance(X, pd.DataFrame):
self.enc.set_output(transform="pandas")
for col in X.columns:
# check if the column name is not a string
if not isinstance(col, str):
# if it's not a string, rename the column with "X" prefix
X.rename(columns={col: f"X{col}"}, inplace=True)
'''

if len(self.columns_) == X.shape[1]:
X_sel = self.enc.fit(X)
else:
X_sel, X_not_sel = _X_selected(X, self.columns_)
X_sel = self.enc.fit(X_sel)

return self

def transform(self, X):
"""Transform X using one-hot encoding.
Parameters
----------
X : array-like or sparse matrix, shape=(n_samples, n_features)
Dense array or sparse matrix.
Returns
-------
X_out : sparse matrix if sparse=True else a 2-d array, dtype=int
Transformed input.
"""


if len(self.columns_) == 0:
return X

#TODO make this more consistent with sklearn baseimputer/baseencoder
'''
if isinstance(X, pd.DataFrame):
for col in X.columns:
# check if the column name is not a string
if not isinstance(col, str):
# if it's not a string, rename the column with "X" prefix
X.rename(columns={col: f"X{col}"}, inplace=True)
'''

if len(self.columns_) == X.shape[1]:
return self.enc.transform(X)
else:

X_sel, X_not_sel= _X_selected(X, self.columns_)
X_sel = self.enc.transform(X_sel)

#If X is dataframe
if isinstance(X, pd.DataFrame):

X_sel = pd.DataFrame(X_sel, columns=self.enc.get_feature_names_out())
return pd.concat([X_not_sel.reset_index(drop=True), X_sel.reset_index(drop=True)], axis=1)
else:
return np.hstack((X_not_sel, X_sel))
18 changes: 9 additions & 9 deletions tpot2/config/get_configspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
from tpot2.builtin_modules import genetic_encoders, feature_encoding_frequency_selector
from tpot2.builtin_modules import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer
from tpot2.builtin_modules.genetic_encoders import DominantEncoder, RecessiveEncoder, HeterosisEncoder, UnderDominanceEncoder, OverDominanceEncoder
from tpot2.builtin_modules import ZeroCount, ColumnOneHotEncoder, PassKBinsDiscretizer
from tpot2.builtin_modules import ZeroCount, ColumnOneHotEncoder, ColumnOrdinalEncoder, PassKBinsDiscretizer
from tpot2.builtin_modules import Passthrough, SkipTransformer
from sklearn.linear_model import SGDClassifier, LogisticRegression, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, ElasticNetCV, PassiveAggressiveClassifier, ARDRegression
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, ExtraTreesRegressor, ExtraTreesClassifier, AdaBoostRegressor, AdaBoostClassifier, GradientBoostingRegressor,RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor
Expand All @@ -86,7 +86,7 @@
import sklearn.calibration


all_methods = [SGDClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, MLPClassifier, DecisionTreeClassifier, XGBClassifier, KNeighborsClassifier, SVC, LogisticRegression, LGBMClassifier, LinearSVC, GaussianNB, BernoulliNB, MultinomialNB, ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, DecisionTreeRegressor, KNeighborsRegressor, XGBRegressor, ZeroCount, ColumnOneHotEncoder, Binarizer, FastICA, FeatureAgglomeration, MaxAbsScaler, MinMaxScaler, Normalizer, Nystroem, PCA, PolynomialFeatures, RBFSampler, RobustScaler, StandardScaler, SelectFwe, SelectPercentile, VarianceThreshold, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, SVR, LinearSVR, AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, DecisionTreeRegressor, KNeighborsRegressor, ElasticNetCV,
all_methods = [SGDClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, MLPClassifier, DecisionTreeClassifier, XGBClassifier, KNeighborsClassifier, SVC, LogisticRegression, LGBMClassifier, LinearSVC, GaussianNB, BernoulliNB, MultinomialNB, ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, DecisionTreeRegressor, KNeighborsRegressor, XGBRegressor, ZeroCount, ColumnOneHotEncoder, ColumnOrdinalEncoder, Binarizer, FastICA, FeatureAgglomeration, MaxAbsScaler, MinMaxScaler, Normalizer, Nystroem, PCA, PolynomialFeatures, RBFSampler, RobustScaler, StandardScaler, SelectFwe, SelectPercentile, VarianceThreshold, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, SVR, LinearSVR, AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, DecisionTreeRegressor, KNeighborsRegressor, ElasticNetCV,
AdaBoostClassifier,MLPRegressor,
GaussianProcessRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor,
AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer,
Expand Down Expand Up @@ -155,19 +155,17 @@
"selectors": ["SelectFwe", "SelectPercentile", "VarianceThreshold",],
"selectors_classification": ["SelectFwe", "SelectPercentile", "VarianceThreshold", "RFE_classification", "SelectFromModel_classification"],
"selectors_regression": ["SelectFwe", "SelectPercentile", "VarianceThreshold", "RFE_regression", "SelectFromModel_regression"],

"classifiers" : ["LGBMClassifier", "BaggingClassifier", 'AdaBoostClassifier', 'BernoulliNB', 'DecisionTreeClassifier', 'ExtraTreesClassifier', 'GaussianNB', 'HistGradientBoostingClassifier', 'KNeighborsClassifier','LinearDiscriminantAnalysis', 'LogisticRegression', 'MLPClassifier', 'MultinomialNB', "QuadraticDiscriminantAnalysis", 'RandomForestClassifier', 'SGDClassifier', 'XGBClassifier'],
"regressors" : ["LGBMRegressor", 'AdaBoostRegressor', "ARDRegression", 'DecisionTreeRegressor', 'ExtraTreesRegressor', 'HistGradientBoostingRegressor', 'KNeighborsRegressor', 'LinearSVR', "MLPRegressor", 'RandomForestRegressor', 'SGDRegressor', 'XGBRegressor'],


"transformers": ["KBinsDiscretizer", "Binarizer", "PCA", "ZeroCount", "ColumnOneHotEncoder", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler", "QuantileTransformer", "PowerTransformer"],
"transformers": ["KBinsDiscretizer", "Binarizer", "PCA", "ZeroCount", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler", "QuantileTransformer", "PowerTransformer", "ColumnOneHotEncoder", "ColumnOrdinalEncoder"],

"scalers": ["MinMaxScaler", "RobustScaler", "StandardScaler", "MaxAbsScaler", "Normalizer", ],
"all_transformers" : ["transformers", "scalers"],

"arithmatic": ["AddTransformer", "mul_neg_1_Transformer", "MulTransformer", "SafeReciprocalTransformer", "EQTransformer", "NETransformer", "GETransformer", "GTTransformer", "LETransformer", "LTTransformer", "MinTransformer", "MaxTransformer"],
"imputers": ["SimpleImputer", "IterativeImputer", "KNNImputer"],
"skrebate": ["ReliefF", "SURF", "SURFstar", "MultiSURF"],
"genetic_encoders": ["DominantEncoder", "RecessiveEncoder", "HeterosisEncoder", "UnderDominanceEncoder", "OverDominanceEncoder"],

"classifiers_sklearnex" : ["RandomForestClassifier_sklearnex", "LogisticRegression_sklearnex", "KNeighborsClassifier_sklearnex", "SVC_sklearnex","NuSVC_sklearnex"],
"regressors_sklearnex" : ["LinearRegression_sklearnex", "Ridge_sklearnex", "Lasso_sklearnex", "ElasticNet_sklearnex", "SVR_sklearnex", "NuSVR_sklearnex", "RandomForestRegressor_sklearnex", "KNeighborsRegressor_sklearnex"],
"genetic encoders" : ["DominantEncoder", "RecessiveEncoder", "HeterosisEncoder", "UnderDominanceEncoder", "OverDominanceEncoder"],
Expand Down Expand Up @@ -352,8 +350,6 @@ def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_st
return transformers.get_QuantileTransformer_configspace(n_samples=n_samples, random_state=random_state)
case "RobustScaler":
return transformers.RobustScaler_configspace
case "ColumnOneHotEncoder":
return {}
case "MaxAbsScaler":
return {}
case "PolynomialFeatures":
Expand All @@ -364,6 +360,10 @@ def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_st
return transformers.get_passkbinsdiscretizer_configspace(random_state=random_state)
case "KBinsDiscretizer":
return transformers.get_passkbinsdiscretizer_configspace(random_state=random_state)
case "ColumnOneHotEncoder":
return {}
case "ColumnOrdinalEncoder":
return {}

#selectors.py
case "SelectFwe":
Expand Down
2 changes: 1 addition & 1 deletion tpot2/config/imputers.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
# and can cause errors. gk
}
)

#test
def get_IterativeImputer_config_space(n_features, random_state):
space = { 'initial_strategy' : Categorical('initial_strategy',
['mean', 'median',
Expand Down
2 changes: 2 additions & 0 deletions tpot2/config/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@

OneHotEncoder_configspace = {} #TODO include the parameter for max unique values

OrdinalEncoder_configspace = {} #TODO include the parameter for max unique values

def get_FastICA_configspace(n_features=100, random_state=None):

space = {
Expand Down

0 comments on commit 9ca7800

Please sign in to comment.