Skip to content

Commit

Permalink
Merge pull request #136 from perib/new_search_space_def
Browse files Browse the repository at this point in the history
New search space def
  • Loading branch information
perib authored May 24, 2024
2 parents 378149f + 8056594 commit fedc90d
Show file tree
Hide file tree
Showing 16 changed files with 244 additions and 65 deletions.
3 changes: 2 additions & 1 deletion tpot2/builtin_modules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@
from .arithmetictransformer import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer
from .passthrough import Passthrough
from .imputer import ColumnSimpleImputer
from .estimatortransformer import EstimatorTransformer
from .estimatortransformer import EstimatorTransformer
from .passkbinsdiscretizer import PassKBinsDiscretizer
2 changes: 1 addition & 1 deletion tpot2/builtin_modules/column_one_hot_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def _X_selected(X, selected):
class ColumnOneHotEncoder(BaseEstimator, TransformerMixin):


def __init__(self, columns='auto', drop=None, handle_unknown='error', sparse_output=False, min_frequency=None,max_categories=None):
def __init__(self, columns='auto', drop=None, handle_unknown='infrequent_if_exist', sparse_output=False, min_frequency=None,max_categories=None):
'''
Parameters
Expand Down
43 changes: 43 additions & 0 deletions tpot2/builtin_modules/passkbinsdiscretizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np

def select_features(X, min_unique=10,):

if isinstance(X, pd.DataFrame):
return [col for col in X.columns if len(X[col].unique()) > min_unique]
else:
return [i for i in range(X.shape[1]) if len(np.unique(X[:, i])) > min_unique]

class PassKBinsDiscretizer(BaseEstimator, TransformerMixin):
"""
Same as sklearn.preprocessing.KBinsDiscretizer, but passes through columns that are not discretized due to having fewer than n_bins unique values instead of ignoring them.
"""
def __init__(self, n_bins=5, encode='onehot-dense', strategy='quantile', subsample='warn', random_state=None):
self.n_bins = n_bins
self.encode = encode
self.strategy = strategy
self.subsample = subsample
self.random_state = random_state

def fit(self, X, y=None):
# Identify columns with more than n unique values
# Create a ColumnTransformer to select and discretize the chosen columns
self.selected_columns_ = select_features(X, min_unique=10)
if isinstance(X, pd.DataFrame):
self.not_selected_columns_ = [col for col in X.columns if col not in self.selected_columns_]
else:
self.not_selected_columns_ = [i for i in range(X.shape[1]) if i not in self.selected_columns_]

enc = KBinsDiscretizer(n_bins=self.n_bins, encode=self.encode, strategy=self.strategy, subsample=self.subsample, random_state=self.random_state)
self.transformer = ColumnTransformer([
('discretizer', enc, self.selected_columns_),
('passthrough', 'passthrough', self.not_selected_columns_)
])
self.transformer.fit(X)
return self

def transform(self, X):
return self.transformer.transform(X)
36 changes: 30 additions & 6 deletions tpot2/config/classifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,10 +386,18 @@ def GradientBoostingClassifier_hyperparameter_parser(params):
final_params['n_iter_no_change'] = None
final_params['validation_fraction'] = None
elif params['early_stop'] == 'valid':
final_params['n_iter_no_change'] = params['n_iter_no_change']
final_params['validation_fraction'] = params['validation_fraction']
#this is required because in crossover, its possible that n_iter_no_change is not in the params
if 'n_iter_no_change' not in params:
final_params['n_iter_no_change'] = 10
else:
final_params['n_iter_no_change'] = params['n_iter_no_change']
if 'validation_fraction' not in params:
final_params['validation_fraction'] = 0.1
else:
final_params['validation_fraction'] = params['validation_fraction']
elif params['early_stop'] == 'train':
final_params['n_iter_no_change'] = params['n_iter_no_change']
if 'n_iter_no_change' not in params:
final_params['n_iter_no_change'] = 10
final_params['validation_fraction'] = None


Expand Down Expand Up @@ -445,16 +453,32 @@ def HistGradientBoostingClassifier_hyperparameter_parser(params):
final_params['random_state'] = params['random_state']



if params['early_stop'] == 'off':
# final_params['n_iter_no_change'] = 0
final_params['validation_fraction'] = None
final_params['early_stopping'] = False
elif params['early_stop'] == 'valid':
final_params['n_iter_no_change'] = params['n_iter_no_change']
final_params['validation_fraction'] = params['validation_fraction']

#this is required because in crossover, its possible that n_iter_no_change is not in the params
if 'n_iter_no_change' not in params:
final_params['n_iter_no_change'] = 10
else:
final_params['n_iter_no_change'] = params['n_iter_no_change']
if 'validation_fraction' not in params:
final_params['validation_fraction'] = 0.1
else:
final_params['validation_fraction'] = params['validation_fraction']

final_params['early_stopping'] = True
elif params['early_stop'] == 'train':
final_params['n_iter_no_change'] = params['n_iter_no_change']

if 'n_iter_no_change' not in params:
final_params['n_iter_no_change'] = 10
else:
final_params['n_iter_no_change'] = params['n_iter_no_change']


final_params['validation_fraction'] = None
final_params['early_stopping'] = True

Expand Down
7 changes: 5 additions & 2 deletions tpot2/config/get_configspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from tpot2.builtin_modules import genetic_encoders, feature_encoding_frequency_selector
from tpot2.builtin_modules import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer
from tpot2.builtin_modules.genetic_encoders import DominantEncoder, RecessiveEncoder, HeterosisEncoder, UnderDominanceEncoder, OverDominanceEncoder
from tpot2.builtin_modules import ZeroCount, ColumnOneHotEncoder
from tpot2.builtin_modules import ZeroCount, ColumnOneHotEncoder, PassKBinsDiscretizer
from tpot2.builtin_modules import Passthrough
from sklearn.linear_model import SGDClassifier, LogisticRegression, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, ElasticNetCV, PassiveAggressiveClassifier, ARDRegression
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, ExtraTreesRegressor, ExtraTreesClassifier, AdaBoostRegressor, AdaBoostClassifier, GradientBoostingRegressor,RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor
Expand Down Expand Up @@ -55,6 +55,7 @@
DominantEncoder, RecessiveEncoder, HeterosisEncoder, UnderDominanceEncoder, OverDominanceEncoder,
GaussianProcessClassifier, BaggingClassifier,LGBMRegressor,
Passthrough,
PassKBinsDiscretizer,
]


Expand Down Expand Up @@ -117,7 +118,7 @@
"regressors" : ["LGBMRegressor", 'AdaBoostRegressor', "ARDRegression", 'DecisionTreeRegressor', 'ExtraTreesRegressor', 'HistGradientBoostingRegressor', 'KNeighborsRegressor', 'LinearSVR', "MLPRegressor", 'RandomForestRegressor', 'SGDRegressor', 'SVR', 'XGBRegressor'],


"transformers": ["Binarizer", "PCA", "ZeroCount", "ColumnOneHotEncoder", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler", "QuantileTransformer", "PowerTransformer"],
"transformers": ["PassKBinsDiscretizer", "Binarizer", "PCA", "ZeroCount", "ColumnOneHotEncoder", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler", "QuantileTransformer", "PowerTransformer"],
"scalers": ["MinMaxScaler", "RobustScaler", "StandardScaler", "MaxAbsScaler", "Normalizer", ],
"all_transformers" : ["transformers", "scalers"],

Expand Down Expand Up @@ -291,6 +292,8 @@ def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_st
return transformers.PolynomialFeatures_configspace
case "StandardScaler":
return {}
case "PassKBinsDiscretizer":
return transformers.get_passkbinsdiscretizer_configspace(random_state=random_state)

#selectors.py
case "SelectFwe":
Expand Down
31 changes: 25 additions & 6 deletions tpot2/config/regressors.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,10 +436,20 @@ def GradientBoostingRegressor_hyperparameter_parser(params):
final_params['n_iter_no_change'] = None
final_params['validation_fraction'] = None
elif params['early_stop'] == 'valid':
final_params['n_iter_no_change'] = params['n_iter_no_change']
final_params['validation_fraction'] = params['validation_fraction']
#this is required because in crossover, its possible that n_iter_no_change is not in the params
if 'n_iter_no_change' not in params:
final_params['n_iter_no_change'] = 10
else:
final_params['n_iter_no_change'] = params['n_iter_no_change']
if 'validation_fraction' not in params:
final_params['validation_fraction'] = 0.1
else:
final_params['validation_fraction'] = params['validation_fraction']
elif params['early_stop'] == 'train':
final_params['n_iter_no_change'] = params['n_iter_no_change']
if 'n_iter_no_change' not in params:
final_params['n_iter_no_change'] = 10
else:
final_params['n_iter_no_change'] = params['n_iter_no_change']
final_params['validation_fraction'] = None


Expand Down Expand Up @@ -498,11 +508,20 @@ def HistGradientBoostingRegressor_hyperparameter_parser(params):
# final_params['validation_fraction'] = None
final_params['early_stopping'] = False
elif params['early_stop'] == 'valid':
final_params['n_iter_no_change'] = params['n_iter_no_change']
final_params['validation_fraction'] = params['validation_fraction']
if 'n_iter_no_change' not in params:
final_params['n_iter_no_change'] = 10
else:
final_params['n_iter_no_change'] = params['n_iter_no_change']
if 'validation_fraction' not in params:
final_params['validation_fraction'] = 0.1
else:
final_params['validation_fraction'] = params['validation_fraction']
final_params['early_stopping'] = True
elif params['early_stop'] == 'train':
final_params['n_iter_no_change'] = params['n_iter_no_change']
if 'n_iter_no_change' not in params:
final_params['n_iter_no_change'] = 10
else:
final_params['n_iter_no_change'] = params['n_iter_no_change']
final_params['validation_fraction'] = None
final_params['early_stopping'] = True

Expand Down
21 changes: 20 additions & 1 deletion tpot2/config/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,22 @@ def get_QuantileTransformer_configspace(random_state=None):
)


def get_passkbinsdiscretizer_configspace(random_state=None):
space = {
'n_bins': Integer('n_bins', bounds=(3, 100)),
'encode': 'onehot-dense',
'strategy': Categorical('strategy', ['uniform', 'quantile', 'kmeans']),
# 'subsample': Categorical('subsample', ['auto', 'warn', 'ignore']),
}

if random_state is not None: #This is required because configspace doesn't allow None as a value
space['random_state'] = random_state

return ConfigurationSpace(
space = space

)


### ROBUST SCALER

Expand All @@ -133,4 +149,7 @@ def get_QuantileTransformer_configspace(random_state=None):
})

def robust_scaler_hyperparameter_parser(params):
return {"quantile_range": (params["q_min"], params["q_max"])}
return {"quantile_range": (params["q_min"], params["q_max"])}



5 changes: 3 additions & 2 deletions tpot2/evolvers/base_evolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,7 @@ def optimize(self, generations=None):
self._client.close()
self._cluster.close()

tpot2.utils.get_pareto_frontier(self.population.evaluated_individuals, column_names=self.objective_names, weights=self.objective_function_weights, invalid_values=["TIMEOUT","INVALID"])
tpot2.utils.get_pareto_frontier(self.population.evaluated_individuals, column_names=self.objective_names, weights=self.objective_function_weights)

def step(self,):
if self.population_size_list is not None:
Expand Down Expand Up @@ -624,7 +624,7 @@ def evaluate_population_full(self, budget=None):
parallel_timeout = 10

#scores = tpot2.utils.eval_utils.parallel_eval_objective_list(individuals_to_evaluate, self.objective_functions, self.n_jobs, verbose=self.verbose, timeout=self.max_eval_time_seconds, budget=budget, n_expected_columns=len(self.objective_names), client=self._client, parallel_timeout=parallel_timeout, **self.objective_kwargs)
scores, start_times, end_times, eval_errors = tpot2.utils.eval_utils.parallel_eval_objective_list2(individuals_to_evaluate, self.objective_functions, verbose=self.verbose, max_eval_time_seconds=self.max_eval_time_seconds, budget=budget, n_expected_columns=len(self.objective_names), client=self._client, **self.objective_kwargs)
scores, start_times, end_times, eval_errors = tpot2.utils.eval_utils.parallel_eval_objective_list2(individuals_to_evaluate, self.objective_functions, verbose=self.verbose, max_eval_time_seconds=self.max_eval_time_seconds, budget=budget, n_expected_columns=len(self.objective_names), client=self._client, scheduled_timeout_time=self.scheduled_timeout_time, **self.objective_kwargs)

self.population.update_column(individuals_to_evaluate, column_names=self.objective_names, data=scores)
if budget is not None:
Expand Down Expand Up @@ -705,6 +705,7 @@ def evaluate_population_selection_early_stop(self,survival_counts, thresholds=No
generation = self.generation,
n_expected_columns=len(self.objective_names),
client=self._client,
scheduled_timeout_time=self.scheduled_timeout_time,
**self.objective_kwargs,
)

Expand Down
12 changes: 6 additions & 6 deletions tpot2/evolvers/steady_state_evolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ def optimize(self):
###############################
if self.verbose >= 3:
sign = np.sign(self.objective_function_weights)
valid_df = self.population.evaluated_individuals[~self.population.evaluated_individuals[self.objective_names].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names]*sign
valid_df = self.population.evaluated_individuals[~self.population.evaluated_individuals[["Eval Error"]].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names]*sign
cur_best_scores = valid_df.max(axis=0)*sign
cur_best_scores = cur_best_scores.to_numpy()
for i, obj in enumerate(self.objective_names):
Expand All @@ -353,7 +353,7 @@ def optimize(self):
#get sign of objective_function_weights
sign = np.sign(self.objective_function_weights)
#get best score for each objective
valid_df = self.population.evaluated_individuals[~self.population.evaluated_individuals[self.objective_names].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names]*sign
valid_df = self.population.evaluated_individuals[~self.population.evaluated_individuals[["Eval Error"]].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names]*sign
cur_best_scores = valid_df.max(axis=0)
cur_best_scores = cur_best_scores.to_numpy()
#cur_best_scores = self.population.get_column(self.population.population, column_names=self.objective_names).max(axis=0)*sign #TODO this assumes the current population is the best
Expand Down Expand Up @@ -499,7 +499,7 @@ def optimize(self):
elif len(submitted_futures) < self.max_queue_size:

initial_population = self.population.evaluated_individuals.iloc[:self.initial_population_size*3]
invalid_initial_population = initial_population[initial_population[self.objective_names].isin(["TIMEOUT","INVALID"]).any(axis=1)]
invalid_initial_population = initial_population[initial_population[["Eval Error"]].isin(["TIMEOUT","INVALID"]).any(axis=1)]
if len(invalid_initial_population) >= self.initial_population_size*3: #if all individuals in the 3*initial population are invalid
raise Exception("No individuals could be evaluated in the initial population. This may indicate a bug in the configuration, included models, or objective functions. Set verbose>=4 to see the errors that caused individuals to fail.")

Expand Down Expand Up @@ -540,8 +540,8 @@ def optimize(self):
# Step 7: Cleanup
###############################

self.population.remove_invalid_from_population(column_names=self.objective_names, invalid_value="INVALID")
self.population.remove_invalid_from_population(column_names=self.objective_names, invalid_value="TIMEOUT")
self.population.remove_invalid_from_population(column_names="Eval Error", invalid_value="INVALID")
self.population.remove_invalid_from_population(column_names="Eval Error", invalid_value="TIMEOUT")


#done, cleanup futures
Expand All @@ -556,7 +556,7 @@ def optimize(self):
self._client.close()
self._cluster.close()

tpot2.utils.get_pareto_frontier(self.population.evaluated_individuals, column_names=self.objective_names, weights=self.objective_function_weights, invalid_values=["TIMEOUT","INVALID"])
tpot2.utils.get_pareto_frontier(self.population.evaluated_individuals, column_names=self.objective_names, weights=self.objective_function_weights)



Expand Down
5 changes: 5 additions & 0 deletions tpot2/search_spaces/nodes/genetic_feature_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ def __init__( self,
else:
self.mask = mask

# check if there are no features selected, if so select one
if sum(self.mask) == 0:
index = rng.choice(len(self.mask))
self.mask[index] = True

self.mutation_list = [self._mutate_add, self._mutate_remove]
self.crossover_list = [self._crossover_swap]

Expand Down
Loading

0 comments on commit fedc90d

Please sign in to comment.