From f1371fba036710c365de0370402d07978236b814 Mon Sep 17 00:00:00 2001 From: gketron Date: Wed, 17 Jul 2024 16:01:00 -0700 Subject: [PATCH 1/5] test --- test.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 test.txt diff --git a/test.txt b/test.txt new file mode 100644 index 00000000..30d74d25 --- /dev/null +++ b/test.txt @@ -0,0 +1 @@ +test \ No newline at end of file From 4f0fbf73e726f03e76909c5596713f5f4f0bb470 Mon Sep 17 00:00:00 2001 From: gketron Date: Thu, 18 Jul 2024 17:31:35 -0700 Subject: [PATCH 2/5] Simple, Iterative, and KNNImputers Added --- test.txt | 1 - tpot2/config/get_configspace.py | 25 +++++++++--- tpot2/config/imputers.py | 71 +++++++++++++++++++++++++++++++-- 3 files changed, 87 insertions(+), 10 deletions(-) delete mode 100644 test.txt diff --git a/test.txt b/test.txt deleted file mode 100644 index 30d74d25..00000000 --- a/test.txt +++ /dev/null @@ -1 +0,0 @@ -test \ No newline at end of file diff --git a/tpot2/config/get_configspace.py b/tpot2/config/get_configspace.py index 4a5cc997..3c3c2505 100644 --- a/tpot2/config/get_configspace.py +++ b/tpot2/config/get_configspace.py @@ -45,7 +45,7 @@ from sklearn.feature_selection import f_classif, f_regression #TODO create a selectomixin using these? from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis from sklearn.gaussian_process import GaussianProcessRegressor, GaussianProcessClassifier -from sklearn.impute import SimpleImputer +from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer all_methods = [SGDClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, MLPClassifier, DecisionTreeClassifier, XGBClassifier, KNeighborsClassifier, SVC, LogisticRegression, LGBMClassifier, LinearSVC, GaussianNB, BernoulliNB, MultinomialNB, ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, DecisionTreeRegressor, KNeighborsRegressor, XGBRegressor, ZeroCount, ColumnOneHotEncoder, Binarizer, FastICA, FeatureAgglomeration, MaxAbsScaler, MinMaxScaler, Normalizer, Nystroem, PCA, PolynomialFeatures, RBFSampler, RobustScaler, StandardScaler, SelectFwe, SelectPercentile, VarianceThreshold, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, SVR, LinearSVR, AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, DecisionTreeRegressor, KNeighborsRegressor, ElasticNetCV, AdaBoostClassifier,MLPRegressor, @@ -56,7 +56,7 @@ GaussianProcessClassifier, BaggingClassifier,LGBMRegressor, Passthrough,SkipTransformer, PassKBinsDiscretizer, - SimpleImputer, + SimpleImputer, IterativeImputer, KNNImputer ] @@ -124,7 +124,7 @@ "all_transformers" : ["transformers", "scalers"], "arithmatic": ["AddTransformer", "mul_neg_1_Transformer", "MulTransformer", "SafeReciprocalTransformer", "EQTransformer", "NETransformer", "GETransformer", "GTTransformer", "LETransformer", "LTTransformer", "MinTransformer", "MaxTransformer"], - "imputers": ["SimpleImputer"], + "imputers": ["SimpleImputer", "IterativeImputer", "KNNImputer"], "skrebate": ["ReliefF", "SURF", "SURFstar", "MultiSURF"], "genetic_encoders": ["DominantEncoder", "RecessiveEncoder", "HeterosisEncoder", "UnderDominanceEncoder", "OverDominanceEncoder"], @@ -136,8 +136,6 @@ def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_state=None): match name: - case "SimpleImputer": - return imputers.simple_imputer_cs #autoqtl_builtins.py case "FeatureEncodingFrequencySelector": @@ -352,6 +350,12 @@ def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_st ) #imputers.py + case "SimpleImputer": + return imputers.simple_imputer_cs + case "IterativeImputer": + return imputers.get_IterativeImputer_config_space(n_features=n_features, random_state=random_state) + case "KNNImputer": + return imputers.get_KNNImputer_config_space(n_samples=n_samples) #mdr_configs.py case "MDR": @@ -443,8 +447,17 @@ def get_node(name, n_classes=3, n_samples=100, n_features=100, random_state=None sfm_sp = get_configspace(name="SelectFromModel", n_classes=n_classes, n_samples=n_samples, random_state=random_state) ext = get_node("ExtraTreesRegressor", n_classes=n_classes, n_samples=n_samples, random_state=random_state) return WrapperPipeline(estimator_search_space=ext, method=SelectFromModel, space=sfm_sp) - + # TODO Add IterativeImputer with more estimator methods + ''' + if name == "IterativeImputer_learnedestimators": + iteative_sp = get_configspace(name="IterativeImputer", n_classes=n_classes, n_samples=n_samples, random_state=random_state) + regessor_searchspace = get_search_space(["LinearRegression", ..], n_classes=n_classes, n_samples=n_samples, random_state=random_state) + return WrapperPipeline(estimator_search_space=regressor_searchspace, method=ItartiveImputer, space=iteative_sp) + ''' #these are nodes that have special search spaces which require custom parsing of the hyperparameters + if name == "IterativeImputer": + configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state) + return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=imputers.IterativeImputer_hyperparameter_parser) if name == "RobustScaler": configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state) return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=transformers.robust_scaler_hyperparameter_parser) diff --git a/tpot2/config/imputers.py b/tpot2/config/imputers.py index 3499c0aa..5ad6d3a5 100644 --- a/tpot2/config/imputers.py +++ b/tpot2/config/imputers.py @@ -1,9 +1,74 @@ +import sklearn.ensemble +import sklearn.linear_model +import sklearn.neighbors from ConfigSpace import ConfigurationSpace from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal +import sklearn simple_imputer_cs = ConfigurationSpace( space = { - 'strategy' : Categorical('strategy', ['mean','median', 'most_frequent', ]), - 'add_indicator' : Categorical('add_indicator', [True, False]), + 'strategy' : Categorical('strategy', + ['mean','median', 'most_frequent', 'constant'] + ), + #'add_indicator' : Categorical('add_indicator', [True, False]), + #Removed add_indicator, it appends a mask next to the rest of the data + # and can cause errors. gk } -) \ No newline at end of file +) + +def get_IterativeImputer_config_space(n_features, random_state): + space = { + 'estimator' : Categorical('estimator', + ['Bayesian', 'RFR', 'Ridge', + 'KNN', 'RandomForest']), + 'sample_posterior' : Categorical('sample_posterior', [True, False]), + 'initial_strategy' : Categorical('initial_strategy', + ['mean', 'median', + 'most_frequent', 'constant']), + 'n_nearest_features' : Integer('n_nearest_features', + bounds=(1, n_features)), + 'imputation_order' : Categorical('imputation_order', + ['ascending', 'descending', + 'roman', 'arabic', 'random']), + } + if random_state is not None: + #This is required because configspace doesn't allow None as a value + space['random_state'] = random_state + + return ConfigurationSpace( + space = space + ) + +def get_KNNImputer_config_space(n_samples): + space = { + 'n_neighbors': Integer('n_neighbors', bounds=(1, max(n_samples,100))), + 'weights': Categorical('weights', ['uniform', 'distance']) + } + return ConfigurationSpace( + space=space + ) + +def IterativeImputer_hyperparameter_parser(params): + est = params['estimator'] + match est: + case 'Bayesian': + estimator = sklearn.linear_model.BayesianRidge() + case 'RFR': + estimator = sklearn.ensemble.RandomForestRegressor() + case 'Ridge': + estimator = sklearn.linear_model.Ridge() + case 'KNN': + estimator = sklearn.neighbors.KNeighborsRegressor() + + final_params = { + 'estimator' : estimator, + 'sample_posterior' : params['sample_posterior'], + 'initial_strategy' : params['initial_strategy'], + 'n_nearest_features' : params['n_nearest_features'], + 'imputation_order' : params['imputation_order'], + } + + if "random_state" in params: + final_params['random_state'] = params['random_state'] + + return final_params \ No newline at end of file From d1b2a319209f193f6158c0caf792d91035346b6c Mon Sep 17 00:00:00 2001 From: gketron Date: Fri, 19 Jul 2024 11:02:59 -0700 Subject: [PATCH 3/5] Debugging --- tpot2/config/get_configspace.py | 1 + tpot2/config/imputers.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tpot2/config/get_configspace.py b/tpot2/config/get_configspace.py index 3c3c2505..d142508c 100644 --- a/tpot2/config/get_configspace.py +++ b/tpot2/config/get_configspace.py @@ -45,6 +45,7 @@ from sklearn.feature_selection import f_classif, f_regression #TODO create a selectomixin using these? from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis from sklearn.gaussian_process import GaussianProcessRegressor, GaussianProcessClassifier +from sklearn.experimental import enable_iterative_imputer from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer all_methods = [SGDClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, MLPClassifier, DecisionTreeClassifier, XGBClassifier, KNeighborsClassifier, SVC, LogisticRegression, LGBMClassifier, LinearSVC, GaussianNB, BernoulliNB, MultinomialNB, ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, DecisionTreeRegressor, KNeighborsRegressor, XGBRegressor, ZeroCount, ColumnOneHotEncoder, Binarizer, FastICA, FeatureAgglomeration, MaxAbsScaler, MinMaxScaler, Normalizer, Nystroem, PCA, PolynomialFeatures, RBFSampler, RobustScaler, StandardScaler, SelectFwe, SelectPercentile, VarianceThreshold, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, SVR, LinearSVR, AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, DecisionTreeRegressor, KNeighborsRegressor, ElasticNetCV, diff --git a/tpot2/config/imputers.py b/tpot2/config/imputers.py index 5ad6d3a5..4c896060 100644 --- a/tpot2/config/imputers.py +++ b/tpot2/config/imputers.py @@ -1,9 +1,10 @@ +import sklearn import sklearn.ensemble import sklearn.linear_model import sklearn.neighbors from ConfigSpace import ConfigurationSpace from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal -import sklearn + simple_imputer_cs = ConfigurationSpace( space = { @@ -20,7 +21,7 @@ def get_IterativeImputer_config_space(n_features, random_state): space = { 'estimator' : Categorical('estimator', ['Bayesian', 'RFR', 'Ridge', - 'KNN', 'RandomForest']), + 'KNN']), 'sample_posterior' : Categorical('sample_posterior', [True, False]), 'initial_strategy' : Categorical('initial_strategy', ['mean', 'median', From 6c2341d534d0d906019bf86caf8ee61b9c0efdf4 Mon Sep 17 00:00:00 2001 From: gketron Date: Thu, 25 Jul 2024 09:25:39 -0700 Subject: [PATCH 4/5] sample_posterior parameter made conditional --- tpot2/config/imputers.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tpot2/config/imputers.py b/tpot2/config/imputers.py index 4c896060..3355f6dc 100644 --- a/tpot2/config/imputers.py +++ b/tpot2/config/imputers.py @@ -54,16 +54,20 @@ def IterativeImputer_hyperparameter_parser(params): match est: case 'Bayesian': estimator = sklearn.linear_model.BayesianRidge() + posterior = params['sample_posterior'] case 'RFR': estimator = sklearn.ensemble.RandomForestRegressor() + posterior = False case 'Ridge': estimator = sklearn.linear_model.Ridge() + posterior = False case 'KNN': estimator = sklearn.neighbors.KNeighborsRegressor() + posterior = False final_params = { 'estimator' : estimator, - 'sample_posterior' : params['sample_posterior'], + 'sample_posterior' : posterior, 'initial_strategy' : params['initial_strategy'], 'n_nearest_features' : params['n_nearest_features'], 'imputation_order' : params['imputation_order'], From 90498c7c1bfa1389b2d4109029347e48dacce8d3 Mon Sep 17 00:00:00 2001 From: gketron Date: Mon, 29 Jul 2024 14:04:54 -0700 Subject: [PATCH 5/5] Conditional Sample Posterior Added for Iterative Imputer --- tpot2/config/imputers.py | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/tpot2/config/imputers.py b/tpot2/config/imputers.py index 3355f6dc..2c33629f 100644 --- a/tpot2/config/imputers.py +++ b/tpot2/config/imputers.py @@ -4,6 +4,7 @@ import sklearn.neighbors from ConfigSpace import ConfigurationSpace from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal +from ConfigSpace import EqualsCondition simple_imputer_cs = ConfigurationSpace( @@ -18,33 +19,35 @@ ) def get_IterativeImputer_config_space(n_features, random_state): - space = { - 'estimator' : Categorical('estimator', - ['Bayesian', 'RFR', 'Ridge', - 'KNN']), - 'sample_posterior' : Categorical('sample_posterior', [True, False]), - 'initial_strategy' : Categorical('initial_strategy', + space = { 'initial_strategy' : Categorical('initial_strategy', ['mean', 'median', 'most_frequent', 'constant']), - 'n_nearest_features' : Integer('n_nearest_features', + 'n_nearest_features' : Integer('n_nearest_features', bounds=(1, n_features)), - 'imputation_order' : Categorical('imputation_order', + 'imputation_order' : Categorical('imputation_order', ['ascending', 'descending', 'roman', 'arabic', 'random']), } + + estimator = Categorical('estimator', ['Bayesian', 'RFR', 'Ridge', 'KNN']) + sample_posterior = Categorical('sample_posterior', [True, False]) + sampling_condition = EqualsCondition(sample_posterior, estimator, 'Bayesian') + if random_state is not None: #This is required because configspace doesn't allow None as a value space['random_state'] = random_state - return ConfigurationSpace( - space = space - ) + cs = ConfigurationSpace(space=space) + cs.add_hyperparameters([estimator, sample_posterior]) + cs.add_conditions([sampling_condition]) + return cs def get_KNNImputer_config_space(n_samples): space = { 'n_neighbors': Integer('n_neighbors', bounds=(1, max(n_samples,100))), 'weights': Categorical('weights', ['uniform', 'distance']) } + return ConfigurationSpace( space=space ) @@ -54,26 +57,24 @@ def IterativeImputer_hyperparameter_parser(params): match est: case 'Bayesian': estimator = sklearn.linear_model.BayesianRidge() - posterior = params['sample_posterior'] case 'RFR': estimator = sklearn.ensemble.RandomForestRegressor() - posterior = False case 'Ridge': estimator = sklearn.linear_model.Ridge() - posterior = False case 'KNN': estimator = sklearn.neighbors.KNeighborsRegressor() - posterior = False - + final_params = { 'estimator' : estimator, - 'sample_posterior' : posterior, 'initial_strategy' : params['initial_strategy'], 'n_nearest_features' : params['n_nearest_features'], 'imputation_order' : params['imputation_order'], } - if "random_state" in params: - final_params['random_state'] = params['random_state'] + if 'sample_posterior' in params: + final_params['sample_posterior'] = params['sample_posterior'] + if 'random_state' in params: + final_params['random_state'] = params['random_state'] + return final_params \ No newline at end of file