diff --git a/tpot2/config/get_configspace.py b/tpot2/config/get_configspace.py index 4a5cc997..d142508c 100644 --- a/tpot2/config/get_configspace.py +++ b/tpot2/config/get_configspace.py @@ -45,7 +45,8 @@ from sklearn.feature_selection import f_classif, f_regression #TODO create a selectomixin using these? from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis from sklearn.gaussian_process import GaussianProcessRegressor, GaussianProcessClassifier -from sklearn.impute import SimpleImputer +from sklearn.experimental import enable_iterative_imputer +from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer all_methods = [SGDClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, MLPClassifier, DecisionTreeClassifier, XGBClassifier, KNeighborsClassifier, SVC, LogisticRegression, LGBMClassifier, LinearSVC, GaussianNB, BernoulliNB, MultinomialNB, ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, DecisionTreeRegressor, KNeighborsRegressor, XGBRegressor, ZeroCount, ColumnOneHotEncoder, Binarizer, FastICA, FeatureAgglomeration, MaxAbsScaler, MinMaxScaler, Normalizer, Nystroem, PCA, PolynomialFeatures, RBFSampler, RobustScaler, StandardScaler, SelectFwe, SelectPercentile, VarianceThreshold, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, SVR, LinearSVR, AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, DecisionTreeRegressor, KNeighborsRegressor, ElasticNetCV, AdaBoostClassifier,MLPRegressor, @@ -56,7 +57,7 @@ GaussianProcessClassifier, BaggingClassifier,LGBMRegressor, Passthrough,SkipTransformer, PassKBinsDiscretizer, - SimpleImputer, + SimpleImputer, IterativeImputer, KNNImputer ] @@ -124,7 +125,7 @@ "all_transformers" : ["transformers", "scalers"], "arithmatic": ["AddTransformer", "mul_neg_1_Transformer", "MulTransformer", "SafeReciprocalTransformer", "EQTransformer", "NETransformer", "GETransformer", "GTTransformer", "LETransformer", "LTTransformer", "MinTransformer", "MaxTransformer"], - "imputers": ["SimpleImputer"], + "imputers": ["SimpleImputer", "IterativeImputer", "KNNImputer"], "skrebate": ["ReliefF", "SURF", "SURFstar", "MultiSURF"], "genetic_encoders": ["DominantEncoder", "RecessiveEncoder", "HeterosisEncoder", "UnderDominanceEncoder", "OverDominanceEncoder"], @@ -136,8 +137,6 @@ def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_state=None): match name: - case "SimpleImputer": - return imputers.simple_imputer_cs #autoqtl_builtins.py case "FeatureEncodingFrequencySelector": @@ -352,6 +351,12 @@ def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_st ) #imputers.py + case "SimpleImputer": + return imputers.simple_imputer_cs + case "IterativeImputer": + return imputers.get_IterativeImputer_config_space(n_features=n_features, random_state=random_state) + case "KNNImputer": + return imputers.get_KNNImputer_config_space(n_samples=n_samples) #mdr_configs.py case "MDR": @@ -443,8 +448,17 @@ def get_node(name, n_classes=3, n_samples=100, n_features=100, random_state=None sfm_sp = get_configspace(name="SelectFromModel", n_classes=n_classes, n_samples=n_samples, random_state=random_state) ext = get_node("ExtraTreesRegressor", n_classes=n_classes, n_samples=n_samples, random_state=random_state) return WrapperPipeline(estimator_search_space=ext, method=SelectFromModel, space=sfm_sp) - + # TODO Add IterativeImputer with more estimator methods + ''' + if name == "IterativeImputer_learnedestimators": + iteative_sp = get_configspace(name="IterativeImputer", n_classes=n_classes, n_samples=n_samples, random_state=random_state) + regessor_searchspace = get_search_space(["LinearRegression", ..], n_classes=n_classes, n_samples=n_samples, random_state=random_state) + return WrapperPipeline(estimator_search_space=regressor_searchspace, method=ItartiveImputer, space=iteative_sp) + ''' #these are nodes that have special search spaces which require custom parsing of the hyperparameters + if name == "IterativeImputer": + configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state) + return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=imputers.IterativeImputer_hyperparameter_parser) if name == "RobustScaler": configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state) return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=transformers.robust_scaler_hyperparameter_parser) diff --git a/tpot2/config/imputers.py b/tpot2/config/imputers.py index 3499c0aa..2c33629f 100644 --- a/tpot2/config/imputers.py +++ b/tpot2/config/imputers.py @@ -1,9 +1,80 @@ +import sklearn +import sklearn.ensemble +import sklearn.linear_model +import sklearn.neighbors from ConfigSpace import ConfigurationSpace from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal +from ConfigSpace import EqualsCondition + simple_imputer_cs = ConfigurationSpace( space = { - 'strategy' : Categorical('strategy', ['mean','median', 'most_frequent', ]), - 'add_indicator' : Categorical('add_indicator', [True, False]), + 'strategy' : Categorical('strategy', + ['mean','median', 'most_frequent', 'constant'] + ), + #'add_indicator' : Categorical('add_indicator', [True, False]), + #Removed add_indicator, it appends a mask next to the rest of the data + # and can cause errors. gk + } +) + +def get_IterativeImputer_config_space(n_features, random_state): + space = { 'initial_strategy' : Categorical('initial_strategy', + ['mean', 'median', + 'most_frequent', 'constant']), + 'n_nearest_features' : Integer('n_nearest_features', + bounds=(1, n_features)), + 'imputation_order' : Categorical('imputation_order', + ['ascending', 'descending', + 'roman', 'arabic', 'random']), } -) \ No newline at end of file + + estimator = Categorical('estimator', ['Bayesian', 'RFR', 'Ridge', 'KNN']) + sample_posterior = Categorical('sample_posterior', [True, False]) + sampling_condition = EqualsCondition(sample_posterior, estimator, 'Bayesian') + + if random_state is not None: + #This is required because configspace doesn't allow None as a value + space['random_state'] = random_state + + cs = ConfigurationSpace(space=space) + cs.add_hyperparameters([estimator, sample_posterior]) + cs.add_conditions([sampling_condition]) + return cs + +def get_KNNImputer_config_space(n_samples): + space = { + 'n_neighbors': Integer('n_neighbors', bounds=(1, max(n_samples,100))), + 'weights': Categorical('weights', ['uniform', 'distance']) + } + + return ConfigurationSpace( + space=space + ) + +def IterativeImputer_hyperparameter_parser(params): + est = params['estimator'] + match est: + case 'Bayesian': + estimator = sklearn.linear_model.BayesianRidge() + case 'RFR': + estimator = sklearn.ensemble.RandomForestRegressor() + case 'Ridge': + estimator = sklearn.linear_model.Ridge() + case 'KNN': + estimator = sklearn.neighbors.KNeighborsRegressor() + + final_params = { + 'estimator' : estimator, + 'initial_strategy' : params['initial_strategy'], + 'n_nearest_features' : params['n_nearest_features'], + 'imputation_order' : params['imputation_order'], + } + + if 'sample_posterior' in params: + final_params['sample_posterior'] = params['sample_posterior'] + + if 'random_state' in params: + final_params['random_state'] = params['random_state'] + + return final_params \ No newline at end of file