From 2f224417736a579636348f96e76e4b86d0e23d76 Mon Sep 17 00:00:00 2001 From: peiyanpan <1065112771@qq.com> Date: Tue, 3 Dec 2024 00:24:19 +0800 Subject: [PATCH 1/6] Add the new feature of customized initial population --- tpot2/tests/test_customized_iniPop.py | 62 +++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 tpot2/tests/test_customized_iniPop.py diff --git a/tpot2/tests/test_customized_iniPop.py b/tpot2/tests/test_customized_iniPop.py new file mode 100644 index 00000000..ab55a8f6 --- /dev/null +++ b/tpot2/tests/test_customized_iniPop.py @@ -0,0 +1,62 @@ +import pytest + + +@pytest.fixture +def test_customized_iniPop(): + import tpot2 + import sklearn + import sklearn.datasets + + scorer = sklearn.metrics.get_scorer('roc_auc_ovo') + + X, y = sklearn.datasets.load_iris(return_X_y=True) + + X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25) + + from tpot2.config.get_configspace import set_node + from tpot2.search_spaces.pipelines.union import UnionPipeline + from tpot2.search_spaces.pipelines.choice import ChoicePipeline + from tpot2.search_spaces.pipelines.sequential import SequentialPipeline + from tpot2.config.get_configspace import get_search_space + + scalers = set_node("MinMaxScaler", {}) + selectors = set_node("SelectFwe", {'alpha': 0.0002381268562}) + transformers_layer =UnionPipeline([ + ChoicePipeline([ + set_node("SkipTransformer", {}) + ]), + get_search_space("Passthrough",) + ] + ) + + inner_estimators_layer = UnionPipeline([ + get_search_space("Passthrough",)] + ) + estimators = set_node("HistGradientBoostingClassifier", + {'early_stop': 'valid', + 'l2_regularization': 0.0011074158219, + 'learning_rate': 0.0050792320068, + 'max_depth': None, + 'max_features': 0.3430178535213, + 'max_leaf_nodes': 237, + 'min_samples_leaf': 63, + 'tol': 0.0001, + 'n_iter_no_change': 14, + 'validation_fraction': 0.2343285974496}) + + pipeline = SequentialPipeline(search_spaces=[ + scalers, + selectors, + transformers_layer, + inner_estimators_layer, + estimators, + ]) + ind = pipeline.generate() + + est = tpot2.TPOTClassifier(search_space="linear", n_jobs=40, verbose=5, generations=1, population_size=5, customized_initial_population=[ind]) + + est.fit(X_train, y_train) + + print(str(est.fitted_pipeline_)) + + print(scorer(est, X_test, y_test)) \ No newline at end of file From 4878b065a750bd268830200551a48ab5b4b7c299 Mon Sep 17 00:00:00 2001 From: peiyanpan <1065112771@qq.com> Date: Tue, 3 Dec 2024 00:24:38 +0800 Subject: [PATCH 2/6] Add the new feature of customized initial population --- PULL_REQUEST_TEMPLATE.md | 102 +++++++++++++++++++++++++++--- tpot2/config/get_configspace.py | 4 ++ tpot2/evolvers/base_evolver.py | 11 ++++ tpot2/tpot_estimator/estimator.py | 5 ++ 4 files changed, 112 insertions(+), 10 deletions(-) diff --git a/PULL_REQUEST_TEMPLATE.md b/PULL_REQUEST_TEMPLATE.md index 365ff3e5..07a9da06 100644 --- a/PULL_REQUEST_TEMPLATE.md +++ b/PULL_REQUEST_TEMPLATE.md @@ -1,30 +1,112 @@ -[please review the [Contribution Guidelines](http://epistasislab.github.io/tpot/contributing/) prior to submitting your pull request. go ahead and delete this line if you've already reviewed said guidelines.] - ## What does this PR do? - +Add the new feature of allowing users to specify customized initial pipeline population for TPOT2. ## Where should the reviewer start? - +- tpot2/tests/test_customized_iniPop.py +- tpot2/config/get_configspace.py +- tpot2/evolvers/base_evolver.py +- tpot2/tpot_estimator/estimator.py ## How should this PR be tested? +The test code is at tpot2/tests/test_customized_iniPop.py: + +**pytest test_customized_iniPop.py** + +``` +import pytest + + +@pytest.fixture +def test_customized_iniPop(): + import tpot2 + import sklearn + import sklearn.datasets + + scorer = sklearn.metrics.get_scorer('roc_auc_ovo') + + X, y = sklearn.datasets.load_iris(return_X_y=True) + + X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25) + + from tpot2.config.get_configspace import set_node + from tpot2.search_spaces.pipelines.union import UnionPipeline + from tpot2.search_spaces.pipelines.choice import ChoicePipeline + from tpot2.search_spaces.pipelines.sequential import SequentialPipeline + from tpot2.config.get_configspace import get_search_space + scalers = set_node("MinMaxScaler", {}) + selectors = set_node("SelectFwe", {'alpha': 0.0002381268562}) + transformers_layer =UnionPipeline([ + ChoicePipeline([ + set_node("SkipTransformer", {}) + ]), + get_search_space("Passthrough",) + ] + ) + + inner_estimators_layer = UnionPipeline([ + get_search_space("Passthrough",)] + ) + estimators = set_node("HistGradientBoostingClassifier", + {'early_stop': 'valid', + 'l2_regularization': 0.0011074158219, + 'learning_rate': 0.0050792320068, + 'max_depth': None, + 'max_features': 0.3430178535213, + 'max_leaf_nodes': 237, + 'min_samples_leaf': 63, + 'tol': 0.0001, + 'n_iter_no_change': 14, + 'validation_fraction': 0.2343285974496}) + + pipeline = SequentialPipeline(search_spaces=[ + scalers, + selectors, + transformers_layer, + inner_estimators_layer, + estimators, + ]) + ind = pipeline.generate() + + est = tpot2.TPOTClassifier(search_space="linear", n_jobs=40, verbose=5, generations=1, population_size=5, customized_initial_population=[ind]) + + est.fit(X_train, y_train) + + print(str(est.fitted_pipeline_)) + + print(scorer(est, X_test, y_test)) +``` ## Any background context you want to provide? +Under this version, users can specify well-defined initial pipeline population in SequentialPipeline type pipeline. This update has the potential to enhance the algorithm's performance and reduce evolutionary time. +Several Tips: -## What are the relevant issues? +1. These SequentialPipeline pipelines can be obtained: + +Referencing the examples in customized_initial_population.py and modifying them according to TPOT2's config_dict. -[you can link directly to issues by entering # then the number of the issue] +2. We consider the relationship between #customized initial pipelines and #population as follows: -## Screenshots (if appropriate) +``` +init_population_size = len(customized_initial_population) +if self.cur_population_size <= init_population_size: + initial_population = customized_initial_population[:self.cur_population_size] +else: + initial_population = [next(self.individual_generator) for _ in range(self.cur_population_size - init_population_size)] + initial_population = customized_initial_population + initial_population +``` +3. The current version is only applicable to solve the problem where search_spaces is linear and the initialized pipeline is of type SequentialPipeline. We will continue to refine the scenario where search_spaces is graph and the pipeline is of type GraphPipeline in the near future if you think our approach is appropriate. +## What are the relevant issues? + +[issue-61](https://github.com/EpistasisLab/tpot2/issues/61) -## Questions: +## Main Contributors -- Do the docs need to be updated? -- Does this PR add new (Python) dependencies? +@peiyanpan @t-harden \ No newline at end of file diff --git a/tpot2/config/get_configspace.py b/tpot2/config/get_configspace.py index 72eac9a6..6d2a5921 100644 --- a/tpot2/config/get_configspace.py +++ b/tpot2/config/get_configspace.py @@ -546,6 +546,10 @@ def get_search_space(name, n_classes=3, n_samples=1000, n_features=100, random_s return get_node(name, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state, base_node=base_node, n_jobs=n_jobs) +def set_node(name, params): + node = get_node(name) + node.space = params + return node def get_node(name, n_classes=3, n_samples=100, n_features=100, random_state=None, base_node=EstimatorNode, n_jobs=1): """ diff --git a/tpot2/evolvers/base_evolver.py b/tpot2/evolvers/base_evolver.py index 8e4958aa..f50cf7d1 100644 --- a/tpot2/evolvers/base_evolver.py +++ b/tpot2/evolvers/base_evolver.py @@ -144,6 +144,8 @@ def __init__( self, callback = None, rng=None, + customized_initial_population=[] + ) -> None: """ Uses mutation, crossover, and optimization functions to evolve a population of individuals towards the given objective functions. @@ -433,6 +435,15 @@ def __init__( self, init_names = init_names + ["Budget"] if self.population is None: self.population = tpot2.Population(column_names=init_names) + init_population_size = len(customized_initial_population) + if self.cur_population_size <= init_population_size: + initial_population = customized_initial_population[:self.cur_population_size] + else: + initial_population = [next(self.individual_generator) for _ in range(self.cur_population_size - init_population_size)] + initial_population = customized_initial_population + initial_population + # initial_population = [next(self.individual_generator) for _ in range(self.cur_population_size)] + # for individual in initial_population: + # print(individual.unique_id()) initial_population = [next(self.individual_generator) for _ in range(self.cur_population_size)] self.population.add_to_population(initial_population, self.rng) self.population.update_column(self.population.population, column_names="Generation", data=self.generation) diff --git a/tpot2/tpot_estimator/estimator.py b/tpot2/tpot_estimator/estimator.py index f7848f09..cb259fd4 100644 --- a/tpot2/tpot_estimator/estimator.py +++ b/tpot2/tpot_estimator/estimator.py @@ -135,6 +135,8 @@ def __init__(self, # random seed for random number generator (rng) random_state = None, + customized_initial_population=[] + ): ''' @@ -508,6 +510,7 @@ def __init__(self, self.label_encoder_ = None + self.customized_initial_population = customized_initial_population set_dask_settings() @@ -757,6 +760,8 @@ def ind_generator(rng): crossover_then_mutate_probability= self.crossover_then_mutate_probability, rng=self.rng, + + customized_initial_population=self.customized_initial_population, ) From 1a74df36b6595203b119899c59ce51fc46b95d29 Mon Sep 17 00:00:00 2001 From: gygb Date: Tue, 3 Dec 2024 11:25:48 +0100 Subject: [PATCH 3/6] some changes --- PULL_REQUEST_TEMPLATE.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PULL_REQUEST_TEMPLATE.md b/PULL_REQUEST_TEMPLATE.md index 07a9da06..37c756fa 100644 --- a/PULL_REQUEST_TEMPLATE.md +++ b/PULL_REQUEST_TEMPLATE.md @@ -82,7 +82,7 @@ def test_customized_iniPop(): ## Any background context you want to provide? -Under this version, users can specify well-defined initial pipeline population in SequentialPipeline type pipeline. This update has the potential to enhance the algorithm's performance and reduce evolutionary time. +In this version, users can specify a well-defined initial pipeline population, currently limited to the *SequentialPipeline* type. This update has the potential to improve algorithm performance and reduce evolutionary time. Several Tips: @@ -90,7 +90,7 @@ Several Tips: Referencing the examples in customized_initial_population.py and modifying them according to TPOT2's config_dict. -2. We consider the relationship between #customized initial pipelines and #population as follows: +2. We consider the relationship between #customized initial pipelines and #population_size as follows: ``` init_population_size = len(customized_initial_population) From c983018e2e38d83a73e84aaa04b92af9531beb02 Mon Sep 17 00:00:00 2001 From: peiyanpan <1065112771@qq.com> Date: Tue, 3 Dec 2024 19:15:27 +0800 Subject: [PATCH 4/6] Add some detail --- PULL_REQUEST_TEMPLATE.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/PULL_REQUEST_TEMPLATE.md b/PULL_REQUEST_TEMPLATE.md index 37c756fa..6dde84eb 100644 --- a/PULL_REQUEST_TEMPLATE.md +++ b/PULL_REQUEST_TEMPLATE.md @@ -5,9 +5,13 @@ Add the new feature of allowing users to specify customized initial pipeline pop ## Where should the reviewer start? - tpot2/tests/test_customized_iniPop.py +Contains the SequentialPipeline initialization method, which consists of scalers, selectors, transformers_layer, inner_estimators_layer, estimators and a sample of initializing this TPOTClassifier in a customized_initial_population parameter. - tpot2/config/get_configspace.py +A new set_node() function has been added, containing mainly operations for adding new nodes in pipeline. - tpot2/evolvers/base_evolver.py +Add some judgments about the number of initialized populations and the number of populations that need to be generated by crushed gold. - tpot2/tpot_estimator/estimator.py +Add passing of customized_initial_population parameter ## How should this PR be tested? From 0e3f4bab4c56561bf3dd2cdb95a2840cb3ccef23 Mon Sep 17 00:00:00 2001 From: peiyanpan <1065112771@qq.com> Date: Tue, 3 Dec 2024 23:43:30 +0800 Subject: [PATCH 5/6] fix some bugs --- tpot2/config/get_configspace.py | 4 ---- tpot2/evolvers/base_evolver.py | 4 ++-- tpot2/tests/test_customized_iniPop.py | 10 +++++----- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/tpot2/config/get_configspace.py b/tpot2/config/get_configspace.py index 6d2a5921..72eac9a6 100644 --- a/tpot2/config/get_configspace.py +++ b/tpot2/config/get_configspace.py @@ -546,10 +546,6 @@ def get_search_space(name, n_classes=3, n_samples=1000, n_features=100, random_s return get_node(name, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state, base_node=base_node, n_jobs=n_jobs) -def set_node(name, params): - node = get_node(name) - node.space = params - return node def get_node(name, n_classes=3, n_samples=100, n_features=100, random_state=None, base_node=EstimatorNode, n_jobs=1): """ diff --git a/tpot2/evolvers/base_evolver.py b/tpot2/evolvers/base_evolver.py index f50cf7d1..310341cd 100644 --- a/tpot2/evolvers/base_evolver.py +++ b/tpot2/evolvers/base_evolver.py @@ -437,14 +437,14 @@ def __init__( self, self.population = tpot2.Population(column_names=init_names) init_population_size = len(customized_initial_population) if self.cur_population_size <= init_population_size: - initial_population = customized_initial_population[:self.cur_population_size] + initial_population = customized_initial_population + # initial_population = customized_initial_population[:self.cur_population_size] else: initial_population = [next(self.individual_generator) for _ in range(self.cur_population_size - init_population_size)] initial_population = customized_initial_population + initial_population # initial_population = [next(self.individual_generator) for _ in range(self.cur_population_size)] # for individual in initial_population: # print(individual.unique_id()) - initial_population = [next(self.individual_generator) for _ in range(self.cur_population_size)] self.population.add_to_population(initial_population, self.rng) self.population.update_column(self.population.population, column_names="Generation", data=self.generation) diff --git a/tpot2/tests/test_customized_iniPop.py b/tpot2/tests/test_customized_iniPop.py index ab55a8f6..cc12845d 100644 --- a/tpot2/tests/test_customized_iniPop.py +++ b/tpot2/tests/test_customized_iniPop.py @@ -13,17 +13,17 @@ def test_customized_iniPop(): X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25) - from tpot2.config.get_configspace import set_node + from tpot2.search_spaces.nodes.estimator_node import EstimatorNodeIndividual from tpot2.search_spaces.pipelines.union import UnionPipeline from tpot2.search_spaces.pipelines.choice import ChoicePipeline from tpot2.search_spaces.pipelines.sequential import SequentialPipeline from tpot2.config.get_configspace import get_search_space - scalers = set_node("MinMaxScaler", {}) - selectors = set_node("SelectFwe", {'alpha': 0.0002381268562}) + scalers = EstimatorNodeIndividual("MinMaxScaler", {}) + selectors = EstimatorNodeIndividual("SelectFwe", {'alpha': 0.0002381268562}) transformers_layer =UnionPipeline([ ChoicePipeline([ - set_node("SkipTransformer", {}) + EstimatorNodeIndividual("SkipTransformer", {}) ]), get_search_space("Passthrough",) ] @@ -32,7 +32,7 @@ def test_customized_iniPop(): inner_estimators_layer = UnionPipeline([ get_search_space("Passthrough",)] ) - estimators = set_node("HistGradientBoostingClassifier", + estimators = EstimatorNodeIndividual("HistGradientBoostingClassifier", {'early_stop': 'valid', 'l2_regularization': 0.0011074158219, 'learning_rate': 0.0050792320068, From 9ca211617b0d85c7bdc711da93ebef0d6aeb53e7 Mon Sep 17 00:00:00 2001 From: peiyanpan <1065112771@qq.com> Date: Sat, 7 Dec 2024 16:52:14 +0800 Subject: [PATCH 6/6] enhance set_node --- Tutorial/1_Using_TPOT.ipynb | 212 ++++++++++++++++++++++++++++++-- tpot2/config/get_configspace.py | 12 ++ 2 files changed, 216 insertions(+), 8 deletions(-) diff --git a/Tutorial/1_Using_TPOT.ipynb b/Tutorial/1_Using_TPOT.ipynb index 92821ca1..f520926a 100644 --- a/Tutorial/1_Using_TPOT.ipynb +++ b/Tutorial/1_Using_TPOT.ipynb @@ -23,6 +23,207 @@ "This is actually an advantage over fixed grid search techniques: TPOT is meant to be an assistant that gives you ideas on how to solve a particular machine learning problem by exploring pipeline configurations that you might have never considered, then leaves the fine-tuning to more constrained parameter tuning techniques such as grid search or bayesian optimization." ] }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "/Users/pyp/Desktop/code/tpot2/tpot2/tpot_estimator/estimator.py:457: UserWarning: Both generations and max_time_mins are set. TPOT will terminate when the first condition is met.\n", + " warnings.warn(\"Both generations and max_time_mins are set. TPOT will terminate when the first condition is met.\")\n", + "Generation: 0%| | 0/1 [00:00 \n", + " X has 13 features, but BernoulliNB is expecting 11 features as input. \n", + " Traceback (most recent call last):\n", + " File \"/Users/pyp/Desktop/code/tpot2/tpot2/utils/eval_utils.py\", line 87, in objective_nan_wrapper\n", + " value = func_timeout.func_timeout(timeout, objective_function, args=[individual], kwargs=objective_kwargs)\n", + " File \"/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/func_timeout/dafunc.py\", line 108, in func_timeout\n", + " raise_exception(exception)\n", + " File \"/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/func_timeout/py3_raise.py\", line 7, in raise_exception\n", + " raise exception[0] from None\n", + " File \"/Users/pyp/Desktop/code/tpot2/tpot2/tpot_estimator/estimator.py\", line 644, in objective_function\n", + " return objective_function_generator(\n", + " File \"/Users/pyp/Desktop/code/tpot2/tpot2/tpot_estimator/estimator_utils.py\", line 162, in objective_function_generator\n", + " cv_obj_scores = cross_val_score_objective(sklearn.base.clone(pipeline),x,y,scorers=scorers, cv=cv , fold=step)\n", + " File \"/Users/pyp/Desktop/code/tpot2/tpot2/tpot_estimator/cross_val_utils.py\", line 93, in cross_val_score_objective\n", + " this_fold_scores = [sklearn.metrics.get_scorer(scorer)(this_fold_estimator, X_test, y_test) for scorer in scorers]\n", + " File \"/Users/pyp/Desktop/code/tpot2/tpot2/tpot_estimator/cross_val_utils.py\", line 93, in \n", + " this_fold_scores = [sklearn.metrics.get_scorer(scorer)(this_fold_estimator, X_test, y_test) for scorer in scorers]\n", + " File \"/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/sklearn/metrics/_scorer.py\", line 279, in __call__\n", + " return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)\n", + " File \"/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/sklearn/metrics/_scorer.py\", line 371, in _score\n", + " y_pred = method_caller(\n", + " File \"/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/sklearn/metrics/_scorer.py\", line 89, in _cached_call\n", + " result, _ = _get_response_values(\n", + " File \"/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/sklearn/utils/_response.py\", line 211, in _get_response_values\n", + " y_pred = prediction_method(X)\n", + " File \"/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/sklearn/pipeline.py\", line 721, in predict_proba\n", + " return self.steps[-1][1].predict_proba(Xt, **params)\n", + " File \"/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/sklearn/naive_bayes.py\", line 144, in predict_proba\n", + " return np.exp(self.predict_log_proba(X))\n", + " File \"/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/sklearn/naive_bayes.py\", line 122, in predict_log_proba\n", + " X = self._check_X(X)\n", + " File \"/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/sklearn/naive_bayes.py\", line 1178, in _check_X\n", + " X = super()._check_X(X)\n", + " File \"/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/sklearn/naive_bayes.py\", line 574, in _check_X\n", + " return self._validate_data(X, accept_sparse=\"csr\", reset=False)\n", + " File \"/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/sklearn/base.py\", line 654, in _validate_data\n", + " self._check_n_features(X, reset=reset)\n", + " File \"/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/sklearn/base.py\", line 443, in _check_n_features\n", + " raise ValueError(\n", + "ValueError: X has 13 features, but BernoulliNB is expecting 11 features as input.\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Generation: 100%|██████████| 1/1 [00:02<00:00, 2.56s/it]\n", + "2024-12-07 16:43:46,992 - distributed.scheduler - ERROR - Removing worker 'tcp://127.0.0.1:60611' caused the cluster to lose scattered data, which can't be recovered: {'ndarray-0597151fa3bea6002bd2596e80d2f9ac', 'ndarray-6fcb6e5015d408ff90b146c00ff2c93b'} (stimulus_id='handle-worker-cleanup-1733561026.9929442')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING THIS INDIVIDUAL CAUSED AND EXCEPTION \n", + " \n", + " X has 18 features, but QuadraticDiscriminantAnalysis is expecting 14 features as input. \n", + " Traceback (most recent call last):\n", + " File \"/Users/pyp/Desktop/code/tpot2/tpot2/utils/eval_utils.py\", line 87, in objective_nan_wrapper\n", + " value = func_timeout.func_timeout(timeout, objective_function, args=[individual], kwargs=objective_kwargs)\n", + " File \"/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/func_timeout/dafunc.py\", line 108, in func_timeout\n", + " raise_exception(exception)\n", + " File \"/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/func_timeout/py3_raise.py\", line 7, in raise_exception\n", + " raise exception[0] from None\n", + " File \"/Users/pyp/Desktop/code/tpot2/tpot2/tpot_estimator/estimator.py\", line 644, in objective_function\n", + " return objective_function_generator(\n", + " File \"/Users/pyp/Desktop/code/tpot2/tpot2/tpot_estimator/estimator_utils.py\", line 162, in objective_function_generator\n", + " cv_obj_scores = cross_val_score_objective(sklearn.base.clone(pipeline),x,y,scorers=scorers, cv=cv , fold=step)\n", + " File \"/Users/pyp/Desktop/code/tpot2/tpot2/tpot_estimator/cross_val_utils.py\", line 93, in cross_val_score_objective\n", + " this_fold_scores = [sklearn.metrics.get_scorer(scorer)(this_fold_estimator, X_test, y_test) for scorer in scorers]\n", + " File \"/Users/pyp/Desktop/code/tpot2/tpot2/tpot_estimator/cross_val_utils.py\", line 93, in \n", + " this_fold_scores = [sklearn.metrics.get_scorer(scorer)(this_fold_estimator, X_test, y_test) for scorer in scorers]\n", + " File \"/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/sklearn/metrics/_scorer.py\", line 279, in __call__\n", + " return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)\n", + " File \"/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/sklearn/metrics/_scorer.py\", line 371, in _score\n", + " y_pred = method_caller(\n", + " File \"/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/sklearn/metrics/_scorer.py\", line 89, in _cached_call\n", + " result, _ = _get_response_values(\n", + " File \"/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/sklearn/utils/_response.py\", line 211, in _get_response_values\n", + " y_pred = prediction_method(X)\n", + " File \"/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/sklearn/pipeline.py\", line 721, in predict_proba\n", + " return self.steps[-1][1].predict_proba(Xt, **params)\n", + " File \"/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/sklearn/discriminant_analysis.py\", line 1037, in predict_proba\n", + " values = self._decision_function(X)\n", + " File \"/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/sklearn/discriminant_analysis.py\", line 966, in _decision_function\n", + " X = self._validate_data(X, reset=False)\n", + " File \"/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/sklearn/base.py\", line 654, in _validate_data\n", + " self._check_n_features(X, reset=reset)\n", + " File \"/Users/pyp/miniconda3/envs/tpot2env3/lib/python3.10/site-packages/sklearn/base.py\", line 443, in _check_n_features\n", + " raise ValueError(\n", + "ValueError: X has 18 features, but QuadraticDiscriminantAnalysis is expecting 14 features as input.\n", + "\n", + "Generation: 1\n", + "Best roc_auc_score score: 0.9898313492063491\n", + "Pipeline(steps=[('normalizer', Normalizer(norm='l1')),\n", + " ('passthrough', Passthrough()),\n", + " ('featureunion-1',\n", + " FeatureUnion(transformer_list=[('featureunion',\n", + " FeatureUnion(transformer_list=[('quantiletransformer',\n", + " QuantileTransformer(n_quantiles=65,\n", + " output_distribution='normal')),\n", + " ('pca',\n", + " PCA(n_components=0.748157093073))])),\n", + " ('passthrough',\n", + " Passthrough())])),\n", + " ('featureunion-2',\n", + " FeatureUnion(transformer_list=[('skiptransformer',\n", + " SkipTransformer()),\n", + " ('passthrough',\n", + " Passthrough())])),\n", + " ('decisiontreeclassifier',\n", + " DecisionTreeClassifier(class_weight='balanced', max_depth=4,\n", + " max_features='log2', min_samples_leaf=4,\n", + " min_samples_split=12))])\n", + "0.9976851851851851\n" + ] + } + ], + "source": [ + "import tpot2\n", + "import sklearn\n", + "import sklearn.datasets\n", + "\n", + "scorer = sklearn.metrics.get_scorer('roc_auc_ovo')\n", + "\n", + "X, y = sklearn.datasets.load_iris(return_X_y=True)\n", + "\n", + "X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)\n", + "\n", + "from tpot2.search_spaces.nodes.estimator_node import EstimatorNode\n", + "from tpot2.search_spaces.pipelines.union import UnionPipeline\n", + "from tpot2.search_spaces.pipelines.choice import ChoicePipeline\n", + "from tpot2.search_spaces.pipelines.sequential import SequentialPipeline\n", + "from tpot2.config.get_configspace import get_search_space, set_node, get_node\n", + "\n", + "scalers = set_node(\"MinMaxScaler\", {})\n", + "selectors = set_node(\"SelectFwe\", {'alpha': 0.0002381268562})\n", + "transformers_layer =UnionPipeline([\n", + " ChoicePipeline([\n", + " set_node(\"SkipTransformer\", {})\n", + " ]),\n", + " get_search_space(\"Passthrough\",)\n", + " ]\n", + " )\n", + "\n", + "inner_estimators_layer = UnionPipeline([\n", + " get_search_space(\"Passthrough\",)]\n", + " )\n", + "estimators = set_node(\"HistGradientBoostingClassifier\", \n", + " {'early_stop': 'valid', \n", + " 'l2_regularization': 0.0011074158219, \n", + " 'learning_rate': 0.0050792320068, \n", + " 'max_depth': None, \n", + " 'max_features': 0.3430178535213, \n", + " 'max_leaf_nodes': 237, \n", + " 'min_samples_leaf': 63, \n", + " 'tol': 0.0001, \n", + " 'n_iter_no_change': 14, \n", + " 'validation_fraction': 0.2343285974496})\n", + "\n", + "pipeline = SequentialPipeline(search_spaces=[\n", + " scalers,\n", + " selectors, \n", + " transformers_layer,\n", + " inner_estimators_layer,\n", + " estimators,\n", + " ])\n", + "ind = pipeline.generate()\n", + "\n", + "est = tpot2.TPOTClassifier(search_space=\"linear\", n_jobs=40, verbose=5, generations=1, population_size=5, customized_initial_population=[ind])\n", + "# for ind in est.evaluated_individuals.iterrows():\n", + "# print(ind[1]['Instance'])\n", + "est.fit(X_train, y_train)\n", + "\n", + "print(str(est.fitted_pipeline_))\n", + "\n", + "print(scorer(est, X_test, y_test))" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -2414,7 +2615,7 @@ ], "metadata": { "kernelspec": { - "display_name": "tpot_dev", + "display_name": "tpot2env3", "language": "python", "name": "python3" }, @@ -2428,14 +2629,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.10.15" }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "7fe1fe9ef32cd5efd76326a08046147513534f0dd2318301a1a96ae9071c1c4e" - } - } + "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 diff --git a/tpot2/config/get_configspace.py b/tpot2/config/get_configspace.py index 72eac9a6..8943ac5f 100644 --- a/tpot2/config/get_configspace.py +++ b/tpot2/config/get_configspace.py @@ -546,6 +546,18 @@ def get_search_space(name, n_classes=3, n_samples=1000, n_features=100, random_s return get_node(name, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state, base_node=base_node, n_jobs=n_jobs) +def set_node(name, params): + node = get_node(name) + if isinstance(node.space, dict): + node.space = params + else: + rng = np.random.default_rng(rng) + node.space.seed(rng.integers(0, 2**32)) + node.hyperparameters = dict(node.space.sample_configuration()) + for key, val in params.items(): + node.hyperparameters[key] = val + node.space = node.hyperparameters + return node def get_node(name, n_classes=3, n_samples=100, n_features=100, random_state=None, base_node=EstimatorNode, n_jobs=1): """