Skip to content

Commit

Permalink
2nd committee meeting setup + best hyperparams range
Browse files Browse the repository at this point in the history
  • Loading branch information
adamovanja committed May 17, 2024
1 parent 9ab31be commit b605fea
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 3 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@

Longitudinal modeling approaches accounting for high-dimensional, sparse and compositional nature of microbial time-series.

## Why q2-ritme?
* q2-ritme allows optimized application of various feature engineering and modelling methods: usually optimal hyperparameters (e.g. regularization) depend on the feature transformation that is performed. q2-ritme can infer feature transformation and optimal model in one go.


## Setup
To install the required dependencies for this package run (note: running `conda activate` before `make dev` is a mandatory step to ensure also coral_pytorch is installed):
```shell
Expand Down
132 changes: 132 additions & 0 deletions experiments/import_mlflow.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Import experiments from former MLflow UI\n",
"\n",
"\n",
"this is a workaround if mlflow does not react with too many experiments on Euler"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import mlflow\n",
"import pandas as pd\n",
"\n",
"# Read the CSV file\n",
"df = pd.read_csv(\"models/intermediate_rtrac5c_r0r3_le2y_cpu_t10.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# df.columns.tolist()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"experiment_name = \"euler_imported3\"\n",
"mlflow.create_experiment(experiment_name)\n",
"\n",
"# Get the experiment ID\n",
"experiment = mlflow.get_experiment_by_name(experiment_name)\n",
"experiment_id = experiment.experiment_id\n",
"\n",
"# Define the custom order for the models\n",
"model_order = {\n",
" \"linreg\": 1,\n",
" \"trac\": 2,\n",
" \"xgb\": 3,\n",
" \"nn_reg\": 4,\n",
" \"nn_class\": 5,\n",
" \"nn_corn\": 6,\n",
" \"rf\": 7,\n",
"}\n",
"\n",
"# Iterate over the rows of the DataFrame and log each run\n",
"for _, row in df.iterrows():\n",
" with mlflow.start_run(experiment_id=experiment_id):\n",
" # Log the metrics, parameters, and tags from the CSV\n",
" for key, value in row.items():\n",
" if key.startswith(\"metrics.\"):\n",
" metric_name = key.split(\".\", 1)[1]\n",
" mlflow.log_metric(metric_name, value)\n",
" elif key.startswith(\"params.\"):\n",
" param_name = key.split(\".\", 1)[1]\n",
" mlflow.log_param(param_name, value)\n",
" elif key == \"model\":\n",
" mlflow.log_param(\"model\", value)\n",
" # Log the model order as a separate parameter\n",
" mlflow.log_param(\"model_order\", model_order.get(value, 999))\n",
" elif key == \"time_total_s\":\n",
" mlflow.log_metric(\"time_total_s\", value)\n",
" elif key.startswith(\"tags.\"):\n",
" tag_name = key.split(\".\", 1)[1]\n",
" mlflow.set_tag(tag_name, value)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-05-05 13:41:53 +0200] [35348] [INFO] Starting gunicorn 21.2.0\n",
"[2024-05-05 13:41:53 +0200] [35348] [INFO] Listening at: http://127.0.0.1:5004 (35348)\n",
"[2024-05-05 13:41:53 +0200] [35348] [INFO] Using worker: sync\n",
"[2024-05-05 13:41:53 +0200] [35350] [INFO] Booting worker with pid: 35350\n",
"[2024-05-05 13:41:53 +0200] [35352] [INFO] Booting worker with pid: 35352\n",
"[2024-05-05 13:41:53 +0200] [35353] [INFO] Booting worker with pid: 35353\n",
"[2024-05-05 13:41:53 +0200] [35354] [INFO] Booting worker with pid: 35354\n",
"^C\n",
"[2024-05-05 13:44:47 +0200] [35348] [INFO] Handling signal: int\n",
"[2024-05-05 13:44:47 +0200] [35352] [INFO] Worker exiting (pid: 35352)\n",
"[2024-05-05 13:44:47 +0200] [35350] [INFO] Worker exiting (pid: 35350)\n",
"[2024-05-05 13:44:47 +0200] [35354] [INFO] Worker exiting (pid: 35354)\n",
"[2024-05-05 13:44:47 +0200] [35353] [INFO] Worker exiting (pid: 35353)\n"
]
}
],
"source": [
"! mlflow ui --port 5004"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "ritme_wclasso_f",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
6 changes: 3 additions & 3 deletions q2_ritme/model_space/_static_searchspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def get_rf_space(train_val):
model="rf",
**data_eng_space,
**{
"n_estimators": tune.randint(100, 500),
"n_estimators": tune.randint(50, 300),
"max_depth": tune.randint(2, 32),
"min_samples_split": tune.choice([0.001, 0.01, 0.1]),
"min_samples_leaf": tune.choice([0.0001, 0.001]),
Expand Down Expand Up @@ -113,8 +113,8 @@ def get_trac_space(train_val):
# 'one-cv_one_stddev-error' = select simplest model (largest lambda
# value) in CV whose CV score is within 1 stddev of best score
"cv_one_stddev": tune.choice([True, False]),
"lambdas_num_searched": tune.choice([80, 100, 120]),
"lambda_min": tune.choice([0.00001, 0.0001, 0.001, 0.01]),
"lambdas_num_searched": tune.choice([60, 80, 100]),
"lambda_min": tune.choice([0.0001, 0.001, 0.01]),
# logscale when going from lambda_min to 1
"lambda_logscale_search": tune.choice([True, False]),
},
Expand Down

0 comments on commit b605fea

Please sign in to comment.