2nd committee meeting setup + best hyperparams range

adamovanja · May 17, 2024 · b605fea · b605fea
1 parent 9ab31be
commit b605fea
Show file tree

Hide file tree

Showing 3 changed files with 139 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -4,6 +4,10 @@
 
 Longitudinal modeling approaches accounting for high-dimensional, sparse and compositional nature of microbial time-series.
 
+## Why q2-ritme?
+* q2-ritme allows optimized application of various feature engineering and modelling methods: usually optimal hyperparameters (e.g. regularization) depend on the feature transformation that is performed. q2-ritme can infer feature transformation and optimal model in one go.
+
+
 ## Setup
 To install the required dependencies for this package run (note: running `conda activate` before `make dev` is a mandatory step to ensure also coral_pytorch is installed):
 ```shell

diff --git a/experiments/import_mlflow.ipynb b/experiments/import_mlflow.ipynb
@@ -0,0 +1,132 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Import experiments from former MLflow UI\n",
+        "\n",
+        "\n",
+        "this is a workaround if mlflow does not react with too many experiments on Euler"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import mlflow\n",
+        "import pandas as pd\n",
+        "\n",
+        "# Read the CSV file\n",
+        "df = pd.read_csv(\"models/intermediate_rtrac5c_r0r3_le2y_cpu_t10.csv\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# df.columns.tolist()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "experiment_name = \"euler_imported3\"\n",
+        "mlflow.create_experiment(experiment_name)\n",
+        "\n",
+        "# Get the experiment ID\n",
+        "experiment = mlflow.get_experiment_by_name(experiment_name)\n",
+        "experiment_id = experiment.experiment_id\n",
+        "\n",
+        "# Define the custom order for the models\n",
+        "model_order = {\n",
+        "    \"linreg\": 1,\n",
+        "    \"trac\": 2,\n",
+        "    \"xgb\": 3,\n",
+        "    \"nn_reg\": 4,\n",
+        "    \"nn_class\": 5,\n",
+        "    \"nn_corn\": 6,\n",
+        "    \"rf\": 7,\n",
+        "}\n",
+        "\n",
+        "# Iterate over the rows of the DataFrame and log each run\n",
+        "for _, row in df.iterrows():\n",
+        "    with mlflow.start_run(experiment_id=experiment_id):\n",
+        "        # Log the metrics, parameters, and tags from the CSV\n",
+        "        for key, value in row.items():\n",
+        "            if key.startswith(\"metrics.\"):\n",
+        "                metric_name = key.split(\".\", 1)[1]\n",
+        "                mlflow.log_metric(metric_name, value)\n",
+        "            elif key.startswith(\"params.\"):\n",
+        "                param_name = key.split(\".\", 1)[1]\n",
+        "                mlflow.log_param(param_name, value)\n",
+        "            elif key == \"model\":\n",
+        "                mlflow.log_param(\"model\", value)\n",
+        "                # Log the model order as a separate parameter\n",
+        "                mlflow.log_param(\"model_order\", model_order.get(value, 999))\n",
+        "            elif key == \"time_total_s\":\n",
+        "                mlflow.log_metric(\"time_total_s\", value)\n",
+        "            elif key.startswith(\"tags.\"):\n",
+        "                tag_name = key.split(\".\", 1)[1]\n",
+        "                mlflow.set_tag(tag_name, value)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[2024-05-05 13:41:53 +0200] [35348] [INFO] Starting gunicorn 21.2.0\n",
+            "[2024-05-05 13:41:53 +0200] [35348] [INFO] Listening at: http://127.0.0.1:5004 (35348)\n",
+            "[2024-05-05 13:41:53 +0200] [35348] [INFO] Using worker: sync\n",
+            "[2024-05-05 13:41:53 +0200] [35350] [INFO] Booting worker with pid: 35350\n",
+            "[2024-05-05 13:41:53 +0200] [35352] [INFO] Booting worker with pid: 35352\n",
+            "[2024-05-05 13:41:53 +0200] [35353] [INFO] Booting worker with pid: 35353\n",
+            "[2024-05-05 13:41:53 +0200] [35354] [INFO] Booting worker with pid: 35354\n",
+            "^C\n",
+            "[2024-05-05 13:44:47 +0200] [35348] [INFO] Handling signal: int\n",
+            "[2024-05-05 13:44:47 +0200] [35352] [INFO] Worker exiting (pid: 35352)\n",
+            "[2024-05-05 13:44:47 +0200] [35350] [INFO] Worker exiting (pid: 35350)\n",
+            "[2024-05-05 13:44:47 +0200] [35354] [INFO] Worker exiting (pid: 35354)\n",
+            "[2024-05-05 13:44:47 +0200] [35353] [INFO] Worker exiting (pid: 35353)\n"
+          ]
+        }
+      ],
+      "source": [
+        "! mlflow ui --port 5004"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "ritme_wclasso_f",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 2
+}
diff --git a/q2_ritme/model_space/_static_searchspace.py b/q2_ritme/model_space/_static_searchspace.py
@@ -56,7 +56,7 @@ def get_rf_space(train_val):
         model="rf",
         **data_eng_space,
         **{
-            "n_estimators": tune.randint(100, 500),
+            "n_estimators": tune.randint(50, 300),
             "max_depth": tune.randint(2, 32),
             "min_samples_split": tune.choice([0.001, 0.01, 0.1]),
             "min_samples_leaf": tune.choice([0.0001, 0.001]),
@@ -113,8 +113,8 @@ def get_trac_space(train_val):
             # 'one-cv_one_stddev-error' = select simplest model (largest lambda
             # value) in CV whose CV score is within 1 stddev of best score
             "cv_one_stddev": tune.choice([True, False]),
-            "lambdas_num_searched": tune.choice([80, 100, 120]),
-            "lambda_min": tune.choice([0.00001, 0.0001, 0.001, 0.01]),
+            "lambdas_num_searched": tune.choice([60, 80, 100]),
+            "lambda_min": tune.choice([0.0001, 0.001, 0.01]),
             # logscale when going from lambda_min to 1
             "lambda_logscale_search": tune.choice([True, False]),
         },