add codebase

igorsieradzki · Jun 6, 2019 · 0e19a24 · 0e19a24
1 parent 47c25b0
commit 0e19a24
Show file tree

Hide file tree

Showing 17 changed files with 2,743 additions and 0 deletions.
diff --git a/chembl_outliers.ipynb b/chembl_outliers.ipynb
diff --git a/img/large_CHEMBL214.png b/img/large_CHEMBL214.png
diff --git a/img/large_CHEMBL216.png b/img/large_CHEMBL216.png
diff --git a/img/large_CHEMBL217.png b/img/large_CHEMBL217.png
diff --git a/img/large_CHEMBL224.png b/img/large_CHEMBL224.png
diff --git a/img/large_CHEMBL225.png b/img/large_CHEMBL225.png
diff --git a/img/large_CHEMBL226.png b/img/large_CHEMBL226.png
diff --git a/img/large_CHEMBL251.png b/img/large_CHEMBL251.png
diff --git a/img/large_CHEMBL264.png b/img/large_CHEMBL264.png
diff --git a/img/large_CHEMBL3155.png b/img/large_CHEMBL3155.png
diff --git a/img/large_CHEMBL3371.png b/img/large_CHEMBL3371.png
diff --git a/metrics_and_plots.ipynb b/metrics_and_plots.ipynb
diff --git a/rankings.ipynb b/rankings.ipynb
diff --git a/suspects.txt b/suspects.txt
diff --git a/train.ipynb b/train.ipynb
@@ -0,0 +1,103 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = '2'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import deepchem as dc\n",
+    "import numpy as np\n",
+    "from itertools import product\n",
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_data(target, split, i):\n",
+    "    \n",
+    "    d = np.load(os.path.join(\"./data\", str(target), str(split), str(i)+\".npz\"))\n",
+    "    \n",
+    "    train_dataset = dc.data.NumpyDataset(X=d[\"x_tr\"], y=d[\"y_tr\"].reshape(-1,1))\n",
+    "    test_dataset = dc.data.NumpyDataset(X=d[\"x_te\"], y=d[\"y_te\"].reshape(-1,1))\n",
+    "    sim = d[\"sim\"]\n",
+    "    \n",
+    "    return train_dataset, test_dataset, sim\n",
+    "\n",
+    "\n",
+    "def train_single(train_dataset, test_dataset, save_path):\n",
+    "    \n",
+    "    reg = dc.models.MultitaskRegressor(n_tasks=1, n_features=1024, layer_sizes=[500, 500, 200], uncertainty=True)\n",
+    "    reg.fit(train_dataset, nb_epoch=200)\n",
+    "    \n",
+    "    y_pred_real = reg.predict(test_dataset)\n",
+    "    y_pred_dropout, y_std = reg.predict_uncertainty(test_dataset)\n",
+    "    \n",
+    "    np.savez(file=save_path, \n",
+    "             y_pred=y_pred_real.flatten(),\n",
+    "             y_drop_pred=y_pred_dropout.flatten(), \n",
+    "             unc=y_std.flatten())\n",
+    "\n",
+    "def train(data_dir):\n",
+    "    \n",
+    "    targets = os.listdir(data_dir)\n",
+    "    splits = ['bac', 'cv']\n",
+    "    split_ids = list(range(5))\n",
+    "    \n",
+    "    runs = list(product(targets, splits, split_ids))\n",
+    "    \n",
+    "    for target, split, split_id in tqdm(runs):\n",
+    "        \n",
+    "        save_path = os.path.join(data_dir, target, split, f\"large_result_{split_id}.npz\")\n",
+    "        train_dataset, test_dataset, _ = load_data(target=target, split=split, i=split_id)\n",
+    "   \n",
+    "        train_single(train_dataset=train_dataset, test_dataset=test_dataset, save_path=save_path)\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train(data_dir='./data')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda env:chem]",
+   "language": "python",
+   "name": "conda-env-chem-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/train.py b/train.py
@@ -0,0 +1,50 @@
+import deepchem as dc
+import numpy as np
+from itertools import product
+from tqdm import tqdm
+import os
+
+
+def load_data(target, split, i):
+
+    d = np.load(os.path.join("./data", str(target), str(split), str(i)+".npz"))
+
+    train_dataset = dc.data.NumpyDataset(X=d["x_tr"], y=d["y_tr"].reshape(-1,1))
+    test_dataset = dc.data.NumpyDataset(X=d["x_te"],  y=d["y_te"].reshape(-1,1))
+    sim = d["sim"]
+
+    return train_dataset, test_dataset, sim
+
+
+def train_single(train_dataset, test_dataset, save_path):
+
+    reg = dc.models.MultitaskRegressor(n_tasks=1, n_features=1024, layer_sizes=[500, 500, 200], uncertainty=True)
+    reg.fit(train_dataset, nb_epoch=200)
+
+    y_pred_real = reg.predict(test_dataset)
+    y_pred_dropout, y_std = reg.predict_uncertainty(test_dataset)
+
+    np.savez(file=save_path, 
+             y_pred=y_pred_real.flatten(),
+             y_drop_pred=y_pred_dropout.flatten(), 
+             unc=y_std.flatten())
+
+def train(data_dir):
+
+    targets = os.listdir(data_dir)
+    splits = ['bac', 'cv']
+    split_ids = list(range(5))
+
+    runs = list(product(targets, splits, split_ids))
+
+    for target, split, split_id in tqdm(runs):
+
+        save_path = os.path.join(data_dir, target, split, f"large_result_{split_id}.npz")
+        train_dataset, test_dataset, _ = load_data(target=target, split=split, i=split_id)
+
+        train_single(train_dataset=train_dataset, test_dataset=test_dataset, save_path=save_path)
+
+
+if __name__ == "__main__":
+
+    train('./data')
diff --git a/utils.py b/utils.py
@@ -0,0 +1,68 @@
+import os
+import numpy as np
+import pandas as pd
+import deepchem as dc
+
+import matplotlib.pyplot as plt
+
+
+def load_data(target, split, i):
+
+    d = np.load(os.path.join("./data", str(target), str(split), str(i)+".npz"))
+
+    train_dataset = dc.data.NumpyDataset(X=d["x_tr"], y=d["y_tr"].reshape(-1,1))
+    test_dataset = dc.data.NumpyDataset(X=d["x_te"], y=d["y_te"].reshape(-1,1))
+    sim = d["sim"]
+
+    return train_dataset, test_dataset, sim
+
+
+def load_new_data(target, split, i):
+
+    d = np.load(os.path.join("./new_data", str(target), str(split), str(i)+".npz"))
+
+    train_dataset = dc.data.NumpyDataset(X=d["x_tr"], y=d["y_tr"].reshape(-1,1))
+    test_dataset = dc.data.NumpyDataset(X=d["x_te"], y=d["y_te"].reshape(-1,1))
+    sim = d["sim"]
+    train_uids = d['uid_tr']
+    test_uids = d['uid_te']
+    sim_uids = d['sim_uids']
+
+    return train_dataset, test_dataset, sim, train_uids, test_uids, sim_uids
+
+
+def get_value(string):
+
+    return float(string.split('±')[0].strip())
+
+
+def highlight_min(data, color='yellow'):
+
+    attr = 'background-color: {}'.format(color)
+    #remove % and cast to float
+    data = data.apply(get_value).astype(float)
+
+    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
+        sub_data = data[filter(lambda x: 'unc' not in x, data.index)]
+        is_max = data == sub_data.min()
+        return [attr if v  else '' for v in is_max]
+    else:  # from .apply(axis=None)
+        is_max = data == data.min().min()
+        return pd.DataFrame(np.where(is_max, attr, ''),
+                            index=data.index, columns=data.columns)
+
+
+def highlight_max(data, color='yellow'):
+    '''
+    highlight the maximum in a Series or DataFrame
+    '''
+    attr = 'background-color: {}'.format(color)
+    #remove % and cast to float
+    data = data.apply(get_value).astype(float)
+    if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
+        is_max = data == data.max()
+        return [attr if v else '' for v in is_max]
+    else:  # from .apply(axis=None)
+        is_max = data == data.max().max()
+        return pd.DataFrame(np.where(is_max, attr, ''),
+                            index=data.index, columns=data.columns)