Skip to content

Commit

Permalink
add codebase
Browse files Browse the repository at this point in the history
  • Loading branch information
igorsieradzki committed Jun 6, 2019
1 parent 47c25b0 commit 0e19a24
Show file tree
Hide file tree
Showing 17 changed files with 2,743 additions and 0 deletions.
701 changes: 701 additions & 0 deletions chembl_outliers.ipynb

Large diffs are not rendered by default.

Binary file added img/large_CHEMBL214.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/large_CHEMBL216.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/large_CHEMBL217.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/large_CHEMBL224.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/large_CHEMBL225.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/large_CHEMBL226.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/large_CHEMBL251.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/large_CHEMBL264.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/large_CHEMBL3155.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/large_CHEMBL3371.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
730 changes: 730 additions & 0 deletions metrics_and_plots.ipynb

Large diffs are not rendered by default.

581 changes: 581 additions & 0 deletions rankings.ipynb

Large diffs are not rendered by default.

510 changes: 510 additions & 0 deletions suspects.txt

Large diffs are not rendered by default.

103 changes: 103 additions & 0 deletions train.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.environ['CUDA_VISIBLE_DEVICES'] = '2'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import deepchem as dc\n",
"import numpy as np\n",
"from itertools import product\n",
"from tqdm import tqdm"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def load_data(target, split, i):\n",
" \n",
" d = np.load(os.path.join(\"./data\", str(target), str(split), str(i)+\".npz\"))\n",
" \n",
" train_dataset = dc.data.NumpyDataset(X=d[\"x_tr\"], y=d[\"y_tr\"].reshape(-1,1))\n",
" test_dataset = dc.data.NumpyDataset(X=d[\"x_te\"], y=d[\"y_te\"].reshape(-1,1))\n",
" sim = d[\"sim\"]\n",
" \n",
" return train_dataset, test_dataset, sim\n",
"\n",
"\n",
"def train_single(train_dataset, test_dataset, save_path):\n",
" \n",
" reg = dc.models.MultitaskRegressor(n_tasks=1, n_features=1024, layer_sizes=[500, 500, 200], uncertainty=True)\n",
" reg.fit(train_dataset, nb_epoch=200)\n",
" \n",
" y_pred_real = reg.predict(test_dataset)\n",
" y_pred_dropout, y_std = reg.predict_uncertainty(test_dataset)\n",
" \n",
" np.savez(file=save_path, \n",
" y_pred=y_pred_real.flatten(),\n",
" y_drop_pred=y_pred_dropout.flatten(), \n",
" unc=y_std.flatten())\n",
"\n",
"def train(data_dir):\n",
" \n",
" targets = os.listdir(data_dir)\n",
" splits = ['bac', 'cv']\n",
" split_ids = list(range(5))\n",
" \n",
" runs = list(product(targets, splits, split_ids))\n",
" \n",
" for target, split, split_id in tqdm(runs):\n",
" \n",
" save_path = os.path.join(data_dir, target, split, f\"large_result_{split_id}.npz\")\n",
" train_dataset, test_dataset, _ = load_data(target=target, split=split, i=split_id)\n",
" \n",
" train_single(train_dataset=train_dataset, test_dataset=test_dataset, save_path=save_path)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"train(data_dir='./data')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:chem]",
"language": "python",
"name": "conda-env-chem-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
50 changes: 50 additions & 0 deletions train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import deepchem as dc
import numpy as np
from itertools import product
from tqdm import tqdm
import os


def load_data(target, split, i):

d = np.load(os.path.join("./data", str(target), str(split), str(i)+".npz"))

train_dataset = dc.data.NumpyDataset(X=d["x_tr"], y=d["y_tr"].reshape(-1,1))
test_dataset = dc.data.NumpyDataset(X=d["x_te"], y=d["y_te"].reshape(-1,1))
sim = d["sim"]

return train_dataset, test_dataset, sim


def train_single(train_dataset, test_dataset, save_path):

reg = dc.models.MultitaskRegressor(n_tasks=1, n_features=1024, layer_sizes=[500, 500, 200], uncertainty=True)
reg.fit(train_dataset, nb_epoch=200)

y_pred_real = reg.predict(test_dataset)
y_pred_dropout, y_std = reg.predict_uncertainty(test_dataset)

np.savez(file=save_path,
y_pred=y_pred_real.flatten(),
y_drop_pred=y_pred_dropout.flatten(),
unc=y_std.flatten())

def train(data_dir):

targets = os.listdir(data_dir)
splits = ['bac', 'cv']
split_ids = list(range(5))

runs = list(product(targets, splits, split_ids))

for target, split, split_id in tqdm(runs):

save_path = os.path.join(data_dir, target, split, f"large_result_{split_id}.npz")
train_dataset, test_dataset, _ = load_data(target=target, split=split, i=split_id)

train_single(train_dataset=train_dataset, test_dataset=test_dataset, save_path=save_path)


if __name__ == "__main__":

train('./data')
68 changes: 68 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import os
import numpy as np
import pandas as pd
import deepchem as dc

import matplotlib.pyplot as plt


def load_data(target, split, i):

d = np.load(os.path.join("./data", str(target), str(split), str(i)+".npz"))

train_dataset = dc.data.NumpyDataset(X=d["x_tr"], y=d["y_tr"].reshape(-1,1))
test_dataset = dc.data.NumpyDataset(X=d["x_te"], y=d["y_te"].reshape(-1,1))
sim = d["sim"]

return train_dataset, test_dataset, sim


def load_new_data(target, split, i):

d = np.load(os.path.join("./new_data", str(target), str(split), str(i)+".npz"))

train_dataset = dc.data.NumpyDataset(X=d["x_tr"], y=d["y_tr"].reshape(-1,1))
test_dataset = dc.data.NumpyDataset(X=d["x_te"], y=d["y_te"].reshape(-1,1))
sim = d["sim"]
train_uids = d['uid_tr']
test_uids = d['uid_te']
sim_uids = d['sim_uids']

return train_dataset, test_dataset, sim, train_uids, test_uids, sim_uids


def get_value(string):

return float(string.split('±')[0].strip())


def highlight_min(data, color='yellow'):

attr = 'background-color: {}'.format(color)
#remove % and cast to float
data = data.apply(get_value).astype(float)

if data.ndim == 1: # Series from .apply(axis=0) or axis=1
sub_data = data[filter(lambda x: 'unc' not in x, data.index)]
is_max = data == sub_data.min()
return [attr if v else '' for v in is_max]
else: # from .apply(axis=None)
is_max = data == data.min().min()
return pd.DataFrame(np.where(is_max, attr, ''),
index=data.index, columns=data.columns)


def highlight_max(data, color='yellow'):
'''
highlight the maximum in a Series or DataFrame
'''
attr = 'background-color: {}'.format(color)
#remove % and cast to float
data = data.apply(get_value).astype(float)
if data.ndim == 1: # Series from .apply(axis=0) or axis=1
is_max = data == data.max()
return [attr if v else '' for v in is_max]
else: # from .apply(axis=None)
is_max = data == data.max().max()
return pd.DataFrame(np.where(is_max, attr, ''),
index=data.index, columns=data.columns)

0 comments on commit 0e19a24

Please sign in to comment.