-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNeuronClassifier.py
296 lines (226 loc) · 12 KB
/
NeuronClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
# -*- coding: utf-8 -*-
"""
Created on Sun Feb 18 22:25:36 2018
@author: nicknobad
"""
# Inputs
DeadPath = r'D:\stuff\00_Dead.zip'
AlivePath = r'D:\stuff\00_Aliive.zip'
from PIL import Image; import io, zipfile, numpy as np, pandas as pd, time
from sklearn.linear_model import LogisticRegression; from sklearn.model_selection import train_test_split
# to plot ML models' accuracy
import matplotlib.pyplot as plt; import seaborn as sns; from sklearn import metrics
'import data'
# function to read TIF images of Neurons from archive to pd df
def read_TIFs_from_archive_to_np( zip_path, label ):
# prepare the list which will be appended to the pandas df
# LEAVE IT THIS WAY: Neuron_numpyArrayToAdd = []; NeuronID = []; NeuronStatus = []
Neuron_numpyArrayToAdd = []; NeuronID = []; NeuronStatus = []
# read the archive
archive = zipfile.ZipFile(zip_path, 'r')
for data in archive.namelist():
# read a file from Archive
image_data = archive.read( data )
# open the file with io.BytesIO
fh = io.BytesIO( image_data )
# save image to convert to numpy array # show the image
img = Image.open(fh)
# save img to numpy array
imgArray = np.array( img )
# from numpy array to PIL image
# Image.fromarray( imgArray )
# append the numpy array to list
Neuron_numpyArrayToAdd.append( imgArray )
# append the Neuron ID
NeuronID.append( data )
# have '0' for Dead Neurons and '1' for Alive Neurons
if label == 'Dead':
NStatus = 0
elif label == 'Alive':
NStatus = 1
NeuronStatus.append( NStatus )
# get a list of: NeuronID AND NeuronStatus AND Neuron numpy arrays
list_data = [ NeuronID, NeuronStatus, Neuron_numpyArrayToAdd ]
return list_data
# Alive and Dead lists contain 3 lists:
# 1. file name of the Neuron image
# 2. Status of Neuron: Alive = 1, Dead = 0
# 3. numpy array of the Neuron image
# get df of Dead Neurons
listDeadNeurons = read_TIFs_from_archive_to_np( zip_path = DeadPath, label = 'Dead')
# get df of Alive Neurons
listAliveNeurons = read_TIFs_from_archive_to_np( zip_path = AlivePath , label = 'Alive')
# The resulting listAliveNeurons and listDeadNeurons are 3D arrays:
# 1st- and 2nd-Dimension arrays - NO IDEA WHAT THEY ARE!!
# 3rd Dimension array is the 40 x 40 numpy array of the image. Select this array to build the ML models on
'Check Image importing'
# 1. Check if the read images were imported correctly. If they aren't imported correctly then we wouldn't have
# good data for training the ML models.
# 2. Check that all arrays in the 3rd Dimension of each image-array are the same;
# if set( list ) is True then there is not problem - all arrays in the 3rd Dimension of each image-array are the same
## %%timeit
# Alive
list_checkAlive_np_array = []
for wd in range( 0, len( listAliveNeurons[2] ) ):
checkcheck = np.unique( listAliveNeurons[2][wd][:, :, 0] == listAliveNeurons[2][wd][:, :, 1] )[0]
list_checkAlive_np_array.append( checkcheck )
checkcheck = np.unique( listAliveNeurons[2][wd][:, :, 0] == listAliveNeurons[2][wd][:, :, 2] )[0]
list_checkAlive_np_array.append( checkcheck )
# if len( list_checkAlive_np_array ) / len( listAliveNeurons[2] ) != 2 then there's a problem
set( list_checkAlive_np_array )
# Dead
list_checkDead_np_array = []
for wd in range( 0, len( listDeadNeurons[2] ) ):
checkcheck = np.unique( listDeadNeurons[2][wd][:, :, 0] == listDeadNeurons[2][wd][:, :, 1] )[0]
list_checkDead_np_array.append( checkcheck )
checkcheck = np.unique( listDeadNeurons[2][wd][:, :, 0] == listDeadNeurons[2][wd][:, :, 2] )[0]
list_checkDead_np_array.append( checkcheck )
# if len( list_checkDead_np_array ) / len( listDeadNeurons[2] ) != 2 then there's a problem
set( list_checkDead_np_array )
'/Check Image importing'
'check if len is 1600 in Dead and in Alive sets'
# Dead
for i in range( 0, len( listDeadNeurons[2] ) ):
if len( listDeadNeurons[2][i][:, :, 0].flatten() ) != 1600:
print( " numpy array in the Dead dataset that doesn't have the len of 1600: " % (i) )
else:
print( " all numpy arrays in the Dead dataset have the len of 1600 " )
# Alive
for i in range( 0, len( listAliveNeurons[2] ) ):
if len( listAliveNeurons[2][i][:, :, 0].flatten() ) != 1600:
print( " numpy array in the Alive dataset that doesn't have the len of 1600: " % (i) )
else:
print( " all numpy arrays in the Alive dataset have the len of 1600 " )
'/check if len is 1600'
'/import data'
'prepare data'
# get all the 40x40 image arrays into a tidy 2D array together with NeuronID and NeuronStatus
# Dead
nparrayFlatDeadNeurons = np.empty( (0, len( listDeadNeurons[2][0][:, :, 0].flatten() ) ), dtype = 'uint8' )
for line in range( 0, len( listDeadNeurons[2] ) ):
result = listDeadNeurons[2][line][:, :, 0].flatten()
nparrayFlatDeadNeurons = np.append( nparrayFlatDeadNeurons, [result], axis = 0 )
# Alive
nparrayFlatAliveNeurons = np.empty( (0, len( listDeadNeurons[2][0][:, :, 0].flatten() ) ), dtype = 'uint8' )
for line in range( 0, len( listAliveNeurons[2] ) ):
result = listAliveNeurons[2][line][:, :, 0].flatten()
nparrayFlatAliveNeurons = np.append( nparrayFlatAliveNeurons, [result], axis = 0 )
# combine Neuron ID, Neuron status and its numpy array
# Dead
npArrayAll_Dead = np.column_stack( ( listDeadNeurons[0], listDeadNeurons[1], nparrayFlatDeadNeurons ) )
# Alive
npArrayAll_Alive = np.column_stack( ( listAliveNeurons[0], listAliveNeurons[1], nparrayFlatAliveNeurons ) )
# combine npArrayAll_Dead and npArrayAll_Alive in one np array
npArrayAll = np.vstack( (npArrayAll_Dead, npArrayAll_Alive) )
# np.savetxt( 'test', nparrayFlatDeadNeurons[0:1, :], newline = ' ', delimiter = '', fmt = '%d')
'/prepare data'
'build the ML models'
''' compare classifiers
https://www.digitalocean.com/community/tutorials/how-to-build-a-machine-learning-classifier-in-python-with-scikit-learn
https://www.datacamp.com/community/blog/scikit-learn-cheat-sheet
https://machinelearningmastery.com/compare-machine-learning-algorithms-python-scikit-learn/
https://www.kaggle.com/nirajvermafcb/comparing-various-ml-models-roc-curve-comparison
http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
'''
'/build the ML models'
'Check performance of ML models'
# -----------------------------------------------------------------------------
'benchmark different ML models'
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
'prepare models'
models = []
models.append( ( 'LR', LogisticRegression() ) )
models.append( ( 'LDA', LinearDiscriminantAnalysis() ) )
models.append( ( 'KNN', KNeighborsClassifier() ) )
models.append( ( 'CART', DecisionTreeClassifier() ) )
models.append( ( 'GNB', GaussianNB() ) )
models.append( ( 'SVM', SVC() ) )
X = npArrayAll[:, 2:].astype(np.uint8)
y = npArrayAll[:, 1].astype(np.uint8)
# split in train and test sets
X_train, X_test, y_train, y_test = train_test_split( npArrayAll[:, 2:].astype(np.uint8), npArrayAll[:, 1].astype(np.uint8)
, test_size = 0.33, random_state = 42 )
#----------------------------------------
# store metrics in a df
# all scoring metrics available here: http://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values
# List of scoring metrics:
# ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc']
list_metrics = [ 'accuracy', 'r2', 'f1', 'f1_micro', 'f1_macro', 'f1_weighted', 'precision', 'recall' ]
'''Get algorithms' short names, their mean accuracies and their standard deviation accuracies.'''
t0 = time.time() # time the for loop
sscore_result = []; mmetric = []; mmodel = []
for metric in list_metrics:
for name, model in models:
mmodel.append( name )
mmetric.append( metric )
kfold = model_selection.KFold( n_splits = 13 )
sscore_result.append( model_selection.cross_val_score( model, X, y, cv = kfold, scoring = metric ).mean() )
t1 = time.time()
print( '\n CV of models ran in %.4fs or %.4fm ' % ( (t1 - t0), ((t1 - t0) / 60) ) )
# convert dict to pd DF
scoresLongDF = pd.DataFrame( { 'model' : mmodel, 'metric' : mmetric, 'value' : sscore_result } )
# conver tthe Long DF to wide DF
scoresWideDF = scoresLongDF.pivot( index = 'model', columns = 'metric', values = 'value' )
# convert Index column to column and re-order it
scoresWideDF['model'] = scoresWideDF.index
# delete Index
scoresWideDF = scoresWideDF.reset_index( drop = True )
# reorder columns in the DF; model name column to be the first column
scoresWideDF = scoresWideDF[ ['model'] + [i for i in scoresWideDF.columns.tolist() if i != 'model'] ]
# sort by column descending to get the most accurate model
scoresWideDF = scoresWideDF.sort_values( [ 'accuracy' ], ascending = [ False ] )
'/benchmark different ML models'
'tune the chosen model'
# http://scikit-learn.org/stable/modules/svm.html
# https://medium.com/@aneesha/svm-parameter-tuning-in-scikit-learn-using-gridsearchcv-2413c02125a0
# https://www.pyimagesearch.com/2016/08/15/how-to-tune-hyperparameters-with-python-and-scikit-learn/
''' functions plots the Confusion Matrix for train and test sets.
It also returns a list with elements: 0. trainPreds, 1. trainScore, 2. train confusion matrix,
3. train_AUC_ROC, 4. trainPreds, 5. testScore, 6. test confusion matrix, 7. test_AUC_ROC '''
def makePredsScores( model, model_name ):
'train'
# make predictions on train
trainPreds = model.predict( X_train )
# Use score method to get accuracy of model
trainScore = model.score( X_train, y_train )
# build a confusion matrix
trainCM = metrics.confusion_matrix( y_train, trainPreds )
# calculate auc_roc
train_AUC_ROC = metrics.roc_auc_score( y_train, trainPreds )
# graph; confusion matrix with Seaborn
plt.figure( figsize = (9, 9) )
sns.heatmap( trainCM, annot = True, fmt = '.0f', linewidths = .5, square = True, cmap = 'Blues_r' )
plt.ylabel( 'Actual label' )
plt.xlabel( 'Predicted label' )
all_sample_title = model_name + ' Train. ' + 'Accuracy Score: {0}'.format( round(trainScore, 6) )
plt.title( all_sample_title, size = 15 )
'/train'
'test'
# make predictions on test
testPreds = model.predict( X_test )
# Use score method to get accuracy of model
testScore = model.score( X_test, y_test )
# build a confusion matrix
testCM = metrics.confusion_matrix( y_test, testPreds )
# calculate auc_roc
test_AUC_ROC = metrics.roc_auc_score( y_test, testPreds )
# graph; confusion matrix with Seaborn
plt.figure( figsize = (9, 9) )
sns.heatmap( testCM, annot = True, fmt = '.0f', linewidths = .5, square = True, cmap = 'Blues_r' )
plt.ylabel( 'Actual label' )
plt.xlabel( 'Predicted label' )
all_sample_title = model_name + ' Test. ' + 'Accuracy Score: {0}'.format( round(testScore, 6) )
plt.title( all_sample_title, size = 15 )
'/test'
return [ trainPreds, trainScore, trainCM, train_AUC_ROC, testPreds, testScore, testCM, test_AUC_ROC ]
SVM = SVC()
SVM.fit( X_train, y_train )
makePredsScores( model = SVM, model_name = 'SVM' )
'/tune the chosen model'
'/Check performance of ML models'