-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathshallow_nn.py
399 lines (324 loc) · 15.6 KB
/
shallow_nn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
import numpy as np
from mnist_prepare import parse_data
from med_5 import import_5_med
def sigmoid_activation(neuron_layer):
"""
Activates the neurons layer with a sigmoid or logistics function.
Uses the sigmoid function 1 / (1 + e^z) to activate the neuron layer.
Args:
neuron_layer: Layer of neurons to be activated.
Returns:
The activated neurons layer.D
"""
return (1.0 / (1.0 + np.exp(-neuron_layer)))
def sigmoid_derivative(neuron_layer):
"""
Computes the derivative of the sigmoid activation function for a given
layer.
The derivative of the sigmoid function is given by:
sigmoid(z) * (1 - sigmoid(z))
Args:
neuron_layer: Layer of neurons to be activated.
Returns:
The neuron layer activated by the derivative of the sigmoid function.
"""
activated_inputs = sigmoid_activation(neuron_layer)
return activated_inputs * (1 - activated_inputs)
class ShallowNetwork:
"""
Creates a feedforward neural network model with one hidden layer.
"""
def __init__(self, architecture, activation_func=sigmoid_activation):
"""
Initializes the perceptron.
The perceptron is here a Single Layer Perceptron with one hidden
layer.
The biases are generated randomly from a gaussian distribution with
standard deviation 1, and the weights are generated from this same
distribution but are divided by the number of inputs.This trick
sharpens the distribution around 0 and makes the neurons less likely
to saturate, what slows the learning process.
Args:
architecture: Architecture of the network, as a tuple. The
first value of the tuple gives the number of input
nodes, the second the number of hidden neurons, and
the tird the number of output neurons.
activation_function: Activation function to use for neurons
activation.
"""
self.architecture = architecture
# Generates the biases as drawn from gaussian.
# distribution with standard deviation = 1.
self.biases = [np.random.randn(y, 1)
for y in self.architecture[1:]]
# Generates the weights as drawn from gaussian distribution with
# standard deviation = 1 and divided by the number of inputs.
# This reduces the risk of the neuron to saturate.
self.weights = [np.random.randn(y, x)/np.sqrt(x)
for x, y in zip(self.architecture[:-1],
self.architecture[1:])]
self.activation_function = activation_func
# neurons values, used for backpropagation computation.
self.z_values = []
self.activations = []
self.training_error = []
def feedforward_computation(self, input_vector):
"""
Computes the output of the neuron.
Uses the feedforward algorithm to compute the value of the output
for the given weights and biases. Computes the weighted sum of the
neurons of the previous layer and uses the activation function to
activate it, for each neuron of the current layer.
Args:
input_vector: Values of the input as an array or list.
Returns:
The activated neurons values of the output layer.
Updates the 'activations' attribute of the network.
"""
# Initialisation.
self.z_values = []
self.activations = [input_vector]
next_layer = input_vector
# Iteration over each layer.
for bias_layer, weight_layer in zip(self.biases, self.weights):
# Get the neuron value.
z = np.dot(weight_layer, next_layer) + bias_layer
self.z_values.append(z) # Stocked for backward pass.
# Activates the neuron.
next_layer = self.activation_function(z)
self.activations.append(next_layer) # Stocked for backward pass.
# Returns the output layer activations.
return (next_layer)
def error_cost(self, output, label):
"""
Computes the error of output layer.
This function uses the cross-entropy as a cost function to
assess the error on the output. Cross entropy is given by:
C = -Sum(y * ln(a) + (1 - y) * ln(1 - a)).
This cost function gives better results than the MSE by avoiding
a learning slowdown that is encountered by using the quadratic cost
function.
Args:
output: Values of the activated output layer.
label: Expected output of the network given the input.
Returns:
The error on the predicted output.
"""
# nan_to_num avoid 'nan's that can be obtained when the predicted
# and disered output have a 1 in the same position, and replace it
# by the value 0.0.
return (np.sum(np.nan_to_num(-label * np.log(output) -
(1 - label) * np.log(1 - output))))
def error_rate(self, output, label):
"""
Computes the error rate, used in the partial derivative for the
gradient descent.
Args:
output: Values of the output layer.
label: Expected output of the network given the input.
Returns:
The error rate of the network.
"""
return (output - label)
def backpropagation(self, input_vector, label):
"""
Applies backpropagation to adapt the weights and biases for the
network.
Backpropagation uses the error on the output compared to the expected
output to compute new biases and weights for the network, allowing the
model to learn. To do so, we need:
1) To compute the error on the output node. It is denoted
nabla_a * C, a vector that represents the rate of change of
the cost-function given the activations of the output neurons.
Using cross-entropy cost function as here, the error on the
output is given by
delta^L = (a^L - y),
with a the activations on the output layer, and y the expected
output.
2) Propagate the output error to the previous layers. This is
done using the general formula for the previous layer l:
delta^l = ((w^(l+1)^T * delta^(l+1) . sigma'(z^l);
with delta^l the error on layer l, delta^(l+1) the error on
layer l + 1 and the sigma term the derivative of the activation
function on the neurons of layer l. w^(l+1)^T is the transposed
of the weights of layer l+1.
3) Compute the new biases and weights based on these errors. We
compute them as the derivatives of the cost function with
respect to the variation in biases and weights respectively
For the weights:
der(C)/der(b) = delta^l
For the biases:
der(C)/der(w) = a^(l-1) * delta^l
Args:
input_vector: Values of the input as an array or list.
label: Expected output of the network given the input.
Returns:
Gradients of the cost function with respect to the biases and
weights for each layer.
"""
# Forward pass.
self.feedforward_computation(input_vector)
# Computation for the output layer;
# Computes the error rate = delta^L.
delta_l = self.error_rate(self.activations[-1], label)
# Initialize the gradients for each layer.
gradient_biases = [np.zeros(bias.shape)
for bias in self.biases]
gradient_weights = [np.zeros(weight.shape)
for weight in self.weights]
# Gradients for the last layer.
gradient_weights[-1] = np.dot(delta_l, self.activations[-2].T)
gradient_biases[-1] = delta_l
# Computation for previous layers.
for layer in range(2, len(self.architecture)):
# delta^l computation
delta_l = np.dot(
self.weights[-layer + 1].T, delta_l) *\
sigmoid_derivative(self.z_values[-layer])
# Gradients computation.
gradient_biases[-layer] = delta_l
gradient_weights[-layer] =\
np.dot(delta_l, self.activations[-layer - 1].T)
return (gradient_biases, gradient_weights)
def parameters_update(self, batch, eta, lbda, n):
"""
Executes the weights and biases updates for the backpropagation.
Computes the errors for each inpu using the chain rule. The weights and
biases of the network are updated in the body of the function. This
step incorporates the L2 regularization, in which the updates are
scaled by a factor (1 - eta) * lambda / n, with eta the learning rate,
lambda the regularization parameter telling how much the high weights
should be scaled, and n the size of the full dataset.
Args:
batch: Mini-batch used to update the weights and biases.
eta: Learning rate, defines 'how fast' the network learns from a
single batch.
lbda: Regularization parameter, defines how much the large weights
should be reduced to avoid overfitting.
n: Size of the full dataset.
"""
# Initializes the weights and biases gradients containers.
gradient_biases = [np.zeros(bias.shape) for bias in self.biases]
gradient_weights = [np.zeros(weight.shape) for weight in self.weights]
# For each input in the mini-batch;
for inp, outp in batch:
# Backpropagates the error.
update_grd_biases, update_grd_weights = self.backpropagation(inp,
outp)
# Keeps the gradients to weights and biases.
gradient_biases = [gd_b + up_b for gd_b, up_b in zip(
gradient_biases, update_grd_biases)]
gradient_weights = [gd_w + up_w for gd_w, up_w in zip(
gradient_weights, update_grd_weights)]
# Updates the biases and gradients for the mini-batch.
self.weights = [(1-eta*(lbda/n)) * weight - (eta / len(batch))
* gradient_w
for weight, gradient_w in zip(self.weights,
gradient_weights)]
self.biases = [bias - (eta / len(batch)) * gradient_b
for bias, gradient_b in zip(self.biases,
gradient_biases)]
def evaluation(self, test_data):
"""
Evaluate the accuracy of classification.
/!\ Adapted for the mnist classification problem (handrwitten digit
recognition) with output on the form of a vector filled with a one in
the position corresponding to the predicted value, and zeros everywhere
else.
Should be adapted to the expected output for specific problem. /!\
Args:
test_data: Tuple containing in first position the flattened image
as a column matrix, and in second position the expected output
encoded as a column matrix with a 1 in position corresponding to
the expected value, and 0 everywhere else.
Returns:
The number of correctly predicted elements.
"""
# Gets the predicted and expected values.
test_results = [(np.argmax(self.feedforward_computation(x)),
np.argmax(y)) for (x, y) in test_data]
# Computes the number of correctly predicted values.
return sum(int(x == y) for (x, y) in test_results)
def loo_eval(self, test_input):
"""
Evaluate the accuracy in the context on leave-one-out.
Args:
test_input: Input to be predicted.
Returns:
Wether the input is correctly classified or not.
"""
result = (np.argmax(self.feedforward_computation(test_input[0])),
np.argmax(test_input[1]))
return (1 if result[0] == result[1] else 0)
def stochastic_gradient_descent(self, dataset, n_epoch,
batch_size, eta, lbda,
test_data=None, loo=None):
"""
Trains the network using a stochastic gradient descent.
In stochastic gradient descent, we train the network by dividing the
dataset into mini-batches. Each minibatch contains a certain number of
inputs, and each mini-batch is used to update the weights and biases.
Using mini-batches allows to improve the learning compared to full-
batch, and is more computationaly efficient than using inputs one by
one.
This process applies L2 or Ridge regularisation. This improvement
scales the weights and reduce the importance of larges weights,
allowing a better generalization (reduction of the overfitting).
This regularisation is achieved by scaling the weights in the weights
update step by a new term:
1 - eta * (lambda / n)
with lambda the regularization parameter; the larger it is, the more
we reduce the weights.
Args:
dataset: Set of inputs to use to train the network.
n_epoch: Number of epochs (number of times to revisit the whole
dataset) to train the network.
batch_size: Size of the mini-batches.
eta: Learning rate, determines how fast the network 'learns' from a
batch.
lbda: Regularization parameter, defines how much the large weights
should be reduced to avoid overfitting.
test_data: Dataset used for testing the network.
Returns:
The trained network.
"""
loo_score = None
for epoch in range(n_epoch):
n = len(dataset)
# Shuffles the dataset to get random batches.
np.random.shuffle(dataset)
# Divides the dataset in mini-batches.
mini_batches = [dataset[i: i + batch_size]
for i in range(0, len(dataset), batch_size)]
# For each batch mini-batch;
for batch in mini_batches:
# Updates the parameters.
self.parameters_update(batch, eta, lbda, n)
# Tests the accuracy on the test set if test data is provided.
if test_data:
n_test = len(test_data)
print("Epoch {0}: {1} / {2}".format(
epoch, self.evaluation(test_data), n_test))
# Computes the training_error.
ce = 0
for elt, res in dataset:
ce += self.error_cost(self.activations[-1], res)
self.training_error.append(ce/len(dataset))
# Leave-one-out evaluation.
if loo:
loo_score = self.loo_eval(loo)
print("One pass terminated; score: {}".format(loo_score))
return (loo_score)
def predict(self, target):
"""
Predicts the value for a single input.
/!\ Suits the mnist problem encoding, can have to be adapted
in other situations.
"""
return(np.argmax(self.feedforward_computation(target)))
"""
data = parse_data('mnist.pkl', d_set='train')
t_data = parse_data('mnist.pkl', d_set='test')
nn = ShallowNetwork([784, 30, 10], )
nn.stochastic_gradient_descent(data, 30, 10, 0.5, 5.0, t_data)
"""