-
Notifications
You must be signed in to change notification settings - Fork 34
/
Copy pathLSTMmodel.py
150 lines (126 loc) · 6.37 KB
/
LSTMmodel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
class LSTM(BaseRecurrent, Initializable):
u"""Long Short Term Memory.
Every unit of an LSTM is equipped with input, forget and output gates.
This implementation is based on code by Mohammad Pezeshki that
implements the architecture used in [GSS03]_ and [Grav13]_. It aims to
do as many computations in parallel as possible and expects the last
dimension of the input to be four times the output dimension.
Unlike a vanilla LSTM as described in [HS97]_, this model has peephole
connections from the cells to the gates. The output gates receive
information about the cells at the current time step, while the other
gates only receive information about the cells at the previous time
step. All 'peephole' weight matrices are diagonal.
.. [GSS03] Gers, Felix A., Nicol N. Schraudolph, and Jürgen
Schmidhuber, *Learning precise timing with LSTM recurrent
networks*, Journal of Machine Learning Research 3 (2003),
pp. 115-143.
.. [Grav13] Graves, Alex, *Generating sequences with recurrent neural
networks*, arXiv preprint arXiv:1308.0850 (2013).
.. [HS97] Sepp Hochreiter, and Jürgen Schmidhuber, *Long Short-Term
Memory*, Neural Computation 9(8) (1997), pp. 1735-1780.
Parameters
----------
dim : int
The dimension of the hidden state.
activation : :class:`.Brick`, optional
The activation function. The default and by far the most popular
is :class:`.Tanh`.
Notes
-----
See :class:`.Initializable` for initialization parameters.
"""
@lazy(allocation=['dim'])
def __init__(self, dim, activation=None, **kwargs):
super(LSTM, self).__init__(**kwargs)
self.dim = dim
if not activation:
activation = Tanh()
self.children = [activation]
def get_dim(self, name):
if name == 'inputs':
return self.dim * 4
if name in ['states', 'cells']:
return self.dim
if name == 'mask':
return 0
return super(LSTM, self).get_dim(name)
def _allocate(self):
self.W_state = shared_floatx_nans((self.dim, 4*self.dim),
name='W_state')
self.W_cell_to_in = shared_floatx_nans((self.dim,),
name='W_cell_to_in')
self.W_cell_to_forget = shared_floatx_nans((self.dim,),
name='W_cell_to_forget')
self.W_cell_to_out = shared_floatx_nans((self.dim,),
name='W_cell_to_out')
# The underscore is required to prevent collision with
# the `initial_state` application method
self.initial_state_ = shared_floatx_zeros((self.dim,),
name="initial_state")
self.initial_cells = shared_floatx_zeros((self.dim,),
name="initial_cells")
add_role(self.W_state, WEIGHT)
add_role(self.W_cell_to_in, WEIGHT)
add_role(self.W_cell_to_forget, WEIGHT)
add_role(self.W_cell_to_out, WEIGHT)
add_role(self.initial_state_, INITIAL_STATE)
add_role(self.initial_cells, INITIAL_STATE)
self.parameters = [
self.W_state, self.W_cell_to_in, self.W_cell_to_forget,
self.W_cell_to_out, self.initial_state_, self.initial_cells]
def _initialize(self):
for weights in self.parameters[:4]:
self.weights_init.initialize(weights, self.rng)
@recurrent(sequences=['inputs', 'mask'], states=['states', 'cells'],
contexts=[], outputs=['states', 'cells'])
def apply(self, inputs, states, cells, mask=None):
"""Apply the Long Short Term Memory transition.
Parameters
----------
states : :class:`~tensor.TensorVariable`
The 2 dimensional matrix of current states in the shape
(batch_size, features). Required for `one_step` usage.
cells : :class:`~tensor.TensorVariable`
The 2 dimensional matrix of current cells in the shape
(batch_size, features). Required for `one_step` usage.
inputs : :class:`~tensor.TensorVariable`
The 2 dimensional matrix of inputs in the shape (batch_size,
features * 4). The `inputs` needs to be four times the
dimension of the LSTM brick to insure each four gates receive
different transformations of the input. See [Grav13]_
equations 7 to 10 for more details.
mask : :class:`~tensor.TensorVariable`
A 1D binary array in the shape (batch,) which is 1 if there is
data available, 0 if not. Assumed to be 1-s only if not given.
.. [Grav13] Graves, Alex, *Generating sequences with recurrent
neural networks*, arXiv preprint arXiv:1308.0850 (2013).
Returns
-------
states : :class:`~tensor.TensorVariable`
Next states of the network.
cells : :class:`~tensor.TensorVariable`
Next cell activations of the network.
"""
def slice_last(x, no):
return x[:, no*self.dim: (no+1)*self.dim]
nonlinearity = self.children[0].apply
activation = tensor.dot(states, self.W_state) + inputs
in_gate = tensor.nnet.sigmoid(slice_last(activation, 0) +
cells * self.W_cell_to_in)
forget_gate = tensor.nnet.sigmoid(slice_last(activation, 1) +
cells * self.W_cell_to_forget)
next_cells = (forget_gate * cells +
in_gate * nonlinearity(slice_last(activation, 2)))
out_gate = tensor.nnet.sigmoid(slice_last(activation, 3) +
next_cells * self.W_cell_to_out)
next_states = out_gate * nonlinearity(next_cells)
if mask:
next_states = (mask[:, None] * next_states +
(1 - mask[:, None]) * states)
next_cells = (mask[:, None] * next_cells +
(1 - mask[:, None]) * cells)
return next_states, next_cells
@application(outputs=apply.states)
def initial_states(self, batch_size, *args, **kwargs):
return [tensor.repeat(self.initial_state_[None, :], batch_size, 0),
tensor.repeat(self.initial_cells[None, :], batch_size, 0)]