forked from xdevfaheem/Transformers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathactivation.py
35 lines (24 loc) · 1.3 KB
/
activation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import torch
import torch.nn as nn
# class for Softmax Activation used in Attention Mechanism and Final Projection, which gives Probablic Distribution over the given logits/tensors
class Softmax(nn.Module):
def __init__(self, axis, keepdim=True):
"""
SoftMax:
The softmax function computes a new vector where each element is transformed,
such that the values are all positive and sum up to 1.
More clearly, it projects the input vector into probablities on the scale from 0 to 1 based on their attention. Sum of the Output Vector will be 1.
"""
super(Softmax, self).__init__()
self.axis = axis # axis along the softmax is applied
self.keepdims = keepdim # whether to keep the structure of the dimension but shape will be 1 on guven axis or it'll be squeezed along the gven axis
def forward(self, x):
"""
Input:
x: Attention Vector
Output:
x: Probablic Distribution along the given axis
"""
logsumexp = torch.logsumexp(x, dim=self.axis, keepdim=self.keepdims) # logsumexp is used here to avoid underflow by division by large numbers. you can also use normal sumexp
prob = torch.exp(x - logsumexp) # Element Wise Subtraction
return prob # Output Probablities