-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathpcen.py
111 lines (88 loc) · 3.79 KB
/
pcen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env python3
# Copyright 2022 Lucky Wong
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
import torch
class Pcen(torch.nn.Module):
"""Trainable per-channel energy normalization (PCEN).
This applies a fixed or learnable normalization by an exponential moving
average smoother, and a compression.
See https://arxiv.org/abs/1607.05666 for more details.
"""
def __init__(self,
feat_dim: int,
alpha: float = 0.96,
smooth_coef: float = 0.04,
delta: float = 2.0,
root: float = 2.0,
floor: float = 1e-6):
"""PCEN constructor.
Args:
feat_dim: int, feature dims
alpha: float, exponent of EMA smoother
smooth_coef: float, smoothing coefficient of EMA
delta: float, bias added before compression
root: float, one over exponent applied for compression (r in the paper)
floor: float, offset added to EMA smoother
"""
super().__init__()
self.floor = floor
self.smooth = torch.nn.Parameter(torch.Tensor(feat_dim))
torch.nn.init.constant_(self.smooth, coeff_init)
# The AGC strength (or gain normalization strength) is controlled by the parameter α ∈ [0, 1]
self.alpha = torch.nn.Parameter(torch.Tensor(feat_dim))
torch.nn.init.constant_(self.alpha, alpha)
# A stabilized root compression to further reduce the dynamic range offset δ and exponent r
self.delta = torch.nn.Parameter(torch.Tensor(feat_dim))
torch.nn.init.constant_(self.delta, delta)
self.root = torch.nn.Parameter(torch.Tensor(feat_dim))
torch.nn.init.constant_(self.root, root)
def apply_iir(self, x):
"""Implements a first order Infinite Impulse Response (IIR) forward filter initialized using the input values.
:param x (torch.tensor): batch of (mel-) spectrograms. shape: [..., Frequency, Time]
:return M: Low-pass filtered version of the input spectrograms.
"""
s = torch.clamp(self.smooth, min=0.0, max=1.0)
M = [x[..., 0]]
for t in range(1, x.size(-1)):
m = (1. - s) * M[-1] + s * x[..., t]
M.append(m)
M = torch.stack(M, dim=-1)
return M
def forward(self,
xs: torch.Tensor,
xs_mask: torch.Tensor
):
"""
:param xs: Input tensor (#batch, time, idim).
:param xs_mask: Input mask (#batch, 1, time).
:return:
"""
alpha = torch.min(self.alpha, torch.ones(
self.alpha.size(), device=self.alpha.device))
root = torch.max(self.root, torch.ones(
self.root.size(), device=self.root.device))
# exchange the temporal dimension and the feature dimension
xs = xs.transpose(1, 2)
# mask batch padding
if mask_pad is not None:
xs.masked_fill_(~mask_pad, 0.0)
xs = self.apply_iir(xs)
# mask batch padding
if mask_pad is not None:
xs.masked_fill_(~mask_pad, 0.0)
ema_smoother = xs.transpose(1, 2)
one_over_root = 1. / root
xs = ((xs / (self.floor + ema_smoother)**alpha + self.delta)**one_over_root
- self.delta**one_over_root)
return xs