-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathMIR_Assignment_2.py
368 lines (254 loc) · 12.8 KB
/
MIR_Assignment_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
from datetime import datetime
from pathlib import Path
from typing import List, Tuple, Union, Callable
import torch
import torch.nn as nn
import torchaudio
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import IPython.display as ipd
from MIR_Assignment_2_predefined import MTATDataset, SpecModel, AudioModelm, get_roc_auc, get_tpr_fpr
class OnTheFlyDataset(MTATDataset):
def __init__(self, dir_path:str, split:str='train', num_max_data:int=4000, sr:int=16000):
super().__init__(dir_path, split, num_max_data, sr)
def __getitem__(self, idx):
'''
__getitem__ returns a corresponding idx-th data sample among the dataset.
In music-tag dataset, it has to return (audio_sample, label) of idx-th data.
OnTheFlyDataset loads the audio file whenever this __getitem__ function is called.
In this function, you have to implement these things
1) Get the file path of idx-th data sample (use self.labels['mp3_path'])
2) Load the audio of that file path
3) Resample the audio sample into frequency of self.sr (You can use torchaudio.functional.resample)
4) Return resampled audio sample and the label (tag data) of the data sample
Output
audio_sample (torch.FloatTensor):
label (torch.FloatTensor): A tensor with shape of 50 dimension. Each dimension has value either 0 or 1
If n-th dimension's value is 1, it means n-th tag is True for this data sample
TODO: Complete this function
'''
audio_sample = None
label = None
return audio_sample, label
class PreProcessDataset(MTATDataset):
def __init__(self, dir_path:str, split:str='train', num_max_data:int=4000, sr:int=16000):
super().__init__(dir_path, split, num_max_data, sr)
self.pre_process_and_save_data()
def pre_process_and_save_data(self):
'''
self.pre_process_and_save_data loads every audio sample in the dataset, resample it, and save it into pt file.
In this function, you have to implement these things
1) For every data sample in the dataset, check whether pre-processed data already exists
- You can get data sample path by self.labels['mp3_path'].values
- path of pre-processed data can be in the same directory, but with different suffix.
- You can make it with Path(mp3_path).with_suffix('.pt')
2) If it doesn't exist, do follow things
a) Load audio file
b) Resample the audio file with samplerate of self.sr
c) Get label of this audio file
d) Save {'audio': audio_tensor, 'label':label_tensor} with torch.save
Output
None
TODO: Complete this function
'''
def __getitem__(self, idx):
'''
__getitem__ returns a corresponding idx-th data sample among the dataset.
In music-tag dataset, it has to return (audio_sample, label) of idx-th data.
PreProcessDataset loads the pre-processed pt file whenever this __getitem__ function is called.
In this function, you have to implement these things
1) Get the pt file path of idx-th data sample (use self.labels)
2) Load the pre-procssed data of that file path (use torch.load)
3) Return the audio sample and the label (tag data) of the data sample
TODO: Complete this function
'''
return audio_sample, label
class OnMemoryDataset(MTATDataset):
def __init__(self, dir_path:str, split:str='train', num_max_data:int=4000, sr:int=16000):
super().__init__(dir_path, split, num_max_data, sr)
self.loaded_audios = self.load_audio()
def load_audio(self):
'''
In this function, you have to load all the audio file in the dataset, and resample them,
and store the data on the memory as a python variable
For each data in the dataset,
a) Load Audio
b) Resample it to self.sr
c) Append it to total_audio_datas
Output:
total_audio_datas (list): A list of torch.FloatTensor. i-th item of the list corresponds to the audio sample of i-th data
Each item is an audio sample in torch.FloatTensor with sampling rate of self.sr
'''
total_audio_datas = []
### Write your code from here
return total_audio_datas
def __getitem__(self, idx):
'''
__getitem__ returns a corresponding idx-th data sample among the dataset.
In music-tag dataset, it has to return (audio_sample, label) of idx-th data.
OnMemoryDataset returns the pre-loaded audio data that is aved on self.loaded_audios whenever this __getitem__ function is called.
In this function, you have to implement these things
1) Load the pre-procssed audio data from self.loaded_audios
2) Return the audio sample and the label (tag data) of the data sample
TODO: Complete this function
'''
return audio_sample, label
class YourModel(AudioModel):
def __init__(self, sr, n_fft, hop_length, n_mels, hidden_size, num_output):
super().__init__(sr, n_fft, hop_length, n_mels, hidden_size, num_output)
# TODO: Implement your own model
def forward(self, x):
# TODO: Implement your own forward pass
return
def get_conv1d_output_with_linear(atensor, conv1d_linear, kernel_size):
batch_size, in_channels, sequence_length = atensor.shape
# TODO: Implement the forward pass
# Assume stride=1 and padding=0 for simplicity
# To match with the result of nn.Conv1d, flatten the input tensor without changing dimension order
return
def get_conv2d_output_with_linear(atensor, conv2d_linear, kernel_size):
batch_size, in_channels, height, width = atensor.shape
# TODO: Implement the forward pass
# Assume stride=1 and padding=0 for simplicity
# To match with the result of nn.Conv1d, flatten the input tensor without changing dimension order
return
def get_binary_cross_entropy(pred:torch.Tensor, target:torch.Tensor, eps=1e-8):
'''
pred (torch.Tensor): predicted value of a neural network model for a given input (assume that the value is output of sigmoid function)
target (torch.Tensor): ground-truth label for a given input, given in multi-hot encoding
output (torch.Tensor): Mean Binary Cross Entropy Loss value of every sample
'''
# TODO: Complete this function
return
def get_precision_and_recall(pred:torch.Tensor, target:torch.Tensor, threshold:float):
'''
This function calculates precision and recall of given (prediction, target, threshold)
pred (torch.Tensor): predicted value of a neural network model for a given input
target (torch.Tensor): ground-truth label for a given input, given in multi-hot encoding
output
precision (torch.Tensor): (Number of true positive)/(Number of total positive predictions)
recall (torch.Tensor): (Number of true positive)/(Number of total positive ground-truth)
IMPORTANT:
If there is no positive prediction, precision has to be 1
If there is no positive ground-truth, recall has to be 1
TODO: Complete this function
'''
# Write your code here
precision, recall = None, None
'''
Be careful for not returning nan because of division by zero
'''
assert not (torch.isnan(precision) or torch.isnan(recall))
return precision, recall
def get_precision_recall_auc(pred:torch.Tensor, target:torch.Tensor, num_grid=500):
'''
This function returns PR_AUC value for a given prediction and target.
Assume pred.shape == target.shape
pred (torch.Tensor): predicted value of a neural network model for a given input
target (torch.Tensor): ground-truth label for a given input, given in multi-hot encoding
output (torch.Tensor): Area Under Curve value for Precision-Recall Curve, using rectangle method
TODO: Complete this function using get_precision_and_recall
'''
return
def draw_pr_auc_curve(pred:torch.Tensor, target:torch.Tensor, num_grid=500):
'''
This function draws PR curve for given prediction and target.
Assume pred.shape == target.shape
pred (torch.Tensor): predicted value of a neural network model for a given input
target (torch.Tensor): ground-truth label for a given input, given in multi-hot encoding
'''
pr_curve = []
for thresh in reversed(torch.linspace(0,1,num_grid)):
precision, recall = get_precision_and_recall(pred, target, threshold=thresh)
pr_curve.append((recall, precision))
pr_curve = torch.tensor(pr_curve)
return pr_curve
def get_f1_score(pred:torch.Tensor, target:torch.Tensor, threshold:float):
'''
This function calculates F1 score of given (prediction, target, threshold)
pred (torch.Tensor): predicted value of a neural network model for a given input
target (torch.Tensor): ground-truth label for a given input, given in multi-hot encoding
output
f1_score (torch.Tensor): 2 * (precision * recall) / (precision + recall)
IMPORTANT:
If there is no positive prediction, precision has to be 1
If there is no positive ground-truth, recall has to be 1
'''
# Write your code here
return
def find_best_threshold_for_each_class(pred:torch.Tensor, target:torch.Tensor, num_grid=100):
'''
This function finds the best threshold for each class to maximize F1 score
pred (torch.Tensor): predicted value of a neural network model for a given input
target (torch.Tensor): ground-truth label for a given input, given in multi-hot encoding
output
best_thresholds (torch.Tensor): A tensor of best threshold for each class
'''
# Write your code here
return
def get_f1_score_for_each_class(pred, target, best_thresholds):
'''
This function calculates F1 score for each class
pred (torch.Tensor): predicted value of a neural network model for a given input
target (torch.Tensor): ground-truth label for a given input, given in multi-hot encoding
best_thresholds (torch.Tensor): A tensor of best threshold for each class
output
f1_scores (torch.Tensor): A tensor of F1 score for each class
'''
# Write your code here
return
def collect_every_pred_and_label(model:nn.Module, data_loader:DataLoader, device='cuda'):
'''
This function collects every prediction and label of a given model and data_loader
model (nn.Module): A neural network model
data_loader (DataLoader): A DataLoader object that has test data
output
every_pred (torch.Tensor): A tensor of every prediction of the model, device has to be 'cpu'
every_label (torch.Tensor): A tensor of every label of the model, device has to be 'cpu'
'''
# Write your code here
return
def get_audio_prediction(audio_path:str, model:nn.Module, best_thresholds:torch.Tensor, target_sr=16000):
'''
This function takes an audio path, model, sampling rate, and best_thresholds
and returns the prediction of the model for the audio file.
audio_path (str): A path of audio file
model (nn.Module): A neural network model
best_thresholds (torch.Tensor): A tensor of best threshold for each class
target_sr (int): Sampling rate of audio file
output
audio (torch.Tensor): A tensor of audio file
pred (list of str): A list of tags that are predicted to be True
CAUTION: Do not use external variable to get tag names. Use model.vocab to get tag names
'''
return
if __name__ == '__main__':
your_model = YourModel(sr=16000, n_fft=1024, hop_length=512, n_mels=48, num_output=50, hidden_size=32)
your_model.load_state_dict(torch.load('your_model_best.pt'))
in_channels = 10
out_channels = 2
kernel_size = 4
dummy_input = torch.randn(5, in_channels, 23)
conv1d = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=1, padding=0)
output = conv1d(dummy_input)
print(output.shape)
conv1d_linear = nn.Linear(in_channels * kernel_size, out_channels)
conv1d_linear.weight.data = conv1d.weight.data.view(out_channels, -1).clone()
conv1d_linear.bias.data = conv1d.bias.data.clone()
linear_output = get_conv1d_output_with_linear(dummy_input, conv1d_linear, kernel_size)
assert linear_output.shape == output.shape, "Output tensors have different shapes"
assert torch.allclose(output, linear_output, atol=1e-6), "Output tensors are different"
in_channels = 10
out_channels = 2
kernel_size = 4
dummy_input = torch.randn(5, in_channels, 13, 17)
conv2d = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=1, padding=0)
output = conv2d(dummy_input)
conv2d_linear = nn.Linear(in_channels * kernel_size * kernel_size, out_channels)
conv2d_linear.weight.data = conv2d.weight.data.view(out_channels, -1).clone()
conv2d_linear.bias.data = conv2d.bias.data.clone()
linear_output = get_conv2d_output_with_linear(dummy_input, conv2d_linear, kernel_size)
assert linear_output.shape == output.shape, "Output tensors have different shapes"
assert torch.allclose(output, linear_output, atol=1e-6), "Output tensors are different"