(Almost?) fully working interface

himsoklong · Jan 22, 2023 · 77164ef · 77164ef
1 parent 6949260
commit 77164ef
Show file tree

Hide file tree

Showing 160 changed files with 19,341 additions and 106,153 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,7 +2,10 @@
 /Code/.ipynb_checkpoints
 /Code/I.mp3
 /Code/tmp_process*
+models/rule_based.joblib
 Flask_VT/venv
 Flask_VT/.idea
-models/rule_based.joblib
+Flask_VT/*.wav
+Flask_VT/__pycache__
+Flask_VT/models/rule_based.joblib
 
diff --git a/Code/Demo.ipynb b/Code/Demo.ipynb
@@ -38,13 +38,13 @@
     "        super(CNNRegressor, self).__init__()\n",
     "        self.cnn_layer1 = nn.Sequential(nn.Conv2d(1, 16, kernel_size=3, padding='valid'), nn.ReLU(), nn.Dropout(0.1), nn.BatchNorm2d(16), nn.MaxPool2d(kernel_size=2))\n",
     "        self.cnn_layer2 = nn.Sequential(nn.Conv2d(16, 32, kernel_size=3, padding='valid'), nn.ReLU(), nn.Dropout(0.2), nn.BatchNorm2d(32), nn.MaxPool2d(kernel_size=2))\n",
-    "        self.linear_layer1 = nn.Linear(32*30*6 + 8, 64)\n",
+    "        self.linear_layer1 = nn.Linear(32*30*6 + 8, 128)\n",
     "        self.dropout1 = nn.Dropout(0.5)\n",
     "        self.activ1 = nn.ReLU()\n",
-    "        self.linear_layer_p = nn.Linear(64, 32)\n",
+    "        self.linear_layer_p = nn.Linear(128, 64)\n",
     "        self.dropout_p = nn.Dropout(0.5)\n",
     "        self.activ_p = nn.ReLU()\n",
-    "        self.linear_layer2 = nn.Linear(32, 2)\n",
+    "        self.linear_layer2 = nn.Linear(64, 2)\n",
     "        self.activ2 = nn.Sigmoid()\n",
     "        \n",
     "    def forward(self, images, features):\n",
@@ -62,10 +62,10 @@
     "        self.linear_layer1 = nn.Linear(32*30*3 + 7, 64)\n",
     "        self.dropout1 = nn.Dropout(0.5)\n",
     "        self.activ1 = nn.ReLU()\n",
-    "        self.linear_layer_p = nn.Linear(64, 32)\n",
+    "        self.linear_layer_p = nn.Linear(128, 64)\n",
     "        self.dropout_p = nn.Dropout(0.5)\n",
     "        self.activ_p = nn.ReLU()\n",
-    "        self.linear_layer2 = nn.Linear(32, num_classes)\n",
+    "        self.linear_layer2 = nn.Linear(64, num_classes)\n",
     "        \n",
     "    def forward(self, images, features):\n",
     "        cnn2 = self.cnn_layer2(self.cnn_layer1(images.unsqueeze(1)))\n",

diff --git a/Code/audio_crop.ipynb b/Code/audio_crop.ipynb
diff --git a/Code/neural_network.ipynb b/Code/neural_network.ipynb
diff --git a/Code/rule_based.ipynb b/Code/rule_based.ipynb
diff --git a/Flask_VT/app.py b/Flask_VT/app.py
@@ -1,89 +1,218 @@
-from flask import Flask, render_template, request, jsonify
-from joblib import load
+import base64
+import json
 import math
+
+import librosa
 import numpy as np
-import wave
-import contextlib
-from processing import extract_formant
+import parselmouth
+import torch
+import torch.nn as nn
+from flask import Flask, render_template, request, jsonify
+from joblib import load
+from parselmouth import praat
+from pydub import AudioSegment
+from pydub.silence import detect_leading_silence
+from scipy.io import wavfile
+from skimage.transform import resize
+
 from feedback import vowel_feedback, pron_hack
 
 app = Flask(__name__)
 
 
-@app.route('/upload', methods=[ 'POST'])
+# The neural regression definition
+class CNNRegressor(nn.Module):
+    def __init__(self):
+        super(CNNRegressor, self).__init__()
+        self.cnn_layer1 = nn.Sequential(nn.Conv2d(1, 16, kernel_size=3, padding='valid'), nn.ReLU(), nn.Dropout(0.1),
+                                        nn.BatchNorm2d(16), nn.MaxPool2d(kernel_size=2))
+        self.cnn_layer2 = nn.Sequential(nn.Conv2d(16, 32, kernel_size=3, padding='valid'), nn.ReLU(), nn.Dropout(0.2),
+                                        nn.BatchNorm2d(32), nn.MaxPool2d(kernel_size=2))
+        self.linear_layer1 = nn.Linear(32 * 30 * 6 + 8, 128)
+        self.dropout1 = nn.Dropout(0.5)
+        self.activ1 = nn.ReLU()
+        self.linear_layer_p = nn.Linear(128, 64)
+        self.dropout_p = nn.Dropout(0.5)
+        self.activ_p = nn.ReLU()
+        self.linear_layer2 = nn.Linear(64, 2)
+        self.activ2 = nn.Sigmoid()
+
+    def forward(self, images, features):
+        cnn2 = self.cnn_layer2(self.cnn_layer1(images.unsqueeze(1)))
+        cnn_vec = cnn2.reshape(cnn2.shape[0], -1)
+        out = self.dropout1(self.activ1(self.linear_layer1(torch.cat((cnn_vec, features), dim=1))))
+        return self.activ2(self.linear_layer2(self.activ_p(self.dropout_p(self.linear_layer_p(out)))))
+
+
+# The neural classification model
+class CNNClassifier(nn.Module):
+    def __init__(self, num_classes=10):
+        super(CNNClassifier, self).__init__()
+        self.cnn_layer1 = nn.Sequential(nn.Conv2d(1, 16, kernel_size=3, padding='valid'), nn.ReLU(), nn.Dropout(0.1),
+                                        nn.BatchNorm2d(16), nn.MaxPool2d(kernel_size=2))
+        self.cnn_layer2 = nn.Sequential(nn.Conv2d(16, 32, kernel_size=3, padding='valid'), nn.ReLU(), nn.Dropout(0.2),
+                                        nn.BatchNorm2d(32), nn.MaxPool2d(kernel_size=2))
+        self.linear_layer1 = nn.Linear(32 * 30 * 3 + 7, 64)
+        self.dropout1 = nn.Dropout(0.5)
+        self.activ1 = nn.ReLU()
+        self.linear_layer_p = nn.Linear(128, 64)
+        self.dropout_p = nn.Dropout(0.5)
+        self.activ_p = nn.ReLU()
+        self.linear_layer2 = nn.Linear(64, num_classes)
+
+    def forward(self, images, features):
+        cnn2 = self.cnn_layer2(self.cnn_layer1(images.unsqueeze(1)))
+        cnn_vec = cnn2.reshape(cnn2.shape[0], -1)
+        out = self.dropout1(self.activ1(self.linear_layer1(torch.cat((cnn_vec, features), dim=1))))
+        out2 = self.linear_layer2(self.activ_p(self.dropout_p(self.linear_layer_p(out))))
+        return out2
+
+
+rule_clf = load('models/rule_based.joblib')  # The rule-based classifier
+nn_clf = torch.load('models/neural_classifier.pt', map_location=torch.device('cpu'))  # The neural classifier
+scaler = load('models/scaler.joblib')  # The scaler, transforms formants so that they have a mean of 0 and a variance of 1
+regressor = torch.load('models/neural_regressor.pt', map_location=torch.device('cpu'))  # The vowel detection model
+
+rule_based = True
+
+idx2key = ['2', '9', 'a', 'a~', 'e', 'E', 'i', 'O', 'o', 'o~', 'u', 'U~+', 'y']  # All possible vowels
+valid = [0, 1, 2, 4, 5, 6, 7, 8, 10, 12]  # Vowels we consider here (depends on the classifier)
+all_phonemes = ['l', 'm', 'p', 's', 't', 't1']  # Phonemes that can be before the vowel
+
+tmp_wav = 'tmp_process.wav'
+tmp_wav_2 = 'tmp_process_trimmed.wav'
+max_w = 31  # Image width to resize to
+max_w_2 = max_w if rule_based else 20
+
+
+@app.route('/upload', methods=['POST'])
 def upload():
-    f = open("file.wav", 'wb')
-    f.write(request.data)
-    f.close()
-    return jsonify(message='Bon Week-end')
-
-
-
-
-@app.route('/')
-def index():
-    return render_template("index.html")
-
-clf = load('models/rule_based.joblib') # The classifier
-scaler = load('models/scaler.joblib') # The scaler, transforms formants so that they have a mean of 0 and a variance of 1
-
-input_file = 'uploads/I.wav'
-target_vowel = 'y' # Possible values: 'a', 'e', 'E', 'i', 'o', 'O', 'u', 'y', '2', '9'
-speaker_gender = 'm' # Possible values: 'f' or 'm'
-previous_phoneme = 'm' # Possible values: 'l', 'm', 'p', 's', 't' or 't1' (last one shouldn't be used)
-
-@app.route('/predict_rule', methods=['GET'])
-def predict_rule():
-    idx2key = ['2', '9', 'a', 'a~', 'e', 'E', 'i', 'O', 'o', 'o~', 'u', 'U~+', 'y']  # All possible vowels
-    valid = [0, 1, 2, 4, 5, 6, 7, 8, 10, 12]  # Vowels we consider here (depends on the classifier)
-
-    with contextlib.closing(wave.open(input_file, 'r')) as f:  # Open file
-        frames = f.getnframes()
-        rate = f.getframerate()
-        duration = frames / float(rate)
-
+    data = json.loads(request.data)
+    speaker_gender = data['gender']
+    audio = data['audio'][22:]
+    des_vowel = data['des_vowel']
+    previous_phoneme = data['prev_phoneme']
+    word_ends_with_r = data['r_word']
+    input_file = "input.wav"
+
+    # TODO remove
+    #return jsonify(predicted_vowel='a',
+    #                   confidence=0.85,
+    #               feedback=vowel_feedback(des_vowel, 'a'),
+    #               add_feedback=pron_hack(des_vowel, 'a'))
+
+    audio = base64.b64decode(audio)
+
+    with open(input_file, 'wb') as f:
+        f.write(audio)
+
+    # Remove leading and trailing silences
+    sound = AudioSegment.from_file(input_file)
+    trim_leading_silence = lambda x: x[detect_leading_silence(x, silence_threshold=-25):]
+    trimmed = trim_leading_silence(trim_leading_silence(sound).reverse()).reverse()
+    trimmed.export(tmp_wav, format='wav', bitrate='768k')
+
+    # Generate log-melspectrogram
+    try:
+        y, sr = librosa.load(tmp_wav)
+    except ValueError:
+        print('The file is too silent to analyze!')
+        return jsonify(error='The file is too silent to analyze!')
+
+    mels = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, n_fft=512, hop_length=512)
+    mels = np.log(mels + 1e-9)  # add small number to avoid log(0)
+
+    # Rescale mel-spectrogram
+    mels_std = (mels - mels.min()) / (mels.max() - mels.min())
+    melspec = (mels_std * 255).astype(np.uint8)
+
+    melspec = np.flip(melspec, axis=0)  # put low frequencies at the bottom in image
+    melspec = 255 - melspec
+
+    # Feed log-melspectrogram to regression model to predict start and and of the vowel
+    melspec2 = resize(melspec, (melspec.shape[0], max_w_2), anti_aliasing=False)
+    melspec = resize(melspec, (melspec.shape[0], max_w), anti_aliasing=False)
+    input_tensor = torch.tensor(melspec).float()
+    input_features = torch.tensor(
+        [speaker_gender == 'f', not word_ends_with_r, *[previous_phoneme == x for x in all_phonemes]])
+    pred = regressor(input_tensor.unsqueeze(0), input_features.unsqueeze(0))[0]
+    vowel_start = pred[0].item()
+    vowel_end = pred[1].item()
+    if vowel_start >= vowel_end:
+        print('The model predicted that the vowel has negative duration!')
+        return jsonify(error='The model predicted that the vowel has negative duration!')
+
+    # Trim file at start and end to only have the vowel
+    sample_rate, wave_data = wavfile.read(tmp_wav)
+    duration = len(wave_data) / sample_rate
+    start_sample = int(duration * vowel_start * sample_rate)
+    end_sample = int(duration * vowel_end * sample_rate)
+    wavfile.write(tmp_wav_2, sample_rate, wave_data[start_sample:end_sample])
+    duration = len(wave_data[start_sample:end_sample]) / sample_rate
+
+    if rule_based:
         # Extract formants
+        sound = parselmouth.Sound(tmp_wav_2)
+        point_process = praat.call(sound, "To PointProcess (periodic, cc)", math.ceil(3 / duration + 0.000001), 300)
+        formants = praat.call(sound, "To Formant (burg)", 0, 5, 5000, 0.025, 50)
+        num_points = praat.call(point_process, "Get number of points")
+        f_lists = [[] for i in range(5)]
+        for point in range(1, num_points + 1):
+            t = praat.call(point_process, "Get time from index", point)
+            for i in range(4):
+                f_lists[i].append(praat.call(formants, "Get value at time", i + 1, t, 'Hertz', 'Linear'))
+        f_lists = [[x for x in f_list if not math.isnan(x)] for f_list in f_lists]
+        # Compute the average of formants
+        formants = []
         try:
-            formants = extract_formant(input_file, start_time=0, end_time=duration,
-                                       f0min=math.ceil(3 / duration + 0.000001), n_formants=4)
+            for i in range(4):
+                formants.append(sum(f_lists[i]) / len(f_lists[i]))
         except ZeroDivisionError:
-            print('The file is too short to analyze!')
+            print('The file is too short/empty to analyze!')
+            return jsonify(error='The file is too short/empty to analyze!')
 
         # Add additional features (gender, previous phoneme)
-        features = formants
-        features.append(speaker_gender == 'f')
-        for prev in 'lmpst':
-            features.append(previous_phoneme == prev)
-        features.append(previous_phoneme == 't1')
+        input_features = torch.cat([input_features[0:1], input_features[2:]]).cpu()
+        features = torch.cat((torch.tensor(formants), input_features)).numpy()
 
         # Rescale formants
         features[:4] = scaler.transform(np.array(features[:4]).reshape(1, -1))[0]
 
         # Prediction with probabilities
-        pred = clf.predict_proba([features])  # Probabilities
-        final_vowel = np.argmax(pred)
-        final_confidence = pred[0][final_vowel]  # Best score
-        final_vowel = idx2key[valid[final_vowel]]  # Actual prediction
+        pred = rule_clf.predict_proba([features])  # Probabilities
+    else:
+        # Neural network
+        input_tensor = torch.tensor(melspec2).float()
+        input_features = torch.cat([input_features[0:1], input_features[2:]])
+        pred = nn.Softmax(dim=1)(nn_clf(input_tensor.unsqueeze(0), input_features.unsqueeze(0))).detach().numpy()
 
-        print('Vowel ', 'Confidence')
-        print('-' * 25)
-        for i in range(len(valid)):
-            vowel = idx2key[valid[i]]
-            print(f'{vowel:<6} {pred[0][i]:.3f}', '=' * int(pred[0][i] * 100))
+    final_vowel = np.argmax(pred)
+    final_confidence = pred[0][final_vowel]  # Best score
+    final_vowel = idx2key[valid[final_vowel]]  # Actual prediction
 
-        print(f'Prediction: /{final_vowel}/, confidence: {final_confidence:.3f}')
+    print('Vowel ', 'Confidence')
+    print('-' * 25)
+    for i in range(len(valid)):
+        vowel = idx2key[valid[i]]
+        print(f'{vowel:<6} {pred[0][i]:.3f}', '=' * int(pred[0][i] * 100))
 
-        n_feedback, r_feedback, o_feedback, f_feedback = vowel_feedback(target_vowel,final_vowel)
-        fb = [n_feedback, r_feedback, o_feedback, f_feedback]
-        print(fb)
+    print(f'Prediction: /{final_vowel}/, confidence: {final_confidence:.3f}')
 
-        ph = pron_hack(target_vowel,final_vowel)
-        print(ph)
-    return render_template("old_index.html", final_vowel=final_vowel, final_confidence=final_confidence, fb=fb, ph=ph)
+    return jsonify(predicted_vowel=final_vowel,
+                   confidence=final_confidence,
+                   feedback=vowel_feedback(des_vowel, final_vowel),
+                   add_feedback=pron_hack(des_vowel, final_vowel)
+                   )
+
+
+@app.route('/')
+def index():
+    return render_template("index.html")
 
 
-def rule_base(input_file,target_vowel,speaker_gender,previous_phoneme):
-    pass
+@app.route('/privacy.html')
+def privacy():
+    return render_template("privacy.html")
 
 
 if __name__ == '__main__':

diff --git a/Flask_VT/feedback.py b/Flask_VT/feedback.py
@@ -62,17 +62,21 @@ def vowel_feedback(vowel_des, vowel_per):
     elif nasal_per == True != nasal_des:
         n_feedback = feedback_dict['nasality'][1]
 
-    # returns tuple with four values
-    return n_feedback, r_feedback, o_feedback, f_feedback
+    # Turn the output into a bullet list
+    out_str = '<ul>'
+    for feedback in (n_feedback, r_feedback, o_feedback, f_feedback):
+        if feedback:
+            out_str += '<li>' + feedback + '</li>'
+    return out_str + '</ul>'
 
 
 def pron_hack(vowel_des, vowel_per):
-    '''
-    Provides more in-depth feedback for some of the vowels.
+    """
+    Provides more in-depth feedback for some vowels.
     Input: desired vowel (string) and perceived vowel (string), for example: 'y' and 'u'
     Output: string, for example "Protrude your lips like in a kiss."
     If no advice exists, the function returns an empty string.
-    '''
+    """
 
     hack_dict = {
         'x_to_y': "Try saying “tea” in English. Now keep your tongue in the same position but protrude your lips like in a kiss.",