-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpredict_emotion.py
executable file
·128 lines (97 loc) · 3.6 KB
/
predict_emotion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python3
import speech_recognition as sr
import os
import gtts
from pydub import AudioSegment
from pydub.playback import play
import numpy as np
import pickle
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from extract_audio_features import extract_audio_features
from record_audio import get_audio
# Constants
DURATION_IEMOCAP = 11
SAMPLING_RATE = 16000
MER_MODEL = '/content/mser-thesis-app/result_models/lr_mer_iemocap'
# MER_MODEL = '/content/mser-thesis-app/result_models/lsvm_mer_iemocap'
VECTORIZER = '/content/mser-thesis-app/mer_tfidf_iemocap.pkl'
SCALER = '/content/mser-thesis-app/mer_mfcc_mel_chroma_scaler_iemocap.pkl'
emotions = ['neutral', 'happy', 'sad', 'angry']
LANG='en'
vectorizer_params = {
'ngram_range': (1, 2),
'max_df': 0.95,
'sublinear_tf': True,
'min_df': 4,
'stop_words': 'english',
'max_features': 2200
}
LR_PARAMS = {
'solver': 'newton-cg',
'penalty': 'l2',
'multi_class': 'auto',
'max_iter': 800,
'class_weight': None,
'C': 0.56
}
"""
This function predicts an emotion of the given audio file with corresponding transript.
"""
def predict_emotion(text, filepath, saved_model_filename=MER_MODEL, scaler_filename=SCALER, vect_filename=VECTORIZER):
# Get the features
X_audio = extract_audio_features(filepath, mfcc=True, mel=True, chroma=True).reshape(1, -1)
# Create a normalization object
scaler = MinMaxScaler()
# Load the scaler
with open(scaler_filename, 'rb') as scaler_file:
scaler = pickle.load(scaler_file)
# Normalize audio training data
X_audio = scaler.transform(X_audio)
# Initialize Vectorizer
vectorizer = TfidfVectorizer(**vectorizer_params)
# Load the vectorizer
with open(vect_filename, 'rb') as vect_file:
vectorizer = pickle.load(vect_file)
# Get the tf-idf text features
X_text = vectorizer.transform([text]).toarray()
# X = np.hstack(([text], X_audio[0])).reshape(1, -1)
X = np.concatenate((X_text, X_audio), axis=1)
model = LogisticRegression(**LR_PARAMS)
# Load the model
with open(saved_model_filename, 'rb') as f:
model = pickle.load(f)
return emotions[model.predict(X)[0]]
"""
This method runs the EmotionRecognition.
"""
def run_emotion_recognizer():
# Record an audio from a microphone
audio, sample_rate, audio_file = get_audio()
# use the audio file as the audio source
r = sr.Recognizer()
with sr.AudioFile(audio_file) as source:
audio = r.record(source, duration=DURATION_IEMOCAP) # read the entire audio file
# Resource: https://github.com/Uberi/speech_recognition/blob/master/examples/audio_transcribe.py
# Recognize speech using Google Speech Recognition
try:
text = r.recognize_google(audio, language=lang)
print("Google Speech Recognition thinks you said: " + text)
except sr.UnknownValueError:
print("Google Speech Recognition could not understand audio")
except sr.RequestError as e:
print("Could not request results from Google Speech Recognition service; {0}".format(e))
# Predict an emotion
pred = predict_emotion(text, audio_file)
print('\n' + '-'*40)
print(f'Predicted emotion: {pred}')
# make request to google to get synthesis
tts = gtts.gTTS(f'You are {pred}', lang=LANG)
output_file = 'output_emotion.mp3'
tts.save(output_file)
sound = AudioSegment.from_mp3(output_file)
play(sound)
# Delete the files
os.remove(filepath)
os.remove(output_file)