-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path5 Chapter Five Evaluation Plots.py
362 lines (314 loc) · 14.3 KB
/
5 Chapter Five Evaluation Plots.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
%reset -f
#----------------------------------------------------
# Chapter Five
#----------------------------------------------------
# 5. Evaluation Plots
# Violin Plot for Prediction Probabilities
# Confusion Matrix Heatmap
# Test Accuracy Over Multiple Runs
# Residual Plot
# Precision-Recall Curve
# ROC Curve
# Cumulative Gain Chart
# Histogram of Prediction Probabilities
# Class Distribution Plot
# Box Plot for Prediction Probabilities
#----------------------------------------------------
import time
import numpy as np
import os
import warnings
from scipy.interpolate import interp1d
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import (
confusion_matrix, roc_auc_score,
average_precision_score,
balanced_accuracy_score,
precision_score, recall_score, f1_score, accuracy_score
)
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
#----------------------------------------------------
# Record the start time
start_time = time.time()
# Suppress warnings
warnings.filterwarnings("ignore")
# Parse BVH files
def parse_bvh(file_path):
print(f"Parsing file: {file_path}")
with open(file_path, 'r') as file:
lines = file.readlines()
header, motion_data = [], []
capture_data = False
for line in lines:
if "MOTION" in line:
capture_data = True
elif capture_data:
if line.strip().startswith("Frames") or line.strip().startswith("Frame Time"):
continue
motion_data.append(np.fromstring(line, sep=' '))
else:
header.append(line)
print(f"Finished parsing: {file_path}")
return header, np.array(motion_data)
def read_bvh(filename):
header, motion_data = parse_bvh(filename)
return motion_data
def interpolate_frames(motion_data, target_frame_count):
print(f"Interpolating frames to match target frame count: {target_frame_count}")
original_frame_count = len(motion_data)
original_time = np.linspace(0, 1, original_frame_count)
target_time = np.linspace(0, 1, target_frame_count)
interpolated_frames = []
for frame in np.array(motion_data).T:
interpolator = interp1d(original_time, frame.astype(float), kind='linear')
interpolated_frame = interpolator(target_time)
interpolated_frames.append(interpolated_frame)
return np.array(interpolated_frames).T
def find_max_frames(folder_path):
print(f"Finding maximum number of frames in folder: {folder_path}")
max_frames = 0
for filename in os.listdir(folder_path):
if filename.endswith('.bvh'):
motion_data = read_bvh(os.path.join(folder_path, filename))
max_frames = max(max_frames, len(motion_data))
return max_frames
def process_bvh_files(folder_path, max_frames):
print(f"Processing BVH files in folder: {folder_path}")
all_features = []
for filename in os.listdir(folder_path):
if filename.endswith('.bvh'):
full_path = os.path.join(folder_path, filename)
motion_data = read_bvh(full_path)
interpolated_frames = interpolate_frames(motion_data, max_frames)
all_features.append(interpolated_frames)
return np.array(all_features)
# Feature Extraction
def extract_motion_features(motion_data):
print("Extracting motion features...")
channels_per_joint = 3
num_joints = motion_data.shape[1] // channels_per_joint
features = []
for i in range(num_joints):
joint_rotations = motion_data[:, i * channels_per_joint:(i + 1) * channels_per_joint]
angular_velocity = np.diff(joint_rotations, axis=0)
acceleration = np.diff(angular_velocity, axis=0)
jerk = np.diff(acceleration, axis=0)
range_of_motion = np.max(joint_rotations, axis=0) - np.min(joint_rotations, axis=0)
joint_features = np.concatenate([angular_velocity.flatten(), acceleration.flatten(), jerk.flatten(), range_of_motion])
features.append(joint_features)
flattened_features = np.concatenate(features)
return flattened_features
# -------------------------------
# Load and preprocess data
train_folder_path = 'Small Dataset/'
# -------------------------------
max_frames_train = find_max_frames(train_folder_path)
all_features_train = process_bvh_files(train_folder_path, max_frames_train)
X_features = np.array([extract_motion_features(interpolate_frames(motion_data, max_frames_train)) for motion_data in all_features_train])
# Labels
C_Angry = [0] * 42
C_Depressed = [1] * 42
C_Neutral = [2] * 42
C_Proud = [3] * 32
Labels = np.array(C_Angry + C_Depressed + C_Neutral + C_Proud, dtype=np.int32)
# Standardize and normalize the data
scaler = StandardScaler()
X_standardized = scaler.fit_transform(all_features_train.reshape(len(all_features_train), -1))
normalizer = MinMaxScaler()
X_normalized = normalizer.fit_transform(X_standardized)
# Handle NaN values
imputer = SimpleImputer(strategy='mean')
X_cleaned = imputer.fit_transform(X_normalized)
# Final feature set and labels
X = X_cleaned
Y = Labels
# Feature selection
k = 500
selector = SelectKBest(chi2, k=k)
X_selected = selector.fit_transform(X, Y)
selected_indices = selector.get_support(indices=True)
print(f"Selected feature indices: {selected_indices}")
print(f"Shape of X before selection: {X.shape}")
print(f"Shape of X after selection: {X_selected.shape}")
# Train and test split
X_train, X_test, Y_train, Y_test = train_test_split(X_selected, Y, test_size=0.3, random_state=42)
# Train XGBoost Classifier
xgb_classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_classifier.fit(X_train, Y_train)
# Predictions
y_proba_XGB = xgb_classifier.predict_proba(X_test)
y_pred_XGB = xgb_classifier.predict(X_test)
# Plots------------------------------------------------------
# 1. Violin Plot for Prediction Probabilities
def plot_violin_with_medians(y_proba, y_true, classifier_name, class_labels, accuracies):
plt.figure(figsize=(8, 6))
sns.violinplot(data=y_proba, inner="quartile", palette="Set2")
for i in range(y_proba.shape[1]):
plt.scatter([i] * len(y_proba[:, i]), y_proba[:, i], color='black', alpha=0.6)
for i in range(y_proba.shape[1]):
median = np.median(y_proba[:, i])
plt.scatter(i, median, color='white', edgecolor='black', s=100, zorder=3)
for i, acc in enumerate(accuracies):
plt.text(i, 1.05, f'Acc: {acc:.2f}', horizontalalignment='center', fontsize=10, color='black')
plt.title(f'Violin Plot of Predicted Probabilities - {classifier_name}')
plt.xlabel("Class")
plt.ylabel("Predicted Probability")
plt.xticks(ticks=np.arange(len(class_labels)), labels=class_labels)
plt.show()
# 2. Confusion Matrix Heatmap
def plot_confusion_matrix_heatmap(y_true, y_pred, class_labels):
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_labels, yticklabels=class_labels)
plt.title('Confusion Matrix Heatmap')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()
# 3. Test Accuracy Over Multiple Runs
def plot_test_accuracy(accuracies):
runs = range(1, len(accuracies) + 1)
plt.figure(figsize=(8, 6))
plt.bar(runs, accuracies)
plt.title("Test Accuracy Over Multiple Runs")
plt.xlabel("Run")
plt.ylabel("Test Accuracy")
plt.xticks(runs)
plt.ylim([0, 1])
plt.show()
# 4. Residual Plot (for classification)
def plot_residuals(y_true, y_pred, classifier_name):
residuals = y_true - y_pred
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True, bins=20)
plt.title(f'Residuals Distribution - {classifier_name}')
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.show()
# 5. Precision-Recall Curve (for multiclass)
def plot_precision_recall_curve(y_true, y_proba, n_classes, classifier_name):
from sklearn.preprocessing import label_binarize
from sklearn.metrics import precision_recall_curve
y_true_bin = label_binarize(y_true, classes=range(n_classes))
plt.figure(figsize=(8, 6))
for i in range(n_classes):
precision, recall, _ = precision_recall_curve(y_true_bin[:, i], y_proba[:, i])
plt.plot(recall, precision, label=f'Class {i}')
plt.title(f'Precision-Recall Curve - {classifier_name}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(loc="best")
plt.show()
# 6. ROC Curve (for multiclass)
def plot_multiclass_roc_curve(y_true, y_proba, n_classes, classifier_name):
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
y_true_bin = label_binarize(y_true, classes=range(n_classes))
fpr = dict()
tpr = dict()
roc_auc = dict()
plt.figure(figsize=(8, 6))
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], y_proba[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
plt.plot(fpr[i], tpr[i], label=f'Class {i} (area = {roc_auc[i]:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'ROC Curve - {classifier_name}')
plt.legend(loc="lower right")
plt.grid()
plt.show()
# 7. Cumulative Gain Chart
def plot_cumulative_gain_chart(y_true, y_proba, classifier_name):
sorted_indices = np.argsort(-y_proba[:, 1])
sorted_y_true = y_true[sorted_indices]
cumulative_true = np.cumsum(sorted_y_true) / np.sum(sorted_y_true)
cumulative_total = np.cumsum(np.ones_like(sorted_y_true)) / len(sorted_y_true)
plt.figure(figsize=(8, 6))
plt.plot(cumulative_total, cumulative_true, label="Cumulative Gain")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray", label="Random Model")
plt.xlabel('Fraction of Samples')
plt.ylabel('Cumulative Gain')
plt.title(f'Cumulative Gain Chart - {classifier_name}')
plt.legend(loc="best")
plt.grid()
plt.show()
# 8. Histogram of Prediction Probabilities
def plot_prediction_probabilities_histogram(y_proba, classifier_name):
plt.figure(figsize=(8, 6))
plt.hist(y_proba, bins=10, edgecolor="k")
plt.title(f'Histogram of Prediction Probabilities - {classifier_name}')
plt.xlabel('Predicted Probability')
plt.ylabel('Frequency')
plt.show()
# 9. Class Distribution Plot
def plot_class_distribution(Y, class_labels):
plt.figure(figsize=(8, 6))
sns.countplot(x=Y)
plt.title("Class Distribution")
plt.xlabel("Class")
plt.ylabel("Count")
plt.xticks(ticks=range(len(class_labels)), labels=class_labels)
plt.show()
# Call the function to plot class distribution
plot_class_distribution(Y, np.array(["Angry", "Depressed", "Neutral", "Proud"]))
# 10. Box Plot for Prediction Probabilities
def plot_boxplot_proba_with_enhancements(y_proba, classifier_name, class_labels):
plt.figure(figsize=(10, 6))
# Create the box plot
sns.boxplot(data=y_proba, showmeans=True, meanline=True)
# Add additional statistical annotations
for i in range(y_proba.shape[1]):
# Calculate summary statistics
median = np.median(y_proba[:, i])
mean = np.mean(y_proba[:, i])
q1 = np.percentile(y_proba[:, i], 25)
q3 = np.percentile(y_proba[:, i], 75)
iqr = q3 - q1
lower_whisker = max(min(y_proba[:, i]), q1 - 1.5 * iqr)
upper_whisker = min(max(y_proba[:, i]), q3 + 1.5 * iqr)
outliers = y_proba[(y_proba[:, i] < lower_whisker) | (y_proba[:, i] > upper_whisker), i]
# Annotate the plot with median, mean, and whiskers
plt.text(i, median, f'Median: {median:.2f}', horizontalalignment='center', fontsize=9, color='black', weight='bold')
plt.text(i, mean, f'Mean: {mean:.2f}', horizontalalignment='center', fontsize=9, color='blue', weight='bold')
plt.text(i, lower_whisker, f'Low: {lower_whisker:.2f}', horizontalalignment='center', fontsize=8, color='green')
plt.text(i, upper_whisker, f'High: {upper_whisker:.2f}', horizontalalignment='center', fontsize=8, color='green')
# Annotate outliers
if len(outliers) > 0:
plt.text(i, upper_whisker + 0.05, f'{len(outliers)} outliers', horizontalalignment='center', fontsize=8, color='red')
# Add titles and labels
plt.title(f'Enhanced Box Plot of Predicted Probabilities - {classifier_name}')
plt.xlabel("Class")
plt.ylabel("Predicted Probability")
plt.xticks(ticks=np.arange(len(class_labels)), labels=class_labels)
plt.show()
# Call the enhanced box plot function
plot_boxplot_proba_with_enhancements(y_proba_XGB, "XGBoost", np.array(["Angry", "Depressed", "Neutral", "Proud"]))
# Call the function for the Violin Plot
plot_violin_with_medians(y_proba_XGB, Y_test, "XGBoost", np.array(["Angry", "Depressed", "Neutral", "Proud"]),
[accuracy_score(Y_test[Y_test == i], y_pred_XGB[Y_test == i]) for i in range(4)])
# Call the function to plot the Confusion Matrix Heatmap
plot_confusion_matrix_heatmap(Y_test, y_pred_XGB, np.array(["Angry", "Depressed", "Neutral", "Proud"]))
# Call the function to plot Test Accuracy Over Multiple Runs
# accuracy scores over multiple runs
XGB_test_accuracies = [0.9, 0.85, 0.87, 0.88] # Dummy accuracy values
plot_test_accuracy(XGB_test_accuracies)
# Call the function to plot Residuals
plot_residuals(Y_test, y_pred_XGB, "XGBoost")
# Call the function to plot the Precision-Recall Curve (for multiclass)
plot_precision_recall_curve(Y_test, y_proba_XGB, 4, "XGBoost")
# Call the function to plot the ROC Curve (for multiclass)
plot_multiclass_roc_curve(Y_test, y_proba_XGB, 4, "XGBoost")
# Call the function for the Cumulative Gain Chart
plot_cumulative_gain_chart(Y_test, y_proba_XGB, "XGBoost")
# Call the function for the Histogram of Prediction Probabilities
plot_prediction_probabilities_histogram(y_proba_XGB, "XGBoost")
# Call the function to plot Class Distribution
plot_class_distribution(Y_test, np.array(["Angry", "Depressed", "Neutral", "Proud"]))