-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathML_classifications_code.py
395 lines (307 loc) · 12.8 KB
/
ML_classifications_code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
###############################################################################################
# STAT 603 Final Project 1
# Prediction performance of different statistical modeling methods on analysis of the MNIST data
# Rafsan Siddiqui
###############################################################################################
# import required packages
import pandas as pd
import numpy as np
# for logistic regression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# for decision trees and random forest
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
# for SVM
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
# for LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# for DNN
from sklearn.model_selection import cross_val_score, KFold
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
# for CNN
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
# import the required files
train_binary_df = pd.read_csv("mnist_train_binary.csv")
train_counts_df = pd.read_csv("mnist_train_counts.csv")
test_binary_df = pd.read_csv("mnist_test_binary.csv")
test_counts_df = pd.read_csv("mnist_test_counts.csv")
test_counts_new_df = pd.read_csv("mnist_test_counts_new.csv")
# rename the columns for the compressed datasets
new_column_names = ['label'] + ['x' + str(i) for i in range(1, 50)]
train_counts_df.columns = new_column_names
test_counts_df.columns = new_column_names
test_counts_new_df.columns = new_column_names
# Separate features and labels
X = train_counts_df.drop(columns=['label'])
y = train_counts_df['label']
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
########################################################################
# Multinomial Logistic Regression (with Lasso, Ridge, and Elastic Net) #
########################################################################
# Use LogisticRegressionCV for cross-validation and hyperparameter tuning
# LogisticRegressionCV performs k-fold cross-validation to find the best C
model = LogisticRegressionCV(
Cs=10, # Number of values to try for C
cv=5, # Number of cross-validation folds
penalty='l1',
solver='saga',
multi_class='multinomial',
max_iter=3000,
scoring='accuracy'
)
# Fit the model
model.fit(X_scaled, y)
# Best C value found
best_C = model.C_[0]
print(f'Best C value: {best_C}')
# Cross-validation score
cv_score = model.score(X_scaled, y)
print(f'Cross-validation accuracy: {cv_score}')
# Make predictions and evaluate on the same training set
y_pred = model.predict(X_scaled)
accuracy = accuracy_score(y, y_pred)
print(f'Training set accuracy: {accuracy}')
##########################
# Ridge (L2) regularization
ridge_model = LogisticRegressionCV(
Cs=10, # Number of values to try for C
cv=5, # Number of cross-validation folds
penalty='l2', # L2 regularization
solver='saga', # Solver that supports L2 regularization
multi_class='multinomial', # Multinomial logistic regression
max_iter=300, # Maximum number of iterations
scoring='accuracy' # Scoring metric
)
ridge_model.fit(train_counts_df.drop(columns=['label']), train_counts_df['label'])
# Best C value found
best_C = ridge_model.C_[0]
print(f'Best C value: {best_C}')
# Cross-validation score
cv_score = ridge_model.score(X_scaled, y)
print(f'Cross-validation accuracy: {cv_score}')
# Make predictions and evaluate on the same training set
y_pred = ridge_model.predict(X_scaled)
accuracy = accuracy_score(y, y_pred)
print(f'Training set accuracy: {accuracy}')
############################
# Elastic Net regularization
elastic_net_model = LogisticRegressionCV(
Cs=10, # Number of values to try for C
cv=5, # Number of cross-validation folds
penalty='elasticnet', # Elastic Net regularization
solver='saga', # Solver that supports Elastic Net regularization
l1_ratios=[0.5], # L1 ratio(s) to use in the Elastic Net regularization
multi_class='multinomial', # Multinomial logistic regression
max_iter=300, # Maximum number of iterations
scoring='accuracy' # Scoring metric
)
elastic_net_model.fit(train_counts_df.drop(columns=['label']), train_counts_df['label'])
# Best C value found
best_C = elastic_net_model.C_[0]
print(f'Best C value: {best_C}')
# Cross-validation score
cv_score = elastic_net_model.score(X_scaled, y)
print(f'Cross-validation accuracy: {cv_score}')
# Make predictions and evaluate on the same training set
y_pred = elastic_net_model.predict(X_scaled)
accuracy = accuracy_score(y, y_pred)
print(f'Training set accuracy: {accuracy}')
####################################
# Decision Tree and Random Forest #
###################################
# Decision Tree Classifier with GridSearchCV for hyperparameter tuning
dt_params = {
'criterion': ['gini', 'entropy'],
'max_depth': [None, 10, 20, 30, 40, 50],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
dt_grid = GridSearchCV(DecisionTreeClassifier(), dt_params, cv=5, n_jobs=-1, verbose=1)
dt_grid.fit(X, y)
# Best parameters and training the best model
dt_best_params = dt_grid.best_params_
dt_best_model = dt_grid.best_estimator_
# Cross-validation score for Decision Tree
dt_cv_scores = cross_val_score(dt_best_model, X, y, cv=5)
print("Decision Tree Classifier CV Scores:", dt_cv_scores)
print("Mean CV Score:", dt_cv_scores.mean())
# Random Forest Classifier with GridSearchCV for hyperparameter tuning
rf_params = {
'n_estimators': [50, 100, 200],
'criterion': ['gini', 'entropy'],
'max_depth': [None, 10, 20, 30, 40, 50],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
rf_grid = GridSearchCV(RandomForestClassifier(), rf_params, cv=5, n_jobs=-1, verbose=1)
rf_grid.fit(X, y)
# Best parameters and training the best model
rf_best_params = rf_grid.best_params_
rf_best_model = rf_grid.best_estimator_
# Cross-validation score for Random Forest
rf_cv_scores = cross_val_score(rf_best_model, X, y, cv=5)
print("Random Forest Classifier CV Scores:", rf_cv_scores)
print("Mean CV Score:", rf_cv_scores.mean())
############################################
# Multi-class Support Vector Machine (SVM) #
############################################
# SVM with GridSearchCV for hyperparameter tuning
svm_params = {
'svc__C': [0.1, 1, 10, 100],
'svc__gamma': [1, 0.1, 0.01, 0.001],
'svc__kernel': ['linear', 'rbf', 'poly']
}
# Pipeline for scaling and SVM
pipeline = Pipeline([
('scaler', StandardScaler()),
('svc', SVC())
])
svm_grid = GridSearchCV(pipeline, svm_params, cv=5, n_jobs=-1, verbose=1)
svm_grid.fit(X, y)
# Best parameters and training the best model
svm_best_params = svm_grid.best_params_
svm_best_model = svm_grid.best_estimator_
# Cross-validation score for SVM
svm_cv_scores = cross_val_score(svm_best_model, X, y, cv=5)
print("SVM Classifier CV Scores:", svm_cv_scores)
print("Mean CV Score:", svm_cv_scores.mean())
###############################
# SVM Misclassification Error #
###############################
# Separate features and labels
X_test = test_counts_df.drop(columns=['label'])
y_test = test_counts_df['label']
# Standardize the features using the same scaler fitted on the training data
X_test_scaled = svm_best_model.named_steps['scaler'].transform(X_test)
# Make predictions on the test data
y_pred = svm_best_model.named_steps['svc'].predict(X_test_scaled)
# Calculate the misclassification rate
accuracy = accuracy_score(y_test, y_pred)
misclassification_rate = 1 - accuracy
print(f"Test Accuracy: {accuracy}")
print(f"Test Misclassification Rate: {misclassification_rate}")
####################
# SVM Prediction 1 #
####################
# Standardize the features using the same scaler fitted on the training data
X_test_scaled = svm_best_model.named_steps['scaler'].transform(X_test)
# Make predictions on the test data
yhat1 = svm_best_model.named_steps['svc'].predict(X_test_scaled)
# Save predictions to file
pd.DataFrame(yhat1).to_csv("myfinal_prediction1.txt", header=False, index=False)
####################
# SVM Prediction 2 #
####################
# Separate features (labels are not used as they are all -1)
X_test_new = test_counts_new_df.drop(columns=['label'])
# Standardize the features using the same scaler fitted on the training data
X_test_new_scaled = svm_best_model.named_steps['scaler'].transform(X_test_new)
# Make predictions on the new test data
yhat2 = svm_best_model.named_steps['svc'].predict(X_test_new_scaled)
# Save new predictions to file
pd.DataFrame(yhat2).to_csv("myfinal_prediction2.txt", header=False, index=False)
##################################################
# Multi-class Linear Discriminant Analysis (LDA) #
##################################################
# Pipeline for scaling and LDA
pipeline = Pipeline([
('scaler', StandardScaler()), # Standardize the features
('lda', LinearDiscriminantAnalysis()) # Apply LDA
])
# Cross-validation score for LDA
lda_cv_scores = cross_val_score(pipeline, X, y, cv=5, n_jobs=-1)
print("LDA Classifier CV Scores:", lda_cv_scores)
print("Mean CV Score:", lda_cv_scores.mean())
#############################
# Deep Neural Network (DNN) #
#############################
# Define the neural network model
def create_model():
model = Sequential()
model.add(Dense(128, input_dim=X.shape[1], activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(10, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
# Wrap the Keras model so it can be used with scikit-learn
keras_model = KerasClassifier(build_fn=create_model, epochs=10, batch_size=32, verbose=0)
# Standardize features and create pipeline
pipeline = Pipeline([
('scaler', StandardScaler()), # Standardize the features
('keras', keras_model) # Apply the Keras model
])
# Define cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# Perform cross-validation
cv_scores = cross_val_score(pipeline, X, y, cv=kfold, n_jobs=-1)
print("DNN Classifier CV Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())
######################################
# Convolutional Neural Network (CNN) #
######################################
# prepare the training data
new_column_names = ['label'] + ['x' + str(i) for i in range(1, 785)]
train_binary_df.columns = new_column_names
# Separate features and labels
X = train_binary_df.drop(columns=['label'])
y = train_binary_df['label']
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Reshape the data for CNN
X_reshaped = X.values.reshape(-1, 28, 28, 1)
# Define the CNN model
def create_model():
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(28, 28, 1)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(10, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
# Wrap the Keras model so it can be used with scikit-learn
keras_model = KerasClassifier(build_fn=create_model, epochs=10, batch_size=32, verbose=0)
# Define cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# Perform cross-validation
cv_scores = cross_val_score(keras_model, X_reshaped, y, cv=kfold, n_jobs=-1)
print("CNN Classifier CV Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())
# Train the model on the full training data
keras_model.fit(X_reshaped, y)
###############################
# CNN Misclassification Error #
###############################
# prepare the testing data
test_binary_df.columns = new_column_names
# Separate features and labels
X_test = test_binary_df.drop(columns=['label'])
y_test = test_binary_df['label']
# Standardize the features using the same scaler fitted on the training data
X_test_scaled = scaler.transform(X_test)
# Reshape the data for CNN
X_test_reshaped = X_test.values.reshape(-1, 28, 28, 1)
# Make predictions on the test data
y_pred = keras_model.predict(X_test_reshaped)
# Calculate the misclassification rate
accuracy = accuracy_score(y_test, y_pred)
misclassification_rate = 1 - accuracy
print(f"Test Accuracy: {accuracy}")
print(f"Test Misclassification Rate: {misclassification_rate}")