-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgmm.py
59 lines (48 loc) · 2.1 KB
/
gmm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as MSE
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
train_file = 'Sine_MP_train.csv'
test_file = 'sine_mp_test.csv'
def preprocess_data(train_file, test_file, target_col):
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
X_train = train_data.iloc[:, 20:]
y_train = train_data[target_col]
X_test = test_data.iloc[:, 20:]
y_test = test_data[target_col]
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
return X_train, y_train, X_test, y_test
def gmm_regression(X_train, y_train, X_test, y_test, n_components=5):
gmm = GaussianMixture(n_components=n_components, random_state=42)
gmm.fit(X_train)
# Assign cluster means as predictions
cluster_means = np.array([y_train[gmm.predict(X_train) == i].mean() for i in range(n_components)])
train_pred = cluster_means[gmm.predict(X_train)]
test_pred = cluster_means[gmm.predict(X_test)]
train_rmse = np.sqrt(MSE(y_train, train_pred))
test_rmse = np.sqrt(MSE(y_test, test_pred))
print(f"GMM - Train RMSE: {train_rmse:.3f}, Test RMSE: {test_rmse:.3f}")
return train_pred, test_pred
# Plot Results
def plot_results(y_test, predictions, title, filename):
plt.figure(figsize=(8, 6))
plt.scatter(range(len(y_test)), y_test, color='blue', label='Actual')
plt.scatter(range(len(predictions)), predictions, color='red', label='Predicted')
plt.title(title)
plt.xlabel('Test Sample Index')
plt.ylabel('Energy per Atom')
plt.legend()
plt.grid(True)
plt.savefig(filename, dpi=300)
plt.show()
# Main Execution
if __name__ == "__main__":
target_col = 'energy_per_atom'
X_train, y_train, X_test, y_test = preprocess_data(train_file, test_file, target_col)
train_pred, test_pred = gmm_regression(X_train, y_train, X_test, y_test, n_components=5)
plot_results(y_test, test_pred, "GMM: Actual vs Predicted", "gmm_actual_vs_predicted.png")