-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathKmeans_NonLinear_Dimensionality_Reduction.py
52 lines (40 loc) · 2.36 KB
/
Kmeans_NonLinear_Dimensionality_Reduction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 9 16:28:48 2022
source: https://medium.com/analytics-vidhya/less-known-applications-of-k-means-clustering-dimensionality-reduction-anomaly-detection-and-908f4bee155f#:~:text=Non%20Linear%20Dimensionality%20Reduction%20using,number%20of%20clusters%20to%202.
Using KMeans for dimensionality reduction and then for classification.
New features are created based on the distance between each point and the centroids calculated by Kmeans.
This improves classification 2%-3%.
@author: antona
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model, preprocessing, model_selection, pipeline, ensemble, tree, datasets, cluster
from sklearn.cluster import KMeans
sns.set(style = 'white', font_scale = 1.4)
### Load the Data
data = pd.DataFrame(datasets.load_boston().data, columns = datasets.load_boston().feature_names)
y = datasets.load_boston().target
features = ['CRIM', 'LSTAT', 'RM', 'AGE', 'INDUS', 'NOX', 'DIS']
data = data[features] #use only selected features
# Scale and Fit KMeans to data as part of a pipeline
# transform() calculates the distance of each data point from each cluster center.
def ScatterPlotCentroidDistance(data):
kmeans = pipeline.make_pipeline(preprocessing.StandardScaler(), cluster.KMeans(n_clusters = 2)).fit(X_train)
lower_dim = pd.DataFrame(kmeans.transform(data), columns = ['Comp 1', 'Comp 2']) #we have two clusters and thus two columns
lower_dim.plot.scatter( x='Comp 1',y= 'Comp 2', grid = True, figsize = (10, 7))
### Train Test Split
X_train, X_test, y_train, y_test = model_selection.train_test_split(data, y, test_size = .2, random_state = 10)
# note that X_test and y_test are not used in the pipeline bleow
ScatterPlotCentroidDistance(X_train)
### Fit a Linear Model
model = linear_model.LinearRegression()
score = model_selection.cross_val_score(model, X_train, y_train, cv = 10, scoring = 'r2')
print(f'Average LR r2: {np.mean(score)}')
### Perform the transformation using K-Means and train a linear model on the transformed features
km = pipeline.make_pipeline(preprocessing.StandardScaler(), cluster.KMeans(n_clusters = 5),
linear_model.LinearRegression())
score = model_selection.cross_val_score(km, X_train, y_train, cv = 10, scoring = 'r2')
print(f'Average KMeans + LR r2: {np.mean(score)}')