-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_preprocessing.py
80 lines (67 loc) · 2.82 KB
/
data_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import pandas as pd
import numpy as np
class DataPreprocessor:
def preprocess(self, data):
return data
class ImputerDecorator(DataPreprocessor):
def __init__(self, strategy='mean'):
self.strategy = strategy
def preprocess(self, data):
data = super().preprocess(data)
if self.strategy == 'mean':
data.fillna(data.mean(), inplace=True)
elif self.strategy == 'median':
data.fillna(data.median(), inplace=True)
elif self.strategy == 'mode':
data.fillna(data.mode().iloc[0], inplace=True)
return data
class NormalizationDecorator(DataPreprocessor):
def preprocess(self, data):
data = super().preprocess(data)
data = (data - data.min()) / (data.max() - data.min())
return data
class StandardizationDecorator(DataPreprocessor):
def preprocess(self, data):
data = super().preprocess(data)
data = (data - data.mean()) / data.std()
return data
class OutlierHandlerDecorator(DataPreprocessor):
def __init__(self, method='z-score', threshold=3):
self.method = method
self.threshold = threshold
def preprocess(self, data):
data = super().preprocess(data)
if self.method == 'z-score':
z_scores = np.abs((data - data.mean()) / data.std())
outliers = z_scores > self.threshold
data[outliers] = np.nan
data = super().preprocess(data) # Re-apply imputation
# Add other outlier handling methods like IQR or capping
return data
class CategoricalEncodingDecorator(DataPreprocessor):
def __init__(self, encoding_type='one-hot'):
self.encoding_type = encoding_type
def preprocess(self, data):
data = super().preprocess(data)
if self.encoding_type == 'one-hot':
data = pd.get_dummies(data, columns=data.select_dtypes(include='object').columns)
# Add other encoding methods like label encoding or target encoding
return data
class MappingDecorator(DataPreprocessor):
def __init__(self, mapping_dict):
self.mapping_dict = mapping_dict
def preprocess(self, data):
data = super().preprocess(data)
for column, mapping in self.mapping_dict.items():
data[column] = data[column].map(mapping)
return data
# Create a preprocessing pipeline
preprocessor = DataPreprocessor()
preprocessor = ImputerDecorator(strategy='mean')(preprocessor)
preprocessor = NormalizationDecorator()(preprocessor)
preprocessor = CategoricalEncodingDecorator()(preprocessor)
preprocessor = OutlierHandlerDecorator()(preprocessor)
preprocessor = MappingDecorator()(preprocessor)
# Preprocess the data
preprocessed_data = preprocessor.preprocess(data)
print(preprocessed_data)