-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrandom_ensembler.py
123 lines (108 loc) · 4.4 KB
/
random_ensembler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import numpy as np
from sklearn.exceptions import NotFittedError
class RandomEnsembler():
def __init__(self, estimator_class, fitted_params, n_ensembles=5):
"""
Convenient way to 'average' predictions from multiple instances of a single classifier, using the
best parameters found from some type of hyperparameter search.
Parameters:
-----------
estimator_class :
a class of scikit-learn estimator Note: it should not be instanced;
e.g. 'RandomForestClassifier' NOT 'RandomForestClassifier()'
fitted_params :
dictionary, a set of (preferably optimized) parameters to be fed to `estimator`
n_ensembles :
integer, controls number of instances to ensemble over (default: 5)
"""
# Check that algorithm implements a random seed
assert hasattr(estimator_class(), 'random_state')
self.estimator_class = estimator_class
# Sanity check on input parameters
assert isinstance(fitted_params, dict)
self.fitted_params = fitted_params
self.n_ensembles = int(n_ensembles)
# Variable checking the status of fitting
self.is_fitted = False
# The individual models are stored here
self.models = list()
def get_models(self):
"""
Returns the individual fitted models if `fit` has
been called. Otherwise return `None`.
"""
if self.is_fitted:
return self.models
else:
return None
def fit(self, X_train, y_train, **kwargs):
"""
Parameters:
-----------
X_train :
array-like, shape (n_samples_train, n_features). Training data
y_train :
array-like, shape (n_samples_train,). Training labels
**kwargs :
other parameters to input to `fit` method if applicable
"""
# Generate and train new model for each seed
for _ in range(self.n_ensembles):
model = self.estimator_class(
random_state=np.random.randint(low=0, high=1e9))
model.set_params(**self.fitted_params)
model.fit(X_train, y_train, **kwargs)
self.models.append(model)
del model
self.is_fitted = True
return self
def predict(self, X_test):
"""
Parameters:
-----------
X_test :
matrix-like, shape (n_samples_test, n_features). Test data to predict on.
Returns:
-----------
ensembled_preds :
numpy-array, shape (n_samples_test,). Majority ensembled predictions
which usually beat the single best classifier predictions trained on
the same data.
"""
if not self.is_fitted:
raise NotFittedError('Call `fit` first.')
preds = [m.predict(X_test) for m in self.models]
ensembled_preds = []
for jj in range(X_test.shape[0]):
ensembled_preds.append(
self._majority_vote([preds[ii][jj]
for ii in range(self.n_ensembles)]))
return np.array(ensembled_preds)
def predict_proba(self, X_test):
"""
Parameters:
-----------
X_test :
matrix-like, shape (n_samples_test, n_features). Test data to predict on.
Returns:
-----------
ensembled_preds :
numpy-array, shape (n_samples_test,). Numerically averaged probability
predictions which usually beat the single best classifier predictions
trained on the same data.
"""
if not self.is_fitted:
raise NotFittedError('Call `fit` first.')
if not hasattr(self.estimator_class(), 'predict_proba'):
raise TypeError('Estimator should implement `predict_proba`.')
preds = [m.predict_proba(X_test) for m in self.models]
ensembled_preds = []
for jj in range(X_test.shape[0]):
ensembled_preds.append(
self._average_prob([preds[ii][jj]
for ii in range(self.n_ensembles)]))
return np.array(ensembled_preds)
def _majority_vote(self, lst):
return max(set(lst), key=lst.count)
def _average_prob(self, lst):
return np.array(lst).mean()