-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbinary_gaussian.py
206 lines (166 loc) · 6.36 KB
/
binary_gaussian.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import json
from dataclasses import dataclass
from io import TextIOWrapper
from typing import Literal
import numpy as np
import numpy.typing as npt
from project.classifiers.classifier import Classifier
from project.funcs.base import corr, cov, vcol
from project.funcs.dcf import optimal_bayes_threshold
from project.funcs.log_pdf import log_pdf_gaussian
from project.funcs.pca import pca
@dataclass
class BinaryGaussian(Classifier):
"""
Attributes:
C (list[npt.NDArray]): Covariance matrices of the classes.
mu (list[npt.NDArray]): Means of the classes.
corr (list[npt.NDArray]): Correlation matrices of the classes.
llr (npt.NDArray): Log likelihood ratio of the classifier.
accuracy (float): Accuracy of the classifier.
error_rate (float): Error rate of the classifier.
_type (Literal["naive", "tied", "multivariate"]): Type of classifier to use.
_S (npt.NDArray): Scores of the classifier.
_pca_dims (int): Number of dimensions to keep after PCA.
_slicer (slice): Slice to apply to the data.
_fitted (bool): Whether the classifier has been fitted or not.
"""
def __init__(self, classifier: Literal["naive", "tied", "multivariate"]) -> None:
self._type = classifier
self._fitted = False
@property
def llr(self):
"""
Log likelihood ratio of the classifier. llr(xₜ) = log(𝑓(xₜ|h₁) / 𝑓(xₜ|h₀))
"""
if not hasattr(self, "_S"):
raise ValueError("Scores have not been computed yet.")
return self._S[1] - self._S[0]
@staticmethod
def from_json(data):
decoded = (
json.load(data) if isinstance(data, TextIOWrapper) else json.loads(data)
)
cl = BinaryGaussian(decoded["type"])
cl._type = decoded["type"]
cl.mu = decoded["mu"]
cl.C = (
decoded["C"]
if cl._type == "multivariate"
else (
[np.diag(C) for C in decoded["C"]]
if cl._type == "naive"
else [decoded["C"], decoded["C"]]
)
)
cl._pca_dims = decoded["pca_dims"]
cl._slicer = decoded["slicer"]
cl._fitted = True
return cl
def fit(
self,
X: npt.NDArray[np.float64],
y: npt.ArrayLike,
*,
slicer: slice | None = None,
pca_dims: int | None = None,
) -> "BinaryGaussian":
"""
Fit the Gaussian classifier to the training data.
Args:
X (npt.NDArray[np.float64]): Training data.
y (npt.ArrayLike): Training labels.
slicer (slice, optional): Slice to apply to the data. Defaults to None.
pca_dims (int, optional): Number of dimensions to keep after PCA
if one want to apply it as a pre-processing step. Defaults to None.
Returns:
BinaryGaussian: The fitted classifier.
"""
self._pca_dims = pca_dims
self._slicer = slicer
X = X[:, slicer] if slicer else X
if pca_dims:
X = pca(X, pca_dims)[1]
split = [X[y == k] for k in [0, 1]]
self.C = [cov(split[k].T) for k in [0, 1]]
self.mu = [np.mean(split[k], axis=0) for k in [0, 1]]
self.corr = [corr(split[k].T) for k in [0, 1]]
if self._type == "tied":
Sw = np.average( # Within-class covariance matrix
[self.C[k] for k in [0, 1]],
axis=0,
weights=np.array([len(split[k].T) for k in [0, 1]]),
)
# If tied then the ML estimate of the covariance matrix is
# the within class covariance matrix
self.C = [Sw, Sw]
elif self._type == "naive":
# If naive then the ML estimate of the covariance matrix is
# the diagonal of the sample covariance matrix
self.C = [np.diag(np.diag(self.C[k])) for k in [0, 1]]
self._fitted = True
return self
def predict(
self,
X: npt.NDArray[np.float64],
y: npt.ArrayLike | None = None,
*,
pi_T: float = 0.5,
C_fn: float = 1,
C_fp: float = 1,
) -> npt.ArrayLike:
"""
Predict the class of the samples in the validation set.
Args:
X (npt.NDArray[np.float64]): Validation set.
y (npt.ArrayLike, optional): True labels of the validation set, if
provided the accuracy and error rate will be computed. Defaults to None.
pi_T (float, optional): Prior of the True class. Defaults to 0.5.
C_fn (float, optional): Cost of false negatives. Defaults to 1.
C_fp (float, optional): Cost of false positives. Defaults to 1.
Returns:
ArrayLike: Predicted classes of the samples in the validation set.
"""
if not self._fitted:
raise ValueError("Classifier has not been fitted yet.")
self.scores(X)
predictions = self.llr > optimal_bayes_threshold(pi_T, C_fn, C_fp)
if y is not None:
self.accuracy = np.mean(predictions == y) * 100
self.error_rate = 100 - self.accuracy
return predictions
def scores(self, X):
if not self._fitted:
raise ValueError("Classifier has not been fitted yet.")
if self._slicer:
X = X[:, self._slicer]
if self._pca_dims:
X = pca(X, self._pca_dims)[1]
X = X.T
self._S = np.zeros((2, X.shape[1]))
for i in [0, 1]:
mu = self.mu[i]
C = self.C[i]
self._S[i, :] = log_pdf_gaussian(X, vcol(mu), C)
return self._S
def to_json(self, fp=None):
if not self._fitted:
raise ValueError("Classifier has not been fitted yet.")
data = {
"type": self._type,
"mu": [mu.tolist() for mu in self.mu],
"C": ( # store more efficiently
[C.tolist() for C in self.C]
if self._type == "multivariate"
else (
[np.diag(C).tolist() for C in self.C]
if self._type == "naive"
else [self.C[0].tolist()]
)
),
"pca_dims": self._pca_dims,
"slicer": self._slicer,
}
if fp is None:
return data
json.dump(data, fp)