-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnaive_bayes.py
66 lines (59 loc) · 2.72 KB
/
naive_bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from __future__ import division, print_function
import numpy as np
from pca import get_normalized, get_scaled, get_normalized_and_scaled
class NaiveBayes(object):
"""The Gaussian Naive Bayes classifier. """
def __init__(self):
self.classes = None
self.X = None
self.y = None
# Gaussian prob. distribution parameters
self.parameters = []
def fit(self, X, y):
self.X = X
self.y = y
self.classes = np.unique(y)
# Calculate the mean and variance of each feature for each class
for i in range(len(self.classes)):
c = self.classes[i]
# Only select the rows where the species equals the given class
x_where_c = X[np.where(y == c)]
# Add the mean and variance for each feature
self.parameters.append((x_where_c.mean(axis=0),x_where_c.std(axis=0)))
# Gaussian probability distribution
def calculate_probability(self, means, stds, X):
return (1.0 / (np.sqrt((2.0 * np.pi) * stds))) * np.exp(-(np.power(X - means, 2) / (2 * stds**2)))
# Calculate the prior of class c (samples where class == c / total number of samples)
def calculate_prior(self, c):
# Selects the rows where the class label is c
x_where_c = self.X[np.where(self.y == c)]
n_class_instances = np.shape(x_where_c)[0]
n_total_instances = np.shape(self.X)[0]
return n_class_instances / n_total_instances
# Classify using Bayes Rule, P(Y|X) = P(X|Y)*P(Y)/P(X)
# P(X|Y) - Probability. Gaussian distribution (given by calculate_probability)
# P(Y) - Prior (given by calculate_prior)
# P(X) - Scales the posterior to the range 0 - 1 (ignored)
# Classify the sample as the class that results in the largest P(Y|X)
# (posterior)
def predict(self, X):
# Go through list of classes
posteriors=np.zeros(len(X),dtype="float")
for i in range(len(self.classes)):
c = self.classes[i]
prior = self.calculate_prior(c)
posterior = prior
# multiply with the additional probabilties
# Naive assumption (independence):
# P(x1,x2,x3|Y) = P(x1|Y)*P(x2|Y)*P(x3|Y)
mean, std = self.parameters[i]
# Determine P(x|Y)
prob = self.calculate_probability(mean, std, X)
# Multiply with the rest
posterior*=prob.prod(axis=1)
# Total probability = P(Y)*P(x1|Y)*P(x2|Y)*...*P(xN|Y)
posteriors=np.c_[posteriors,posterior]
# Get the largest probability and return the class corresponding
# to that probability
predictions = np.argmax(posteriors,axis=1)
return np.choose(predictions-1,self.classes)