-
Notifications
You must be signed in to change notification settings - Fork 1
/
knnAlgorithm.py
172 lines (125 loc) · 4.81 KB
/
knnAlgorithm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# coding: utf-8
# In[1]:
# Philip Tenteromano
# 1/28/2019
#
# KNN Algorithm implementation
# Using training and test .arff files
# Outputs to a newly created results.arff file
# to read arff file
from scipy.io import arff
# for euclidean distance
import scipy
# pandas <3
import pandas as pd
# for output
from shutil import copyfile
# In[2]:
# get training file into a dataframe
training = arff.loadarff('train.arff')
trainDf = pd.DataFrame(training[0])
# In[3]:
# get testing file into a dataframe
testing = arff.loadarff('test.arff')
testDf = pd.DataFrame(testing[0])
# In[4]:
# get our results file - avoid manipulating test.arff
copyfile('test.arff', 'results.arff')
# In[5]:
# replace the title of results.arr from HW1_TEST to HW1_RESULTS
line_to_replace = 0
my_file = 'results.arff'
with open(my_file, 'r+') as file:
lines = file.readlines()
lines[line_to_replace] = '@relation HW1_RESULTS\n'
with open(my_file, 'w') as file:
file.writelines( lines )
# In[6]:
# used to append to the resulting arff file
def appendToResults(testIndex, classifiedLabel):
# the dataframe index and file line are off by 9
# IE - the first sample is index 0 in df, but index 9 on the file
append_to_line = testIndex + 9
my_file = 'results.arff'
with open(my_file, 'r+') as file:
# get array of lines
lines = file.readlines()
# strip newline, append text, add newline
lines[append_to_line] = lines[append_to_line].rstrip() + ',' + classifiedLabel + '\n'
with open(my_file, 'w') as file:
# write the lines back to file
file.writelines(lines)
# In[7]:
# KNN algorithm
def kNearestNeighbors(test, train, k=None):
'''
KNN Algorithm
Takes a 'test' and 'train' dataframe, and 'k' as an integer
TODO: implement my own euclidean distance algorithm
'''
# check for valid k
if (k == None) or (k < 1):
print("K needs to be atleast 1")
return
# STEP 1
# find euclidean distance, temporarily slicing out the 'class_label' of training data
distDf = scipy.spatial.distance.cdist(testDf.iloc[:], trainDf.iloc[:,:-1], metric='euclidean')
# every row is a test data sample, cols are distance with that training data index
distDf = pd.DataFrame(distDf)
# STEP 2
# init variables for k-minimum values and voter-log dictionary
minK = []
# [index 0 = 'versicolor' | index 1 = 'setosa' | index 2 = 'virginica']
voter = {'votes': [ 0, 0, 0], 'sums': [ 0, 0, 0]}
indexPair = {'versicolor': 0, 'setosa': 1, 'virginica': 2}
# STEP 3 - primary loop
# iterate over rows in the distance matrix
for index, row in distDf.iterrows():
# sort the distances, slice the first 'k' indices
minK = row.argsort()[:k]
# STEP 4
# iterate over the k-min values (we have our close neighbors)
for i in range(k):
# grab the label from taining using found index (decode to get rid of 'b')
label = trainDf.iloc[minK[i]]['CLASS_LABEL'].decode("utf-8")
# grab the voter index corresponding to label and increment that vote
indexing = indexPair[label]
voter['votes'][indexing] += 1
# add distances to vote to break ties
distK = distDf.loc[index, minK[i]]
voter['sums'][indexing] += distK
# STEP 5
# get winning label from voter object
maxVotesIndex = voter['votes'].index(max(voter['votes']))
winner = voter['votes'][maxVotesIndex] # num of votes, not label!
tiebreaker = False
# check for tiebreaker (equal votes for all even k)
for i, val in enumerate(voter['votes']):
# if tiebreaker, use min distance sums
if i != maxVotesIndex and val == winner:
tiebreaker = True
minDistIndex = voter['sums'].index(min(voter['sums']))
# reverse search to get the correct WINNING label
for flowerKey, flowerIndex in indexPair.items():
if tiebreaker:
if flowerIndex == minDistIndex:
winner = flowerKey
break
else:
if flowerIndex == maxVotesIndex:
winner = flowerKey
break
# STEP 6
# finally, find line to append and add to arff file
appendToResults(index, winner)
# STEP 7
# reset the votes and dist sums for the next test sample
for i in range(len(voter['votes'])):
voter['votes'][i] = 0
voter['sums'][i] = 0
# In[8]:
# we want to run the algorithm for the first 5 kVals
# namely, k = {1,3,5,7,9}
# each call will append the result to the results.arff file
for kVals in range(1,10,2):
kNearestNeighbors(testDf, trainDf, kVals)