-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathperf_comparison.py
138 lines (118 loc) · 3.81 KB
/
perf_comparison.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# import time
# from sklearn.metrics.pairwise import euclidean_distances
import time
from strsimpy.cosine import Cosine
from strsimpy.normalized_levenshtein import NormalizedLevenshtein
from strsimpy.damerau import Damerau
from strsimpy.jaro_winkler import JaroWinkler
from strsimpy.metric_lcs import MetricLCS
import pandas as pd
import numpy as np
# import nltk
# import sklearn
# from numpy import argmax
# # nltk.download('punkt')
df = pd.read_csv('./new_dataset2.csv')
search_strings = [
'Corn Nuts Ranch Bag'.lower(),
'Jolly Time Popcorn'.lower(),
'Roasted Turkey Gravy'.lower(),
'Simply Organic Seasoning'.lower(),
'M&b Curry'.lower(),
'French Fries with Sauce'.lower(),
'Biscuits and Tea'.lower(),
'Potato Chips'.lower(),
'Chocolate Ice Cream'.lower(),
'Choco Chip Cookies with Nuts'.lower(),
'Cheese Popcorn'.lower(),
'Cheese Sandwich'.lower(),
'Gelatin Free Cookies'.lower(),
'Jam and Butter'.lower(),
'Organic Olive Oil for cooking'.lower(),
]
normalized_levenshtein = NormalizedLevenshtein()
cosine = Cosine(2)
damerau = Damerau()
jarowinkler = JaroWinkler()
metric_lcs = MetricLCS()
time_values = []
# print(df.head())
# print(normalized_levenshtein.distance("ABCDE", "ABCE"))
def calNL(inputString):
ans = []
t1 = time.perf_counter()
for index, row in df.iterrows():
ans.append([normalized_levenshtein.distance(
inputString, row['name'].lower()), row['name']])
t2 = time.perf_counter()
ans.sort()
print(f'Normalized Levenshtein: {ans[:3]}\n')
timeCount = int(round((t2 - t1)*1000))
return timeCount
# print(f"Normalized Levenshtein took: {timeCount}ms")
# time_values['NormalizedLevenshtein'] = timeCount
def calCo(inputString):
ans = []
t1 = time.perf_counter()
for index, row in df.iterrows():
p0 = cosine.get_profile(inputString)
p1 = cosine.get_profile(row['name'].lower())
ans.append([cosine.similarity_profiles(p0, p1), row['name']])
t2 = time.perf_counter()
ans.sort(reverse=True)
print(f'Cosine Similarity: {ans[:3]}\n')
timeCount = int(round((t2 - t1)*1000))
# print(f"Cosine Similarity took: {timeCount}ms")
# time_values['CosineSimilarity'] = timeCount
return timeCount
def calJW(inputString):
ans = []
t1 = time.perf_counter()
for index, row in df.iterrows():
ans.append([jarowinkler.similarity(
inputString, row['name'].lower()), row['name']])
t2 = time.perf_counter()
ans.sort(reverse=True)
print(f'Jaro Winkler: {ans[:3]}\n')
timeCount = int(round((t2 - t1)*1000))
# print(f"Jaro Winkler took: {timeCount}ms")
# time_values['JaroWinkler'] = timeCount
return timeCount
def calDA(inputString):
ans = []
t1 = time.perf_counter()
for index, row in df.iterrows():
ans.append(
[damerau.distance(inputString, row['name'].lower()), row['name']])
t2 = time.perf_counter()
ans.sort()
print(f'Damerau Lev: {ans[:3]}\n')
timeCount = int(round((t2 - t1)*1000))
# print(f"Damerau took: {timeCount}ms")
# time_values['Damerau'] = timeCount
return timeCount
def calMLCS(inputString):
ans = []
t1 = time.perf_counter()
for index, row in df.iterrows():
ans.append([metric_lcs.distance(inputString, row['name']), row['name']])
t2 = time.perf_counter()
ans.sort()
print(f'Metric LCS: {ans[:3]}\n')
timeCount = int(round((t2 - t1)*1000))
# print(f"Metric LCS took: {timeCount}ms")
# time_values['MetricLCS'] = timeCount
return timeCount
for i in search_strings:
print(f'\n{i}----------------\n')
time_values.append(
[i,
calNL(i),
calCo(i),
calJW(i),
calDA(i),
calMLCS(i)
])
print('-------runtimes in order-----')
for i in time_values:
print(i)