-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_trie.py
50 lines (43 loc) · 1.9 KB
/
create_trie.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import csv
from Trie import Trie
def makeMappings(path):
trie = Trie()
count = 0
# with open('./unigram_freq_words_only.csv') as f:
with open(path) as f:
reader = csv.reader(f)
for row in reader:
trie.insert(row[0], count)
count += 1
with open('./mappings.csv', 'w') as f:
writer = csv.writer(f)
# titles
writer.writerow(["from", "to", "letters saved", "percentage of word saved", "word precedence"])
# not a word and has a unique path should return the prefix -> prefix + full unique path suffix
def dfs(node, prefix):
if len(node.children) == 0:
return node.char
currPre = prefix + node.char
if len(node.children) == 1:
for child in node.children.values():
suffix = dfs(child, currPre)
from_chars = currPre + suffix[0]
to_chars = currPre + suffix
if not node.is_word:
return node.char + suffix
if len(suffix) > 1:
lettDiff = len(to_chars) - len(from_chars)
lettRat = lettDiff / len(to_chars)
writer.writerow([from_chars, to_chars, lettDiff, lettRat, trie.getFreq(to_chars)])
return node.char
else:
for child in node.children.values():
suffix = dfs(child, currPre)
if len(suffix) > 1:
from_chars = currPre + suffix[0]
to_chars = currPre + suffix
lettDiff = len(to_chars) - len(from_chars)
lettRat = lettDiff / len(to_chars)
writer.writerow([from_chars, to_chars, lettDiff, lettRat, trie.getFreq(to_chars)])
return node.char
dfs(trie.root, "")