Skip to content

Commit

Permalink
all nighter and finalize the code
Browse files Browse the repository at this point in the history
  • Loading branch information
dodoels committed Nov 29, 2018
1 parent 16ca2fc commit 3c7f28d
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 43 deletions.
117 changes: 79 additions & 38 deletions predict.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,29 @@
from preprocessor import Preprocessor
import os
import re
import math
import pickle
import re


language = {
'EN': 'English',
'FR': 'French'
}
model_type = {
1: 'unigram',
2: 'bigram'
}


class Predict:

def __init__(self, n):
def __init__(self, n, smooth=0.5):
self.smooth = smooth
self.n = n
if self.n == 1:
prefix = 'unigram'
elif self.n == 2:
prefix = 'bigram'
else:
prefix = str(self.n) + 'gram'
self.model_list = self.__load_model__(prefix)
print(self.model_list)
self.model_list = self.__load_model__()
if n < 1 or n > 2:
print('Only support unigram or bigram.')
os._exit(1)
if len(self.model_list) == 0:
print('Please train first.')
os._exit(1)
Expand All @@ -26,40 +33,74 @@ def run(self):
p.run()
for file_name in os.listdir('temp/'):
with open('temp/' + file_name, 'r+', encoding='utf-8', errors='ignore') as test_file:
for sentence in test_file.read().split('\n'):
if sentence == '':
continue
result = self.__calculate_sentence_probability__(sentence)
for idx, sentence in enumerate(filter(None, test_file.read().split('\n'))):
# run prediction on every model
print_log = list()
sentence_probabilities = dict()
for model in self.model_list:
sentence_probability, log = self.__calculate_sentence_probability__(sentence, model)
print_log.append(log)
sentence_probabilities[model['lang']] = sentence_probability
detected = language[sorted(sentence_probabilities, key=lambda x: sentence_probabilities[x])[-1]
.upper()]
self.__save_trace__(sentence, idx, print_log, detected)

def __load_model__(self, prefix):
model_list = dict()
def __calculate_sentence_probability__(self, sentence, model):
log = list()
# append boundary when n = 2
# treat the sentence as a word since all just addition
# i.e. #birds#build#nests#
if model['n'] == 2:
p_sentence = '#'.join(filter(None, sentence.split()))
p_sentence = '#' + p_sentence + '#'
else:
p_sentence = ''.join(filter(None, sentence.split()))
# get ['#b', 'bi', 'ir' 'rd', 'd#' ...., 's#']
grams = [p_sentence[i:i + model['n']] for i in range(len(p_sentence) - model['n'] + 1)]
# get the probabilities of each gram
sentence_probability = 0
for gram in grams:
log.append({
gram: list()
})
# if the gram is in the dictionary, get the probability
if gram in model['probability']:
gram_probability = model['probability'][gram]
# if not, calculate the add delta value
else:
denominator = float(model['size']) + (
float(len(model['probability'])) * float(self.smooth))
numerator = float(self.smooth)
gram_probability = numerator / denominator
sentence_probability += math.log(gram_probability, 10)
log[-1][gram].append(language[model['lang'].upper()].upper() + ': P(' + '|'
.join(reversed([char for char in gram]))
+ ') = ' + str(gram_probability) + ' ==> log prob of sentence so far: '
+ str(sentence_probability))
return sentence_probability, log

def __load_model__(self):
model_list = list()
prefix = model_type[1] if self.n == 1 else model_type[2]
for file_name in os.listdir('output/'):
print(file_name)
if re.match(prefix, file_name) and file_name.endswith('.pkl'):
with open('output/' + file_name, 'rb+') as model:
lang = file_name.replace(prefix, '').replace('.pkl', '')
model_list[lang] = pickle.load(model)
model_list.append(pickle.load(model))
return model_list

def __calculate_sentence_probability__(self, sentence):
sentence_probability = dict()
for lang in self.model_list.keys():
sentence_probability[lang] = 0
for term in sentence.split():
if term == '':
continue
sentence_probability[lang] += self.__calculate_term_probability(term, lang)
break
break
return sentence_probability
def __save_trace__(self, sentence, idx, print_log, detected):
with open('output/out' + str(idx + 1) + '.txt', 'w+', encoding='utf-8', errors='ignore') as st_log:
st_log.write(sentence + '\n\n')
st_log.write(model_type[self.n].upper() + ' MODEL:\n')
for j in range(len(print_log[0])):
for gram, gram_log in print_log[0][j].items():
st_log.write('\n' + model_type[self.n].upper() + ': ' + gram + '\n')
for i in range(len(print_log)):
for s in print_log[i][j].values():
st_log.write(s[0] + '\n')
st_log.write('According to the ' + model_type[self.n] + ' model, the sentence is in ' + detected)
print(sentence + ' [' + detected + ']')
st_log.write('\n---------------- ')

def __calculate_term_probability(self, term, lang):
if self.n > 1:
term = '#' + term + '#'
grams = [term[i:i + self.n] for i in range(len(term) - self.n + 1)]
term_probability = 0
for gram in grams:
term_probability += math.log(self.model_list[lang][gram])
return 0


18 changes: 13 additions & 5 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ def __init__(self, n, lang, smooth=0.5):
self.frequency = dict()
self.probability = dict()
self.smooth = smooth
if n < 1 or n > 2:
print('Only support unigram or bigram.')
os._exit(1)
preprocessor = Preprocessor()
preprocessor.run()

Expand All @@ -32,7 +35,7 @@ def __create_dictionary__(self):
# for every term in the corpus
for term in corpus.read().split():
# for every character in the term
if self.n > 1:
if self.n == 2:
term = '#' + term + '#'
for i in range(len(term) - self.n + 1):
# create term and set frequency, or increase by 1 if it exists
Expand All @@ -50,15 +53,20 @@ def __calculate_gram_probability__(self):
def __dump_model__(self):
if self.n == 1:
prefix = 'unigram'
elif self.n == 2:
prefix = 'bigram'
else:
prefix = self.n + 'gram'
prefix = 'bigram'
with open('output/' + prefix + self.lang.upper() + '.txt', 'w+', encoding='utf-8', errors='ignore') as model:
for gram in sorted(self.probability):
key = '|'.join(reversed([char for char in gram]))
model.write('P(' + key + ') = ' + str(self.probability[gram]) + '\n')
with open('output/' + prefix + self.lang.upper() + '.pkl', 'wb+') as model:
pickle.dump(self.probability, model)
s = {
'probability': self.probability,
'size': sum(self.frequency.values()),
'lang': self.lang,
'n': self.n,
'type': prefix
}
pickle.dump(s, model)


0 comments on commit 3c7f28d

Please sign in to comment.