From 3c7f28d9dd4833e161db39e37aa99c62ac7d981e Mon Sep 17 00:00:00 2001 From: choitwao Date: Thu, 29 Nov 2018 09:02:41 -0500 Subject: [PATCH] all nighter and finalize the code --- predict.py | 117 ++++++++++++++++++++++++++++++++++++----------------- train.py | 18 ++++++--- 2 files changed, 92 insertions(+), 43 deletions(-) diff --git a/predict.py b/predict.py index eb6424f..3f3cde1 100644 --- a/predict.py +++ b/predict.py @@ -1,22 +1,29 @@ from preprocessor import Preprocessor import os -import re import math import pickle +import re + + +language = { + 'EN': 'English', + 'FR': 'French' +} +model_type = { + 1: 'unigram', + 2: 'bigram' +} class Predict: - def __init__(self, n): + def __init__(self, n, smooth=0.5): + self.smooth = smooth self.n = n - if self.n == 1: - prefix = 'unigram' - elif self.n == 2: - prefix = 'bigram' - else: - prefix = str(self.n) + 'gram' - self.model_list = self.__load_model__(prefix) - print(self.model_list) + self.model_list = self.__load_model__() + if n < 1 or n > 2: + print('Only support unigram or bigram.') + os._exit(1) if len(self.model_list) == 0: print('Please train first.') os._exit(1) @@ -26,40 +33,74 @@ def run(self): p.run() for file_name in os.listdir('temp/'): with open('temp/' + file_name, 'r+', encoding='utf-8', errors='ignore') as test_file: - for sentence in test_file.read().split('\n'): - if sentence == '': - continue - result = self.__calculate_sentence_probability__(sentence) + for idx, sentence in enumerate(filter(None, test_file.read().split('\n'))): + # run prediction on every model + print_log = list() + sentence_probabilities = dict() + for model in self.model_list: + sentence_probability, log = self.__calculate_sentence_probability__(sentence, model) + print_log.append(log) + sentence_probabilities[model['lang']] = sentence_probability + detected = language[sorted(sentence_probabilities, key=lambda x: sentence_probabilities[x])[-1] + .upper()] + self.__save_trace__(sentence, idx, print_log, detected) - def __load_model__(self, prefix): - model_list = dict() + def __calculate_sentence_probability__(self, sentence, model): + log = list() + # append boundary when n = 2 + # treat the sentence as a word since all just addition + # i.e. #birds#build#nests# + if model['n'] == 2: + p_sentence = '#'.join(filter(None, sentence.split())) + p_sentence = '#' + p_sentence + '#' + else: + p_sentence = ''.join(filter(None, sentence.split())) + # get ['#b', 'bi', 'ir' 'rd', 'd#' ...., 's#'] + grams = [p_sentence[i:i + model['n']] for i in range(len(p_sentence) - model['n'] + 1)] + # get the probabilities of each gram + sentence_probability = 0 + for gram in grams: + log.append({ + gram: list() + }) + # if the gram is in the dictionary, get the probability + if gram in model['probability']: + gram_probability = model['probability'][gram] + # if not, calculate the add delta value + else: + denominator = float(model['size']) + ( + float(len(model['probability'])) * float(self.smooth)) + numerator = float(self.smooth) + gram_probability = numerator / denominator + sentence_probability += math.log(gram_probability, 10) + log[-1][gram].append(language[model['lang'].upper()].upper() + ': P(' + '|' + .join(reversed([char for char in gram])) + + ') = ' + str(gram_probability) + ' ==> log prob of sentence so far: ' + + str(sentence_probability)) + return sentence_probability, log + + def __load_model__(self): + model_list = list() + prefix = model_type[1] if self.n == 1 else model_type[2] for file_name in os.listdir('output/'): - print(file_name) if re.match(prefix, file_name) and file_name.endswith('.pkl'): with open('output/' + file_name, 'rb+') as model: - lang = file_name.replace(prefix, '').replace('.pkl', '') - model_list[lang] = pickle.load(model) + model_list.append(pickle.load(model)) return model_list - def __calculate_sentence_probability__(self, sentence): - sentence_probability = dict() - for lang in self.model_list.keys(): - sentence_probability[lang] = 0 - for term in sentence.split(): - if term == '': - continue - sentence_probability[lang] += self.__calculate_term_probability(term, lang) - break - break - return sentence_probability + def __save_trace__(self, sentence, idx, print_log, detected): + with open('output/out' + str(idx + 1) + '.txt', 'w+', encoding='utf-8', errors='ignore') as st_log: + st_log.write(sentence + '\n\n') + st_log.write(model_type[self.n].upper() + ' MODEL:\n') + for j in range(len(print_log[0])): + for gram, gram_log in print_log[0][j].items(): + st_log.write('\n' + model_type[self.n].upper() + ': ' + gram + '\n') + for i in range(len(print_log)): + for s in print_log[i][j].values(): + st_log.write(s[0] + '\n') + st_log.write('According to the ' + model_type[self.n] + ' model, the sentence is in ' + detected) + print(sentence + ' [' + detected + ']') + st_log.write('\n---------------- ') - def __calculate_term_probability(self, term, lang): - if self.n > 1: - term = '#' + term + '#' - grams = [term[i:i + self.n] for i in range(len(term) - self.n + 1)] - term_probability = 0 - for gram in grams: - term_probability += math.log(self.model_list[lang][gram]) - return 0 diff --git a/train.py b/train.py index fd8cadb..edea2f9 100644 --- a/train.py +++ b/train.py @@ -12,6 +12,9 @@ def __init__(self, n, lang, smooth=0.5): self.frequency = dict() self.probability = dict() self.smooth = smooth + if n < 1 or n > 2: + print('Only support unigram or bigram.') + os._exit(1) preprocessor = Preprocessor() preprocessor.run() @@ -32,7 +35,7 @@ def __create_dictionary__(self): # for every term in the corpus for term in corpus.read().split(): # for every character in the term - if self.n > 1: + if self.n == 2: term = '#' + term + '#' for i in range(len(term) - self.n + 1): # create term and set frequency, or increase by 1 if it exists @@ -50,15 +53,20 @@ def __calculate_gram_probability__(self): def __dump_model__(self): if self.n == 1: prefix = 'unigram' - elif self.n == 2: - prefix = 'bigram' else: - prefix = self.n + 'gram' + prefix = 'bigram' with open('output/' + prefix + self.lang.upper() + '.txt', 'w+', encoding='utf-8', errors='ignore') as model: for gram in sorted(self.probability): key = '|'.join(reversed([char for char in gram])) model.write('P(' + key + ') = ' + str(self.probability[gram]) + '\n') with open('output/' + prefix + self.lang.upper() + '.pkl', 'wb+') as model: - pickle.dump(self.probability, model) + s = { + 'probability': self.probability, + 'size': sum(self.frequency.values()), + 'lang': self.lang, + 'n': self.n, + 'type': prefix + } + pickle.dump(s, model)