From 3c7f28d9dd4833e161db39e37aa99c62ac7d981e Mon Sep 17 00:00:00 2001
From: choitwao <rkevincai@hotmail.com>
Date: Thu, 29 Nov 2018 09:02:41 -0500
Subject: [PATCH] all nighter and finalize the code

---
 predict.py | 117 ++++++++++++++++++++++++++++++++++++-----------------
 train.py   |  18 ++++++---
 2 files changed, 92 insertions(+), 43 deletions(-)

diff --git a/predict.py b/predict.py
index eb6424f..3f3cde1 100644
--- a/predict.py
+++ b/predict.py
@@ -1,22 +1,29 @@
 from preprocessor import Preprocessor
 import os
-import re
 import math
 import pickle
+import re
+
+
+language = {
+    'EN': 'English',
+    'FR': 'French'
+}
+model_type = {
+    1: 'unigram',
+    2: 'bigram'
+}
 
 
 class Predict:
 
-    def __init__(self, n):
+    def __init__(self, n, smooth=0.5):
+        self.smooth = smooth
         self.n = n
-        if self.n == 1:
-            prefix = 'unigram'
-        elif self.n == 2:
-            prefix = 'bigram'
-        else:
-            prefix = str(self.n) + 'gram'
-        self.model_list = self.__load_model__(prefix)
-        print(self.model_list)
+        self.model_list = self.__load_model__()
+        if n < 1 or n > 2:
+            print('Only support unigram or bigram.')
+            os._exit(1)
         if len(self.model_list) == 0:
             print('Please train first.')
             os._exit(1)
@@ -26,40 +33,74 @@ def run(self):
         p.run()
         for file_name in os.listdir('temp/'):
             with open('temp/' + file_name, 'r+', encoding='utf-8', errors='ignore') as test_file:
-                for sentence in test_file.read().split('\n'):
-                    if sentence == '':
-                        continue
-                    result = self.__calculate_sentence_probability__(sentence)
+                for idx, sentence in enumerate(filter(None, test_file.read().split('\n'))):
+                    # run prediction on every model
+                    print_log = list()
+                    sentence_probabilities = dict()
+                    for model in self.model_list:
+                        sentence_probability, log = self.__calculate_sentence_probability__(sentence, model)
+                        print_log.append(log)
+                        sentence_probabilities[model['lang']] = sentence_probability
+                    detected = language[sorted(sentence_probabilities, key=lambda x: sentence_probabilities[x])[-1]
+                        .upper()]
+                    self.__save_trace__(sentence, idx, print_log, detected)
 
-    def __load_model__(self, prefix):
-        model_list = dict()
+    def __calculate_sentence_probability__(self, sentence, model):
+        log = list()
+        # append boundary when n = 2
+        # treat the sentence as a word since all just addition
+        # i.e. #birds#build#nests#
+        if model['n'] == 2:
+            p_sentence = '#'.join(filter(None, sentence.split()))
+            p_sentence = '#' + p_sentence + '#'
+        else:
+            p_sentence = ''.join(filter(None, sentence.split()))
+        # get ['#b', 'bi', 'ir' 'rd', 'd#' ...., 's#']
+        grams = [p_sentence[i:i + model['n']] for i in range(len(p_sentence) - model['n'] + 1)]
+        # get the probabilities of each gram
+        sentence_probability = 0
+        for gram in grams:
+            log.append({
+                gram: list()
+            })
+            # if the gram is in the dictionary, get the probability
+            if gram in model['probability']:
+                gram_probability = model['probability'][gram]
+            # if not, calculate the add delta value
+            else:
+                denominator = float(model['size']) + (
+                        float(len(model['probability'])) * float(self.smooth))
+                numerator = float(self.smooth)
+                gram_probability = numerator / denominator
+            sentence_probability += math.log(gram_probability, 10)
+            log[-1][gram].append(language[model['lang'].upper()].upper() + ': P(' + '|'
+                                 .join(reversed([char for char in gram]))
+                                 + ') = ' + str(gram_probability) + '  ==> log prob of sentence so far: '
+                                 + str(sentence_probability))
+        return sentence_probability, log
+
+    def __load_model__(self):
+        model_list = list()
+        prefix = model_type[1] if self.n == 1 else model_type[2]
         for file_name in os.listdir('output/'):
-            print(file_name)
             if re.match(prefix, file_name) and file_name.endswith('.pkl'):
                 with open('output/' + file_name, 'rb+') as model:
-                    lang = file_name.replace(prefix, '').replace('.pkl', '')
-                    model_list[lang] = pickle.load(model)
+                    model_list.append(pickle.load(model))
         return model_list
 
-    def __calculate_sentence_probability__(self, sentence):
-        sentence_probability = dict()
-        for lang in self.model_list.keys():
-            sentence_probability[lang] = 0
-            for term in sentence.split():
-                if term == '':
-                    continue
-                sentence_probability[lang] += self.__calculate_term_probability(term, lang)
-                break
-            break
-        return sentence_probability
+    def __save_trace__(self, sentence, idx, print_log, detected):
+        with open('output/out' + str(idx + 1) + '.txt', 'w+', encoding='utf-8', errors='ignore') as st_log:
+            st_log.write(sentence + '\n\n')
+            st_log.write(model_type[self.n].upper() + ' MODEL:\n')
+            for j in range(len(print_log[0])):
+                for gram, gram_log in print_log[0][j].items():
+                    st_log.write('\n' + model_type[self.n].upper() + ': ' + gram + '\n')
+                    for i in range(len(print_log)):
+                        for s in print_log[i][j].values():
+                            st_log.write(s[0] + '\n')
+            st_log.write('According to the ' + model_type[self.n] + ' model, the sentence is in ' + detected)
+            print(sentence + ' [' + detected + ']')
+            st_log.write('\n---------------- ')
 
-    def __calculate_term_probability(self, term, lang):
-        if self.n > 1:
-            term = '#' + term + '#'
-        grams = [term[i:i + self.n] for i in range(len(term) - self.n + 1)]
-        term_probability = 0
-        for gram in grams:
-            term_probability += math.log(self.model_list[lang][gram])
-        return 0
 
 
diff --git a/train.py b/train.py
index fd8cadb..edea2f9 100644
--- a/train.py
+++ b/train.py
@@ -12,6 +12,9 @@ def __init__(self, n, lang, smooth=0.5):
         self.frequency = dict()
         self.probability = dict()
         self.smooth = smooth
+        if n < 1 or n > 2:
+            print('Only support unigram or bigram.')
+            os._exit(1)
         preprocessor = Preprocessor()
         preprocessor.run()
 
@@ -32,7 +35,7 @@ def __create_dictionary__(self):
                 # for every term in the corpus
                 for term in corpus.read().split():
                     # for every character in the term
-                    if self.n > 1:
+                    if self.n == 2:
                         term = '#' + term + '#'
                     for i in range(len(term) - self.n + 1):
                         # create term and set frequency, or increase by 1 if it exists
@@ -50,15 +53,20 @@ def __calculate_gram_probability__(self):
     def __dump_model__(self):
         if self.n == 1:
             prefix = 'unigram'
-        elif self.n == 2:
-            prefix = 'bigram'
         else:
-            prefix = self.n + 'gram'
+            prefix = 'bigram'
         with open('output/' + prefix + self.lang.upper() + '.txt', 'w+', encoding='utf-8', errors='ignore') as model:
             for gram in sorted(self.probability):
                 key = '|'.join(reversed([char for char in gram]))
                 model.write('P(' + key + ') = ' + str(self.probability[gram]) + '\n')
         with open('output/' + prefix + self.lang.upper() + '.pkl', 'wb+') as model:
-            pickle.dump(self.probability, model)
+            s = {
+                'probability': self.probability,
+                'size': sum(self.frequency.values()),
+                'lang': self.lang,
+                'n': self.n,
+                'type': prefix
+            }
+            pickle.dump(s, model)