From 453fbe11c632e71d1edbe7ec9db56fab36b931ba Mon Sep 17 00:00:00 2001 From: Shubh Goyal Date: Tue, 2 Apr 2024 14:32:49 +0530 Subject: [PATCH 1/4] Adds Spello - Adds new spello model to spell_check module --- src/spell_check/spello/README.md | 0 src/spell_check/spello/local/Dockerfile | 26 ++++ src/spell_check/spello/local/README.md | 38 +++++ src/spell_check/spello/local/__init__.py | 2 + src/spell_check/spello/local/api.py | 46 ++++++ src/spell_check/spello/local/model.py | 137 ++++++++++++++++++ src/spell_check/spello/local/request.py | 10 ++ src/spell_check/spello/local/requirements.txt | 4 + 8 files changed, 263 insertions(+) create mode 100644 src/spell_check/spello/README.md create mode 100644 src/spell_check/spello/local/Dockerfile create mode 100644 src/spell_check/spello/local/README.md create mode 100644 src/spell_check/spello/local/__init__.py create mode 100644 src/spell_check/spello/local/api.py create mode 100644 src/spell_check/spello/local/model.py create mode 100644 src/spell_check/spello/local/request.py create mode 100644 src/spell_check/spello/local/requirements.txt diff --git a/src/spell_check/spello/README.md b/src/spell_check/spello/README.md new file mode 100644 index 0000000..e69de29 diff --git a/src/spell_check/spello/local/Dockerfile b/src/spell_check/spello/local/Dockerfile new file mode 100644 index 0000000..ea5cf8b --- /dev/null +++ b/src/spell_check/spello/local/Dockerfile @@ -0,0 +1,26 @@ +# Use an official Python runtime as a parent image +FROM python:3.9-slim + +WORKDIR /app + +# Install system packages required for building kenlm +RUN apt-get update && apt-get install -y cmake g++ zlib1g-dev + +# Install requirements +COPY requirements.txt requirements.txt +RUN pip3 install -r requirements.txt + +# Install wget +RUN apt-get update && apt-get install -y wget + +# Download the files using wget +RUN wget "https://drive.google.com/uc?export=download&id=14cMmeDPlAODbRe37CdHLnhClGX7JXG-A" -O 'freq_dict.txt' +RUN wget "https://drive.google.com/uc?export=download&id=1Ztj6k0A4BMi_o87qwSDKJQ6cyhvlvneD" -O 'freq_dict_eng.txt' + +# Copy the rest of the application code to the working directory +COPY . /app/ + +EXPOSE 8000 + +# Set the entrypoint for the container +CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] diff --git a/src/spell_check/spello/local/README.md b/src/spell_check/spello/local/README.md new file mode 100644 index 0000000..4902903 --- /dev/null +++ b/src/spell_check/spello/local/README.md @@ -0,0 +1,38 @@ +**curl request for inferenece:** + +curl -X POST -H "Content-Type: application/json" -d '{ +"text": "ପାମ ମିଶନରୀ ଉପରେ କେତେ % ରିହାତି ଧୈର୍ଯ ହୋଇଛି", +"lang" : "ory" +}' http://localhost:8000/ + +curl -X POST -H "Content-Type: application/json" -d '{ +"text": "ପାମ ମିଶନରୀ ଉପରେ କେତେ % ରିହାତି ଧୈର୍ଯ ହୋଇଛି" +}' http://172.17.0.2:8000/ + +curl -X POST -H "Content-Type: application/json" -d '{ +"text": "how to apply for go-sugem scheme for my paddi crop", +"lang" : "eng" +}' http://localhost:8000/ + + + +**curl request for update:** + +curl -X PUT -H "Content-Type: application/json" -d '{ +"text": "ମିଶନରୀ", +"lang" : "ory" +}' http://localhost:8000/ + +curl -X PUT -H "Content-Type: application/json" -d '{ +"text": ["ପାମ ମିଶନରୀ ଉପରେ", "ରିହାତି ଧୈର୍ଯ ହୋଇଛି"] +}' http://localhost:8000/ + +curl -X PUT -H "Content-Type: application/json" -d '{ +"text": "go-sugem", +"lang" : "eng" +}' http://localhost:8000/ + +curl -X PUT -H "Content-Type: application/json" -d '{ +"text": ["how to apply for", "scheme for my paddi crop"], +"lang" : "eng" +}' http://localhost:8000/ diff --git a/src/spell_check/spello/local/__init__.py b/src/spell_check/spello/local/__init__.py new file mode 100644 index 0000000..661113c --- /dev/null +++ b/src/spell_check/spello/local/__init__.py @@ -0,0 +1,2 @@ +from .request import * +from .model import * diff --git a/src/spell_check/spello/local/api.py b/src/spell_check/spello/local/api.py new file mode 100644 index 0000000..d189645 --- /dev/null +++ b/src/spell_check/spello/local/api.py @@ -0,0 +1,46 @@ +from model import Model +from request import ModelRequest +from quart import Quart, request +import aiohttp + +app = Quart(__name__) + +model = None + +freq_dict_paths = { + 'ory': 'freq_dict.txt', + 'eng': 'freq_dict_eng.txt' +} + +spello_model_paths = { + 'ory': 'spello_model.pkl', + 'eng': 'spello_model_eng.pkl' +} + + +@app.before_serving +async def startup(): + app.client = aiohttp.ClientSession() + global model + model = Model(app, freq_dict_paths) + +@app.route('/', methods=['POST']) +async def infer(): + global model + data = await request.get_json() + req = ModelRequest(**data) + result = await model.inference(req) + return result + +@app.route('/', methods=['PUT']) +async def update(): + # print("PUT") + global model + data = await request.get_json() + req = ModelRequest(**data) + result = await model.update(req) + return result + + +if __name__ == "__main__": + app.run() diff --git a/src/spell_check/spello/local/model.py b/src/spell_check/spello/local/model.py new file mode 100644 index 0000000..5258c9b --- /dev/null +++ b/src/spell_check/spello/local/model.py @@ -0,0 +1,137 @@ +from request import ModelRequest +from spello.model import SpellCorrectionModel + +from collections import Counter + + +freq_dict_paths = { + 'ory': 'freq_dict.txt', + 'eng': 'freq_dict_eng.txt' +} + +spello_model_paths = { + 'ory': 'spello_model.pkl', + 'eng': 'spello_model_eng.pkl' +} + +class TextCorrector: + # def __init__(self, freq_dict_paths): + def __init__(self, freq_dict_paths): + self.models = { + 'ory': self.create_spello_model(freq_dict_paths['ory'], 'or'), + 'eng': self.create_spello_model(freq_dict_paths['eng'], 'en') + } + + self.freq_dict_paths = freq_dict_paths + + # Set the default language + self.set_language('ory') + + def set_language(self, lang): + # Switch the model and vocabulary based on language + self.model = self.models[lang] + + def load_freq_dict(self, freq_dict_path): + freq_dict = {} + + # read the frequency dictionary file + with open(freq_dict_path, 'r') as f: + freq_file = f.read().splitlines() + + # create a dictionary from the frequency file + for line in freq_file: + word, freq = line.split() + freq_dict[word] = int(freq) + + return freq_dict + + def create_spello_model(self, freq_dict_path, language): + # load the frequency dictionary + freq_dict = self.load_freq_dict(freq_dict_path) + + # create the spello model and train it + spello_model = SpellCorrectionModel(language=language) + spello_model.train(freq_dict) + # print('Loading model') + + return spello_model + + def make_correct_text(self, text, correction_dict): + corrected_list = text.split() + for i in range(len(corrected_list)): + word = corrected_list[i] + if word in correction_dict: + corrected_list[i] = correction_dict[word] + + corrected_text = ' '.join(corrected_list) + + return corrected_text + + def correct_text_with_spello(self, text): + result = self.model.spell_correct(text) + + corrected_text = result['spell_corrected_text'] + correction_dict = result['correction_dict'] + + corrected_text = self.make_correct_text(corrected_text, correction_dict) + + return corrected_text + + def make_updation_counter(self, text): + + if type(text) == list: + text = ' '.join(text) + + # remove punctuations from the text + text = ''.join(e for e in text if e.isalnum() or e.isspace()) + words = text.split() + + # create a dictionary of words and their frequencies + dict = Counter(words) + + return dict + + def update_model(self, lang, text): + # update the frequency dictionary + current_freq_dict_counter = Counter(self.load_freq_dict(self.freq_dict_paths[lang])) + new_freq_dict_counter = self.make_updation_counter(text) + + # merge the two frequency dictionaries + freq_dict_counter = current_freq_dict_counter + new_freq_dict_counter + + freq_dict = {} + for word, freq in freq_dict_counter.items(): + freq_dict[word] = int(freq) + + with open(self.freq_dict_paths[lang], 'w') as f: + for word, freq in freq_dict.items(): + f.write(word + ' ' + str(freq) + '\n') + + # retrain the model with the updated frequency dictionary + self.models[lang].train(freq_dict) + + return 'Model updated successfully' + +class Model(): + def __init__(self, context, freq_dict_paths): + self.context = context + # self.text_corrector = TextCorrector(freq_dict_paths) + self.text_corrector = TextCorrector(freq_dict_paths) + + async def inference(self, request: ModelRequest): + # Set the correct language model based on the request + self.text_corrector.set_language(request.lang) + + corrected_text = self.text_corrector.correct_text_with_spello( + request.text + ) + return corrected_text + + async def update(self, request: ModelRequest): + # Set the correct language model based on the request + self.text_corrector.set_language(request.lang) + + # Update the model with the new data + self.text_corrector.update_model(request.lang, request.text) + + return 'Model updated successfully' \ No newline at end of file diff --git a/src/spell_check/spello/local/request.py b/src/spell_check/spello/local/request.py new file mode 100644 index 0000000..1cd2a3f --- /dev/null +++ b/src/spell_check/spello/local/request.py @@ -0,0 +1,10 @@ +import requests +import json + +class ModelRequest(): + def __init__(self, text, lang='ory'): + self.text = text + self.lang = lang + + def to_json(self): + return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4) diff --git a/src/spell_check/spello/local/requirements.txt b/src/spell_check/spello/local/requirements.txt new file mode 100644 index 0000000..41d690a --- /dev/null +++ b/src/spell_check/spello/local/requirements.txt @@ -0,0 +1,4 @@ +quart +aiohttp +requests +spello \ No newline at end of file From efff2128cb3a1e6dcb1971c1d2d5f1411fb50211 Mon Sep 17 00:00:00 2001 From: Shubh Goyal Date: Tue, 2 Apr 2024 15:09:39 +0530 Subject: [PATCH 2/4] Updates kenlm - adds symspell for word suggestions to the existing kenlm model --- src/spell_check/kenlm/local/Dockerfile | 2 + src/spell_check/kenlm/local/README.md | 27 +++++++- src/spell_check/kenlm/local/api.py | 16 ++++- src/spell_check/kenlm/local/model.py | 93 +++++++++++++++++++++++--- src/spell_check/kenlm/local/request.py | 8 +++ 5 files changed, 133 insertions(+), 13 deletions(-) diff --git a/src/spell_check/kenlm/local/Dockerfile b/src/spell_check/kenlm/local/Dockerfile index d792430..62a218d 100644 --- a/src/spell_check/kenlm/local/Dockerfile +++ b/src/spell_check/kenlm/local/Dockerfile @@ -17,8 +17,10 @@ RUN apt-get update && apt-get install -y wget # Download the files using wget RUN wget "https://drive.google.com/uc?export=download&id=1frSw5-qfRMgrYs4QL961s2yYuq2KplEM" -O '5gram_model.bin' RUN wget "https://drive.google.com/uc?export=download&id=1o31Z4TZbAOEt6E8Rx7VMONJOGJH-5Mwk" -O 'lexicon.txt' +RUN wget "https://drive.google.com/uc?export=download&id=14cMmeDPlAODbRe37CdHLnhClGX7JXG-A" -O 'freq_dict.txt' RUN wget "https://drive.google.com/uc?export=download&id=1-Dtk5socjYdeGyqhbQzG-rvWJfWVFGqv" -O '5gram_model_eng.bin' RUN wget "https://drive.google.com/uc?export=download&id=1-59pDTvEXCMUZ-NQ8BwmCnHQZh4Eg6Gw" -O 'lexicon_eng.txt' +RUN wget "https://drive.google.com/uc?export=download&id=1Ztj6k0A4BMi_o87qwSDKJQ6cyhvlvneD" -O 'freq_dict_eng.txt' # Copy the rest of the application code to the working directory COPY . /app/ diff --git a/src/spell_check/kenlm/local/README.md b/src/spell_check/kenlm/local/README.md index a4708cd..6dcc49d 100644 --- a/src/spell_check/kenlm/local/README.md +++ b/src/spell_check/kenlm/local/README.md @@ -1,4 +1,4 @@ -curl request : +.curl request : curl -X POST -H "Content-Type: application/json" -d '{ "text": "ପାମ ମିଶନରୀ ଉପରେ କେତେ % ରିହାତି ଧୈର୍ଯ ହୋଇଛି", @@ -8,7 +8,6 @@ curl -X POST -H "Content-Type: application/json" -d '{ "lang" : "ory" }' http://localhost:8000/ - curl -X POST -H "Content-Type: application/json" -d '{ "text": "ପାମ ମିଶନରୀ ଉପରେ କେତେ % ରିହାତି ଧୈର୍ଯ ହୋଇଛି", "BEAM_WIDTH": 5, @@ -16,7 +15,6 @@ curl -X POST -H "Content-Type: application/json" -d '{ "max_distance": 1 }' http://localhost:8000/ - curl -X POST -H "Content-Type: application/json" -d '{ "text": "how to apply for go-sugem scheme for my paddi crop", "BEAM_WIDTH": 5, @@ -24,3 +22,26 @@ curl -X POST -H "Content-Type: application/json" -d '{ "max_distance": 1, "lang" : "eng" }' http://localhost:8000/ + + + +**curl request for update:** + +curl -X PUT -H "Content-Type: application/json" -d '{ +"text": "ମିଶନରୀ", +"lang" : "ory" +}' http://localhost:8000/ + +curl -X PUT -H "Content-Type: application/json" -d '{ +"text": ["ପାମ ମିଶନରୀ ଉପରେ", "ରିହାତି ଧୈର୍ଯ ହୋଇଛି"] +}' http://localhost:8000/ + +curl -X PUT -H "Content-Type: application/json" -d '{ +"text": "go-sugem", +"lang" : "eng" +}' http://localhost:8000/ + +curl -X PUT -H "Content-Type: application/json" -d '{ +"text": ["how to apply for", "scheme for my paddi crop"], +"lang" : "eng" +}' http://localhost:8000/ diff --git a/src/spell_check/kenlm/local/api.py b/src/spell_check/kenlm/local/api.py index 87230c0..de05f4a 100644 --- a/src/spell_check/kenlm/local/api.py +++ b/src/spell_check/kenlm/local/api.py @@ -1,5 +1,5 @@ from model import Model -from request import ModelRequest +from request import ModelRequest, ModelUpdateRequest from quart import Quart, request import aiohttp @@ -17,13 +17,17 @@ 'eng': 'lexicon_eng.txt' } +freq_dict_paths = { + 'ory': 'freq_dict.txt', + 'eng': 'freq_dict_eng.txt' +} @app.before_serving async def startup(): app.client = aiohttp.ClientSession() global model - model = Model(app, model_paths, vocab_paths) + model = Model(app, model_paths, vocab_paths, freq_dict_paths) @app.route('/', methods=['POST']) async def embed(): @@ -33,5 +37,13 @@ async def embed(): result = await model.inference(req) return result +@app.route('/', methods=['PUT']) +async def update(): + global model + data = await request.get_json() + req = ModelUpdateRequest(**data) + result = await model.update_symspell(req) + return result + if __name__ == "__main__": app.run() diff --git a/src/spell_check/kenlm/local/model.py b/src/spell_check/kenlm/local/model.py index 353af92..bd58a50 100644 --- a/src/spell_check/kenlm/local/model.py +++ b/src/spell_check/kenlm/local/model.py @@ -1,7 +1,11 @@ import kenlm -from request import ModelRequest +from request import ModelRequest, ModelUpdateRequest import Levenshtein +from symspellpy import SymSpell, Verbosity + +from collections import Counter + model_paths = { 'ory': '5gram_model.bin', 'eng': '5gram_model_eng.bin' @@ -12,9 +16,14 @@ 'eng': 'lexicon_eng.txt' } +freq_dict_paths = { + 'ory': 'freq_dict.txt', + 'eng': 'freq_dict_eng.txt' +} + class TextCorrector: - def __init__(self, model_paths, vocab_paths): + def __init__(self, model_paths, vocab_paths, freq_dict_paths): # Initialize both models and vocabularies self.models = { 'ory': kenlm.Model(model_paths['ory']), @@ -24,6 +33,11 @@ def __init__(self, model_paths, vocab_paths): 'ory': self.create_vocab_lexicon(vocab_paths['ory']), 'eng': self.create_vocab_lexicon(vocab_paths['eng']) } + + self.symspell_models = { + 'ory': self.create_symspell_model(freq_dict_paths['ory']), + 'eng': self.create_symspell_model(freq_dict_paths['eng']) + } # Set the default language self.set_language('ory') @@ -31,6 +45,7 @@ def set_language(self, lang): # Switch the model and vocabulary based on language self.model = self.models[lang] self.vocab = self.vocabs[lang] + self.symspell_model = self.symspell_models[lang] def create_vocab_lexicon(self, lexicon_path): vocabulary = [] @@ -40,14 +55,23 @@ def create_vocab_lexicon(self, lexicon_path): vocabulary.append(word) return vocabulary + def create_symspell_model(self, freq_dict_path): + sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) + sym_spell.load_dictionary(freq_dict_path, term_index=0, count_index=1, separator=' ') + return sym_spell + + # def generate_candidates(self, word, max_distance=1): + # len_range = range(len(word) - max_distance, len(word) + max_distance + 1) + # filtered_vocab = [vocab_word for vocab_word in self.vocab if len(vocab_word) in len_range] + # return [vocab_word for vocab_word in filtered_vocab if 0 <= Levenshtein.distance(word, vocab_word) <= max_distance] + def generate_candidates(self, word, max_distance=1): - len_range = range(len(word) - max_distance, len(word) + max_distance + 1) - filtered_vocab = [vocab_word for vocab_word in self.vocab if len(vocab_word) in len_range] - return [vocab_word for vocab_word in filtered_vocab if 0 <= Levenshtein.distance(word, vocab_word) <= max_distance] + suggestions = self.symspell_model.lookup(word, Verbosity.CLOSEST, max_distance) + return [suggestion.term for suggestion in suggestions] def beam_search(self, chunk, BEAM_WIDTH=5, SCORE_THRESHOLD=1.5, max_distance=1): original_score = self.model.score(' '.join(chunk)) - + initial_candidates = self.generate_candidates(chunk[0], max_distance=1) if not initial_candidates: initial_candidates = [chunk[0]] @@ -88,11 +112,55 @@ def correct_text_with_beam_search(self, text, BEAM_WIDTH=5, SCORE_THRESHOLD=1.5, corrected_sentences.append(best_sentence) return ' '.join(corrected_sentences) + + def load_freq_dict(self, freq_dict_path): + freq_dict = {} + with open(freq_dict_path, 'r') as f: + for line in f: + word, freq = line.split() + freq_dict[word] = int(freq) + return freq_dict + + def make_updation_counter(self, text): + + if type(text) == list: + text = ' '.join(text) + + # remove punctuations from the text + text = ''.join(e for e in text if e.isalnum() or e.isspace()) + words = text.split() + + # create a dictionary of words and their frequencies + dict = Counter(words) + + return dict + + def update_symspell_model(self, lang, text): + # update the frequency dictionary + current_freq_dict_counter = Counter(self.load_freq_dict(freq_dict_paths[lang])) + new_freq_dict_counter = self.make_updation_counter(text) + + # merge the two frequency dictionaries + freq_dict_counter = current_freq_dict_counter + new_freq_dict_counter + + freq_dict = {} + for word, freq in freq_dict_counter.items(): + freq_dict[word] = int(freq) + + with open(freq_dict_paths[lang], 'w') as f: + for word, freq in freq_dict.items(): + f.write(word + ' ' + str(freq) + '\n') + + # retrain the model with the updated frequency dictionary + self.symspell_models[lang] = self.create_symspell_model(freq_dict_paths[lang]) + + return 'Model updated successfully' + class Model(): - def __init__(self, context, model_paths, vocab_paths): + def __init__(self, context, model_paths, vocab_paths, freq_dict_paths): self.context = context - self.text_corrector = TextCorrector(model_paths, vocab_paths) + self.text_corrector = TextCorrector(model_paths, vocab_paths, freq_dict_paths) async def inference(self, request: ModelRequest): # Set the correct language model based on the request @@ -105,3 +173,12 @@ async def inference(self, request: ModelRequest): max_distance=request.max_distance ) return corrected_text + + async def update_symspell(self, request: ModelUpdateRequest): + # Set the correct language model based on the request + self.text_corrector.set_language(request.lang) + + # Update the model with the new data + self.text_corrector.update_symspell_model(request.lang, request.text) + + return 'Model updated successfully' diff --git a/src/spell_check/kenlm/local/request.py b/src/spell_check/kenlm/local/request.py index 2e00903..c5d6fac 100644 --- a/src/spell_check/kenlm/local/request.py +++ b/src/spell_check/kenlm/local/request.py @@ -11,3 +11,11 @@ def __init__(self, text, BEAM_WIDTH, SCORE_THRESHOLD, max_distance, lang='ory'): def to_json(self): return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4) + +class ModelUpdateRequest(): + def __init__(self, text, lang='ory'): + self.text = text + self.lang = lang + + def to_json(self): + return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4) From ccf39a40c97a30e592fb31209cc87b6b60708574 Mon Sep 17 00:00:00 2001 From: Shubh Goyal Date: Tue, 2 Apr 2024 15:10:23 +0530 Subject: [PATCH 3/4] updates spello README.md - minor error (ip address was pushed in earlier commit), changes it to local host now --- src/spell_check/spello/local/README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/spell_check/spello/local/README.md b/src/spell_check/spello/local/README.md index 4902903..2e6fbb5 100644 --- a/src/spell_check/spello/local/README.md +++ b/src/spell_check/spello/local/README.md @@ -7,7 +7,7 @@ curl -X POST -H "Content-Type: application/json" -d '{ curl -X POST -H "Content-Type: application/json" -d '{ "text": "ପାମ ମିଶନରୀ ଉପରେ କେତେ % ରିହାତି ଧୈର୍ଯ ହୋଇଛି" -}' http://172.17.0.2:8000/ +}' http://localhost:8000/ curl -X POST -H "Content-Type: application/json" -d '{ "text": "how to apply for go-sugem scheme for my paddi crop", @@ -15,7 +15,6 @@ curl -X POST -H "Content-Type: application/json" -d '{ }' http://localhost:8000/ - **curl request for update:** curl -X PUT -H "Content-Type: application/json" -d '{ From f0237de842cfea380dc9ca993bfa7db571ea04c9 Mon Sep 17 00:00:00 2001 From: Shubh Goyal Date: Tue, 2 Apr 2024 15:12:49 +0530 Subject: [PATCH 4/4] updates kenlm requirements.txt --- src/spell_check/kenlm/local/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/spell_check/kenlm/local/requirements.txt b/src/spell_check/kenlm/local/requirements.txt index a77fa31..c2cc708 100644 --- a/src/spell_check/kenlm/local/requirements.txt +++ b/src/spell_check/kenlm/local/requirements.txt @@ -1,4 +1,5 @@ quart aiohttp python-Levenshtein -requests \ No newline at end of file +requests +symspellpy \ No newline at end of file