Merge pull request #3 from benjamsf/add-kwic

Add initial support for kwic analysis argument parsing from gui
benjamsf · Apr 9, 2023 · b6001b8 · b6001b8
2 parents 46c33a1 + 71936aa
commit b6001b8
Show file tree

Hide file tree

Showing 9 changed files with 66 additions and 102 deletions.
diff --git a/lutherscripts/cli.py b/lutherscripts/cli.py
@@ -13,13 +13,12 @@
 
 
 def add_arguments(parser):
-    parser.add_argument("-o", "--operation", type=str, choices=["word_tokenize_latin", "sent_tokenize_latin", "kwic"], required=True, help="Choose operation: word_tokenize_latin, sent_tokenize_latin, or kwic")
-    parser.add_argument("-1", "--first-detail", type=str, help="First detail flag for operation")
-    parser.add_argument("-2", "--second-detail", type=int, help="Second detail flag for operation")
+    parser.add_argument("-o", "--operation", type=str, choices=["word_tokenize_latin", "sent_tokenize_latin", "kwic_analysis", "freq_analysis"], required=True, help="Choose operation: word_tokenize_latin, sent_tokenize_latin, or kwic")
+    parser.add_argument("-1", "--first-detail", type=str, help="First detail flag for operation, depends on the operation")
+    parser.add_argument("-2", "--second-detail", type=int, help="Second detail flag for operation, depends on the operation")
     parser.add_argument("-s", "--source-path", type=str, required=True, help="The path to the source text file")
     parser.add_argument("-d", "--destination-path", type=str, required=True, help="The path to the output file")
 
-
 def sentence_tokenize_latin(source_path, destination_path):
     from src.text_preparation.cltk_sentencetokenize_latin_arg import main as cltk_sentencetokenize_latin
     output = cltk_sentencetokenize_latin(source_path, destination_path)
@@ -35,8 +34,8 @@ def kwic_analysis(keyword, context_size, source_path, destination_path):
     output = nltk_do_kwic_analysis(keyword, context_size, source_path, destination_path)
     print(output.encode('utf-8'))
 
-def freq_analysis(source_path, destination_path)
-    from src.text_processing.nltk_do_freqanalysis import main as nltk_do_freqanalysis)
+def freq_analysis(source_path, destination_path):
+    from src.text_processing.nltk_do_freqanalysis import main as nltk_do_freqanalysis
     output = nltk_do_freqanalysis(source_path, destination_path)
     print(output.encode('utf-8'))
 
@@ -57,7 +56,7 @@ def cli_main():
             print("Both -1 and -2 flags must be provided for the KWIC operation.")
         else:
             kwic_analysis(args.first_detail, args.second_detail, source_path, destination_path)
-    elif args.operation == 'freq-analysis'
+    elif args.operation == 'freq-analysis':
         freq_analysis(source_path, destination_path)
 
 if __name__ == '__main__':

diff --git a/lutherscripts/gui.py b/lutherscripts/gui.py
@@ -38,7 +38,7 @@ def readline(self):
 
 def create_image_label(parent, root, frames):
     lbl_luther_image = tk.Label(parent, image=frames[0])
-    lbl_luther_image.grid(row=0, rowspan=5, column=0, padx=10, pady=10)
+    lbl_luther_image.grid(row=0, rowspan=6, column=0, padx=10, pady=10)
 
     def update_image():
         nonlocal ind
@@ -102,21 +102,32 @@ def gui_main():
     lbl_selected_output_file = tk.Label(root, textvariable=output_file_label)
     lbl_selected_output_file.grid(row=1, column=3, padx=10, pady=10)
 
+        # Entry field for argument 1
+    lbl_argument1 = tk.Label(root, text="Argument 1 and 2:")
+    lbl_argument1.grid(row=2, column=1, padx=4, pady=4)
+    ent_argument1 = tk.Entry(root, width=10)
+    ent_argument1.grid(row=2, column=2, padx=4, pady=4)
+    ent_argument2 = tk.Entry(root, width=10)
+    ent_argument2.grid(row=2, column=3, padx=4, pady=4)
+
+
     # Choose Operation
     lbl_operation = tk.Label(root, text="Choose Operation:")
-    lbl_operation.grid(row=2, column=1, padx=10, pady=10)
+    lbl_operation.grid(row=4, column=1, padx=10, pady=10)
 
     options = [
         ("word_tokenize_latin", "Tokenize Latin text by words"),        
         ("sent_tokenize_latin", "Tokenize Latin text by sentences"),        
-        ("nltk_do_kwic", "Perform KWIC analysis"),    
+        ("kwic_analysis", "Perform KWIC analysis"),
+        ("freq_analysis", "Perform word frequency analysis")    
         ]
 
     def update_explanation(*args):
         explanations = {
             "Tokenize Latin text by words": "This operation will tokenize your Latin text by words, which is required for further word-based natural language processing.",
             "Tokenize Latin text by sentences": "This operation will tokenize your Latin text by sentences, which is useful for sentence-based natural language processing.",
             "Perform KWIC analysis": "This operation will perform a Key Word in Context (KWIC) analysis, allowing you to see the occurrences of a word within the context of the text.",
+            "Perform word frequency analysis": "This operation will perform a Word Frequency Analysis, allowing you to see the number of times each word has been used in your target text."
         }
 
         selected_operation = var_operation.get()
@@ -127,12 +138,12 @@ def update_explanation(*args):
     var_operation.set(options[0][1])
     var_operation.trace("w", update_explanation)
     opt_operation = tk.OptionMenu(root, var_operation, *[option[1] for option in options])
-    opt_operation.grid(row=2, column=2, padx=10, pady=10)
+    opt_operation.grid(row=4, column=2, padx=10, pady=10)
 
     explanation_label = tk.StringVar()
     explanation_label.set("This operation will tokenize your Latin text by words, which is required for further word-based natural language processing.")
     lbl_explanation = tk.Label(root, textvariable=explanation_label, wraplength=300)
-    lbl_explanation.grid(row=2, column=3, padx=10, pady=10)
+    lbl_explanation.grid(row=4, column=3, padx=10, pady=10)
 
     # function to choose file and store the location in a variable
     def choose_file():
@@ -150,7 +161,7 @@ def choose_output_file():
 
     txt_terminal = tk.Text(root, height=20, width=1000)
     txt_terminal.configure(state='normal')  # Add this line to enable the state of the txt_terminal widget
-    txt_terminal.grid(row=4, column=1, columnspan=3, padx=10, pady=10, sticky='nsew')
+    txt_terminal.grid(row=5, column=1, columnspan=3, padx=10, pady=10, sticky='nsew')
     root.grid_columnconfigure(1, weight=1)
     root.grid_rowconfigure(4, weight=1)
     sys.stdout = CustomTextRedirector(txt_terminal)
@@ -173,12 +184,19 @@ async def run_script_async():
         source_path = os.path.normpath(location_raw_sourcetext)
         destination_path = os.path.normpath(location_output)
         cli_command = ['lutherscripts-cli', '-o', operation_name, '-s', source_path, '-d', destination_path]
+        # Add argument 1 and argument 2 for KWIC analysis
+        if operation_name == "kwic_analysis":
+            argument1 = ent_argument1.get()
+            argument2 = ent_argument2.get()
+            cli_command.extend(["-1", argument1, "-2", argument2])
+
         process = await asyncio.create_subprocess_exec(
             *cli_command,
             stdout=asyncio.subprocess.PIPE,
             stderr=asyncio.subprocess.PIPE
         )
 
+
         output = ''
         while True:
             char = await process.stdout.read(1)
@@ -210,6 +228,15 @@ def start_operation():
         asyncio.set_event_loop(loop)
         print("Starting operation...")
         print("Please wait, this might take couple of seconds...")
+
+        # Check the selected operation and validate arguments for KWIC analysis
+        operation_name = [option[0] for option in options if option[1] == var_operation.get()][0]
+        if operation_name == "kwic_analysis":
+            argument1 = ent_argument1.get()
+            argument2 = ent_argument2.get()
+            if not argument1 or not argument2:
+                print("Please enter both Argument1 (Keyword) and argument2 (Context length, a number of words you want to see left and right of a keyword hit) for the KWIC analysis")
+                return
 
         ind = 0
 
@@ -235,7 +262,7 @@ def update_image():
 
     # Start Operation! button
     btn_play = tk.Button(root, text="Start Operation!", command=start_operation)
-    btn_play.grid(row=3, column=3, padx=10, pady=10)
+    btn_play.grid(row=6, column=3, padx=10, pady=10)
 
     # Print a welcome message to the terminal
     print("Welcome to Lutherscripts!")

diff --git a/...reparation/cltk_sentencetokenize_latin.py → ...ne_scripts/cltk_sentencetokenize_latin.py b/...reparation/cltk_sentencetokenize_latin.py → ...ne_scripts/cltk_sentencetokenize_latin.py
diff --git a/...xt_preparation/cltk_wordtokenize_latin.py → ...dalone_scripts/cltk_wordtokenize_latin.py b/...xt_preparation/cltk_wordtokenize_latin.py → ...dalone_scripts/cltk_wordtokenize_latin.py
diff --git a/lutherscripts/src/text_preparation/cltk_sentencetokenize_latin_arg.py b/lutherscripts/src/text_preparation/cltk_sentencetokenize_latin_arg.py
@@ -1,18 +1,14 @@
 import io
 import os
 import re
+import json
 from cltk import NLP
 from tqdm import tqdm
 import logging
 import sys
 import io
 
 
-__author__ = "benjamsf"
-__license__ = "MIT"
-
-sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
-
 def main(source_path, destination_path, progress_callback=None):
     # Instantiate a Latin-specific NLP object
     cltk_nlp = NLP(language="lat")
@@ -40,24 +36,11 @@ def main(source_path, destination_path, progress_callback=None):
         if progress_callback:
             progress_callback(len(sentence_tokens) / len(text_chunks))
 
-    # Capture the output in a string buffer
-    with io.StringIO() as buffer:
-        for chunk in tqdm(text_chunks, desc="Tokenizing sentences"):
-            doc = cltk_nlp(chunk)
-            for sentence in doc.sentences:
-                sentence_text = ' '.join([word.string for word in sentence.words])
-                sentence_tokens.append(sentence_text.strip())
-
-        buffer.write('\n'.join(sentence_tokens))
-
-        # Save the tokenized output to a file
+        # Save the tokenized output to a JSON file
         output_file = os.path.abspath(destination_path)
         with open(output_file, 'w', encoding='utf-8') as f:
-            f.write(buffer.getvalue())
+            json.dump(sentence_tokens, f, ensure_ascii=False)
 
         # Print a message to confirm that the file has been saved
         print(f'The tokenized output has been saved as {output_file}')
 
-    # Return the output as a string
-    return buffer.getvalue()
-
diff --git a/lutherscripts/src/text_preparation/tokenize_latin_old.py b/lutherscripts/src/text_preparation/tokenize_latin_old.py
diff --git a/lutherscripts/src/text_processing/nltk_do_freqanalysis.py b/lutherscripts/src/text_processing/nltk_do_freqanalysis.py
@@ -5,10 +5,8 @@
 from collections import OrderedDict
 from tqdm import tqdm
 
-__author__ = "benjamsf"
-__license__ = "MIT"
 
-def main(source_path, destination_path):
+def main(source_path, destination_path, progress_callback=None):
     # Load the tokenized text from the source file
     with open(source_path, 'r', encoding='utf-8') as f:
         tokenized_text = json.load(f)
@@ -17,6 +15,8 @@ def main(source_path, destination_path):
     fdist = FreqDist()
     for token in tqdm(tokenized_text, desc="Creating frequency distribution", unit="token"):
         fdist[token] += 1
+        if progress_callback:
+            progress_callback(fdist.N() / len(tokenized_text))
 
     # Sort the frequency distribution by frequency
     sorted_fdist = OrderedDict(fdist.most_common())
@@ -28,3 +28,4 @@ def main(source_path, destination_path):
     # Print a message to confirm that the file has been saved
     print(f'The frequency analysis has been saved as {destination_path}')
 
+
diff --git a/lutherscripts/src/text_processing/nltk_do_kwic.py b/lutherscripts/src/text_processing/nltk_do_kwic.py
@@ -1,14 +1,14 @@
 import json
 import os
 import nltk
-from nltk import word_tokenize
 from nltk.text import Text
+from tqdm import tqdm
 
 __author__ = "benjamsf"
 __license__ = "MIT"
 
 
-def main(keyword, context_size, source_path, destination_path):
+def main(keyword, context_size, source_path, destination_path, progress_callback=None):
     # Load the tokenized text from the source file
     with open(source_path, 'r', encoding='utf-8') as f:
         tokenized_text = f.read()
@@ -17,21 +17,28 @@ def main(keyword, context_size, source_path, destination_path):
     text = Text(nltk.word_tokenize(tokenized_text))
 
     # Perform KWIC analysis for the keyword
-    kwic_results = list(text.concordance_list(keyword, lines=nltk.ConcordanceIndex(text.tokens).count(keyword), width=context_size))
+    concordance = nltk.ConcordanceIndex(text.tokens)
+    kwic_results = []
+    keyword_occurrences = concordance.offsets(keyword)
+    for index in tqdm(keyword_occurrences, desc="Analyzing keyword occurrences"):
+        left_context = text.tokens[index-context_size:index]
+        right_context = text.tokens[index+len(keyword):index+len(keyword)+context_size]
+        kwic_results.append({
+            "left_context": " ".join(left_context),
+            "keyword": keyword,
+            "right_context": " ".join(right_context)
+        })
+        if progress_callback:
+            progress_callback(len(kwic_results) / len(keyword_occurrences))
 
     # Convert the KWIC results into JSON format
-    kwic_json = []
-    for result in kwic_results:
-        kwic_json.append({
-            "left_context": " ".join(result[0]),
-            "keyword": result[1],
-            "right_context": " ".join(result[2])
-        })
+    kwic_json = json.dumps(kwic_results, ensure_ascii=False, indent=2)
 
     # Save the KWIC analysis as a JSON file
     with open(destination_path, 'w', encoding='utf-8') as f:
-        json.dump(kwic_json, f, ensure_ascii=False, indent=2)
+        f.write(kwic_json)
 
     # Print a message to confirm that the file has been saved
     print(f'The KWIC analysis has been saved as {destination_path}')
 
+
diff --git a/lutherscripts/txt/jalla.json b/lutherscripts/txt/jalla.json
@@ -1,4 +1 @@
-Quod tardius diatribae tuae de libero arbitrio respondeo , Venerabilis Erasme , praeter spem omnium , praeterque morem meum accidit , qui hactenus eiusmodi occasiones scribendi , non solum libenter apprehendisse , sed ultro etiam quaesiisse visus sum
-Mirabitur forte quispiam novam illam et insolitam , vel patientiam , vel formidinem Lutheri , quem nec tot iactatae voces et literae adversariorum excitarunt , Erasmo victoriam congratulantes et Io pean cantantes , Scilicet Maccabaeus ille et pervicacissimus assertor , invenit tandem dignum antagonistam , contra quem hiscere non audet
-Quod tardius diatribae tuae de libero arbitrio respondeo , Venerabilis Erasme , praeter spem omnium , praeterque morem meum accidit , qui hactenus eiusmodi occasiones scribendi , non solum libenter apprehendisse , sed ultro etiam quaesiisse visus sum
-Mirabitur forte quispiam novam illam et insolitam , vel patientiam , vel formidinem Lutheri , quem nec tot iactatae voces et literae adversariorum excitarunt , Erasmo victoriam congratulantes et Io pean cantantes , Scilicet Maccabaeus ille et pervicacissimus assertor , invenit tandem dignum antagonistam , contra quem hiscere non audet
+["Quod", "tardius", "diatribae", "tuae", "de", "libero", "arbitrio", "respondeo", "Venerabilis", "Erasme", "praeter", "spem", "omnium", "praeterque", "morem", "meum", "accidit", "qui", "hactenus", "eiusmodi", "occasiones", "scribendi", "non", "solum", "libenter", "apprehendisse", "sed", "ultro", "etiam", "quaesiisse", "visus", "sum", "Mirabitur", "forte", "quispiam", "novam", "illam", "et", "insolitam", "vel", "patientiam", "vel", "formidinem", "Lutheri", "quem", "nec", "tot", "iactatae", "voces", "et", "literae", "adversariorum", "excitarunt", "Erasmo", "victoriam", "congratulantes", "et", "Io", "pean", "cantantes", "Scilicet", "Maccabaeus", "ille", "et", "pervicacissimus", "assertor", "invenit", "tandem", "dignum", "antagonistam", "contra", "quem", "hiscere", "non", "audet"]