add working cli

benjamsf · Apr 7, 2023 · 09b1027 · 09b1027
1 parent 7a91e05
commit 09b1027
Show file tree

Hide file tree

Showing 19 changed files with 293 additions and 12 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,13 +3,13 @@
 *.DS
 
 # Sourcetext
-txt/*txt
+lutherscripts/txt/*txt
 
 # Output text
-output/*.txt
+lutherscripts/output/*.txt
 
 # Output
-output/*.pkl
+lutherscripts/output/*.pkl
 
 # Prerequisites
 *.d

diff --git a/lutherscripts-gui/__pycache__/tkinter.cpython-310.pyc b/lutherscripts-gui/__pycache__/tkinter.cpython-310.pyc
diff --git a/lutherscripts-gui/gui.py b/lutherscripts-gui/gui.py
@@ -0,0 +1,68 @@
+import tkinter as tk
+from tkinter import ttk
+from tkinter import filedialog
+import os
+import subprocess
+
+# create the main window
+root = tk.Tk()
+root.title("NLP Text Analysis")
+
+# create the tabs
+tab_parent = ttk.Notebook(root)
+tab1 = ttk.Frame(tab_parent)
+tab2 = ttk.Frame(tab_parent)
+tab3 = ttk.Frame(tab_parent)
+tab_parent.add(tab1, text="Config")
+tab_parent.add(tab2, text="Text Preparation")
+tab_parent.add(tab3, text="Text Processing")
+tab_parent.pack(expand=1, fill='both')
+
+# create widgets for Config tab
+lbl_raw_sourcetext = tk.Label(tab1, text="Choose raw source text:")
+lbl_raw_sourcetext.pack(side=tk.LEFT, padx=10, pady=10)
+btn_raw_sourcetext = tk.Button(tab1, text="Browse...", command=lambda: choose_file())
+btn_raw_sourcetext.pack(side=tk.LEFT, padx=10, pady=10)
+
+# function to choose file and store the location in a variable
+def choose_file():
+    global location_raw_sourcetext
+    location_raw_sourcetext = filedialog.askopenfilename(title="Select a File")
+    print(f"File selected: {location_raw_sourcetext}")
+
+# create widgets for Text Preparation tab
+lbl_operation = tk.Label(tab2, text="Choose Operation:")
+lbl_operation.pack(side=tk.LEFT, padx=10, pady=10)
+options = ["cltk_sentencetokenize_latin.py", "cltk_wordtokenize_latin.py"]
+var_operation = tk.StringVar(tab2)
+var_operation.set(options[0])
+opt_operation = tk.OptionMenu(tab2, var_operation, *options)
+opt_operation.pack(side=tk.LEFT, padx=10, pady=10)
+
+def run_script():
+    global location_raw_sourcetext
+    global process
+    script_name = var_operation.get()
+    script_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../text_preparation", script_name)
+    process = subprocess.Popen(f"python {script_path} {location_raw_sourcetext}", stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, shell=True)
+    while True:
+        output = process.stdout.readline()
+        if output == b'' and process.poll() is not None:
+            break
+        if output:
+            txt_terminal.insert(tk.END, output.decode('utf-8'))
+    rc = process.poll()
+    print(f"Return code: {rc}")
+
+btn_play = tk.Button(tab2, text="Play!", command=run_script)
+btn_play.pack(side=tk.LEFT, padx=10, pady=10)
+
+txt_terminal = tk.Text(tab2, height=10, width=50)
+txt_terminal.pack(side=tk.BOTTOM, padx=10, pady=10)
+
+# create widgets for Text Processing tab
+lbl_nothing = tk.Label(tab3, text="Nothing so far.")
+lbl_nothing.pack(side=tk.LEFT, padx=10, pady=10)
+
+# start the GUI
+root.mainloop()
diff --git a/lutherscripts/cli.py b/lutherscripts/cli.py
@@ -0,0 +1,51 @@
+import argparse
+import os
+import subprocess
+
+def add_arguments_sentencetokenize(parser):
+    parser.add_argument("-s", "--source-path", type=str, required=True, help="The path to the source text file")
+    parser.add_argument("-d", "--destination-path", type=str, required=True, help="The path to the output file")
+
+def add_arguments_wordtokenize(parser):
+    parser.add_argument("-s", "--source-path", type=str, required=True, help="The path to the source text file")
+    parser.add_argument("-d", "--destination-path", type=str, required=True, help="The path to the output file")
+
+def sentence_tokenize_latin(source_path, destination_path):
+    from src.text_preparation.cltk_sentencetokenize_latin_arg import main as cltk_sentencetokenize_latin
+    output = cltk_sentencetokenize_latin(source_path, destination_path)
+    print(output)
+
+def word_tokenize_latin(source_path, destination_path):
+    from src.text_preparation.cltk_wordtokenize_latin_arg import main as cltk_wordtokenize_latin
+    output = cltk_wordtokenize_latin(source_path, destination_path)
+    print(output)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="NLP script launcher")
+
+    subparsers = parser.add_subparsers(dest='subparser_name')
+
+    sent_tokenize_parser = subparsers.add_parser('sent_tokenize_latin', help='Tokenize Latin text into sentences')
+    add_arguments_sentencetokenize(sent_tokenize_parser)
+
+    word_tokenize_parser = subparsers.add_parser('word_tokenize_latin', help='Tokenize Latin text into words')
+    add_arguments_wordtokenize(word_tokenize_parser)
+
+    args = parser.parse_args()
+
+    if args.subparser_name == 'sent_tokenize_latin':
+        sentence_tokenize_latin(args.source_path, args.destination_path)
+    elif args.subparser_name == 'word_tokenize_latin':
+        word_tokenize_latin(args.source_path, args.destination_path)
+
+    return args
+
+if __name__ == '__main__':
+    args = main()
+    if args.subparser_name == 'word_tokenize_latin':
+        word_tokenize_latin(args.source_path, args.destination_path)
+    elif args.subparser_name == 'sent_tokenize_latin':
+        sentence_tokenize_latin(args.source_path, args.destination_path)
+
+        print(output)
diff --git a/lutherscripts/output/dsa_prepared_2.txt b/lutherscripts/output/dsa_prepared_2.txt
diff --git a/lutherscripts/src/install/install_fasttext-model.py b/lutherscripts/src/install/install_fasttext-model.py
@@ -1,6 +1,9 @@
 import os
 import urllib.request
 
+__author__ = "benjamsf"
+__license__ = "MIT"
+
 def download_fasttext_model():
     print("Downloading Fasttext model for Latin...")
 

diff --git a/lutherscripts/src/install/install_stanza-model.py b/lutherscripts/src/install/install_stanza-model.py
@@ -1,5 +1,8 @@
 import stanza
 
+__author__ = "benjamsf"
+__license__ = "MIT"
+
 def download_stanza_model():
     print("Downloading Stanza model for Latin...")
     stanza.download('la')

diff --git a/...rscripts/src/text_preparation/__pycache__/cltk_sentencetokenize_latin_arg.cpython-310.pyc b/...rscripts/src/text_preparation/__pycache__/cltk_sentencetokenize_latin_arg.cpython-310.pyc
diff --git a/lutherscripts/src/text_preparation/__pycache__/cltk_wordtokenize_latin.cpython-310.pyc b/lutherscripts/src/text_preparation/__pycache__/cltk_wordtokenize_latin.cpython-310.pyc
diff --git a/lutherscripts/src/text_preparation/__pycache__/cltk_wordtokenize_latin_arg.cpython-310.pyc b/lutherscripts/src/text_preparation/__pycache__/cltk_wordtokenize_latin_arg.cpython-310.pyc
diff --git a/lutherscripts/src/text_preparation/cltk_sentencetokenize_latin.py b/lutherscripts/src/text_preparation/cltk_sentencetokenize_latin.py
@@ -1,11 +1,13 @@
 # Python script to prepare Latin text from UTF-8 textfile to form usable in NLP
-# Stage 2b, tokenize prepared text by sentences
-# (c) Benjam Bröijer, licensed under the MIT License
+
+__author__ = "benjamsf"
+__license__ = "MIT"
 
 import os
 import re
 from cltk import NLP
 from tqdm import tqdm
+import argparse
 
 # Instantiate a Latin-specific NLP object
 cltk_nlp = NLP(language="lat")

diff --git a/lutherscripts/src/text_preparation/cltk_sentencetokenize_latin_arg.py b/lutherscripts/src/text_preparation/cltk_sentencetokenize_latin_arg.py
@@ -0,0 +1,55 @@
+import io
+import os
+import re
+from cltk import NLP
+from tqdm import tqdm
+import logging
+
+def main(source_path, destination_path):
+    # Instantiate a Latin-specific NLP object
+    cltk_nlp = NLP(language="lat")
+
+    logging.basicConfig(level=logging.ERROR)
+
+    # Load the Latin text from the source file
+    input_file = os.path.abspath(source_path)
+    logging.info(f"Reading input from file: {input_file}")
+
+    with open(input_file, 'r', encoding='utf-8') as f:
+        input_text = f.read()
+        logging.info(f"Input file contents:\n{input_text}")
+
+    # Split the input text into smaller chunks based on punctuation
+    chunk_delimiters = r'[.!?]+'
+    text_chunks = re.split(chunk_delimiters, input_text)
+
+    # Process the text_chunks with cltk_nlp and update the progress bar
+    sentence_tokens = []
+    for chunk in tqdm(text_chunks, desc="Tokenizing sentences"):
+        doc = cltk_nlp(chunk)
+        for sentence in doc.sentences:
+            sentence_text = ' '.join([word.string for word in sentence.words])
+            sentence_tokens.append(sentence_text.strip())
+
+    print(sentence_tokens)
+
+    # Capture the output in a string buffer
+    with io.StringIO() as buffer:
+        for chunk in tqdm(text_chunks, desc="Tokenizing sentences"):
+            doc = cltk_nlp(chunk)
+            for sentence in doc.sentences:
+                sentence_text = ' '.join([word.string for word in sentence.words])
+                sentence_tokens.append(sentence_text.strip())
+
+        buffer.write('\n'.join(sentence_tokens))
+
+        # Save the tokenized output to a file
+        output_file = os.path.abspath(destination_path)
+        with open(output_file, 'w', encoding='utf-8') as f:
+            f.write(buffer.getvalue())
+
+        # Print a message to confirm that the file has been saved
+        print(f'The tokenized output has been saved as {output_file}')
+
+    # Return the output as a string
+    return buffer.getvalue()
diff --git a/lutherscripts/src/text_preparation/cltk_wordtokenize_latin.py b/lutherscripts/src/text_preparation/cltk_wordtokenize_latin.py
@@ -1,6 +1,6 @@
 # Python script to prepare Luther's Werke im WWW text to form usable in NLP
-# Stage 2, tokenize prepared text
-# (c) Benjam Bröijer, licensed under the MIT License
+__author__ = "benjamsf"
+__license__ = "MIT"
 
 import os
 import string

diff --git a/lutherscripts/src/text_preparation/cltk_wordtokenize_latin_arg.py b/lutherscripts/src/text_preparation/cltk_wordtokenize_latin_arg.py
@@ -0,0 +1,53 @@
+import io
+import os
+import re
+from cltk import NLP
+from tqdm import tqdm
+import logging
+
+def main(source_path, destination_path):
+    # Instantiate a Latin-specific NLP object
+    logging.basicConfig(level=logging.ERROR)
+    cltk_nlp = NLP(language="lat")
+
+    # Load the Latin text from the source file
+    input_file = os.path.abspath(source_path)
+    with open(input_file, 'r', encoding='utf-8') as f:
+        input_text = f.read()
+
+    # Split the input text into smaller chunks based on punctuation
+    chunk_delimiters = r'[.!?]+'
+    text_chunks = re.split(chunk_delimiters, input_text)
+
+    # Process the text_chunks with cltk_nlp and update the progress bar
+    sentence_tokens = []
+    for chunk in tqdm(text_chunks, desc="Tokenizing sentences"):
+        doc = cltk_nlp(chunk)
+        for sentence in doc.sentences:
+            sentence_text = ' '.join([word.string for word in sentence.words])
+            sentence_tokens.append(sentence_text.strip())
+
+    print(sentence_tokens)
+
+    # Capture the output in a string buffer
+    with io.StringIO() as buffer:
+        for chunk in tqdm(text_chunks, desc="Tokenizing sentences"):
+            doc = cltk_nlp(chunk)
+            for sentence in doc.sentences:
+                sentence_text = ' '.join([word.string for word in sentence.words])
+                sentence_tokens.append(sentence_text.strip())
+
+        buffer.write('\n'.join(sentence_tokens))
+
+        # Save the tokenized output to a file
+        output_file = os.path.abspath(destination_path)
+        with open(output_file, 'w', encoding='utf-8') as f:
+            f.write(buffer.getvalue())
+
+        # Print a message to confirm that the file has been saved
+        print(f'The tokenized output has been saved as {destination_path}')
+
+    # Return the output as a string
+    return buffer.getvalue()
+
+
diff --git a/...cripts/src/text_preparation/prepare_WA.py → ...aration/lutherswerkeimwww_prepare-text.py b/...cripts/src/text_preparation/prepare_WA.py → ...aration/lutherswerkeimwww_prepare-text.py
@@ -1,7 +1,9 @@
 # Python script to prepare Luther's Werke im WWW text to form usable in NLP
-# Stage 1: Prepare the text 
 # Removes all square brackets including site and verse numbers and removes newlines & whitespace
-# (c) Benjam Bröijer, licensed under the MIT License
+
+__author__ = "benjamsf"
+__license__ = "MIT"
+
 import os
 import re
 

diff --git a/...preparation/prepare_wa_withsitenumbers.py → ...rswerkeimwww_prepare_with_site_numbers.py b/...preparation/prepare_wa_withsitenumbers.py → ...rswerkeimwww_prepare_with_site_numbers.py
@@ -1,7 +1,9 @@
 # Python script to prepare Luther's Werke im WWW text to form usable in NLP
-# Stage 1: Prepare the text 
 # Removes any square brackets [] and their contents, except for page and verse numbers 
-# (c) Benjam Bröijer, licensed under the MIT License
+
+__author__ = "benjamsf"
+__license__ = "MIT"
+
 import os
 import re
 

diff --git a/lutherscripts/src/text_processing/booknlp_run_booknlp.py b/lutherscripts/src/text_processing/booknlp_run_booknlp.py
@@ -1,5 +1,8 @@
 from booknlp.booknlp import BookNLP
 
+__author__ = "benjamsf"
+__license__ = "MIT"
+
 model_params={
 		"pipeline":"entity,quote,supersense,event,coref", 
 		"model":"big"

diff --git a/lutherscripts/src/text_processing/cltk_do_KWIC.py b/lutherscripts/src/text_processing/cltk_do_KWIC.py
diff --git a/lutherscripts/src/text_processing/nltk_do_kwic.py b/lutherscripts/src/text_processing/nltk_do_kwic.py
@@ -0,0 +1,40 @@
+import json
+import os
+import nltk
+from nltk import word_tokenize
+from nltk.text import Text
+
+__author__ = "benjamsf"
+__license__ = "MIT"
+
+# Define the source and destination file names and paths
+script_path = os.path.dirname(os.path.abspath(__file__))
+source_path = os.path.join(script_path, '../../output/dsa_tokenized.pkl')
+destination_path = os.path.join(script_path, '../../output/dsa_kwic_analysis.json')
+
+# Load the tokenized text from the source file
+with open(source_path, 'r', encoding='utf-8') as f:
+    tokenized_text = f.read()
+
+# Create an NLTK Text object
+text = Text(tokenized_text)
+
+# Perform KWIC analysis for the keyword
+keyword = "spe"
+kwic_results = list(text.concordance_list(keyword, lines=nltk.ConcordanceIndex(text.tokens).count(keyword)))
+
+# Convert the KWIC results into JSON format
+kwic_json = []
+for result in kwic_results:
+    kwic_json.append({
+        "left_context": " ".join(result[0]),
+        "keyword": result[1],
+        "right_context": " ".join(result[2])
+    })
+
+# Save the KWIC analysis as a JSON file
+with open(destination_path, 'w', encoding='utf-8') as f:
+    json.dump(kwic_json, f, ensure_ascii=False, indent=2)
+
+# Print a message to confirm that the file has been saved
+print(f'The KWIC analysis has been saved as {destination_path}')