Skip to content

Commit

Permalink
Merge pull request #3 from benjamsf/add-kwic
Browse files Browse the repository at this point in the history
Add initial support for kwic analysis argument parsing from gui
  • Loading branch information
benjamsf authored Apr 9, 2023
2 parents 46c33a1 + 71936aa commit b6001b8
Show file tree
Hide file tree
Showing 9 changed files with 66 additions and 102 deletions.
13 changes: 6 additions & 7 deletions lutherscripts/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,12 @@


def add_arguments(parser):
parser.add_argument("-o", "--operation", type=str, choices=["word_tokenize_latin", "sent_tokenize_latin", "kwic"], required=True, help="Choose operation: word_tokenize_latin, sent_tokenize_latin, or kwic")
parser.add_argument("-1", "--first-detail", type=str, help="First detail flag for operation")
parser.add_argument("-2", "--second-detail", type=int, help="Second detail flag for operation")
parser.add_argument("-o", "--operation", type=str, choices=["word_tokenize_latin", "sent_tokenize_latin", "kwic_analysis", "freq_analysis"], required=True, help="Choose operation: word_tokenize_latin, sent_tokenize_latin, or kwic")
parser.add_argument("-1", "--first-detail", type=str, help="First detail flag for operation, depends on the operation")
parser.add_argument("-2", "--second-detail", type=int, help="Second detail flag for operation, depends on the operation")
parser.add_argument("-s", "--source-path", type=str, required=True, help="The path to the source text file")
parser.add_argument("-d", "--destination-path", type=str, required=True, help="The path to the output file")


def sentence_tokenize_latin(source_path, destination_path):
from src.text_preparation.cltk_sentencetokenize_latin_arg import main as cltk_sentencetokenize_latin
output = cltk_sentencetokenize_latin(source_path, destination_path)
Expand All @@ -35,8 +34,8 @@ def kwic_analysis(keyword, context_size, source_path, destination_path):
output = nltk_do_kwic_analysis(keyword, context_size, source_path, destination_path)
print(output.encode('utf-8'))

def freq_analysis(source_path, destination_path)
from src.text_processing.nltk_do_freqanalysis import main as nltk_do_freqanalysis)
def freq_analysis(source_path, destination_path):
from src.text_processing.nltk_do_freqanalysis import main as nltk_do_freqanalysis
output = nltk_do_freqanalysis(source_path, destination_path)
print(output.encode('utf-8'))

Expand All @@ -57,7 +56,7 @@ def cli_main():
print("Both -1 and -2 flags must be provided for the KWIC operation.")
else:
kwic_analysis(args.first_detail, args.second_detail, source_path, destination_path)
elif args.operation == 'freq-analysis'
elif args.operation == 'freq-analysis':
freq_analysis(source_path, destination_path)

if __name__ == '__main__':
Expand Down
41 changes: 34 additions & 7 deletions lutherscripts/gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def readline(self):

def create_image_label(parent, root, frames):
lbl_luther_image = tk.Label(parent, image=frames[0])
lbl_luther_image.grid(row=0, rowspan=5, column=0, padx=10, pady=10)
lbl_luther_image.grid(row=0, rowspan=6, column=0, padx=10, pady=10)

def update_image():
nonlocal ind
Expand Down Expand Up @@ -102,21 +102,32 @@ def gui_main():
lbl_selected_output_file = tk.Label(root, textvariable=output_file_label)
lbl_selected_output_file.grid(row=1, column=3, padx=10, pady=10)

# Entry field for argument 1
lbl_argument1 = tk.Label(root, text="Argument 1 and 2:")
lbl_argument1.grid(row=2, column=1, padx=4, pady=4)
ent_argument1 = tk.Entry(root, width=10)
ent_argument1.grid(row=2, column=2, padx=4, pady=4)
ent_argument2 = tk.Entry(root, width=10)
ent_argument2.grid(row=2, column=3, padx=4, pady=4)


# Choose Operation
lbl_operation = tk.Label(root, text="Choose Operation:")
lbl_operation.grid(row=2, column=1, padx=10, pady=10)
lbl_operation.grid(row=4, column=1, padx=10, pady=10)

options = [
("word_tokenize_latin", "Tokenize Latin text by words"),
("sent_tokenize_latin", "Tokenize Latin text by sentences"),
("nltk_do_kwic", "Perform KWIC analysis"),
("kwic_analysis", "Perform KWIC analysis"),
("freq_analysis", "Perform word frequency analysis")
]

def update_explanation(*args):
explanations = {
"Tokenize Latin text by words": "This operation will tokenize your Latin text by words, which is required for further word-based natural language processing.",
"Tokenize Latin text by sentences": "This operation will tokenize your Latin text by sentences, which is useful for sentence-based natural language processing.",
"Perform KWIC analysis": "This operation will perform a Key Word in Context (KWIC) analysis, allowing you to see the occurrences of a word within the context of the text.",
"Perform word frequency analysis": "This operation will perform a Word Frequency Analysis, allowing you to see the number of times each word has been used in your target text."
}

selected_operation = var_operation.get()
Expand All @@ -127,12 +138,12 @@ def update_explanation(*args):
var_operation.set(options[0][1])
var_operation.trace("w", update_explanation)
opt_operation = tk.OptionMenu(root, var_operation, *[option[1] for option in options])
opt_operation.grid(row=2, column=2, padx=10, pady=10)
opt_operation.grid(row=4, column=2, padx=10, pady=10)

explanation_label = tk.StringVar()
explanation_label.set("This operation will tokenize your Latin text by words, which is required for further word-based natural language processing.")
lbl_explanation = tk.Label(root, textvariable=explanation_label, wraplength=300)
lbl_explanation.grid(row=2, column=3, padx=10, pady=10)
lbl_explanation.grid(row=4, column=3, padx=10, pady=10)

# function to choose file and store the location in a variable
def choose_file():
Expand All @@ -150,7 +161,7 @@ def choose_output_file():

txt_terminal = tk.Text(root, height=20, width=1000)
txt_terminal.configure(state='normal') # Add this line to enable the state of the txt_terminal widget
txt_terminal.grid(row=4, column=1, columnspan=3, padx=10, pady=10, sticky='nsew')
txt_terminal.grid(row=5, column=1, columnspan=3, padx=10, pady=10, sticky='nsew')
root.grid_columnconfigure(1, weight=1)
root.grid_rowconfigure(4, weight=1)
sys.stdout = CustomTextRedirector(txt_terminal)
Expand All @@ -173,12 +184,19 @@ async def run_script_async():
source_path = os.path.normpath(location_raw_sourcetext)
destination_path = os.path.normpath(location_output)
cli_command = ['lutherscripts-cli', '-o', operation_name, '-s', source_path, '-d', destination_path]
# Add argument 1 and argument 2 for KWIC analysis
if operation_name == "kwic_analysis":
argument1 = ent_argument1.get()
argument2 = ent_argument2.get()
cli_command.extend(["-1", argument1, "-2", argument2])

process = await asyncio.create_subprocess_exec(
*cli_command,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)


output = ''
while True:
char = await process.stdout.read(1)
Expand Down Expand Up @@ -210,6 +228,15 @@ def start_operation():
asyncio.set_event_loop(loop)
print("Starting operation...")
print("Please wait, this might take couple of seconds...")

# Check the selected operation and validate arguments for KWIC analysis
operation_name = [option[0] for option in options if option[1] == var_operation.get()][0]
if operation_name == "kwic_analysis":
argument1 = ent_argument1.get()
argument2 = ent_argument2.get()
if not argument1 or not argument2:
print("Please enter both Argument1 (Keyword) and argument2 (Context length, a number of words you want to see left and right of a keyword hit) for the KWIC analysis")
return

ind = 0

Expand All @@ -235,7 +262,7 @@ def update_image():

# Start Operation! button
btn_play = tk.Button(root, text="Start Operation!", command=start_operation)
btn_play.grid(row=3, column=3, padx=10, pady=10)
btn_play.grid(row=6, column=3, padx=10, pady=10)

# Print a welcome message to the terminal
print("Welcome to Lutherscripts!")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,14 @@
import io
import os
import re
import json
from cltk import NLP
from tqdm import tqdm
import logging
import sys
import io


__author__ = "benjamsf"
__license__ = "MIT"

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

def main(source_path, destination_path, progress_callback=None):
# Instantiate a Latin-specific NLP object
cltk_nlp = NLP(language="lat")
Expand Down Expand Up @@ -40,24 +36,11 @@ def main(source_path, destination_path, progress_callback=None):
if progress_callback:
progress_callback(len(sentence_tokens) / len(text_chunks))

# Capture the output in a string buffer
with io.StringIO() as buffer:
for chunk in tqdm(text_chunks, desc="Tokenizing sentences"):
doc = cltk_nlp(chunk)
for sentence in doc.sentences:
sentence_text = ' '.join([word.string for word in sentence.words])
sentence_tokens.append(sentence_text.strip())

buffer.write('\n'.join(sentence_tokens))

# Save the tokenized output to a file
# Save the tokenized output to a JSON file
output_file = os.path.abspath(destination_path)
with open(output_file, 'w', encoding='utf-8') as f:
f.write(buffer.getvalue())
json.dump(sentence_tokens, f, ensure_ascii=False)

# Print a message to confirm that the file has been saved
print(f'The tokenized output has been saved as {output_file}')

# Return the output as a string
return buffer.getvalue()

50 changes: 0 additions & 50 deletions lutherscripts/src/text_preparation/tokenize_latin_old.py

This file was deleted.

7 changes: 4 additions & 3 deletions lutherscripts/src/text_processing/nltk_do_freqanalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,8 @@
from collections import OrderedDict
from tqdm import tqdm

__author__ = "benjamsf"
__license__ = "MIT"

def main(source_path, destination_path):
def main(source_path, destination_path, progress_callback=None):
# Load the tokenized text from the source file
with open(source_path, 'r', encoding='utf-8') as f:
tokenized_text = json.load(f)
Expand All @@ -17,6 +15,8 @@ def main(source_path, destination_path):
fdist = FreqDist()
for token in tqdm(tokenized_text, desc="Creating frequency distribution", unit="token"):
fdist[token] += 1
if progress_callback:
progress_callback(fdist.N() / len(tokenized_text))

# Sort the frequency distribution by frequency
sorted_fdist = OrderedDict(fdist.most_common())
Expand All @@ -28,3 +28,4 @@ def main(source_path, destination_path):
# Print a message to confirm that the file has been saved
print(f'The frequency analysis has been saved as {destination_path}')


29 changes: 18 additions & 11 deletions lutherscripts/src/text_processing/nltk_do_kwic.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import json
import os
import nltk
from nltk import word_tokenize
from nltk.text import Text
from tqdm import tqdm

__author__ = "benjamsf"
__license__ = "MIT"


def main(keyword, context_size, source_path, destination_path):
def main(keyword, context_size, source_path, destination_path, progress_callback=None):
# Load the tokenized text from the source file
with open(source_path, 'r', encoding='utf-8') as f:
tokenized_text = f.read()
Expand All @@ -17,21 +17,28 @@ def main(keyword, context_size, source_path, destination_path):
text = Text(nltk.word_tokenize(tokenized_text))

# Perform KWIC analysis for the keyword
kwic_results = list(text.concordance_list(keyword, lines=nltk.ConcordanceIndex(text.tokens).count(keyword), width=context_size))
concordance = nltk.ConcordanceIndex(text.tokens)
kwic_results = []
keyword_occurrences = concordance.offsets(keyword)
for index in tqdm(keyword_occurrences, desc="Analyzing keyword occurrences"):
left_context = text.tokens[index-context_size:index]
right_context = text.tokens[index+len(keyword):index+len(keyword)+context_size]
kwic_results.append({
"left_context": " ".join(left_context),
"keyword": keyword,
"right_context": " ".join(right_context)
})
if progress_callback:
progress_callback(len(kwic_results) / len(keyword_occurrences))

# Convert the KWIC results into JSON format
kwic_json = []
for result in kwic_results:
kwic_json.append({
"left_context": " ".join(result[0]),
"keyword": result[1],
"right_context": " ".join(result[2])
})
kwic_json = json.dumps(kwic_results, ensure_ascii=False, indent=2)

# Save the KWIC analysis as a JSON file
with open(destination_path, 'w', encoding='utf-8') as f:
json.dump(kwic_json, f, ensure_ascii=False, indent=2)
f.write(kwic_json)

# Print a message to confirm that the file has been saved
print(f'The KWIC analysis has been saved as {destination_path}')


5 changes: 1 addition & 4 deletions lutherscripts/txt/jalla.json
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
Quod tardius diatribae tuae de libero arbitrio respondeo , Venerabilis Erasme , praeter spem omnium , praeterque morem meum accidit , qui hactenus eiusmodi occasiones scribendi , non solum libenter apprehendisse , sed ultro etiam quaesiisse visus sum
Mirabitur forte quispiam novam illam et insolitam , vel patientiam , vel formidinem Lutheri , quem nec tot iactatae voces et literae adversariorum excitarunt , Erasmo victoriam congratulantes et Io pean cantantes , Scilicet Maccabaeus ille et pervicacissimus assertor , invenit tandem dignum antagonistam , contra quem hiscere non audet
Quod tardius diatribae tuae de libero arbitrio respondeo , Venerabilis Erasme , praeter spem omnium , praeterque morem meum accidit , qui hactenus eiusmodi occasiones scribendi , non solum libenter apprehendisse , sed ultro etiam quaesiisse visus sum
Mirabitur forte quispiam novam illam et insolitam , vel patientiam , vel formidinem Lutheri , quem nec tot iactatae voces et literae adversariorum excitarunt , Erasmo victoriam congratulantes et Io pean cantantes , Scilicet Maccabaeus ille et pervicacissimus assertor , invenit tandem dignum antagonistam , contra quem hiscere non audet
["Quod", "tardius", "diatribae", "tuae", "de", "libero", "arbitrio", "respondeo", "Venerabilis", "Erasme", "praeter", "spem", "omnium", "praeterque", "morem", "meum", "accidit", "qui", "hactenus", "eiusmodi", "occasiones", "scribendi", "non", "solum", "libenter", "apprehendisse", "sed", "ultro", "etiam", "quaesiisse", "visus", "sum", "Mirabitur", "forte", "quispiam", "novam", "illam", "et", "insolitam", "vel", "patientiam", "vel", "formidinem", "Lutheri", "quem", "nec", "tot", "iactatae", "voces", "et", "literae", "adversariorum", "excitarunt", "Erasmo", "victoriam", "congratulantes", "et", "Io", "pean", "cantantes", "Scilicet", "Maccabaeus", "ille", "et", "pervicacissimus", "assertor", "invenit", "tandem", "dignum", "antagonistam", "contra", "quem", "hiscere", "non", "audet"]

0 comments on commit b6001b8

Please sign in to comment.