Skip to content

Commit

Permalink
add working cli
Browse files Browse the repository at this point in the history
  • Loading branch information
benjamsf committed Apr 7, 2023
1 parent 7a91e05 commit 09b1027
Show file tree
Hide file tree
Showing 19 changed files with 293 additions and 12 deletions.
6 changes: 3 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
*.DS

# Sourcetext
txt/*txt
lutherscripts/txt/*txt

# Output text
output/*.txt
lutherscripts/output/*.txt

# Output
output/*.pkl
lutherscripts/output/*.pkl

# Prerequisites
*.d
Expand Down
Binary file not shown.
68 changes: 68 additions & 0 deletions lutherscripts-gui/gui.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import tkinter as tk
from tkinter import ttk
from tkinter import filedialog
import os
import subprocess

# create the main window
root = tk.Tk()
root.title("NLP Text Analysis")

# create the tabs
tab_parent = ttk.Notebook(root)
tab1 = ttk.Frame(tab_parent)
tab2 = ttk.Frame(tab_parent)
tab3 = ttk.Frame(tab_parent)
tab_parent.add(tab1, text="Config")
tab_parent.add(tab2, text="Text Preparation")
tab_parent.add(tab3, text="Text Processing")
tab_parent.pack(expand=1, fill='both')

# create widgets for Config tab
lbl_raw_sourcetext = tk.Label(tab1, text="Choose raw source text:")
lbl_raw_sourcetext.pack(side=tk.LEFT, padx=10, pady=10)
btn_raw_sourcetext = tk.Button(tab1, text="Browse...", command=lambda: choose_file())
btn_raw_sourcetext.pack(side=tk.LEFT, padx=10, pady=10)

# function to choose file and store the location in a variable
def choose_file():
global location_raw_sourcetext
location_raw_sourcetext = filedialog.askopenfilename(title="Select a File")
print(f"File selected: {location_raw_sourcetext}")

# create widgets for Text Preparation tab
lbl_operation = tk.Label(tab2, text="Choose Operation:")
lbl_operation.pack(side=tk.LEFT, padx=10, pady=10)
options = ["cltk_sentencetokenize_latin.py", "cltk_wordtokenize_latin.py"]
var_operation = tk.StringVar(tab2)
var_operation.set(options[0])
opt_operation = tk.OptionMenu(tab2, var_operation, *options)
opt_operation.pack(side=tk.LEFT, padx=10, pady=10)

def run_script():
global location_raw_sourcetext
global process
script_name = var_operation.get()
script_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../text_preparation", script_name)
process = subprocess.Popen(f"python {script_path} {location_raw_sourcetext}", stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, shell=True)
while True:
output = process.stdout.readline()
if output == b'' and process.poll() is not None:
break
if output:
txt_terminal.insert(tk.END, output.decode('utf-8'))
rc = process.poll()
print(f"Return code: {rc}")

btn_play = tk.Button(tab2, text="Play!", command=run_script)
btn_play.pack(side=tk.LEFT, padx=10, pady=10)

txt_terminal = tk.Text(tab2, height=10, width=50)
txt_terminal.pack(side=tk.BOTTOM, padx=10, pady=10)

# create widgets for Text Processing tab
lbl_nothing = tk.Label(tab3, text="Nothing so far.")
lbl_nothing.pack(side=tk.LEFT, padx=10, pady=10)

# start the GUI
root.mainloop()
51 changes: 51 additions & 0 deletions lutherscripts/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import argparse
import os
import subprocess

def add_arguments_sentencetokenize(parser):
parser.add_argument("-s", "--source-path", type=str, required=True, help="The path to the source text file")
parser.add_argument("-d", "--destination-path", type=str, required=True, help="The path to the output file")

def add_arguments_wordtokenize(parser):
parser.add_argument("-s", "--source-path", type=str, required=True, help="The path to the source text file")
parser.add_argument("-d", "--destination-path", type=str, required=True, help="The path to the output file")

def sentence_tokenize_latin(source_path, destination_path):
from src.text_preparation.cltk_sentencetokenize_latin_arg import main as cltk_sentencetokenize_latin
output = cltk_sentencetokenize_latin(source_path, destination_path)
print(output)

def word_tokenize_latin(source_path, destination_path):
from src.text_preparation.cltk_wordtokenize_latin_arg import main as cltk_wordtokenize_latin
output = cltk_wordtokenize_latin(source_path, destination_path)
print(output)


def main():
parser = argparse.ArgumentParser(description="NLP script launcher")

subparsers = parser.add_subparsers(dest='subparser_name')

sent_tokenize_parser = subparsers.add_parser('sent_tokenize_latin', help='Tokenize Latin text into sentences')
add_arguments_sentencetokenize(sent_tokenize_parser)

word_tokenize_parser = subparsers.add_parser('word_tokenize_latin', help='Tokenize Latin text into words')
add_arguments_wordtokenize(word_tokenize_parser)

args = parser.parse_args()

if args.subparser_name == 'sent_tokenize_latin':
sentence_tokenize_latin(args.source_path, args.destination_path)
elif args.subparser_name == 'word_tokenize_latin':
word_tokenize_latin(args.source_path, args.destination_path)

return args

if __name__ == '__main__':
args = main()
if args.subparser_name == 'word_tokenize_latin':
word_tokenize_latin(args.source_path, args.destination_path)
elif args.subparser_name == 'sent_tokenize_latin':
sentence_tokenize_latin(args.source_path, args.destination_path)

print(output)
1 change: 0 additions & 1 deletion lutherscripts/output/dsa_prepared_2.txt

This file was deleted.

3 changes: 3 additions & 0 deletions lutherscripts/src/install/install_fasttext-model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import os
import urllib.request

__author__ = "benjamsf"
__license__ = "MIT"

def download_fasttext_model():
print("Downloading Fasttext model for Latin...")

Expand Down
3 changes: 3 additions & 0 deletions lutherscripts/src/install/install_stanza-model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import stanza

__author__ = "benjamsf"
__license__ = "MIT"

def download_stanza_model():
print("Downloading Stanza model for Latin...")
stanza.download('la')
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
# Python script to prepare Latin text from UTF-8 textfile to form usable in NLP
# Stage 2b, tokenize prepared text by sentences
# (c) Benjam Bröijer, licensed under the MIT License

__author__ = "benjamsf"
__license__ = "MIT"

import os
import re
from cltk import NLP
from tqdm import tqdm
import argparse

# Instantiate a Latin-specific NLP object
cltk_nlp = NLP(language="lat")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import io
import os
import re
from cltk import NLP
from tqdm import tqdm
import logging

def main(source_path, destination_path):
# Instantiate a Latin-specific NLP object
cltk_nlp = NLP(language="lat")

logging.basicConfig(level=logging.ERROR)

# Load the Latin text from the source file
input_file = os.path.abspath(source_path)
logging.info(f"Reading input from file: {input_file}")

with open(input_file, 'r', encoding='utf-8') as f:
input_text = f.read()
logging.info(f"Input file contents:\n{input_text}")

# Split the input text into smaller chunks based on punctuation
chunk_delimiters = r'[.!?]+'
text_chunks = re.split(chunk_delimiters, input_text)

# Process the text_chunks with cltk_nlp and update the progress bar
sentence_tokens = []
for chunk in tqdm(text_chunks, desc="Tokenizing sentences"):
doc = cltk_nlp(chunk)
for sentence in doc.sentences:
sentence_text = ' '.join([word.string for word in sentence.words])
sentence_tokens.append(sentence_text.strip())

print(sentence_tokens)

# Capture the output in a string buffer
with io.StringIO() as buffer:
for chunk in tqdm(text_chunks, desc="Tokenizing sentences"):
doc = cltk_nlp(chunk)
for sentence in doc.sentences:
sentence_text = ' '.join([word.string for word in sentence.words])
sentence_tokens.append(sentence_text.strip())

buffer.write('\n'.join(sentence_tokens))

# Save the tokenized output to a file
output_file = os.path.abspath(destination_path)
with open(output_file, 'w', encoding='utf-8') as f:
f.write(buffer.getvalue())

# Print a message to confirm that the file has been saved
print(f'The tokenized output has been saved as {output_file}')

# Return the output as a string
return buffer.getvalue()
4 changes: 2 additions & 2 deletions lutherscripts/src/text_preparation/cltk_wordtokenize_latin.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Python script to prepare Luther's Werke im WWW text to form usable in NLP
# Stage 2, tokenize prepared text
# (c) Benjam Bröijer, licensed under the MIT License
__author__ = "benjamsf"
__license__ = "MIT"

import os
import string
Expand Down
53 changes: 53 additions & 0 deletions lutherscripts/src/text_preparation/cltk_wordtokenize_latin_arg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import io
import os
import re
from cltk import NLP
from tqdm import tqdm
import logging

def main(source_path, destination_path):
# Instantiate a Latin-specific NLP object
logging.basicConfig(level=logging.ERROR)
cltk_nlp = NLP(language="lat")

# Load the Latin text from the source file
input_file = os.path.abspath(source_path)
with open(input_file, 'r', encoding='utf-8') as f:
input_text = f.read()

# Split the input text into smaller chunks based on punctuation
chunk_delimiters = r'[.!?]+'
text_chunks = re.split(chunk_delimiters, input_text)

# Process the text_chunks with cltk_nlp and update the progress bar
sentence_tokens = []
for chunk in tqdm(text_chunks, desc="Tokenizing sentences"):
doc = cltk_nlp(chunk)
for sentence in doc.sentences:
sentence_text = ' '.join([word.string for word in sentence.words])
sentence_tokens.append(sentence_text.strip())

print(sentence_tokens)

# Capture the output in a string buffer
with io.StringIO() as buffer:
for chunk in tqdm(text_chunks, desc="Tokenizing sentences"):
doc = cltk_nlp(chunk)
for sentence in doc.sentences:
sentence_text = ' '.join([word.string for word in sentence.words])
sentence_tokens.append(sentence_text.strip())

buffer.write('\n'.join(sentence_tokens))

# Save the tokenized output to a file
output_file = os.path.abspath(destination_path)
with open(output_file, 'w', encoding='utf-8') as f:
f.write(buffer.getvalue())

# Print a message to confirm that the file has been saved
print(f'The tokenized output has been saved as {destination_path}')

# Return the output as a string
return buffer.getvalue()


Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# Python script to prepare Luther's Werke im WWW text to form usable in NLP
# Stage 1: Prepare the text
# Removes all square brackets including site and verse numbers and removes newlines & whitespace
# (c) Benjam Bröijer, licensed under the MIT License

__author__ = "benjamsf"
__license__ = "MIT"

import os
import re

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# Python script to prepare Luther's Werke im WWW text to form usable in NLP
# Stage 1: Prepare the text
# Removes any square brackets [] and their contents, except for page and verse numbers
# (c) Benjam Bröijer, licensed under the MIT License

__author__ = "benjamsf"
__license__ = "MIT"

import os
import re

Expand Down
3 changes: 3 additions & 0 deletions lutherscripts/src/text_processing/booknlp_run_booknlp.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from booknlp.booknlp import BookNLP

__author__ = "benjamsf"
__license__ = "MIT"

model_params={
"pipeline":"entity,quote,supersense,event,coref",
"model":"big"
Expand Down
Empty file.
40 changes: 40 additions & 0 deletions lutherscripts/src/text_processing/nltk_do_kwic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import json
import os
import nltk
from nltk import word_tokenize
from nltk.text import Text

__author__ = "benjamsf"
__license__ = "MIT"

# Define the source and destination file names and paths
script_path = os.path.dirname(os.path.abspath(__file__))
source_path = os.path.join(script_path, '../../output/dsa_tokenized.pkl')
destination_path = os.path.join(script_path, '../../output/dsa_kwic_analysis.json')

# Load the tokenized text from the source file
with open(source_path, 'r', encoding='utf-8') as f:
tokenized_text = f.read()

# Create an NLTK Text object
text = Text(tokenized_text)

# Perform KWIC analysis for the keyword
keyword = "spe"
kwic_results = list(text.concordance_list(keyword, lines=nltk.ConcordanceIndex(text.tokens).count(keyword)))

# Convert the KWIC results into JSON format
kwic_json = []
for result in kwic_results:
kwic_json.append({
"left_context": " ".join(result[0]),
"keyword": result[1],
"right_context": " ".join(result[2])
})

# Save the KWIC analysis as a JSON file
with open(destination_path, 'w', encoding='utf-8') as f:
json.dump(kwic_json, f, ensure_ascii=False, indent=2)

# Print a message to confirm that the file has been saved
print(f'The KWIC analysis has been saved as {destination_path}')

0 comments on commit 09b1027

Please sign in to comment.