-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
19 changed files
with
293 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
import tkinter as tk | ||
from tkinter import ttk | ||
from tkinter import filedialog | ||
import os | ||
import subprocess | ||
|
||
# create the main window | ||
root = tk.Tk() | ||
root.title("NLP Text Analysis") | ||
|
||
# create the tabs | ||
tab_parent = ttk.Notebook(root) | ||
tab1 = ttk.Frame(tab_parent) | ||
tab2 = ttk.Frame(tab_parent) | ||
tab3 = ttk.Frame(tab_parent) | ||
tab_parent.add(tab1, text="Config") | ||
tab_parent.add(tab2, text="Text Preparation") | ||
tab_parent.add(tab3, text="Text Processing") | ||
tab_parent.pack(expand=1, fill='both') | ||
|
||
# create widgets for Config tab | ||
lbl_raw_sourcetext = tk.Label(tab1, text="Choose raw source text:") | ||
lbl_raw_sourcetext.pack(side=tk.LEFT, padx=10, pady=10) | ||
btn_raw_sourcetext = tk.Button(tab1, text="Browse...", command=lambda: choose_file()) | ||
btn_raw_sourcetext.pack(side=tk.LEFT, padx=10, pady=10) | ||
|
||
# function to choose file and store the location in a variable | ||
def choose_file(): | ||
global location_raw_sourcetext | ||
location_raw_sourcetext = filedialog.askopenfilename(title="Select a File") | ||
print(f"File selected: {location_raw_sourcetext}") | ||
|
||
# create widgets for Text Preparation tab | ||
lbl_operation = tk.Label(tab2, text="Choose Operation:") | ||
lbl_operation.pack(side=tk.LEFT, padx=10, pady=10) | ||
options = ["cltk_sentencetokenize_latin.py", "cltk_wordtokenize_latin.py"] | ||
var_operation = tk.StringVar(tab2) | ||
var_operation.set(options[0]) | ||
opt_operation = tk.OptionMenu(tab2, var_operation, *options) | ||
opt_operation.pack(side=tk.LEFT, padx=10, pady=10) | ||
|
||
def run_script(): | ||
global location_raw_sourcetext | ||
global process | ||
script_name = var_operation.get() | ||
script_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../text_preparation", script_name) | ||
process = subprocess.Popen(f"python {script_path} {location_raw_sourcetext}", stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, shell=True) | ||
while True: | ||
output = process.stdout.readline() | ||
if output == b'' and process.poll() is not None: | ||
break | ||
if output: | ||
txt_terminal.insert(tk.END, output.decode('utf-8')) | ||
rc = process.poll() | ||
print(f"Return code: {rc}") | ||
|
||
btn_play = tk.Button(tab2, text="Play!", command=run_script) | ||
btn_play.pack(side=tk.LEFT, padx=10, pady=10) | ||
|
||
txt_terminal = tk.Text(tab2, height=10, width=50) | ||
txt_terminal.pack(side=tk.BOTTOM, padx=10, pady=10) | ||
|
||
# create widgets for Text Processing tab | ||
lbl_nothing = tk.Label(tab3, text="Nothing so far.") | ||
lbl_nothing.pack(side=tk.LEFT, padx=10, pady=10) | ||
|
||
# start the GUI | ||
root.mainloop() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
import argparse | ||
import os | ||
import subprocess | ||
|
||
def add_arguments_sentencetokenize(parser): | ||
parser.add_argument("-s", "--source-path", type=str, required=True, help="The path to the source text file") | ||
parser.add_argument("-d", "--destination-path", type=str, required=True, help="The path to the output file") | ||
|
||
def add_arguments_wordtokenize(parser): | ||
parser.add_argument("-s", "--source-path", type=str, required=True, help="The path to the source text file") | ||
parser.add_argument("-d", "--destination-path", type=str, required=True, help="The path to the output file") | ||
|
||
def sentence_tokenize_latin(source_path, destination_path): | ||
from src.text_preparation.cltk_sentencetokenize_latin_arg import main as cltk_sentencetokenize_latin | ||
output = cltk_sentencetokenize_latin(source_path, destination_path) | ||
print(output) | ||
|
||
def word_tokenize_latin(source_path, destination_path): | ||
from src.text_preparation.cltk_wordtokenize_latin_arg import main as cltk_wordtokenize_latin | ||
output = cltk_wordtokenize_latin(source_path, destination_path) | ||
print(output) | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description="NLP script launcher") | ||
|
||
subparsers = parser.add_subparsers(dest='subparser_name') | ||
|
||
sent_tokenize_parser = subparsers.add_parser('sent_tokenize_latin', help='Tokenize Latin text into sentences') | ||
add_arguments_sentencetokenize(sent_tokenize_parser) | ||
|
||
word_tokenize_parser = subparsers.add_parser('word_tokenize_latin', help='Tokenize Latin text into words') | ||
add_arguments_wordtokenize(word_tokenize_parser) | ||
|
||
args = parser.parse_args() | ||
|
||
if args.subparser_name == 'sent_tokenize_latin': | ||
sentence_tokenize_latin(args.source_path, args.destination_path) | ||
elif args.subparser_name == 'word_tokenize_latin': | ||
word_tokenize_latin(args.source_path, args.destination_path) | ||
|
||
return args | ||
|
||
if __name__ == '__main__': | ||
args = main() | ||
if args.subparser_name == 'word_tokenize_latin': | ||
word_tokenize_latin(args.source_path, args.destination_path) | ||
elif args.subparser_name == 'sent_tokenize_latin': | ||
sentence_tokenize_latin(args.source_path, args.destination_path) | ||
|
||
print(output) |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file added
BIN
+1.69 KB
...rscripts/src/text_preparation/__pycache__/cltk_sentencetokenize_latin_arg.cpython-310.pyc
Binary file not shown.
Binary file added
BIN
+1.16 KB
lutherscripts/src/text_preparation/__pycache__/cltk_wordtokenize_latin.cpython-310.pyc
Binary file not shown.
Binary file added
BIN
+1.76 KB
lutherscripts/src/text_preparation/__pycache__/cltk_wordtokenize_latin_arg.cpython-310.pyc
Binary file not shown.
6 changes: 4 additions & 2 deletions
6
lutherscripts/src/text_preparation/cltk_sentencetokenize_latin.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
55 changes: 55 additions & 0 deletions
55
lutherscripts/src/text_preparation/cltk_sentencetokenize_latin_arg.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import io | ||
import os | ||
import re | ||
from cltk import NLP | ||
from tqdm import tqdm | ||
import logging | ||
|
||
def main(source_path, destination_path): | ||
# Instantiate a Latin-specific NLP object | ||
cltk_nlp = NLP(language="lat") | ||
|
||
logging.basicConfig(level=logging.ERROR) | ||
|
||
# Load the Latin text from the source file | ||
input_file = os.path.abspath(source_path) | ||
logging.info(f"Reading input from file: {input_file}") | ||
|
||
with open(input_file, 'r', encoding='utf-8') as f: | ||
input_text = f.read() | ||
logging.info(f"Input file contents:\n{input_text}") | ||
|
||
# Split the input text into smaller chunks based on punctuation | ||
chunk_delimiters = r'[.!?]+' | ||
text_chunks = re.split(chunk_delimiters, input_text) | ||
|
||
# Process the text_chunks with cltk_nlp and update the progress bar | ||
sentence_tokens = [] | ||
for chunk in tqdm(text_chunks, desc="Tokenizing sentences"): | ||
doc = cltk_nlp(chunk) | ||
for sentence in doc.sentences: | ||
sentence_text = ' '.join([word.string for word in sentence.words]) | ||
sentence_tokens.append(sentence_text.strip()) | ||
|
||
print(sentence_tokens) | ||
|
||
# Capture the output in a string buffer | ||
with io.StringIO() as buffer: | ||
for chunk in tqdm(text_chunks, desc="Tokenizing sentences"): | ||
doc = cltk_nlp(chunk) | ||
for sentence in doc.sentences: | ||
sentence_text = ' '.join([word.string for word in sentence.words]) | ||
sentence_tokens.append(sentence_text.strip()) | ||
|
||
buffer.write('\n'.join(sentence_tokens)) | ||
|
||
# Save the tokenized output to a file | ||
output_file = os.path.abspath(destination_path) | ||
with open(output_file, 'w', encoding='utf-8') as f: | ||
f.write(buffer.getvalue()) | ||
|
||
# Print a message to confirm that the file has been saved | ||
print(f'The tokenized output has been saved as {output_file}') | ||
|
||
# Return the output as a string | ||
return buffer.getvalue() |
4 changes: 2 additions & 2 deletions
4
lutherscripts/src/text_preparation/cltk_wordtokenize_latin.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
53 changes: 53 additions & 0 deletions
53
lutherscripts/src/text_preparation/cltk_wordtokenize_latin_arg.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import io | ||
import os | ||
import re | ||
from cltk import NLP | ||
from tqdm import tqdm | ||
import logging | ||
|
||
def main(source_path, destination_path): | ||
# Instantiate a Latin-specific NLP object | ||
logging.basicConfig(level=logging.ERROR) | ||
cltk_nlp = NLP(language="lat") | ||
|
||
# Load the Latin text from the source file | ||
input_file = os.path.abspath(source_path) | ||
with open(input_file, 'r', encoding='utf-8') as f: | ||
input_text = f.read() | ||
|
||
# Split the input text into smaller chunks based on punctuation | ||
chunk_delimiters = r'[.!?]+' | ||
text_chunks = re.split(chunk_delimiters, input_text) | ||
|
||
# Process the text_chunks with cltk_nlp and update the progress bar | ||
sentence_tokens = [] | ||
for chunk in tqdm(text_chunks, desc="Tokenizing sentences"): | ||
doc = cltk_nlp(chunk) | ||
for sentence in doc.sentences: | ||
sentence_text = ' '.join([word.string for word in sentence.words]) | ||
sentence_tokens.append(sentence_text.strip()) | ||
|
||
print(sentence_tokens) | ||
|
||
# Capture the output in a string buffer | ||
with io.StringIO() as buffer: | ||
for chunk in tqdm(text_chunks, desc="Tokenizing sentences"): | ||
doc = cltk_nlp(chunk) | ||
for sentence in doc.sentences: | ||
sentence_text = ' '.join([word.string for word in sentence.words]) | ||
sentence_tokens.append(sentence_text.strip()) | ||
|
||
buffer.write('\n'.join(sentence_tokens)) | ||
|
||
# Save the tokenized output to a file | ||
output_file = os.path.abspath(destination_path) | ||
with open(output_file, 'w', encoding='utf-8') as f: | ||
f.write(buffer.getvalue()) | ||
|
||
# Print a message to confirm that the file has been saved | ||
print(f'The tokenized output has been saved as {destination_path}') | ||
|
||
# Return the output as a string | ||
return buffer.getvalue() | ||
|
||
|
6 changes: 4 additions & 2 deletions
6
...cripts/src/text_preparation/prepare_WA.py → ...aration/lutherswerkeimwww_prepare-text.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
6 changes: 4 additions & 2 deletions
6
...preparation/prepare_wa_withsitenumbers.py → ...rswerkeimwww_prepare_with_site_numbers.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import json | ||
import os | ||
import nltk | ||
from nltk import word_tokenize | ||
from nltk.text import Text | ||
|
||
__author__ = "benjamsf" | ||
__license__ = "MIT" | ||
|
||
# Define the source and destination file names and paths | ||
script_path = os.path.dirname(os.path.abspath(__file__)) | ||
source_path = os.path.join(script_path, '../../output/dsa_tokenized.pkl') | ||
destination_path = os.path.join(script_path, '../../output/dsa_kwic_analysis.json') | ||
|
||
# Load the tokenized text from the source file | ||
with open(source_path, 'r', encoding='utf-8') as f: | ||
tokenized_text = f.read() | ||
|
||
# Create an NLTK Text object | ||
text = Text(tokenized_text) | ||
|
||
# Perform KWIC analysis for the keyword | ||
keyword = "spe" | ||
kwic_results = list(text.concordance_list(keyword, lines=nltk.ConcordanceIndex(text.tokens).count(keyword))) | ||
|
||
# Convert the KWIC results into JSON format | ||
kwic_json = [] | ||
for result in kwic_results: | ||
kwic_json.append({ | ||
"left_context": " ".join(result[0]), | ||
"keyword": result[1], | ||
"right_context": " ".join(result[2]) | ||
}) | ||
|
||
# Save the KWIC analysis as a JSON file | ||
with open(destination_path, 'w', encoding='utf-8') as f: | ||
json.dump(kwic_json, f, ensure_ascii=False, indent=2) | ||
|
||
# Print a message to confirm that the file has been saved | ||
print(f'The KWIC analysis has been saved as {destination_path}') |