Skip to content

Commit

Permalink
Fix wordtokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
benjamsf committed Apr 7, 2023
1 parent 09b1027 commit a003277
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ def main(source_path, destination_path):

with open(input_file, 'r', encoding='utf-8') as f:
input_text = f.read()
logging.info(f"Input file contents:\n{input_text}")

# Split the input text into smaller chunks based on punctuation
chunk_delimiters = r'[.!?]+'
Expand Down
62 changes: 27 additions & 35 deletions lutherscripts/src/text_preparation/cltk_wordtokenize_latin_arg.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,45 @@
import io
# Python script to prepare Luther's Werke im WWW text to form usable in NLP
__author__ = "benjamsf"
__license__ = "MIT"

import os
import re
import string
from cltk import NLP
from tqdm import tqdm
import logging


def main(source_path, destination_path):
# Instantiate a Latin-specific NLP object

logging.basicConfig(level=logging.ERROR)
# Instantiate a Latin-specific NLP object
cltk_nlp = NLP(language="lat")

# Load the Latin text from the source file
input_file = os.path.abspath(source_path)

# Load the Latin text from the source file
with open(input_file, 'r', encoding='utf-8') as f:
input_text = f.read()

# Split the input text into smaller chunks based on punctuation
chunk_delimiters = r'[.!?]+'
text_chunks = re.split(chunk_delimiters, input_text)
# Remove punctuation marks from the input text
translator = str.maketrans("", "", string.punctuation)
input_text_no_punctuation = input_text.translate(translator)

# Split the input text into smaller chunks
text_chunks = input_text_no_punctuation.split()

# Process the text_chunks with cltk_nlp and update the progress bar
sentence_tokens = []
for chunk in tqdm(text_chunks, desc="Tokenizing sentences"):
word_tokens = []
for chunk in tqdm(text_chunks, desc="Tokenizing words"):
doc = cltk_nlp(chunk)
for sentence in doc.sentences:
sentence_text = ' '.join([word.string for word in sentence.words])
sentence_tokens.append(sentence_text.strip())

print(sentence_tokens)

# Capture the output in a string buffer
with io.StringIO() as buffer:
for chunk in tqdm(text_chunks, desc="Tokenizing sentences"):
doc = cltk_nlp(chunk)
for sentence in doc.sentences:
sentence_text = ' '.join([word.string for word in sentence.words])
sentence_tokens.append(sentence_text.strip())

buffer.write('\n'.join(sentence_tokens))

# Save the tokenized output to a file
output_file = os.path.abspath(destination_path)
with open(output_file, 'w', encoding='utf-8') as f:
f.write(buffer.getvalue())

# Print a message to confirm that the file has been saved
print(f'The tokenized output has been saved as {destination_path}')

# Return the output as a string
return buffer.getvalue()
for word in doc.words:
word_tokens.append(word.string)

print(word_tokens)

# Save the tokenized output to a file
output_file = os.path.abspath(destination_path)
with open(output_file, 'w', encoding='utf-8') as f:
f.write(' '.join(word_tokens))
# Print a message to confirm that the file has been saved
print(f'The tokenized output has been saved as {destination_path}')

0 comments on commit a003277

Please sign in to comment.