Fix wordtokenizer

benjamsf · Apr 7, 2023 · a003277 · a003277
1 parent 09b1027
commit a003277
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 36 deletions.
diff --git a/lutherscripts/src/text_preparation/cltk_sentencetokenize_latin_arg.py b/lutherscripts/src/text_preparation/cltk_sentencetokenize_latin_arg.py
@@ -17,7 +17,6 @@ def main(source_path, destination_path):
 
     with open(input_file, 'r', encoding='utf-8') as f:
         input_text = f.read()
-        logging.info(f"Input file contents:\n{input_text}")
 
     # Split the input text into smaller chunks based on punctuation
     chunk_delimiters = r'[.!?]+'

diff --git a/lutherscripts/src/text_preparation/cltk_wordtokenize_latin_arg.py b/lutherscripts/src/text_preparation/cltk_wordtokenize_latin_arg.py
@@ -1,53 +1,45 @@
-import io
+# Python script to prepare Luther's Werke im WWW text to form usable in NLP
+__author__ = "benjamsf"
+__license__ = "MIT"
+
 import os
-import re
+import string
 from cltk import NLP
 from tqdm import tqdm
 import logging
 
+
 def main(source_path, destination_path):
-    # Instantiate a Latin-specific NLP object
+
     logging.basicConfig(level=logging.ERROR)
+    # Instantiate a Latin-specific NLP object
     cltk_nlp = NLP(language="lat")
 
-    # Load the Latin text from the source file
     input_file = os.path.abspath(source_path)
+
+    # Load the Latin text from the source file
     with open(input_file, 'r', encoding='utf-8') as f:
         input_text = f.read()
 
-    # Split the input text into smaller chunks based on punctuation
-    chunk_delimiters = r'[.!?]+'
-    text_chunks = re.split(chunk_delimiters, input_text)
+    # Remove punctuation marks from the input text
+    translator = str.maketrans("", "", string.punctuation)
+    input_text_no_punctuation = input_text.translate(translator)
+
+    # Split the input text into smaller chunks
+    text_chunks = input_text_no_punctuation.split()
 
     # Process the text_chunks with cltk_nlp and update the progress bar
-    sentence_tokens = []
-    for chunk in tqdm(text_chunks, desc="Tokenizing sentences"):
+    word_tokens = []
+    for chunk in tqdm(text_chunks, desc="Tokenizing words"):
         doc = cltk_nlp(chunk)
-        for sentence in doc.sentences:
-            sentence_text = ' '.join([word.string for word in sentence.words])
-            sentence_tokens.append(sentence_text.strip())
-
-    print(sentence_tokens)
-
-    # Capture the output in a string buffer
-    with io.StringIO() as buffer:
-        for chunk in tqdm(text_chunks, desc="Tokenizing sentences"):
-            doc = cltk_nlp(chunk)
-            for sentence in doc.sentences:
-                sentence_text = ' '.join([word.string for word in sentence.words])
-                sentence_tokens.append(sentence_text.strip())
-
-        buffer.write('\n'.join(sentence_tokens))
-
-        # Save the tokenized output to a file
-        output_file = os.path.abspath(destination_path)
-        with open(output_file, 'w', encoding='utf-8') as f:
-            f.write(buffer.getvalue())
-
-        # Print a message to confirm that the file has been saved
-        print(f'The tokenized output has been saved as {destination_path}')
-
-    # Return the output as a string
-    return buffer.getvalue()
+        for word in doc.words:
+            word_tokens.append(word.string)
 
+    print(word_tokens)
 
+    # Save the tokenized output to a file
+    output_file = os.path.abspath(destination_path)
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.write(' '.join(word_tokens))
+    # Print a message to confirm that the file has been saved
+    print(f'The tokenized output has been saved as {destination_path}')