From dad9f87eddd334244d8df12a0644761148627661 Mon Sep 17 00:00:00 2001 From: christinestraub Date: Fri, 3 Jan 2025 05:43:10 -0800 Subject: [PATCH] feat:update ability to validate nltk assets --- unstructured/nlp/tokenize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py index 2113c9440f..1af428cb82 100644 --- a/unstructured/nlp/tokenize.py +++ b/unstructured/nlp/tokenize.py @@ -45,6 +45,8 @@ def check_for_nltk_package(package_name: str, package_category: str) -> bool: # Ensure NLTK data exists in the specified path (pre-baked in Docker) def validate_nltk_assets(): + if not os.path.exists(NLTK_DATA_PATH): + copy_nltk_packages() """Validate that required NLTK packages are preloaded in the image.""" required_assets = [ ("punkt_tab", "tokenizers"), @@ -57,7 +59,6 @@ def validate_nltk_assets(): f"Ensure it is baked into the Docker image at '{NLTK_DATA_PATH}'." ) - # Validate NLTK assets at import time validate_nltk_assets()