Skip to content

Commit

Permalink
file-base: add get_ntlk_temp_folder routine for downloading ntlk file…
Browse files Browse the repository at this point in the history
…s in writable folder
  • Loading branch information
aldogonzalez8 committed Jan 15, 2025
1 parent 38eefec commit 6cb6630
Showing 1 changed file with 16 additions and 2 deletions.
18 changes: 16 additions & 2 deletions airbyte_cdk/sources/file_based/file_types/unstructured_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,24 @@
unstructured_partition_pdf = None
unstructured_partition_docx = None
unstructured_partition_pptx = None
nltk_data_dir = "/tmp/nltk_data"


def get_ntlk_temp_folder():
"""
For non-root connectors /tmp is not currently writable, but we should allow it in the future.
It's safe to use /airbyte for now. Fallback to /tmp for local development.
"""
try:
nltk_data_dir = "/airbyte/nltk_data"
os.makedirs(nltk_data_dir, exist_ok=True)
except OSError:
nltk_data_dir = "/tmp/nltk_data"
os.makedirs(nltk_data_dir, exist_ok=True)
return nltk_data_dir


try:
os.makedirs(nltk_data_dir, exist_ok=True)
nltk_data_dir = get_ntlk_temp_folder()
nltk.data.path.append(nltk_data_dir)
nltk.data.find("tokenizers/punkt.zip")
nltk.data.find("tokenizers/punkt_tab.zip")
Expand Down

0 comments on commit 6cb6630

Please sign in to comment.