diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3002dbb898..8d364840dd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,7 +13,7 @@ permissions: contents: read env: - NLTK_DATA: /home/notebook-user/nltk_data + NLTK_DATA: ${{ github.workspace }}/nltk_data jobs: setup: diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index 2d73116974..632f4ac3e1 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -14,7 +14,7 @@ jobs: setup: runs-on: ubuntu-latest-m env: - NLTK_DATA: /home/notebook-user/nltk_data + NLTK_DATA: ${{ github.workspace }}/nltk_data steps: - uses: actions/checkout@v3 - uses: ./.github/actions/base-cache @@ -24,7 +24,7 @@ jobs: setup_ingest: runs-on: ubuntu-latest env: - NLTK_DATA: /home/notebook-user/nltk_data + NLTK_DATA: ${{ github.workspace }}/nltk_data needs: [setup] steps: - uses: actions/checkout@v3 diff --git a/Dockerfile b/Dockerfile index 201a435f6c..70fc9f54f7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,8 +9,6 @@ COPY unstructured unstructured COPY test_unstructured test_unstructured COPY example-docs example-docs -# Copy the downloaded NLTK data folder to your local environment.s -COPY ./nltk_data /home/notebook-user/nltk_data RUN chown -R notebook-user:notebook-user /app && \ apk add font-ubuntu git && \ @@ -22,7 +20,7 @@ USER notebook-user RUN find requirements/ -type f -name "*.txt" -exec pip3.11 install --no-cache-dir --user -r '{}' ';' # Command to check if NLTK data has been copied correctly -RUN python3.11 -c "import nltk; print(nltk.data.find('tokenizers/punkt_tab'))" +RUN python3.11 -c "from unstructured.nlp.tokenize import copy_nltk_packages; copy_nltk_packages()" RUN python3.11 -c "from unstructured.partition.model_init import initialize; initialize()" && \ python3.11 -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"