aws-neuron · ningziwen · Dec 27, 2024 · Dec 21, 2024 · Dec 21, 2024 · Dec 21, 2024
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -18,11 +18,6 @@ jobs:
       HADOLINT_RECURSIVE: "true"
     steps:
       - uses: actions/checkout@v4
-      - uses: hadolint/[email protected]
-        with:
-          dockerfile: Dockerfile.neuron
-          recursive: true
-          failure-threshold: error # TODO: enable more linter rules other than error.
       - uses: hadolint/[email protected]
         with:
           dockerfile: Dockerfile.neuronx

diff --git a/README.md b/README.md
@@ -14,16 +14,24 @@ AWS Neuron Deep Learning Containers (DLCs) are a set of Docker images for traini
 
 | Framework                                                                                                                               | Neuron Packages                                                             | Neuron SDK Version | Supported EC2 Instance Types | Python Version Options | ECR Public URL                                                                             | Other Packages    |
 |-----------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------|--------------------|------------------------------|------------------------|--------------------------------------------------------------------------------------------|-------------------|
+| [PyTorch 2.5.1](https://github.com/aws-neuron/deep-learning-containers/blob/2.21.0/docker/pytorch/inference/2.5.1/Dockerfile.neuronx)   | aws-neuronx-tools, neuronx_distributed, neuronx_distributed_inference, torch-neuronx, transformers-neuronx | Neuron 2.21.0      | trn1,trn2,inf2                    | 3.10 (py310)           | public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04  | torchserve 0.11.0 |
 | [PyTorch 2.1.2](https://github.com/aws-neuron/deep-learning-containers/blob/2.20.2/docker/pytorch/inference/2.1.2/Dockerfile.neuronx)   | aws-neuronx-tools, neuronx_distributed, torch-neuronx, transformers-neuronx | Neuron 2.20.2      | trn1,inf2                    | 3.10 (py310)           | public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.2-ubuntu20.04  | torchserve 0.11.0 |
 | [PyTorch 1.13.1](https://github.com/aws-neuron/deep-learning-containers/blob/2.20.2/docker/pytorch/inference/1.13.1/Dockerfile.neuronx) | aws-neuronx-tools, neuronx_distributed, torch-neuronx, transformers-neuronx | Neuron 2.20.2      | trn1,inf2                    | 3.10 (py310)           | public.ecr.aws/neuron/pytorch-inference-neuronx:1.13.1-neuronx-py310-sdk2.20.2-ubuntu20.04 | torchserve 0.11.0 |
 
 ### pytorch-training-neuronx
 
 | Framework                                                                                                                              | Neuron Packages                                       | Neuron SDK Version | Supported EC2 Instance Types | Python Version Options | ECR Public URL                                                                            |
 |----------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------|--------------------|------------------------------|------------------------|-------------------------------------------------------------------------------------------|
+| [PyTorch 2.5.1](https://github.com/aws-neuron/deep-learning-containers/blob/2.21.0/docker/pytorch/training/2.5.1/Dockerfile.neuronx)   | aws-neuronx-tools, neuronx_distributed, neuronx_distributed_training, torch-neuronx | Neuron 2.21.0      | trn1,trn2,inf2                    | 3.10 (py310)           | public.ecr.aws/neuron/pytorch-training-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04  |
 | [PyTorch 2.1.2](https://github.com/aws-neuron/deep-learning-containers/blob/2.20.2/docker/pytorch/training/2.1.2/Dockerfile.neuronx)   | aws-neuronx-tools, neuronx_distributed, neuronx_distributed_training, torch-neuronx | Neuron 2.20.2      | trn1,inf2                    | 3.10 (py310)           | public.ecr.aws/neuron/pytorch-training-neuronx:2.1.2-neuronx-py310-sdk2.20.2-ubuntu20.04  |
 | [PyTorch 1.13.1](https://github.com/aws-neuron/deep-learning-containers/blob/2.20.2/docker/pytorch/training/1.13.1/Dockerfile.neuronx) | aws-neuronx-tools, neuronx_distributed, neuronx_distributed_training, torch-neuronx | Neuron 2.20.2      | trn1,inf2                    | 3.10 (py310)           | public.ecr.aws/neuron/pytorch-training-neuronx:1.13.1-neuronx-py310-sdk2.20.2-ubuntu20.04 |
 
+### jax-training-neuron
+
+| Framework                                                                                                                              | Neuron Packages                 | Neuron SDK Version | Supported EC2 Instance Types | Python Version Options | ECR Public URL                                                                           | Other Packages    |
+|----------------------------------------------------------------------------------------------------------------------------------------|---------------------------------|--------------------|------------------------------|------------------------|------------------------------------------------------------------------------------------|-------------------|
+| [JAX 0.4](https://github.com/aws-neuron/deep-learning-containers/blob/2.21.0/docker/jax/training/0.4/Dockerfile.neuronx) | jax-neuronx, libneuronxla | Neuron 2.21.0      | trn1,trn2,inf2                        | 3.10 (py310)           | public.ecr.aws/neuron/jax-training-neuronx:0.4-neuronx-py310-sdk2.21.0-ubuntu22.04 | jaxlib 0.4 |
+
 ## Security
 
 See [SECURITY](SECURITY.md) for more information.

diff --git a/...ytorch/training/1.13.1/Dockerfile.neuronx → docker/jax/training/0.4/Dockerfile.neuronx b/...ytorch/training/1.13.1/Dockerfile.neuronx → docker/jax/training/0.4/Dockerfile.neuronx
@@ -1,23 +1,21 @@
-FROM public.ecr.aws/docker/library/ubuntu:20.04
+FROM public.ecr.aws/docker/library/ubuntu:22.04
 
-LABEL maintainer="Amazon AI"
 LABEL dlc_major_version="1"
+LABEL maintainer="Amazon AI"
 
 # Neuron SDK components version numbers
-ARG NEURONX_FRAMEWORK_VERSION=1.13.1.1.16.0
-ARG NEURONX_DISTRIBUTED_VERSION=0.9.0
-ARG NEURONX_DISTRIBUTED_TRAINING_VERSION=1.0.1
-ARG NEURONX_CC_VERSION=2.15.143.0
-ARG NEURONX_COLLECTIVES_LIB_VERSION=2.22.33.0-d2128d1aa
-ARG NEURONX_RUNTIME_LIB_VERSION=2.22.19.0-5856c0b42
-ARG NEURONX_TOOLS_VERSION=2.19.0.0
+ARG NEURONX_RUNTIME_LIB_VERSION=2.23.110.0-9b5179492
+ARG NEURONX_COLLECTIVES_LIB_VERSION=2.23.133.0-3e70920f2
+ARG NEURONX_TOOLS_VERSION=2.20.204.0
+ARG NEURONX_CC_VERSION=2.16.345.0
+ARG NEURONX_JAX_TRAINING_VERSION=0.1.2
 
 ARG PYTHON=python3.10
 ARG PYTHON_VERSION=3.10.12
 ARG PIP=pip3
 ARG OMPI_VERSION=4.1.5
 
-# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20
+# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 22
 ARG DEBIAN_FRONTEND=noninteractive
 
 # Python won’t try to write .pyc or .pyo files on the import of source modules
@@ -32,11 +30,8 @@
 ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64"
 ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64"
 ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"
-ENV PATH /opt/aws/neuron/bin/:$PATH
-ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
-ENV DGLBACKEND=pytorch
 
 RUN apt-get update \
  && apt-get upgrade -y \
 && apt-get install -y --no-install-recommends \
    build-essential \
@@ -45,51 +40,41 @@
     curl \
     emacs \
     git \
+    gnupg2 \
+    gpg-agent \
     jq \
     libopencv-dev \
-    openjdk-8-jdk-headless \
-    openjdk-8-jdk \
-    openjdk-8-jre \
     libglib2.0-0 \
     libgl1-mesa-glx \
     libsm6 \
     libxext6 \
     libxrender-dev \
-    openjdk-11-jdk \
-    software-properties-common \
-    wget \
-    unzip \
-    vim \
-    zlib1g-dev \
-    openssl \
     libssl-dev \
     libsqlite3-dev \
     libgdbm-dev \
     libc6-dev \
     libbz2-dev \
     libncurses-dev \
-    tk-dev \
     libffi-dev \
     libcap-dev \
-    gnupg2 \
-    gpg-agent \
- && rm -rf /var/lib/apt/lists/* \
- && apt-get clean
-
-RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list
-RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
-
-RUN apt-get update \
- && apt-get install -y \
-    aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
-    aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
-    aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
+    libhwloc-dev \
+    openjdk-8-jdk-headless \
+    openjdk-8-jdk \
+    openjdk-8-jre \
+    openjdk-11-jdk \
+    openssl \
+    software-properties-common \
+    tk-dev \
+    unzip \
+    wget \
+    vim \
+    zlib1g-dev \
  && rm -rf /var/lib/apt/lists/* \
  && rm -rf /tmp/tmp* \
  && apt-get clean

 # Install Open MPI
 RUN mkdir -p /tmp/openmpi \
 && cd /tmp/openmpi \
 && wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \
 && tar zxf openmpi-${OMPI_VERSION}.tar.gz \
@@ -100,8 +85,17 @@
  && ldconfig \
  && rm -rf /tmp/openmpi
 
+# Install packages and configure SSH for MPI operator in k8s
+RUN apt-get update && apt-get install -y openmpi-bin openssh-server \
+ && mkdir -p /var/run/sshd \
+ && echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
+ && echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config \
+ && sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config \
+ && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean
+
 # install Python
 RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \
  && tar -xzf Python-$PYTHON_VERSION.tgz \
 && cd Python-$PYTHON_VERSION \
 && ./configure --enable-shared --prefix=/usr/local \
@@ -120,78 +114,33 @@
 # ompi_info to fail. This is only observed in CPU containers
 ENV PATH="$PATH:/home/.openmpi/bin"
 ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/"
 RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value
 
-# Copy workaround script for incorrect hostname
-COPY changehostname.c /
-COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
+RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt
 
-RUN ${PIP} install --no-cache-dir -U \
-    "bokeh>=2.3,<3" \
-    "awscli<2" \
-    scipy \
-    click \
-    "cryptography" \
-    "sagemaker>=2,<2.184" \
-    "sagemaker-pytorch-training" \
-    psutil==5.6.7 \
-    dataset \
-    transformers==4.36.2 \
-    Pillow
+# Install Neuron Driver, Runtime and Tools 
+RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list
+RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -
 
-RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt
+RUN apt-get update \
+ && apt-get install -y \
+    aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
+    aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
+    aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
+ && rm -rf /var/lib/apt/lists/* \
+ && rm -rf /tmp/tmp* \
+ && apt-get clean
+
+# Add Neuron PATH
+ENV PATH="/opt/aws/neuron/bin:${PATH}"
+
+# Install AWS CLI
+RUN ${PIP} install --no-cache-dir -U "awscli<2"
+
+# Install JAX & Neuron CC
 RUN ${PIP} config set global.extra-index-url https://pip.repos.neuron.amazonaws.com \
- && ${PIP} install --force-reinstall torch-neuronx==$NEURONX_FRAMEWORK_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \
- && ${PIP} install --force-reinstall neuronx-cc==$NEURONX_CC_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com
-
-RUN ${PIP} install --force-reinstall --no-deps neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com
-
-## Installation for Neuronx Distributed Training framework
-# Install Cython
-RUN pip install --no-cache-dir Cython
-
-# Copy the apex_setup.py file
-COPY apex_setup.py /root/apex_setup.py
-
-# Clone and build Apex
-RUN git clone https://github.com/NVIDIA/apex.git /root/apex \
-    && cd /root/apex \
-    && git checkout 23.05 \
-    && cp /root/apex_setup.py setup.py \
-    && python3 setup.py bdist_wheel
-
-#Install dependencies from requirements and extras for SageMaker usecase
-RUN wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed-training/master/requirements.txt \
-    && pip install --no-cache-dir -r requirements.txt /root/apex/dist/apex-0.1-py3-none-any.whl \
-    && pip install --force-reinstall "multiprocess==0.70.16" \
-    "dill==0.3.8" \
-    "torch==1.13.1"
-
-
-RUN ${PIP} install --force-reinstall --no-deps neuronx_distributed_training==$NEURONX_DISTRIBUTED_TRAINING_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com
-
-# attrs, neuronx-cc required: >=19.2.0, sagemaker <24,>=23.1.0
-# protobuf neuronx-cc<4, sagemaker-training >=3.9.2,<=3.20.3
-# awscli 1.25.47 has requirement docutils<0.17,>=0.10
-# etcd for kubernetes installation
-# awscli 1.27.127 has requirement rsa<4.8,>=3.1.2, but you have rsa 4.9.
-# awscli 1.27.127 requires urllib3 < 1.27, python-etcd requires urllib3 >= 1.7, latest urllib3 release is 2.0.2
-RUN ${PIP} install --no-cache-dir -U \
-    "attrs<24,>=23.1.0" \
-    "protobuf>=3.18.3,<=3.20.3" \
-    "docutils>=0.10,<0.17" \
-    "rsa<4.8,>=3.1.2" \
-    "python-etcd" \
-    "urllib3>=1.26.0,<1.27"
-
-# Install extra packages needed by sagemaker (for passing test_utility_packages_using_import)
-RUN pip install --no-cache-dir -U \
-    "bokeh>=3.0.1,<4" \
-    "imageio>=2.22,<3" \
-    "opencv-python>=4.8.1.78" \
-    "plotly>=5.11,<6" \
-    "seaborn>=0.12,<1" \
-    "shap>=0.41,<1"
+ && ${PIP} install --force-reinstall neuronx-cc==$NEURONX_CC_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \
+ && ${PIP} install --force-reinstall jax-neuronx==$NEURONX_JAX_TRAINING_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com
 
 # EFA Installer does apt get. Make sure to run apt update before that
 RUN apt-get update
@@ -205,27 +154,14 @@
  && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
  && cd $HOME
 
-
 # Clean up after apt update
 RUN rm -rf /var/lib/apt/lists/* \
  && rm -rf /tmp/tmp* \
  && apt-get clean
 
-# Install some common packages used by training scripts
-# torchvision needed for MLP. since it depends on torch and torch neuron/torch
-# is already installed install it with nodeps
-RUN pip3 install --no-cache-dir --no-deps -U \
-    torchvision==0.14.*
-
-# Needed for running bert training scripts
-RUN pip3 install --no-cache-dir -U \
-    graphviz \
-    tensorboard==2.6 \
-    accelerate \
-    sentencepiece!=0.1.92 \
-    h5py \
-    requests
-
+# Copy workaround script for incorrect hostname
+COPY changehostname.c /
+COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh
 COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
 
 RUN chmod +x /usr/local/bin/start_with_right_hostname.sh \
@@ -241,8 +177,8 @@
  && rm -rf ${HOME_DIR}/oss_compliance* \
  && rm -rf /tmp/tmp*
 
-RUN curl -o /license.txt  https://aws-dlc-licenses.s3.amazonaws.com/pytorch-1.13/license.txt
-
 # Starts framework
 ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]
 CMD ["/bin/bash"]
+
+HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1
diff --git a/docker/jax/training/0.4/Dockerfile.neuronx.cve_allowlist.json b/docker/jax/training/0.4/Dockerfile.neuronx.cve_allowlist.json
@@ -0,0 +1,27 @@
+{
+    "CVE-2024-35195": {
+        "description": "Requests is a HTTP library. Prior to 2.32.0, when making requests through a Requests `Session`, if the first request is made with `verify=False` to disable cert verification, all subsequent requests to the same host will continue to ignore cert verification regardless of changes to the value of `verify`. This behavior will continue for the lifecycle of the connection in the connection pool. This vulnerability is fixed in 2.32.0.",
+        "remediation": {
+            "recommendation": {
+                "text": "None Provided"
+            }
+        },
+        "score": 0.0,
+        "score_details": {},
+        "severity": "UNTRIAGED",
+        "source": "NVD",
+        "source_url": "https://nvd.nist.gov/vuln/detail/CVE-2024-35195",
+        "status": "ACTIVE",
+        "title": "CVE-2024-35195 - requests",
+        "vulnerability_id": "CVE-2024-35195",
+        "vulnerable_packages": [
+            {
+                "epoch": 0,
+                "filePath": "usr/local/lib/python3.10/site-packages/requests-2.31.0.dist-info/METADATA",
+                "name": "requests",
+                "packageManager": "PYTHONPKG",
+                "version": "2.31.0"
+            }
+        ]
+    }
+}