From c38371ec1dcd305c89b149fc650528ea57d5fc4b Mon Sep 17 00:00:00 2001 From: Chirag Bhatia AWS Date: Fri, 21 Feb 2025 16:22:48 -0800 Subject: [PATCH 1/3] Update docker/pytorch/inference/2.5.1/Dockerfile.neuronx --- .../inference/2.5.1/Dockerfile.neuronx | 94 +++++++++---------- 1 file changed, 45 insertions(+), 49 deletions(-) diff --git a/docker/pytorch/inference/2.5.1/Dockerfile.neuronx b/docker/pytorch/inference/2.5.1/Dockerfile.neuronx index 3c03f8e..f98809f 100644 --- a/docker/pytorch/inference/2.5.1/Dockerfile.neuronx +++ b/docker/pytorch/inference/2.5.1/Dockerfile.neuronx @@ -4,20 +4,10 @@ LABEL dlc_major_version="1" LABEL maintainer="Amazon AI" LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true -# Neuron SDK components version numbers -ARG NEURONX_CC_VERSION=2.16.372.0 -ARG NEURONX_FRAMEWORK_VERSION=2.5.1.2.4.0 -ARG NEURONX_TRANSFORMERS_VERSION=0.13.380 -ARG NEURONX_COLLECTIVES_LIB_VERSION=2.23.135.0-3e70920f2 -ARG NEURONX_RUNTIME_LIB_VERSION=2.23.112.0-9b5179492 -ARG NEURONX_TOOLS_VERSION=2.20.204.0 -ARG NEURONX_DISTRIBUTED_VERSION=0.10.1 -ARG NEURONX_DISTRIBUTED_INFERENCE_VERSION=0.1.1 - ARG PYTHON=python3.10 ARG PYTHON_VERSION=3.10.12 ARG TORCHSERVE_VERSION=0.11.0 -ARG SM_TOOLKIT_VERSION=2.0.21 +ARG SM_TOOLKIT_VERSION=2.0.25 ARG MAMBA_VERSION=23.1.0-4 # See http://bugs.python.org/issue19846 @@ -56,18 +46,6 @@ RUN apt-get update \ && rm -rf /tmp/tmp* \ && apt-get clean -RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list -RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - - -RUN apt-get update \ - && apt-get install -y \ - aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ - aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \ - aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \ - && rm -rf /var/lib/apt/lists/* \ - && rm -rf /tmp/tmp* \ - && apt-get clean - # https://github.com/docker-library/openjdk/issues/261 https://github.com/docker-library/openjdk/pull/263/files RUN keytool -importkeystore -srckeystore /etc/ssl/certs/java/cacerts -destkeystore /etc/ssl/certs/java/cacerts.jks -deststoretype JKS -srcstorepass changeit -deststorepass changeit -noprompt; \ mv /etc/ssl/certs/java/cacerts.jks /etc/ssl/certs/java/cacerts; \ @@ -100,7 +78,8 @@ RUN conda install -c conda-forge \ && ln -s /opt/conda/bin/pip /usr/local/bin/pip3 \ && pip install packaging \ enum-compat \ - ipython + ipython \ + && rm -rf ~/.cache/pip/* RUN pip install --no-cache-dir -U \ opencv-python>=4.8.1.78 \ @@ -111,43 +90,29 @@ RUN pip install --no-cache-dir -U \ "awscli<2" \ pandas==1.* \ boto3 \ - cryptography - -RUN pip install -U --extra-index-url https://pip.repos.neuron.amazonaws.com \ - neuronx-cc==$NEURONX_CC_VERSION \ - torch-neuronx==$NEURONX_FRAMEWORK_VERSION \ - transformers-neuronx==$NEURONX_TRANSFORMERS_VERSION \ - && pip install -U "protobuf>=3.18.3,<4" \ + cryptography \ + "protobuf>=3.18.3,<4" \ "transformers==4.45.*" \ torchserve==${TORCHSERVE_VERSION} \ torch-model-archiver==${TORCHSERVE_VERSION} \ && pip install --no-deps --no-cache-dir -U torchvision==0.20.* \ - && pip install --no-deps -U --extra-index-url https://pip.repos.neuron.amazonaws.com neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION \ - && pip install -U --extra-index-url https://pip.repos.neuron.amazonaws.com neuronx_distributed_inference==$NEURONX_DISTRIBUTED_INFERENCE_VERSION + && rm -rf ~/.cache/pip/* RUN useradd -m model-server \ && mkdir -p /home/model-server/tmp /opt/ml/model \ && chown -R model-server /home/model-server /opt/ml/model -COPY neuron-entrypoint.py /usr/local/bin/dockerd-entrypoint.py -COPY neuron-monitor.sh /usr/local/bin/neuron-monitor.sh -COPY torchserve-neuron.sh /usr/local/bin/entrypoint.sh +COPY --chmod=755 neuron-entrypoint.py /usr/local/bin/dockerd-entrypoint.py +COPY --chmod=755 neuron-monitor.sh deep_learning_container.py /usr/local/bin/ +COPY --chmod=755 torchserve-neuron.sh /usr/local/bin/entrypoint.sh COPY config.properties /home/model-server -RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \ - && chmod +x /usr/local/bin/neuron-monitor.sh \ - && chmod +x /usr/local/bin/entrypoint.sh - -ADD https://raw.githubusercontent.com/aws/deep-learning-containers/master/src/deep_learning_container.py /usr/local/bin/deep_learning_container.py - -RUN chmod +x /usr/local/bin/deep_learning_container.py - -RUN pip install --no-cache-dir "sagemaker-pytorch-inference==${SM_TOOLKIT_VERSION}" - -# patch default_pytorch_inference_handler.py to import torch_neuronx -RUN DEST_DIR=$(python -c "import os.path, sagemaker_pytorch_serving_container; print(os.path.dirname(sagemaker_pytorch_serving_container.__file__))") \ +RUN pip install --no-cache-dir "sagemaker-pytorch-inference==${SM_TOOLKIT_VERSION}" \ + # patch default_pytorch_inference_handler.py to import torch_neuronx + && DEST_DIR=$(python -c "import os.path, sagemaker_pytorch_serving_container; print(os.path.dirname(sagemaker_pytorch_serving_container.__file__))") \ && DEST_FILE=${DEST_DIR}/default_pytorch_inference_handler.py \ - && sed -i "s/import torch/import torch, torch_neuronx/" ${DEST_FILE} + && sed -i "s/import torch/import torch, torch_neuronx/" ${DEST_FILE} \ + && rm -rf ~/.cache/pip/* RUN HOME_DIR=/root \ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ @@ -162,6 +127,37 @@ RUN HOME_DIR=/root \ RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.5/license.txt +RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list \ + && wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - + +# Neuron SDK components version numbers +ARG NEURONX_COLLECTIVES_LIB_VERSION=2.23.135.0-3e70920f2 +ARG NEURONX_RUNTIME_LIB_VERSION=2.23.112.0-9b5179492 +ARG NEURONX_TOOLS_VERSION=2.20.204.0 + +RUN apt-get update \ + && apt-get install -y \ + aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ + aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \ + aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +ARG NEURONX_CC_VERSION=2.16.372.0 +ARG NEURONX_FRAMEWORK_VERSION=2.5.1.2.4.0 +ARG NEURONX_TRANSFORMERS_VERSION=0.13.380 +ARG NEURONX_DISTRIBUTED_VERSION=0.10.1 +ARG NEURONX_DISTRIBUTED_INFERENCE_VERSION=0.1.1 + +RUN pip install -U --extra-index-url https://pip.repos.neuron.amazonaws.com \ + neuronx-cc==$NEURONX_CC_VERSION \ + torch-neuronx==$NEURONX_FRAMEWORK_VERSION \ + transformers-neuronx==$NEURONX_TRANSFORMERS_VERSION \ + && pip install --no-deps -U --extra-index-url https://pip.repos.neuron.amazonaws.com neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION \ + && pip install -U --extra-index-url https://pip.repos.neuron.amazonaws.com neuronx_distributed_inference==$NEURONX_DISTRIBUTED_INFERENCE_VERSION \ + && rm -rf ~/.cache/pip/* + EXPOSE 8080 8081 ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] From 4892e43be66c1116d1db75dbbe925c074e0ae2bb Mon Sep 17 00:00:00 2001 From: Chirag Bhatia AWS Date: Fri, 21 Feb 2025 16:22:49 -0800 Subject: [PATCH 2/3] Update docker/pytorch/training/2.5.1/Dockerfile.neuronx --- .../pytorch/training/2.5.1/Dockerfile.neuronx | 213 +++++++++++------- 1 file changed, 136 insertions(+), 77 deletions(-) diff --git a/docker/pytorch/training/2.5.1/Dockerfile.neuronx b/docker/pytorch/training/2.5.1/Dockerfile.neuronx index 5a756c9..cbff21c 100644 --- a/docker/pytorch/training/2.5.1/Dockerfile.neuronx +++ b/docker/pytorch/training/2.5.1/Dockerfile.neuronx @@ -1,23 +1,16 @@ -FROM public.ecr.aws/docker/library/ubuntu:22.04 +ARG BUILD_STAGE=prod + +FROM public.ecr.aws/docker/library/ubuntu:22.04 AS base LABEL maintainer="Amazon AI" LABEL dlc_major_version="1" -# Neuron SDK components version numbers -ARG NEURONX_DISTRIBUTED_VERSION=0.10.1 -ARG NEURONX_DISTRIBUTED_TRAINING_VERSION=1.1.1 -ARG NEURONX_CC_VERSION=2.16.372.0 -ARG NEURONX_FRAMEWORK_VERSION=2.5.1.2.4.0 -ARG NEURONX_COLLECTIVES_LIB_VERSION=2.23.135.0-3e70920f2 -ARG NEURONX_RUNTIME_LIB_VERSION=2.23.112.0-9b5179492 -ARG NEURONX_TOOLS_VERSION=2.20.204.0 - ARG PYTHON=python3.10 ARG PYTHON_VERSION=3.10.12 ARG PIP=pip3 ARG OMPI_VERSION=4.1.5 -# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 22 ARG DEBIAN_FRONTEND=noninteractive # Python won’t try to write .pyc or .pyo files on the import of source modules @@ -75,17 +68,6 @@ RUN apt-get update \ vim \ zlib1g-dev \ && rm -rf /var/lib/apt/lists/* \ - && apt-get clean - -RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list -RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - - -RUN apt-get update \ - && apt-get install -y \ - aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ - aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \ - aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \ - && rm -rf /var/lib/apt/lists/* \ && rm -rf /tmp/tmp* \ && apt-get clean @@ -101,7 +83,17 @@ RUN mkdir -p /tmp/openmpi \ && ldconfig \ && rm -rf /tmp/openmpi -# install Python +# Install packages and configure SSH for MPI operator in k8s +RUN apt-get update && apt-get install -y openmpi-bin openssh-server \ + && mkdir -p /var/run/sshd \ + && echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \ + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config \ + && sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +# Install Python RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \ && tar -xzf Python-$PYTHON_VERSION.tgz \ && cd Python-$PYTHON_VERSION \ @@ -112,7 +104,8 @@ RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VER && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ && ${PIP} --no-cache-dir install --upgrade \ pip \ - setuptools + setuptools \ + && rm -rf ~/.cache/pip/* WORKDIR / @@ -123,10 +116,6 @@ ENV PATH="$PATH:/home/.openmpi/bin" ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/" RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value -# Copy workaround script for incorrect hostname -COPY changehostname.c / -COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh - RUN ${PIP} install --no-cache-dir -U \ "bokeh>=2.3,<3" \ "awscli<2" \ @@ -138,39 +127,14 @@ RUN ${PIP} install --no-cache-dir -U \ psutil==5.6.7 \ dataset \ transformers==4.36.2 \ - Pillow + Pillow \ + && rm -rf ~/.cache/pip/* -RUN ${PIP} config set global.extra-index-url https://pip.repos.neuron.amazonaws.com \ - && ${PIP} install --force-reinstall torch-neuronx==$NEURONX_FRAMEWORK_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \ - && ${PIP} install --force-reinstall neuronx-cc==$NEURONX_CC_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com - -RUN ${PIP} install --force-reinstall --no-deps neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com - -## Installation for Neuronx Distributed Training framework -# Install Cython & wheel -RUN ${PIP} install --no-cache-dir Cython \ - && ${PIP} install --no-cache-dir wheel +RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt # Copy the apex_setup.py file COPY apex_setup.py /root/apex_setup.py -# Clone and build Apex -RUN git clone https://github.com/NVIDIA/apex.git /root/apex \ - && cd /root/apex \ - && git checkout 23.05 \ - && cp /root/apex_setup.py setup.py \ - && python3 setup.py bdist_wheel - -#Install dependencies from requirements and extras for SageMaker usecase -RUN wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed-training/master/requirements.txt \ - && ${PIP} install --no-cache-dir -r requirements.txt /root/apex/dist/apex-0.1-py3-none-any.whl \ - && ${PIP} install --force-reinstall "multiprocess==0.70.16" \ - "dill==0.3.8" \ - "torch==2.5.1" - - -RUN ${PIP} install --force-reinstall --no-deps neuronx_distributed_training==$NEURONX_DISTRIBUTED_TRAINING_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com - # attrs, neuronx-cc required: >=19.2.0, sagemaker <24,>=23.1.0 # protobuf neuronx-cc<4, sagemaker-training >=3.9.2,<=3.20.3 # awscli 1.25.47 has requirement docutils<0.17,>=0.10 @@ -183,20 +147,20 @@ RUN ${PIP} install --no-cache-dir -U \ "docutils>=0.10,<0.17" \ "rsa<4.8,>=3.1.2" \ "python-etcd" \ - "urllib3>=1.26.0,<1.27" - -# Install extra packages needed by sagemaker (for passing test_utility_packages_using_import) -RUN pip install --no-cache-dir -U \ + "urllib3>=1.26.0,<1.27" \ + # Install extra packages needed by sagemaker (for passing test_utility_packages_using_import) + && ${PIP} install --no-cache-dir -U \ "bokeh>=3.0.1,<4" \ "imageio>=2.22,<3" \ "opencv-python>=4.8.1.78" \ "plotly>=5.11,<6" \ "seaborn>=0.12,<1" \ - "shap>=0.41,<1" + "shap>=0.41,<1" \ + && rm -rf ~/.cache/pip/* # EFA Installer does apt get. Make sure to run apt update before that -RUN apt-get update -RUN cd $HOME \ +RUN apt-get update \ + && cd $HOME \ && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ && cat aws-efa-installer.key | gpg --fingerprint \ @@ -204,11 +168,8 @@ RUN cd $HOME \ && tar -xf aws-efa-installer-latest.tar.gz \ && cd aws-efa-installer \ && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ - && cd $HOME - - -# Clean up after apt update -RUN rm -rf /var/lib/apt/lists/* \ + && cd $HOME \ + && rm -rf /var/lib/apt/lists/* \ && rm -rf /tmp/tmp* \ && apt-get clean @@ -216,21 +177,24 @@ RUN rm -rf /var/lib/apt/lists/* \ # torchvision needed for MLP. since it depends on torch and torch neuron/torch # is already installed install it with nodeps RUN pip3 install --no-cache-dir --no-deps -U \ - torchvision==0.20.* - -# Needed for running bert training scripts -RUN pip3 install --no-cache-dir -U \ + torchvision==0.20.* \ + # Needed for running bert training scripts + && pip3 install --no-cache-dir -U \ graphviz \ tensorboard==2.6 \ accelerate \ sentencepiece!=0.1.92 \ h5py \ - requests - -COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py + requests \ + # Install Cython & wheel + && ${PIP} install --no-cache-dir \ + Cython \ + wheel \ + && rm -rf ~/.cache/pip/* -RUN chmod +x /usr/local/bin/start_with_right_hostname.sh \ - && chmod +x /usr/local/bin/deep_learning_container.py +# Copy workaround script for incorrect hostname +COPY changehostname.c / +COPY --chmod=755 start_with_right_hostname.sh deep_learning_container.py /usr/local/bin/ RUN HOME_DIR=/root \ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ @@ -244,6 +208,101 @@ RUN HOME_DIR=/root \ RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.5/license.txt +RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list \ + && wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - + +# Neuron SDK components +ARG NEURON_ARTIFACT_PATH=/root/neuron_artifacts +ARG IGNORE_MISSING_NEURON_COMPONENTS=false +RUN IGNORE_MISSING_NEURON_COMPONENTS=$(echo ${IGNORE_MISSING_NEURON_COMPONENTS} | tr '[:upper:]' '[:lower:]') + +ARG NEURONX_COLLECTIVES_LIB_VERSION=2.23.135.0-3e70920f2 +ARG NEURONX_RUNTIME_LIB_VERSION=2.23.112.0-9b5179492 +ARG NEURONX_TOOLS_VERSION=2.20.204.0 + +ARG NEURONX_FRAMEWORK_VERSION=2.5.1.2.4.0 +ARG NEURONX_CC_VERSION=2.16.372.0 +ARG NEURONX_DISTRIBUTED_VERSION=0.10.1 +ARG NEURONX_DISTRIBUTED_TRAINING_VERSION=1.1.1 + +FROM base AS dev + +RUN --mount=type=bind,source=apt,target=${NEURON_ARTIFACT_PATH}/apt \ + install_apt_package() { \ + pkg_name=$1; \ + version_arg=$2; \ + if [ -f "${NEURON_ARTIFACT_PATH}/apt/${version_arg}" ]; then \ + apt-get install -y ${NEURON_ARTIFACT_PATH}/apt/${version_arg}; \ + elif [ "${IGNORE_MISSING_NEURON_COMPONENTS}" = "false" ]; then \ + apt-get install -y ${pkg_name}=${version_arg}; \ + else \ + echo "Ignoring package ${pkg_name}"; \ + fi; \ + } \ + && apt-get update \ + && install_apt_package "aws-neuronx-collectives" "${NEURONX_COLLECTIVES_LIB_VERSION}" \ + && install_apt_package "aws-neuronx-runtime-lib" "${NEURONX_RUNTIME_LIB_VERSION}" \ + && install_apt_package "aws-neuronx-tools" "${NEURONX_TOOLS_VERSION}" \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +RUN --mount=type=bind,source=pip,target=${NEURON_ARTIFACT_PATH}/pip \ + install_pip_package() { \ + pkg_name=$1; \ + version_arg=$2; \ + extra_flags=$3; \ + if [ -f "${NEURON_ARTIFACT_PATH}/pip/${version_arg}" ]; then \ + ${PIP} install --force-reinstall --find-links ${NEURON_ARTIFACT_PATH}/pip \ + ${NEURON_ARTIFACT_PATH}/pip/${version_arg} ${extra_flags}; \ + elif [ "${IGNORE_MISSING_NEURON_COMPONENTS}" = "false" ]; then \ + ${PIP} config set global.extra-index-url https://pip.repos.neuron.amazonaws.com \ + && ${PIP} install --force-reinstall ${pkg_name}==${version_arg} ${extra_flags}; \ + else \ + echo "Ignoring package ${pkg_name}"; \ + fi; \ + } \ + && install_pip_package "torch-neuronx" "${NEURONX_FRAMEWORK_VERSION}" "" \ + && install_pip_package "neuronx-cc" "${NEURONX_CC_VERSION}" "" \ + && install_pip_package "neuronx_distributed" "${NEURONX_DISTRIBUTED_VERSION}" "--no-deps" \ + && install_pip_package "neuronx_distributed_training" "${NEURONX_DISTRIBUTED_TRAINING_VERSION}" "--no-deps" \ + && rm -rf ~/.cache/pip/* + +FROM base AS prod + +RUN apt-get update \ + && apt-get install -y \ + aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ + aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \ + aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +RUN ${PIP} config set global.extra-index-url https://pip.repos.neuron.amazonaws.com \ + && ${PIP} install --force-reinstall torch-neuronx==$NEURONX_FRAMEWORK_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \ + && ${PIP} install --force-reinstall neuronx-cc==$NEURONX_CC_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \ + && ${PIP} install --force-reinstall --no-deps neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \ + && ${PIP} install --force-reinstall --no-deps neuronx_distributed_training==$NEURONX_DISTRIBUTED_TRAINING_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \ + && rm -rf ~/.cache/pip/* + +FROM ${BUILD_STAGE} AS final + +## Installation for Neuronx Distributed Training framework +# Clone and build Apex +RUN git clone https://github.com/NVIDIA/apex.git /root/apex \ + && cd /root/apex \ + && git checkout 23.05 \ + && cp /root/apex_setup.py setup.py \ + && python3 setup.py bdist_wheel \ + # Install dependencies from requirements and extras for SageMaker usecase + && wget https://raw.githubusercontent.com/aws-neuron/neuronx-distributed-training/master/requirements.txt \ + && ${PIP} install --no-cache-dir -r requirements.txt /root/apex/dist/apex-0.1-py3-none-any.whl \ + && ${PIP} install --force-reinstall "multiprocess==0.70.16" \ + "dill==0.3.8" \ + "torch==2.5.1" \ + && rm -rf ~/.cache/pip/* + # Starts framework ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] CMD ["/bin/bash"] From b1ef0cd04a562218a7219eb496a7bd633397d289 Mon Sep 17 00:00:00 2001 From: Chirag Bhatia AWS Date: Fri, 21 Feb 2025 16:22:50 -0800 Subject: [PATCH 3/3] Update docker/jax/training/0.4/Dockerfile.neuronx --- docker/jax/training/0.4/Dockerfile.neuronx | 109 ++++++++++----------- 1 file changed, 52 insertions(+), 57 deletions(-) diff --git a/docker/jax/training/0.4/Dockerfile.neuronx b/docker/jax/training/0.4/Dockerfile.neuronx index 21ff482..92b9700 100644 --- a/docker/jax/training/0.4/Dockerfile.neuronx +++ b/docker/jax/training/0.4/Dockerfile.neuronx @@ -3,21 +3,13 @@ FROM public.ecr.aws/docker/library/ubuntu:22.04 LABEL dlc_major_version="1" LABEL maintainer="Amazon AI" -# Neuron SDK components version numbers -ARG NEURONX_RUNTIME_LIB_VERSION=2.23.112.0-9b5179492 -ARG NEURONX_COLLECTIVES_LIB_VERSION=2.23.135.0-3e70920f2 -ARG NEURONX_TOOLS_VERSION=2.20.204.0 -ARG NEURONX_CC_VERSION=2.16.372.0 -ARG NEURONX_JAX_TRAINING_VERSION=0.1.2 - +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 22 +ARG DEBIAN_FRONTEND=noninteractive ARG PYTHON=python3.10 ARG PYTHON_VERSION=3.10.12 ARG PIP=pip3 ARG OMPI_VERSION=4.1.5 -# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 22 -ARG DEBIAN_FRONTEND=noninteractive - # Python won’t try to write .pyc or .pyo files on the import of source modules # Force stdin, stdout and stderr to be totally unbuffered. Good for logging ENV PYTHONDONTWRITEBYTECODE=1 @@ -30,6 +22,7 @@ ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib" ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64" ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64" ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" +ENV PATH="/opt/aws/neuron/bin:${PATH}" RUN apt-get update \ && apt-get upgrade -y \ @@ -86,15 +79,17 @@ RUN mkdir -p /tmp/openmpi \ && rm -rf /tmp/openmpi # Install packages and configure SSH for MPI operator in k8s -RUN apt-get update && apt-get install -y openmpi-bin openssh-server \ +RUN apt-get update \ + && apt-get install -y openmpi-bin openssh-server \ && mkdir -p /var/run/sshd \ && echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \ && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config \ && sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config \ && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ && apt-get clean -# install Python +# Install Python RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \ && tar -xzf Python-$PYTHON_VERSION.tgz \ && cd Python-$PYTHON_VERSION \ @@ -104,8 +99,26 @@ RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VER && ln -s /usr/local/bin/pip3 /usr/bin/pip \ && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ && ${PIP} --no-cache-dir install --upgrade \ + "awscli<2" \ pip \ - setuptools + requests \ + setuptools \ + && rm -rf ~/.cache/pip/* + +# EFA Installer does apt get. Make sure to run apt update before that +RUN apt-get update \ + && cd $HOME \ + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ + && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ + && cat aws-efa-installer.key | gpg --fingerprint \ + && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ + && tar -xf aws-efa-installer-latest.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ + && cd $HOME \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean WORKDIR / @@ -118,10 +131,29 @@ RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt -# Install Neuron Driver, Runtime and Tools -RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list -RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - +# Copy workaround script for incorrect hostname +COPY changehostname.c / +COPY --chmod=755 start_with_right_hostname.sh deep_learning_container.py /usr/local/bin/ + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* \ + && rm -rf /tmp/tmp* + +RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list \ + && wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - + +# Neuron SDK components version numbers +ARG NEURONX_RUNTIME_LIB_VERSION=2.23.112.0-9b5179492 +ARG NEURONX_COLLECTIVES_LIB_VERSION=2.23.135.0-3e70920f2 +ARG NEURONX_TOOLS_VERSION=2.20.204.0 +# Install Neuron Driver, Runtime and Tools RUN apt-get update \ && apt-get install -y \ aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ @@ -131,51 +163,14 @@ RUN apt-get update \ && rm -rf /tmp/tmp* \ && apt-get clean -# Add Neuron PATH -ENV PATH="/opt/aws/neuron/bin:${PATH}" - -# Install AWS CLI -RUN ${PIP} install --no-cache-dir -U "awscli<2" +ARG NEURONX_CC_VERSION=2.16.372.0 +ARG NEURONX_JAX_TRAINING_VERSION=0.1.2 # Install JAX & Neuron CC RUN ${PIP} config set global.extra-index-url https://pip.repos.neuron.amazonaws.com \ && ${PIP} install --force-reinstall neuronx-cc==$NEURONX_CC_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \ - && ${PIP} install --force-reinstall jax-neuronx==$NEURONX_JAX_TRAINING_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com - -# EFA Installer does apt get. Make sure to run apt update before that -RUN apt-get update -RUN cd $HOME \ - && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ - && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ - && cat aws-efa-installer.key | gpg --fingerprint \ - && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ - && tar -xf aws-efa-installer-latest.tar.gz \ - && cd aws-efa-installer \ - && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ - && cd $HOME - -# Clean up after apt update -RUN rm -rf /var/lib/apt/lists/* \ - && rm -rf /tmp/tmp* \ - && apt-get clean - -# Copy workaround script for incorrect hostname -COPY changehostname.c / -COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh -COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py - -RUN chmod +x /usr/local/bin/start_with_right_hostname.sh \ - && chmod +x /usr/local/bin/deep_learning_container.py - -RUN HOME_DIR=/root \ - && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ - && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ - && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ - && chmod +x /usr/local/bin/testOSSCompliance \ - && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ - && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ - && rm -rf ${HOME_DIR}/oss_compliance* \ - && rm -rf /tmp/tmp* + && ${PIP} install --force-reinstall jax-neuronx==$NEURONX_JAX_TRAINING_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \ + && rm -rf ~/.cache/pip/* # Starts framework ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]