forked from huggingface/tgi-gaudi
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This PR adds support for AMD Instinct MI210 & MI250 GPUs, with paged attention and FAv2 support. Remaining items to discuss, on top of possible others: * Should we have a `ghcr.io/huggingface/text-generation-inference:1.1.0+rocm` hosted image, or is it too early? * Should we set up a CI on MI210/MI250? I don't have access to the runners of TGI though. * Are we comfortable with those changes being directly in TGI, or do we need a fork? --------- Co-authored-by: Felix Marty <[email protected]> Co-authored-by: OlivierDehaene <[email protected]> Co-authored-by: Your Name <[email protected]>
- Loading branch information
1 parent
ed2a3f6
commit b2b5df0
Showing
22 changed files
with
575 additions
and
82 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -59,7 +59,7 @@ jobs: | |
build-and-push-image: | ||
concurrency: | ||
group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }} | ||
group: ${{ github.workflow }}-build-and-push-image-${{ github.head_ref || github.run_id }} | ||
cancel-in-progress: true | ||
needs: start-runner # required to start the main job when the runner is ready | ||
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner | ||
|
@@ -146,13 +146,103 @@ jobs: | |
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=min | ||
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=min | ||
|
||
build-and-push-image-rocm: | ||
concurrency: | ||
group: ${{ github.workflow }}-build-and-push-image-rocm-${{ github.head_ref || github.run_id }} | ||
cancel-in-progress: true | ||
needs: start-runner # required to start the main job when the runner is ready | ||
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner | ||
permissions: | ||
contents: write | ||
packages: write | ||
# This is used to complete the identity challenge | ||
# with sigstore/fulcio when running outside of PRs. | ||
id-token: write | ||
security-events: write | ||
steps: | ||
- name: Checkout repository | ||
uses: actions/checkout@v3 | ||
- name: Initialize Docker Buildx | ||
uses: docker/[email protected] | ||
with: | ||
install: true | ||
- name: Inject slug/short variables | ||
uses: rlespinasse/[email protected] | ||
- name: Tailscale | ||
uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966 | ||
with: | ||
authkey: ${{ secrets.TAILSCALE_AUTHKEY }} | ||
- name: Login to GitHub Container Registry | ||
if: github.event_name != 'pull_request' | ||
uses: docker/login-action@v2 | ||
with: | ||
registry: ghcr.io | ||
username: ${{ github.actor }} | ||
password: ${{ secrets.GITHUB_TOKEN }} | ||
- name: Login to internal Container Registry | ||
uses: docker/[email protected] | ||
with: | ||
username: ${{ secrets.TAILSCALE_DOCKER_USERNAME }} | ||
password: ${{ secrets.TAILSCALE_DOCKER_PASSWORD }} | ||
registry: registry.internal.huggingface.tech | ||
- name: Login to Azure Container Registry | ||
if: github.event_name != 'pull_request' | ||
uses: docker/[email protected] | ||
with: | ||
username: ${{ secrets.AZURE_DOCKER_USERNAME }} | ||
password: ${{ secrets.AZURE_DOCKER_PASSWORD }} | ||
registry: db4c2190dd824d1f950f5d1555fbadf0.azurecr.io | ||
# If pull request | ||
- name: Extract metadata (tags, labels) for Docker | ||
if: ${{ github.event_name == 'pull_request' }} | ||
id: meta-pr | ||
uses: docker/[email protected] | ||
with: | ||
images: | | ||
registry.internal.huggingface.tech/api-inference/community/text-generation-inference | ||
tags: | | ||
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}-rocm | ||
# If main, release or tag | ||
- name: Extract metadata (tags, labels) for Docker | ||
if: ${{ github.event_name != 'pull_request' }} | ||
id: meta | ||
uses: docker/[email protected] | ||
with: | ||
flavor: | | ||
latest=false | ||
images: | | ||
registry.internal.huggingface.tech/api-inference/community/text-generation-inference | ||
ghcr.io/huggingface/text-generation-inference | ||
db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference | ||
tags: | | ||
type=semver,pattern={{version}}-rocm | ||
type=semver,pattern={{major}}.{{minor}}-rocm | ||
type=raw,value=latest-rocm,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }} | ||
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}-rocm | ||
- name: Build and push Docker image | ||
id: build-and-push | ||
uses: docker/build-push-action@v4 | ||
with: | ||
context: . | ||
file: Dockerfile_amd | ||
push: true | ||
platforms: 'linux/amd64' | ||
build-args: | | ||
GIT_SHA=${{ env.GITHUB_SHA }} | ||
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}-rocm | ||
tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }} | ||
labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }} | ||
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-rocm,mode=min | ||
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-rocm,mode=min | ||
|
||
integration-tests: | ||
concurrency: | ||
group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }} | ||
cancel-in-progress: true | ||
needs: | ||
- start-runner | ||
- build-and-push-image # Wait for the docker image to be built | ||
- build-and-push-image-rocm | ||
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner | ||
env: | ||
DOCKER_VOLUME: /cache | ||
|
@@ -187,6 +277,7 @@ jobs: | |
needs: | ||
- start-runner | ||
- build-and-push-image | ||
- build-and-push-image-rocm | ||
- integration-tests | ||
runs-on: ubuntu-latest | ||
env: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
# Rust builder | ||
FROM lukemathwalker/cargo-chef:latest-rust-1.71 AS chef | ||
WORKDIR /usr/src | ||
|
||
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse | ||
|
||
FROM chef as planner | ||
COPY Cargo.toml Cargo.toml | ||
COPY rust-toolchain.toml rust-toolchain.toml | ||
COPY proto proto | ||
COPY benchmark benchmark | ||
COPY router router | ||
COPY launcher launcher | ||
RUN cargo chef prepare --recipe-path recipe.json | ||
|
||
FROM chef AS builder | ||
|
||
ARG GIT_SHA | ||
ARG DOCKER_LABEL | ||
|
||
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \ | ||
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \ | ||
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \ | ||
unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \ | ||
rm -f $PROTOC_ZIP | ||
|
||
COPY --from=planner /usr/src/recipe.json recipe.json | ||
RUN cargo chef cook --release --recipe-path recipe.json | ||
|
||
COPY Cargo.toml Cargo.toml | ||
COPY rust-toolchain.toml rust-toolchain.toml | ||
COPY proto proto | ||
COPY benchmark benchmark | ||
COPY router router | ||
COPY launcher launcher | ||
RUN cargo build --release | ||
|
||
# Text Generation Inference base image for RoCm | ||
FROM rocm/dev-ubuntu-20.04:5.7 as base | ||
|
||
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ | ||
build-essential \ | ||
ca-certificates \ | ||
ccache \ | ||
curl \ | ||
git \ | ||
make \ | ||
libssl-dev \ | ||
g++ \ | ||
# Needed to build VLLM & flash. | ||
rocthrust-dev \ | ||
hipsparse-dev \ | ||
hipblas-dev && \ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
# Keep in sync with `server/pyproject.toml | ||
ARG MAMBA_VERSION=23.1.0-1 | ||
ARG PYTORCH_VERSION='2.2.0.dev0' | ||
ARG ROCM_VERSION='5.7' | ||
ARG PYTHON_VERSION='3.10.10' | ||
# Automatically set by buildx | ||
ARG TARGETPLATFORM | ||
ENV PATH /opt/conda/bin:$PATH | ||
|
||
# TGI seem to require libssl.so.1.1 instead of libssl.so.3 so we can't use ubuntu 22.04. Ubuntu 20.04 has python==3.8, and TGI requires python>=3.9, hence the need for miniconda. | ||
# Install mamba | ||
# translating Docker's TARGETPLATFORM into mamba arches | ||
RUN case ${TARGETPLATFORM} in \ | ||
"linux/arm64") MAMBA_ARCH=aarch64 ;; \ | ||
*) MAMBA_ARCH=x86_64 ;; \ | ||
esac && \ | ||
curl -fsSL -v -o ~/mambaforge.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" | ||
RUN chmod +x ~/mambaforge.sh && \ | ||
bash ~/mambaforge.sh -b -p /opt/conda && \ | ||
mamba init && \ | ||
rm ~/mambaforge.sh | ||
|
||
# Install PyTorch nightly (2.2.0.dev2023) compiled against RoCm 5.7, as VLLM can not be compiled with RoCm 5.6. | ||
RUN pip install --pre torch==2.2.0.dev20231106 --index-url https://download.pytorch.org/whl/nightly/rocm5.7 | ||
|
||
FROM base AS kernel-builder | ||
|
||
# Build vllm kernels | ||
FROM kernel-builder AS vllm-builder | ||
WORKDIR /usr/src | ||
|
||
COPY server/Makefile-vllm Makefile | ||
|
||
# Build specific version of vllm | ||
RUN make build-vllm-rocm | ||
|
||
# Build Flash Attention v2 kernels | ||
FROM kernel-builder AS flash-att-v2-builder | ||
WORKDIR /usr/src | ||
|
||
COPY server/Makefile-flash-att-v2 Makefile | ||
|
||
# Build specific version of flash attention v2 | ||
RUN make build-flash-attention-v2-rocm | ||
|
||
# Build Transformers CUDA kernels (gpt-neox and bloom) | ||
FROM kernel-builder as custom-kernels-builder | ||
WORKDIR /usr/src | ||
COPY server/custom_kernels/ . | ||
RUN PYTORCH_ROCM_ARCH=gfx90a python setup.py build | ||
|
||
FROM base as base-copy | ||
|
||
# Text Generation Inference base env | ||
ENV HUGGINGFACE_HUB_CACHE=/data \ | ||
HF_HUB_ENABLE_HF_TRANSFER=1 \ | ||
PORT=80 | ||
|
||
# Copy builds artifacts from vllm builder | ||
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages | ||
|
||
# Copy build artifacts from flash attention v2 builder | ||
COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages | ||
|
||
# Copy build artifacts from custom kernels builder | ||
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages | ||
|
||
# Install flash-attention dependencies | ||
RUN pip install einops --no-cache-dir | ||
|
||
# Install server | ||
COPY proto proto | ||
COPY server server | ||
COPY server/Makefile server/Makefile | ||
RUN cd server && \ | ||
make gen-server && \ | ||
pip install -r requirements_rocm.txt && \ | ||
pip install ".[accelerate, peft]" --no-cache-dir | ||
|
||
# Install benchmarker | ||
COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark | ||
# Install router | ||
COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router | ||
# Install launcher | ||
COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher | ||
|
||
# AWS Sagemaker compatible image | ||
FROM base-copy as sagemaker | ||
COPY sagemaker-entrypoint.sh entrypoint.sh | ||
RUN chmod +x entrypoint.sh | ||
|
||
ENTRYPOINT ["./entrypoint.sh"] | ||
|
||
# Final image | ||
FROM base-copy | ||
|
||
ENTRYPOINT ["text-generation-launcher"] | ||
CMD ["--json-output"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.