Skip to content

Commit

Permalink
feat(onnx): add onnx runtime for better CPU perf (huggingface#328)
Browse files Browse the repository at this point in the history
  • Loading branch information
OlivierDehaene authored Jul 5, 2024
1 parent 93b3f67 commit 12e76fb
Show file tree
Hide file tree
Showing 16 changed files with 626 additions and 210 deletions.
83 changes: 83 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
members = [
"backends",
"backends/candle",
"backends/ort",
"backends/core",
"backends/python",
"backends/grpc-client",
Expand Down
38 changes: 4 additions & 34 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,22 +28,9 @@ ARG ACTIONS_CACHE_URL
ARG ACTIONS_RUNTIME_TOKEN
ARG SCCACHE_GHA_ENABLED

RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | \
tee /etc/apt/sources.list.d/oneAPI.list

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
intel-oneapi-mkl-devel=2024.0.0-49656 \
build-essential \
&& rm -rf /var/lib/apt/lists/*

RUN echo "int mkl_serv_intel_cpu_true() {return 1;}" > fakeintel.c && \
gcc -shared -fPIC -o libfakeintel.so fakeintel.c

COPY --from=planner /usr/src/recipe.json recipe.json

RUN cargo chef cook --release --features candle --features mkl-dynamic --no-default-features --recipe-path recipe.json && sccache -s
RUN cargo chef cook --release --features ort --no-default-features --recipe-path recipe.json && sccache -s

COPY backends backends
COPY core core
Expand All @@ -53,7 +40,7 @@ COPY Cargo.lock ./

FROM builder as http-builder

RUN cargo build --release --bin text-embeddings-router -F candle -F mkl-dynamic -F http --no-default-features && sccache -s
RUN cargo build --release --bin text-embeddings-router -F ort -F http --no-default-features && sccache -s

FROM builder as grpc-builder

Expand All @@ -65,35 +52,18 @@ RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \

COPY proto proto

RUN cargo build --release --bin text-embeddings-router -F grpc -F candle -F mkl-dynamic --no-default-features && sccache -s
RUN cargo build --release --bin text-embeddings-router -F grpc -F ort --no-default-features && sccache -s

FROM debian:bookworm-slim as base

ENV HUGGINGFACE_HUB_CACHE=/data \
PORT=80 \
MKL_ENABLE_INSTRUCTIONS=AVX512_E4 \
RAYON_NUM_THREADS=8 \
LD_PRELOAD=/usr/local/libfakeintel.so \
LD_LIBRARY_PATH=/usr/local/lib
PORT=80

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
libomp-dev \
ca-certificates \
libssl-dev \
curl \
&& rm -rf /var/lib/apt/lists/*

# Copy a lot of the Intel shared objects because of the mkl_serv_intel_cpu_true patch...
COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_intel_lp64.so.2 /usr/local/lib/libmkl_intel_lp64.so.2
COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_intel_thread.so.2 /usr/local/lib/libmkl_intel_thread.so.2
COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_core.so.2 /usr/local/lib/libmkl_core.so.2
COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_vml_def.so.2 /usr/local/lib/libmkl_vml_def.so.2
COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_def.so.2 /usr/local/lib/libmkl_def.so.2
COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_vml_avx2.so.2 /usr/local/lib/libmkl_vml_avx2.so.2
COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_vml_avx512.so.2 /usr/local/lib/libmkl_vml_avx512.so.2
COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_avx2.so.2 /usr/local/lib/libmkl_avx2.so.2
COPY --from=builder /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_avx512.so.2 /usr/local/lib/libmkl_avx512.so.2
COPY --from=builder /usr/src/libfakeintel.so /usr/local/libfakeintel.so

FROM base as grpc

Expand Down
92 changes: 0 additions & 92 deletions Dockerfile-arm64

This file was deleted.

8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ Below are some examples of the currently supported models:
| MTEB Rank | Model Size | Model Type | Model ID |
|-----------|---------------------|-------------|--------------------------------------------------------------------------------------------------|
| 1 | 7B (Very Expensive) | Mistral | [Salesforce/SFR-Embedding-2_R](https://hf.co/Salesforce/SFR-Embedding-2_R) |
| 2 | 7B (Very Expensive) | Qwen2 | [Alibaba-NLP/gte-Qwen2-7B-instruct](https://hf.co/Alibaba-NLP/gte-Qwen2-7B-instruct) |
| 9 | 1.5B (Expensive) | Qwen2 | [Alibaba-NLP/gte-Qwen2-1.5B-instruct](https://hf.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct) |
| 15 | 0.4B | Alibaba GTE | [Alibaba-NLP/gte-large-en-v1.5](Alibaba-NLP/gte-large-en-v1.5) |
| 2 | 7B (Very Expensive) | Qwen2 | [Alibaba-NLP/gte-Qwen2-7B-instruct](https://hf.co/Alibaba-NLP/gte-Qwen2-7B-instruct) |
| 9 | 1.5B (Expensive) | Qwen2 | [Alibaba-NLP/gte-Qwen2-1.5B-instruct](https://hf.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct) |
| 15 | 0.4B | Alibaba GTE | [Alibaba-NLP/gte-large-en-v1.5](https://hf.co/Alibaba-NLP/gte-large-en-v1.5) |
| 20 | 0.3B | Bert | [WhereIsAI/UAE-Large-V1](https://hf.co/WhereIsAI/UAE-Large-V1) |
| 24 | 0.5B | XLM-RoBERTa | [intfloat/multilingual-e5-large-instruct](https://hf.co/intfloat/multilingual-e5-large-instruct) |
| N/A | 0.1B | NomicBert | [nomic-ai/nomic-embed-text-v1](https://hf.co/nomic-ai/nomic-embed-text-v1) |
Expand Down Expand Up @@ -568,7 +568,7 @@ supported via Docker. As such inference will be CPU bound and most likely pretty
M1/M2 ARM CPU.

```
docker build . -f Dockerfile-arm64 --platform=linux/arm64
docker build . -f Dockerfile --platform=linux/arm64
```

## Examples
Expand Down
4 changes: 4 additions & 0 deletions backends/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,19 @@ homepage.workspace = true

[dependencies]
clap = { workspace = true, optional = true }
hf-hub = { workspace = true }
serde_json = { workspace = true }
text-embeddings-backend-core = { path = "core" }
text-embeddings-backend-python = { path = "python", optional = true }
text-embeddings-backend-candle = { path = "candle", optional = true }
text-embeddings-backend-ort = { path = "ort", optional = true }
tokio = { workspace = true }
tracing = { workspace = true }

[features]
clap = ["dep:clap", "text-embeddings-backend-core/clap"]
python = ["dep:text-embeddings-backend-python"]
ort = ["dep:text-embeddings-backend-ort"]
candle = ["dep:text-embeddings-backend-candle"]
cuda = ["text-embeddings-backend-candle?/cuda"]
metal = ["text-embeddings-backend-candle?/metal"]
Expand Down
17 changes: 17 additions & 0 deletions backends/ort/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
[package]
name = "text-embeddings-backend-ort"
version.workspace = true
edition.workspace = true
authors.workspace = true
homepage.workspace = true

[dependencies]
anyhow = { workspace = true }
nohash-hasher = { workspace = true }
ndarray = "0.15.6"
ort = { version = "2.0.0-rc.2", default-features = false, features = ["download-binaries", "half", "onednn", "ndarray"] }
text-embeddings-backend-core = { path = "../core" }
tracing = { workspace = true }
thiserror = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
Loading

0 comments on commit 12e76fb

Please sign in to comment.