Merge branch 'main' into mvafin/support_awq

eaidova · Dec 17, 2024 · ae8c7db · ae8c7db
2 parents cf2fc8b + a76be08
commit ae8c7db
Show file tree

Hide file tree

Showing 50 changed files with 2,300 additions and 1,103 deletions.
diff --git a/.github/workflows/dockerfile_sanity.yml b/.github/workflows/dockerfile_sanity.yml
@@ -5,13 +5,13 @@ on:
     branches:
       - main
     paths:
-      - "docker/Dockerfile.intel"
-
+      - 'Dockerfile.ipex'
+ 
   pull_request:
     branches:
       - main
     paths:
-      - "docker/Dockerfile.intel"
+      - 'Dockerfile.ipex'
 
 jobs:
   build_and_run:
@@ -27,7 +27,7 @@ jobs:
       - name: Build and Run Docker Image
         run: |
           IMAGE_NAME="intel_image:latest"
-          docker build -f docker/Dockerfile.intel -t $IMAGE_NAME .
+          docker build -f Dockerfile.ipex -t $IMAGE_NAME .
           if [ $? -ne 0 ]; then
             echo "Docker image build failed."
             exit 1

diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml
@@ -18,7 +18,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        torch-version: ["2.4.*", "2.5.0"]
+        torch-version: ["2.4.0", "2.5.*"]
 
     runs-on: ubuntu-22.04
 

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
@@ -18,8 +18,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        torch-version: ["2.2.0", "2.3.*"]
-        transformers-version: ["4.39.0", "4.44.*"]
+        transformers-version: ["4.46.0", "4.46.3"]
+        torch-version: ["2.4.0", "2.5.*"]
 
     runs-on: ubuntu-22.04
 
@@ -38,10 +38,6 @@ jobs:
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
           pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
 
-      - if: ${{ matrix.torch-version == '2.2.0' }}
-        name: Downgrade Numpy
-        run: pip install numpy==1.*
-
       - name: Assert versions
         run: |
           python -c "import torch; print(torch.__version__); assert torch.__version__.startswith('${{ matrix.torch-version }}'.replace('.*', ''))"

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
@@ -1,6 +1,7 @@
 name: OpenVINO - Test
 
 on:
+  workflow_dispatch:
   push:
     branches:
       - main
@@ -46,9 +47,9 @@ jobs:
           pip install .[openvino,openvino-tokenizers,diffusers,tests] transformers[testing]
 
       - if: ${{ matrix.transformers-version != 'latest' }}
-        name: Downgrade Transformers and Accelerate
+        name: Install specific dependencies and versions required for older transformers
         run: |
-          pip install transformers==${{ matrix.transformers-version }} accelerate==0.*
+          pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.30.* transformers_stream_generator
       
       - if: ${{ matrix.transformers-version == 'latest' && matrix.test-pattern == '*modeling*'}}
         name: Install auto-gptq, autoawq

diff --git a/.github/workflows/test_openvino_full.yml b/.github/workflows/test_openvino_full.yml
@@ -0,0 +1,88 @@
+name: OpenVINO - Full Test
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "41 3 * * *" # run every day at 3:41
+  push:
+    branches:
+      - v*-release
+  pull_request:
+    types: [opened, synchronize, reopened, labeled]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    if: ${{ (github.event_name == 'workflow_dispatch') || (github.event_name == 'schedule') ||  (github.event_name == 'push') || contains( github.event.pull_request.labels.*.name, 'openvino-test') }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - python-version: "3.9"
+            os: "ubuntu-22.04"
+            transformers-version: "latest"
+            openvino: "ov-stable"
+            nncf: "nncf-stable"
+          - python-version: "3.9"
+            os: "ubuntu-22.04"
+            transformers-version: "latest"
+            openvino: "ov-nightly"
+            nncf: "nncf-stable"
+          - python-version: "3.9"
+            os: "ubuntu-22.04"
+            transformers-version: "latest"
+            openvino: "ov-stable"
+            nncf: "nncf-develop"
+          - python-version: "3.9"
+            os: "ubuntu-22.04"
+            transformers-version: "latest"
+            openvino: "ov-nightly"
+            nncf: "nncf-develop"
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          # Install PyTorch CPU to prevent unnecessary downloading/installing of CUDA packages
+          pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+          pip install .[tests]
+
+      - name: Install openvino-nightly
+        if: ${{ matrix.openvino == 'ov-nightly' }}
+        run: pip install --pre -U openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+
+      - name: Install openvino release
+        if: ${{ matrix.openvino == 'ov-stable' }}
+        run: pip install .[openvino]
+
+      - name: Install nncf develop
+        if: ${{ matrix.nncf == 'nncf-develop' }}
+        run: pip install git+https://github.com/openvinotoolkit/nncf.git
+
+      - name: Install nncf release
+        if: ${{ matrix.nncf == 'nncf-stable' }}
+        run: pip install .[nncf]
+
+      - name: Install the lowest compatible transformers version
+        if: ${{ matrix.transformers-version != 'latest' }}
+        run: pip install transformers==${{ matrix.transformers-version }}
+
+      - name: Pip freeze
+        run: pip freeze
+
+      - name: OpenVINO tests
+        run: pytest tests/openvino --durations=0
+        env:
+          RUN_SLOW: 1
+          HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
diff --git a/.github/workflows/test_openvino_slow.yml b/.github/workflows/test_openvino_slow.yml
@@ -25,9 +25,7 @@ jobs:
       fail-fast: false
       matrix:
         os: ["ubuntu-22.04", "windows-2019"]
-        openvino-version: ["stable", "nightly"]
         transformers-version: ["4.36.0", "latest"]
-        nncf: ["nncf", "git+https://github.com/openvinotoolkit/nncf.git"]
 
     runs-on: ${{ matrix.os }}
 
@@ -47,14 +45,9 @@ jobs:
           pip install .[openvino,tests] transformers[testing]
           pip uninstall -y nncf
 
-      - if: ${{ matrix.openvino-version == 'nightly' }}
-        name: Install nightly OpenVINO
-        run: |
-          pip install openvino openvino-tokenizers --pre --upgrade --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-
       - if: ${{ matrix.transformers-version != 'latest' }}
-        name: Downgrade Transformers and Accelerate
-        run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.*
+        name: Install specific dependencies and versions required for older transformers
+        run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.*, diffusers==0.30.* transformers_stream_generator
 
       - if: ${{ matrix.transformers-version == 'latest' }}
         name: Install auto-gptq, autoawq
@@ -70,7 +63,11 @@ jobs:
 
       - name: Install dependencies (slow)
         run: |
-          pip install ${{ matrix.nncf }}
+          pip install .[nncf]
+
+      - if: ${{ matrix.transformers-version != 'latest' }}
+        name: Downgrade Transformers and Accelerate
+        run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.*
 
       - name: Test with Pytest (slow)
         run: |

diff --git a/Dockerfile.ipex b/Dockerfile.ipex
@@ -0,0 +1,73 @@
+ARG PLATFORM=cpu
+
+FROM ubuntu:22.04 as cpu
+WORKDIR /usr/src/
+RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
+    sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
+    ca-certificates \
+    git \
+    curl \
+    vim \
+    build-essential \
+    ccache \
+    libgoogle-perftools-dev \
+    numactl \
+    cmake \
+    libjpeg-dev \
+    pybind11-dev \
+    libpng-dev \
+    python3 \
+    python3-pip \
+    && rm -rf /var/lib/apt/lists/*"
+RUN /usr/sbin/update-ccache-symlinks
+RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
+
+ARG IPEX_VERSION=2.5.0
+ARG PYTORCH_VERSION=2.5.1
+ARG TORCHVISION_VERSION=0.20.1+cpu
+ARG TORCHAUDIO_VERSION=2.5.1+cpu
+
+RUN python3 -m pip install --no-cache-dir \
+    torch==${PYTORCH_VERSION}+cpu \
+    torchvision==${TORCHVISION_VERSION} \
+    torchaudio==${TORCHAUDIO_VERSION} \
+    --index-url https://download.pytorch.org/whl/cpu && \
+    python3 -m pip install intel-openmp -f https://download.pytorch.org/whl/torch_stable.html && \
+    python3 -m pip install intel-extension-for-pytorch==$IPEX_VERSION && \
+    python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/cn/ && \
+    python3 -m pip install --no-cache-dir  py-libnuma
+
+ARG KMP_BLOCKTIME=1
+ENV KMP_BLOCKTIME=${KMP_BLOCKTIME}
+ARG KMP_HW_SUBSET=1T
+ENV KMP_HW_SUBSET=${KMP_HW_SUBSET}
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so"
+
+FROM intel/intel-extension-for-pytorch:2.3.110-xpu as xpu
+WORKDIR /usr/src/
+
+RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
+    sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
+    ca-certificates \
+    git \
+    curl \
+    vim \
+    ccache \
+    libgoogle-perftools-dev \
+    numactl \
+    libjpeg-dev \
+    pybind11-dev \
+    libpng-dev \
+    && rm -rf /var/lib/apt/lists/*"
+RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null
+
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
+| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y intel-basekit xpu-smi cmake ninja-build pciutils
+
+FROM ${PLATFORM}
+
+COPY optimum optimum
+COPY Makefile setup.cfg setup.py pyproject.toml README.md ./
+RUN pip install .
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@
 
 🤗 Optimum Intel is the interface between the 🤗 Transformers and Diffusers libraries and the different tools and libraries provided by Intel to accelerate end-to-end pipelines on Intel architectures.
 
-[Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) is an open-source library which provides optimizations for both eager mode and graph mode, however, compared to eager mode, graph mode in PyTorch* normally yields better performance from optimization techniques, such as operation fusion.
+[Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) is an open-source library which provides optimizations like faster attention and operators fusion.
 
 Intel [Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) is an open-source library enabling the usage of the most popular compression techniques such as quantization, pruning and knowledge distillation. It supports automatic accuracy-driven tuning strategies in order for users to easily generate quantized model. The users can easily apply static, dynamic and aware-training quantization approaches while giving an expected accuracy criteria. It also supports different weight pruning techniques enabling the creation of pruned model giving a predefined sparsity target.
 
@@ -159,7 +159,7 @@ optimized_model = OVModelForSequenceClassification.from_pretrained(save_dir)
 
 
 ## IPEX
-To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. You can set `export=True` to load a PyTorch checkpoint, export your model via TorchScript and apply IPEX optimizations : both operators optimization (replaced with customized IPEX operators) and graph-level optimization (like operators fusion) will be applied on your model.
+To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. It will load a PyTorch checkpoint, and apply IPEX operators optimization (replaced with customized IPEX operators).
 ```diff
   from transformers import AutoTokenizer, pipeline
 - from transformers import AutoModelForCausalLM
@@ -168,7 +168,7 @@ To load your IPEX model, you can just replace your `AutoModelForXxx` class with
 
   model_id = "gpt2"
 - model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
-+ model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, export=True)
++ model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
   tokenizer = AutoTokenizer.from_pretrained(model_id)
   pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
   results = pipe("He's a dreadful magician and")

diff --git a/docker/Dockerfile.intel b/docker/Dockerfile.intel
diff --git a/docs/source/ipex/inference.mdx b/docs/source/ipex/inference.mdx
@@ -14,8 +14,8 @@ Optimum Intel can be used to load models from the [Hub](https://huggingface.co/m
 
 ## Loading
 
-You can load your model and apply IPEX optimizations (including weight prepacking and graph mode). For supported architectures like LLaMA, BERT and ViT, further optimizations will be applied by patching the model to use custom operators.
-For now, support is only enabled for CPUs and the original model will be exported via TorchScript. In the future `torch.compile` will be used and model exported via TorchScript will get deprecated.
+You can load your model and apply IPEX optimizations (apply torch.compile except text-generation tasks). For supported architectures like LLaMA, BERT and ViT, further optimizations will be applied by patching the model to use custom operators.
+For now, support is enabled for Intel CPU/GPU. Previous models converted to TorchScript will be deprecated in v1.22.
 
 ```diff
   import torch
@@ -25,7 +25,7 @@ For now, support is only enabled for CPUs and the original model will be exporte
 
   model_id = "gpt2"
 - model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
-+ model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, export=True)
++ model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
   tokenizer = AutoTokenizer.from_pretrained(model_id)
   pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
   results = pipe("He's a dreadful magician and")
@@ -43,3 +43,4 @@ As shown in the table below, each task is associated with a class enabling to au
 | `IPEXModelForMaskedLM`               | `fill-mask`                          |
 | `IPEXModelForAudioClassification`    | `audio-classification`               |
 | `IPEXModelForCausalLM`               | `text-generation`                    |
+| `IPEXModelForSeq2SeqLM`              | `text2text-generation`               |
diff --git a/docs/source/ipex/models.mdx b/docs/source/ipex/models.mdx
@@ -40,6 +40,7 @@ Here is the list of the supported architectures :
 - Roberta
 - Roformer
 - SqueezeBert
+- T5
 - UniSpeech
 - Vit
 - Wav2Vec2