From cfa1c49105fba44facf310fb8080373b3762b83e Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Fri, 7 Jun 2024 20:51:06 +0200
Subject: [PATCH] Fix compatibility with transformers < v4.39.0 release  (#754)

* fix compatibility with transformers < v4.38.0

* add minimum supported transformers version in test

* update setup

* model id fixes

* fix model id

* fix mixtral patcher
---
 .github/workflows/test_openvino.yml         |  2 ++
 optimum/exporters/openvino/model_patcher.py | 10 +++++---
 optimum/intel/openvino/modeling_decoder.py  | 10 +++++---
 setup.py                                    |  2 +-
 tests/openvino/test_modeling.py             | 27 ++++++++++++---------
 tests/openvino/test_quantization.py         |  6 +++--
 tests/openvino/test_training.py             |  3 +++
 tests/openvino/utils_tests.py               |  2 +-
 8 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index a0bc916c75..37cf81fecc 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -21,6 +21,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: ["3.8", "3.12"]
+        transformers-version: ["4.36.0", "4.41.*"]
         os: [ubuntu-latest]
 
     runs-on: ${{ matrix.os }}
@@ -35,6 +36,7 @@ jobs:
         python -m pip install --upgrade pip
         # install PyTorch CPU version to avoid installing CUDA packages on GitHub runner without GPU
         pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+        pip install transformers==${{ matrix.transformers-version }}
         pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime
     - name: Test with Pytest
       run: |
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 0265b3a5fc..9138bed02c 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -20,7 +20,6 @@
 
 import torch
 import torch.nn.functional as F
-from transformers.cache_utils import Cache, StaticCache
 from transformers.modeling_outputs import BaseModelOutputWithPast
 from transformers.utils import is_tf_available
 
@@ -36,6 +35,7 @@
 
 
 if TYPE_CHECKING:
+    from transformers.cache_utils import Cache
     from transformers.modeling_utils import PreTrainedModel
 
     from optimum.exporters.onnx.config import OnnxConfig
@@ -131,7 +131,10 @@ def _mixtral_sparse_moe_block_forward(self, hidden_states: torch.Tensor) -> torc
         # the current expert. We need to make sure to multiply the output hidden
         # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
         current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
-        current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
+        if is_transformers_version("<", "4.37.0"):
+            current_hidden_states = expert_layer(current_state, routing_weights[top_x, idx, None])
+        else:
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
 
         final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
     final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
@@ -1656,9 +1659,10 @@ def _dbrx_update_causal_mask_latest(
     attention_mask: torch.Tensor,
     input_tensor: torch.Tensor,
     cache_position: torch.Tensor,
-    past_key_values: Cache,
+    past_key_values: "Cache",
     output_attentions: bool,
 ):
+    from transformers.cache_utils import StaticCache
     from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 
     # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index b504d6482c..d7f8929aa8 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -38,7 +38,7 @@
 
 from ...exporters.openvino import ensure_stateful_is_available, main_export, patch_stateful
 from ...exporters.openvino.stateful import model_has_state
-from ..utils.import_utils import is_nncf_available
+from ..utils.import_utils import is_nncf_available, is_transformers_version
 from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS
 from .configuration import _DEFAULT_4BIT_CONFIGS, OVConfig, OVWeightQuantizationConfig, _check_default_4bit_configs
 from .modeling import _TOKENIZER_FOR_DOC, INPUTS_DOCSTRING, MODEL_START_DOCSTRING, OVModel
@@ -633,8 +633,12 @@ def generate(
         negative_prompt_attention_mask: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
-        _generation_config, _ = self._prepare_generation_config(generation_config, **kwargs)
-        generation_mode = _generation_config.get_generation_mode(assistant_model)
+        if is_transformers_version(">=", "4.39.0"):
+            _generation_config, _ = self._prepare_generation_config(generation_config, **kwargs)
+            generation_mode = _generation_config.get_generation_mode(assistant_model)
+        else:
+            _generation_config = generation_config or self.generation_config
+            generation_mode = self._get_generation_mode(_generation_config, assistant_model)
 
         is_beam_search = generation_mode in [
             GenerationMode.BEAM_SEARCH,
diff --git a/setup.py b/setup.py
index b8869f46ac..4eb89c2a3d 100644
--- a/setup.py
+++ b/setup.py
@@ -39,7 +39,7 @@
 
 TESTS_REQUIRE = [
     "accelerate",
-    "pytest<8.2",
+    "pytest>=7.2.0,<8.0.0",
     "parameterized",
     "Pillow",
     "evaluate",
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 0cb332276c..262a5396f5 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -81,7 +81,7 @@
 from optimum.intel.openvino.modeling_seq2seq import OVDecoder, OVEncoder
 from optimum.intel.openvino.modeling_timm import TimmImageProcessor
 from optimum.intel.openvino.utils import _print_compiled_model_properties
-from optimum.intel.utils.import_utils import is_openvino_version
+from optimum.intel.utils.import_utils import is_openvino_version, is_transformers_version
 from optimum.utils import (
     DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER,
     DIFFUSION_MODEL_UNET_SUBFOLDER,
@@ -528,8 +528,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "chatglm",
         "codegen",
         "codegen2",
-        # "data2vec-text", # TODO : enable when enabled in exporters
-        "gemma",
         "gpt2",
         "gpt_neo",
         "gpt_neox",
@@ -540,15 +538,10 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "mistral",
         "mixtral",
         "mpt",
-        "olmo",
         "opt",
         "pegasus",
         "qwen",
-        "qwen2",
-        "stablelm",
-        "starcoder2",
         "phi",
-        "phi3",
         "internlm2",
         "orion",
         "falcon",
@@ -556,15 +549,26 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "persimmon",
         "biogpt",
         "gpt_neox_japanese",
-        "cohere",
         "xglm",
         "aquila",
         "aquila2",
         "xverse",
         "internlm",
-        "dbrx",
-        "qwen2-moe",
     )
+
+    if is_transformers_version(">=", "4.40.0"):
+        SUPPORTED_ARCHITECTURES += (
+            "gemma",
+            "olmo",
+            "stablelm",
+            "starcoder2",
+            "dbrx",
+            "phi3",
+            "cohere",
+            "qwen2",
+            "qwen2-moe",
+        )
+
     GENERATION_LENGTH = 100
     REMOTE_CODE_MODELS = (
         "chatglm",
@@ -575,7 +579,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "qwen",
         "internlm2",
         "orion",
-        "phi3",
         "aquila",
         "aquila2",
         "xverse",
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index b7ed36d3e6..bae0ad772f 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -63,7 +63,7 @@
 from optimum.intel.openvino.configuration import OVQuantizationMethod, OVQuantizationConfigBase
 
 from optimum.intel.openvino.quantization import InferRequestWrapper
-from optimum.intel.utils.import_utils import is_openvino_version
+from optimum.intel.utils.import_utils import is_openvino_version, is_transformers_version
 from utils_tests import MODEL_NAMES, get_num_quantized_nodes, _ARCHITECTURES_TO_EXPECTED_INT8
 
 _TASK_TO_DATASET = {
@@ -89,6 +89,9 @@ def test_automodel_static_quantization(self, model_cls, model_name, expected_fak
         dataset_name, dataset_config_name, column_name = _TASK_TO_DATASET[task]
         file_name = "openvino_quantized_model.xml"
 
+        if model_name == "bert" and is_transformers_version("<", "4.41.0"):
+            expected_fake_quantize = 32
+
         def preprocess_function(examples, tokenizer):
             return tokenizer(examples[column_name], padding="max_length", max_length=128, truncation=True)
 
@@ -114,7 +117,6 @@ def preprocess_function(examples, tokenizer):
                 ov_config=ov_config,
             )
             model = model_cls.from_pretrained(tmp_dir, file_name=file_name)
-
             num_fake_quantize, num_int8, _ = get_num_quantized_nodes(model)
             self.assertEqual(expected_fake_quantize, num_fake_quantize)
             self.assertEqual(expected_int8, num_int8)
diff --git a/tests/openvino/test_training.py b/tests/openvino/test_training.py
index 89d644319c..639a77b4a6 100644
--- a/tests/openvino/test_training.py
+++ b/tests/openvino/test_training.py
@@ -54,6 +54,7 @@
 )
 from optimum.intel.openvino.trainer import DEFAULT_QUANTIZATION_CONFIG, OVTrainer
 from optimum.intel.openvino.utils import OV_XML_FILE_NAME
+from optimum.intel.utils.import_utils import is_transformers_version
 
 
 F32_CONFIG = {"INFERENCE_PRECISION_HINT": "f32"}
@@ -463,6 +464,7 @@ class OVTrainerTextClassificationTrainingTest(OVTrainerBaseTrainingTest):
     task = "sequence-classification"
 
     @parameterized.expand(OVTRAINER_TEXT_CLASSIFICATION_TEST_DESCRIPTORS.items())
+    @unittest.skipIf(is_transformers_version("<", "4.41.0"), reason="Mismatch in expected fake quantized op")
     def test_training(self, _, desc: OVTrainerTestDescriptor):
         self.run_ovtrainer_training_checks(desc)
 
@@ -611,6 +613,7 @@ class OVTrainerImageClassificationTrainingTest(OVTrainerBaseTrainingTest):
     task = "image-classification"
 
     @parameterized.expand(OVTRAINER_IMAGE_CLASSIFICATION_TEST_DESCRIPTORS.items())
+    @unittest.skipIf(is_transformers_version("<", "4.41.0"), reason="Mismatch in expected fake quantized op")
     def test_training(self, _, desc: OVTrainerTestDescriptor):
         self.run_ovtrainer_training_checks(desc)
 
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index f19c248f77..a92af8c0a9 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -89,7 +89,7 @@
     "persimmon": "hf-internal-testing/tiny-random-PersimmonForCausalLM",
     "pix2struct": "fxmarty/pix2struct-tiny-random",
     "phi": "echarlaix/tiny-random-PhiForCausalLM",
-    "phi3": "katuni4ka/tiny-random-phi3",
+    "phi3": "Xenova/tiny-random-Phi3ForCausalLM",
     "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel",
     "qwen": "katuni4ka/tiny-random-qwen",
     "qwen2": "fxmarty/tiny-dummy-qwen2",