Skip to content

Commit

Permalink
Restore SDPA in Gemma2 models for transformers > 4.45 (#976)
Browse files Browse the repository at this point in the history
* Restore SDPA in Gemma2 models for transformers > 4.45

* Update tests/openvino/test_modeling.py

* Update tests/openvino/test_modeling.py
  • Loading branch information
eaidova authored Oct 28, 2024
1 parent 635f939 commit 936d272
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 0 deletions.
20 changes: 20 additions & 0 deletions optimum/exporters/openvino/model_patcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -2505,6 +2505,26 @@ def patched_forward(*args, **kwargs):

self.patched_forward = patched_forward

def __enter__(self):
super().__enter__()
if is_transformers_version(">=", "4.45.0"):
from transformers.models.gemma2.modeling_gemma2 import GEMMA2_ATTENTION_CLASSES

sdpa_attn = GEMMA2_ATTENTION_CLASSES["sdpa"]
eager_attn = GEMMA2_ATTENTION_CLASSES["eager"]

for layer in self._model.model.layers:
if isinstance(layer.self_attn, eager_attn):
layer.self_attn._orig_forward = layer.self_attn.forward
layer.self_attn.forward = types.MethodType(sdpa_attn.forward, layer.self_attn)

def __exit__(self, exc_type, exc_value, traceback):
super().__exit__(exc_type, exc_value, traceback)
if is_transformers_version(">=", "4.45.0"):
for layer in self._model.model.layers:
if hasattr(layer.self_attn, "_orig_forward"):
layer.self_attn.forward = layer.self_attn._orig_forward


def _decilm_attn_forward(
self,
Expand Down
8 changes: 8 additions & 0 deletions tests/openvino/test_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -863,6 +863,10 @@ def test_compare_to_transformers(self, model_arch):
if model_arch in self.REMOTE_CODE_MODELS:
model_kwargs = {"trust_remote_code": True}

# starting from transformers 4.45.0 gemma2 uses eager attention by default, while ov - sdpa
if model_arch == "gemma2" and is_transformers_version(">=", "4.45.0"):
model_kwargs["attn_implementation"] = "sdpa"

ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG, **model_kwargs)
self.assertIsInstance(ov_model.config, PretrainedConfig)
self.assertTrue(ov_model.use_cache)
Expand Down Expand Up @@ -1094,6 +1098,10 @@ def test_beam_search(self, model_arch):
"config": AutoConfig.from_pretrained(model_id, trust_remote_code=True),
"trust_remote_code": True,
}

# starting from transformers 4.45.0 gemma2 uses eager attention by default, while ov - sdpa
if model_arch == "gemma2" and is_transformers_version(">=", "4.45.0"):
model_kwargs["attn_implementation"] = "sdpa"
# Qwen tokenizer does not support padding, chatglm, glm4 testing models produce nan that incompatible with beam search
if model_arch in ["qwen", "chatglm", "glm4"]:
return
Expand Down

0 comments on commit 936d272

Please sign in to comment.