From 82d01766ea590a77a5e6f9113e7920f7c48425c5 Mon Sep 17 00:00:00 2001 From: eaidova Date: Wed, 13 Nov 2024 19:58:09 +0400 Subject: [PATCH] update after rebase --- optimum/intel/openvino/modeling_visual_language.py | 10 +++------- tests/openvino/test_modeling.py | 14 +++++++++++--- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index d6987b2a39..73353d1d02 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -305,7 +305,7 @@ def __init__( ov_config=ov_config, model_save_dir=model_save_dir, quantization_config=quantization_config, - compile=not self._compile_only and enable_compilation, + compile=self._compile_only or enable_compilation, compile_only=self._compile_only, ) self.vision_embeddings = OVVisionEmbedding(self.vision_embeddings_model, self) @@ -648,11 +648,8 @@ def forward( position_ids=None, image_bound=None, tgt_sizes=None, - images=None, **kwargs, ): - if pixel_values is None and images is not None: - pixel_values = images inputs_embeds, attention_mask, position_ids = self.get_multimodal_embeddings( input_ids, pixel_values, @@ -756,7 +753,6 @@ def prepare_inputs_for_generation( "image_sizes": image_sizes, "image_bound": kwargs.get("image_bound"), "tgt_sizes": kwargs.get("tgt_sizes"), - "images": kwargs.get("images"), } ) return model_inputs @@ -780,7 +776,7 @@ def preprocess_inputs( class _OVLlavaForCausalLM(OVModelForVisualCausalLM): auto_model_class = LlavaForConditionalGeneration - + def __init__( self, language_model: ov.Model, @@ -1849,7 +1845,7 @@ def preprocess_inputs( attention_mask = torch.ones_like(input_ids, dtype=torch.int64) result = {"input_ids": input_ids, "attention_mask": attention_mask} if image is not None: - result["images"] = torch.unsqueeze(processor(images=image, return_tensors="pt")["pixel_values"][0], 0) + result["pixel_values"] = processor(images=[image], return_tensors="pt")["pixel_values"] return result diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index f2acf464db..b71d2ebe01 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import gc import os import tempfile @@ -1997,6 +1998,7 @@ def test_compare_to_transformers(self, model_arch): self.assertIsInstance(getattr(ov_model, additional_part), MODEL_PARTS_CLS_MAPPING[additional_part]) self.assertIsInstance(ov_model.config, PretrainedConfig) inputs = ov_model.preprocess_inputs(**preprocessors, text=prompt, image=self.IMAGE.resize((600, 600))) + transformers_inputs = copy.deepcopy(inputs) ov_model.to("AUTO") self.assertTrue("AUTO" in ov_model._device) self.assertTrue("AUTO" in ov_model.vision_embeddings._device) @@ -2029,11 +2031,17 @@ def test_compare_to_transformers(self, model_arch): self.assertTrue("CPU" in getattr(ov_model, additional_part)._device) self.assertTrue(getattr(ov_model, additional_part).request is None) + # nanollava pixel_values input named as images + if model_arch == "nanollava": + pixel_values = transformers_inputs.pop("pixel_values", None) + transformers_inputs["images"] = pixel_values # pytorch minicpmv is not designed to be used via forward - if model_arch in ["minicpmv", "internvl2"]: + if model_arch not in ["minicpmv", "internvl2"]: + set_seed(SEED) + ov_outputs = ov_model(**inputs) set_seed(SEED) with torch.no_grad(): - transformers_outputs = transformers_model(**inputs) + transformers_outputs = transformers_model(**transformers_inputs) self.assertTrue( torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4), f"Max abs diff {(torch.abs(ov_outputs.logits - transformers_outputs.logits).max())}", @@ -2053,7 +2061,7 @@ def test_compare_to_transformers(self, model_arch): ov_outputs = ov_model.generate(**inputs, generation_config=gen_config) set_seed(SEED) with torch.no_grad(): - transformers_outputs = transformers_model.generate(**inputs, generation_config=gen_config) + transformers_outputs = transformers_model.generate(**transformers_inputs, generation_config=gen_config) # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them if model_arch in ["minicpmv", "internvl2"]: