diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 1c0e35cca2..813b9ace2e 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -695,8 +695,10 @@ def forward( image_grid_thw=None, video_grid_thw=None, rope_deltas=None, + images=None, **kwargs, ): + pixel_values = pixel_values if pixel_values is not None else images inputs_embeds, attention_mask, position_ids = self.get_multimodal_embeddings( input_ids, pixel_values, @@ -794,6 +796,9 @@ def prepare_inputs_for_generation( else: model_inputs = {"input_ids": input_ids} + if pixel_values is None: + pixel_values = kwargs.get("images") + model_inputs.update( { "position_ids": position_ids, @@ -1907,7 +1912,7 @@ def preprocess_inputs( attention_mask = torch.ones_like(input_ids, dtype=torch.int64) result = {"input_ids": input_ids, "attention_mask": attention_mask} if image is not None: - result["pixel_values"] = processor(images=[image], return_tensors="pt")["pixel_values"] + result["images"] = processor(images=[image], return_tensors="pt")["pixel_values"] return result diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 57d4b64764..097f20991a 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -2182,11 +2182,7 @@ def test_compare_to_transformers(self, model_arch): ov_model.clear_requests() self._check_device_and_request(ov_model, test_device, False) - # nanollava pixel_values input named as images - if model_arch == "nanollava": - pixel_values = transformers_inputs.pop("pixel_values", None) - transformers_inputs["images"] = pixel_values - # pytorch minicpmv is not designed to be used via forward + # pytorch minicpmv and internvl2 is not designed to be used via forward if model_arch not in ["minicpmv", "internvl2"]: set_seed(SEED) ov_outputs = ov_model(**inputs)