From 040ee1270f0c38d953ee5c2a00281b1d3046ddbd Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Fri, 15 Nov 2024 10:03:08 +0400 Subject: [PATCH] fix device selection for compilation language model in vlm and model saving (#967) * fix device selection for compilation language model in vlm * add more tests * extend tests for vlm * update tests * update after rebase * disable test for old transformers * Update tests/openvino/test_modeling.py * fix typo * Apply suggestions from code review Co-authored-by: Nikita Savelyev * add components * fix after rebase * reuse test image * Update tests/openvino/test_modeling.py Co-authored-by: Nikita Savelyev --------- Co-authored-by: Nikita Savelyev --- optimum/intel/openvino/modeling_base.py | 5 +- .../openvino/modeling_visual_language.py | 150 ++++++++++++------ optimum/intel/openvino/utils.py | 3 + tests/openvino/test_export.py | 17 +- tests/openvino/test_exporters_cli.py | 3 + tests/openvino/test_modeling.py | 150 +++++++++++++++--- tests/openvino/utils_tests.py | 2 + 7 files changed, 255 insertions(+), 75 deletions(-) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 320d77c4ca..8e936e09cf 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -782,7 +782,7 @@ def __init__( for inputs in self.model.inputs } self.ov_config = ov_config or {**self.parent_model.ov_config} - self.request = None + self.request = None if not self.parent_model._compile_only else self.model self._model_name = model_name self.config = self.parent_model.config self._model_dir = Path(model_dir or parent_model._model_save_dir) @@ -832,3 +832,6 @@ def __call__(self, *args, **kwargs): def forward(self, *args, **kwargs): raise NotImplementedError + + def clear_requests(self): + self.request = None diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 35d91488d4..e438d69e85 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -16,6 +16,7 @@ from transformers import ( AutoConfig, AutoImageProcessor, + AutoModelForCausalLM, GenerationConfig, GenerationMixin, PretrainedConfig, @@ -30,7 +31,23 @@ from .configuration import OVConfig, OVWeightQuantizationConfig from .modeling_base import OVBaseModel, OVModelPart from .modeling_decoder import CausalLMOutputWithPast, OVModelForCausalLM -from .utils import TemporaryDirectory +from .utils import ( + OV_LANGUAGE_MODEL_NAME, + OV_TEXT_EMBEDDINGS_MODEL_NAME, + OV_VISION_EMBEDDINGS_MODEL_NAME, + TemporaryDirectory, +) + + +try: + from transformers import LlavaForConditionalGeneration +except ImportError: + LlavaForConditionalGeneration = None + +try: + from transformers import LlavaNextForConditionalGeneration +except ImportError: + LlavaNextForConditionalGeneration = None logger = logging.getLogger(__name__) @@ -67,13 +84,19 @@ def __init__( def compile(self): if self.request is None: logger.info(f"Compiling the Language model to {self._device} ...") - self.request = core.compile_model(self.model, self._device, self.ov_config).create_infer_request() + super().compile() self._compile_text_emb() def _compile_text_emb(self): if self.text_emb_request is None: logger.info(f"Compiling the Text embeddings model to {self._device} ...") - self.text_emb_request = core.compile_model(self.text_emb_model, self._device, self.ov_config) + if self._compile_only: + self.text_emb_request = self.text_emb_model + else: + logger.info(f"Compiling the Text embeddings model to {self._device} ...") + self.text_emb_request = self._compile_model( + self.text_emb_model, self._device, self.ov_config, self.model_save_dir + ) def clear_requests(self): if self._compile_only: @@ -238,12 +261,18 @@ def forward(self, img_features): return self.request(img_features)[0] -MODEL_PARTS_CLS_MAPPING = {"resampler": OVResampler, "vision_projection": OVVisionProjection} +MODEL_PARTS_CLS_MAPPING = { + "resampler": OVResampler, + "language_model": OVModelWithEmbedForCausalLM, + "vision_embeddings": OVVisionEmbedding, + "vision_projection": OVVisionProjection, +} class OVModelForVisualCausalLM(OVBaseModel, GenerationMixin): export_feature = "image-text-to-text" additional_parts = [] + auto_model_class = AutoModelForCausalLM def __init__( self, @@ -285,11 +314,11 @@ def __init__( self.lm_model, self.text_embeddings_model, config=config, - deivce=device, + device=device, ov_config=ov_config, model_save_dir=model_save_dir, quantization_config=quantization_config, - compile=not self._compile_only and enable_compilation, + compile=self._compile_only or enable_compilation, compile_only=self._compile_only, ) self.vision_embeddings = OVVisionEmbedding(self.vision_embeddings_model, self) @@ -315,19 +344,15 @@ def clear_requests(self): "`clear_requests()` is not supported with `compile_only` mode, please intialize model without this option" ) - self.language_model.clear_requests() - components = [self.vision_embeddings] + [getattr(self, part) for part in self.additional_parts] - for component in components: - if component is not None: - component.request = None + for _, component in self.components.items(): + component.clear_requests() def compile(self): - self.language_model.compile() - self.vision_embeddings._compile() - for part in self.additional_parts: - part_model = getattr(self, part, None) - if part_model is not None: - part_model._compile() + for _, component in self.components.items(): + if isinstance(component, OVModelPart): + component._compile() + else: + component.compile() def _save_config(self, save_directory): """ @@ -345,21 +370,21 @@ def _save_pretrained(self, save_directory: Union[str, Path]): save_directory (`str` or `Path`): The directory where to save the model files. """ - src_files = [self.lm_model, self.text_embeddings_model, self.vision_embeddings_model] - dst_file_names = [ - "openvino_language_model.xml", - "openvino_text_embeddings_model.xml", - "openvino_vision_embeddings_model.xml", - ] - for part in self.additional_parts: - model = getattr(self, f"{part}_model", None) - if model is not None: - src_files.append(model) - dst_file_names.append(f"openvino_{part}_model.xml") + src_models = self.submodels + dst_file_names = { + "lm_model": OV_LANGUAGE_MODEL_NAME, + "text_embeddings_model": OV_TEXT_EMBEDDINGS_MODEL_NAME, + "vision_embeddings_model": OV_VISION_EMBEDDINGS_MODEL_NAME, + } + for name in self._submodel_names: + if name not in dst_file_names: + dst_file_names[name] = f"openvino_{name}.xml" - for src_file, dst_file_name in zip(src_files, dst_file_names): + for name in self._submodel_names: + model = src_models[name] + dst_file_name = dst_file_names[name] dst_path = os.path.join(save_directory, dst_file_name) - ov.save_model(src_file, dst_path, compress_to_fp16=False) + ov.save_model(model, dst_path, compress_to_fp16=False) self._save_openvino_config(save_directory) if self.generation_config is not None: @@ -429,14 +454,18 @@ def _from_pretrained( token = use_auth_token model_file_names = { - "language_model": "openvino_language_model.xml", - "text_embeddings": "openvino_text_embeddings_model.xml", - "vision_embeddings": "openvino_vision_embeddings_model.xml", + "language_model": OV_LANGUAGE_MODEL_NAME, + "language_model_bin": OV_LANGUAGE_MODEL_NAME.replace(".xml", ".bin"), + "text_embeddings": OV_TEXT_EMBEDDINGS_MODEL_NAME, + "text_embeddings_bin": OV_TEXT_EMBEDDINGS_MODEL_NAME.replace(".xml", ".bin"), + "vision_embeddings": OV_VISION_EMBEDDINGS_MODEL_NAME, + "vision_embeddings_bin": OV_VISION_EMBEDDINGS_MODEL_NAME.replace(".xml", ".bin"), } model_cls = MODEL_TYPE_TO_CLS_MAPPING[config.model_type] for part in model_cls.additional_parts: model_file_names[part] = f"openvino_{part}_model.xml" + model_file_names[part + "_bin"] = f"openvino_{part}_model.bin" compile_only = kwargs.get("compile_only", False) if os.path.isdir(model_id): # Load model from a local directory @@ -593,6 +622,28 @@ def _from_transformers( **kwargs, ) + @property + def _component_names(self): + base_components = ["language_model", "vision_embeddings"] + additional_components = [part for part in self.additional_parts if getattr(self, part, None) is not None] + return base_components + additional_components + + @property + def components(self): + return {component_name: getattr(self, component_name) for component_name in self._component_names} + + @property + def _submodel_names(self): + model_names = ["lm_model", "text_embeddings_model", "vision_embeddings_model"] + for part in self.additional_parts: + if getattr(self, part, None) is not None: + model_names.append(part + "_model") + return model_names + + @property + def submodels(self): + return {submodel_name: getattr(self, submodel_name) for submodel_name in self._submodel_names} + def reshape(self, batch_size: int, sequence_length: int): logger.warning("Static shapes are not supported for causal language model.") return self @@ -601,17 +652,14 @@ def half(self): """ Converts all the model weights to FP16 for more efficient inference on GPU. """ - apply_moc_transformations(self.lm_model, cf=False) - compress_model_transformation(self.lm_model) - apply_moc_transformations(self.text_embeddings_model, cf=False) - compress_model_transformation(self.text_embeddings_model) - apply_moc_transformations(self.vision_embeddings_model, cf=False) - compress_model_transformation(self.vision_embeddings_model) - for part in self.additional_parts: - model = getattr(self, f"{part}_model", None) - if model is not None: - apply_moc_transformations(model, cf=False) - compress_model_transformation(model) + for _, submodel in self.submodels.items(): + apply_moc_transformations(submodel, cf=False) + compress_model_transformation(submodel) + return self + + def to(self, device): + self.language_model.to(device) + super().to(device) return self def forward( @@ -625,11 +673,8 @@ def forward( position_ids=None, image_bound=None, tgt_sizes=None, - images=None, **kwargs, ): - if pixel_values is None and images is not None: - pixel_values = images inputs_embeds, attention_mask, position_ids = self.get_multimodal_embeddings( input_ids, pixel_values, @@ -733,7 +778,6 @@ def prepare_inputs_for_generation( "image_sizes": image_sizes, "image_bound": kwargs.get("image_bound"), "tgt_sizes": kwargs.get("tgt_sizes"), - "images": kwargs.get("images"), } ) return model_inputs @@ -756,6 +800,8 @@ def preprocess_inputs( class _OVLlavaForCausalLM(OVModelForVisualCausalLM): + auto_model_class = LlavaForConditionalGeneration + def __init__( self, language_model: ov.Model, @@ -941,6 +987,8 @@ def preprocess_inputs( class _OVLlavaNextForCausalLM(_OVLlavaForCausalLM): + auto_model_class = LlavaNextForConditionalGeneration + # Adopted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_next/modeling_llava_next.py#L655 def pack_image_features(self, image_features, image_sizes, image_newline=None): from transformers.models.llava_next.modeling_llava_next import get_anyres_image_grid_shape, unpad_image @@ -1211,7 +1259,7 @@ def get_text_embeddings(self, input_ids, **kwargs): return super().get_text_embeddings(for_inputs_embeds_ids, **kwargs) -class _OvInternVLForCausalLM(OVModelForVisualCausalLM): +class _OVInternVLForCausalLM(OVModelForVisualCausalLM): def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs): if input_ids is not None and input_ids.shape[1] == 1: return None @@ -1822,7 +1870,7 @@ def preprocess_inputs( attention_mask = torch.ones_like(input_ids, dtype=torch.int64) result = {"input_ids": input_ids, "attention_mask": attention_mask} if image is not None: - result["images"] = torch.unsqueeze(processor(images=image, return_tensors="pt")["pixel_values"][0], 0) + result["pixel_values"] = processor(images=[image], return_tensors="pt")["pixel_values"] return result @@ -1979,8 +2027,8 @@ def preprocess_inputs( MODEL_TYPE_TO_CLS_MAPPING = { "llava": _OVLlavaForCausalLM, "llava_next": _OVLlavaNextForCausalLM, - "internvl_chat": _OvInternVLForCausalLM, "minicpmv": _OVMiniCPMVForCausalLM, "llava-qwen2": _OVNanoLlavaForCausalLM, "phi3_v": _OVPhi3VisionForCausalLM, + "internvl_chat": _OVInternVLForCausalLM, } diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index 68458c85bf..cf5060f420 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -42,6 +42,9 @@ OV_ENCODER_NAME = "openvino_encoder_model.xml" OV_DECODER_NAME = "openvino_decoder_model.xml" OV_DECODER_WITH_PAST_NAME = "openvino_decoder_with_past_model.xml" +OV_TEXT_EMBEDDINGS_MODEL_NAME = "openvino_text_embeddings_model.xml" +OV_LANGUAGE_MODEL_NAME = "openvino_language_model.xml" +OV_VISION_EMBEDDINGS_MODEL_NAME = "openvino_vision_embeddings_model.xml" OV_TOKENIZER_NAME = "openvino_tokenizer{}.xml" OV_DETOKENIZER_NAME = "openvino_detokenizer{}.xml" diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 80a020d2bd..4c42f8a337 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -41,12 +41,14 @@ OVModelForSequenceClassification, OVModelForSpeechSeq2Seq, OVModelForTokenClassification, + OVModelForVisualCausalLM, OVStableDiffusion3Pipeline, OVStableDiffusionPipeline, OVStableDiffusionXLImg2ImgPipeline, OVStableDiffusionXLPipeline, ) from optimum.intel.openvino.modeling_base import OVBaseModel +from optimum.intel.openvino.modeling_visual_language import MODEL_TYPE_TO_CLS_MAPPING from optimum.intel.openvino.utils import TemporaryDirectory from optimum.intel.utils.import_utils import _transformers_version, is_transformers_version from optimum.utils.save_utils import maybe_load_preprocessors @@ -70,12 +72,13 @@ class ExportModelTest(unittest.TestCase): "stable-diffusion-xl": OVStableDiffusionXLPipeline, "stable-diffusion-xl-refiner": OVStableDiffusionXLImg2ImgPipeline, "latent-consistency": OVLatentConsistencyModelPipeline, + "llava": OVModelForVisualCausalLM, } if is_transformers_version(">=", "4.45"): SUPPORTED_ARCHITECTURES.update({"stable-diffusion-3": OVStableDiffusion3Pipeline, "flux": OVFluxPipeline}) - GENERATIVE_MODELS = ("pix2struct", "t5", "bart", "gpt2", "whisper") + GENERATIVE_MODELS = ("pix2struct", "t5", "bart", "gpt2", "whisper", "llava") def _openvino_export( self, @@ -93,6 +96,10 @@ def _openvino_export( model_class = TasksManager.get_model_class_for_task(task, library=library_name) model = model_class(f"hf_hub:{model_name}", pretrained=True, exportable=True) TasksManager.standardize_model_attributes(model_name, model, library_name=library_name) + elif model_type == "llava": + model = MODEL_TYPE_TO_CLS_MAPPING[model_type].auto_model_class.from_pretrained( + model_name, **loading_kwargs + ) else: model = auto_model.auto_model_class.from_pretrained(model_name, **loading_kwargs) @@ -144,8 +151,12 @@ def test_export_with_custom_gen_config(self, model_type): task = auto_model.export_feature model_name = MODEL_NAMES[model_type] loading_kwargs = {"attn_implementation": "eager"} if model_type in SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED else {} - - model = auto_model.auto_model_class.from_pretrained(model_name, **loading_kwargs) + if model_type == "llava": + model = MODEL_TYPE_TO_CLS_MAPPING[model_type].auto_model_class.from_pretrained( + model_name, **loading_kwargs + ) + else: + model = auto_model.auto_model_class.from_pretrained(model_name, **loading_kwargs) model.generation_config.top_k = 42 model.generation_config.do_sample = True diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index be73b68152..f218fa05ba 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -93,6 +93,7 @@ class OVCLIExportTestCase(unittest.TestCase): "stable-diffusion-xl": 4 if is_tokenizers_version("<", "0.20") else 0, "stable-diffusion-3": 6 if is_tokenizers_version("<", "0.20") else 2, "flux": 4 if is_tokenizers_version("<", "0.20") else 0, + "llava": 2 if is_tokenizers_version("<", "0.20") else 0, } SUPPORTED_SD_HYBRID_ARCHITECTURES = [ @@ -244,6 +245,8 @@ def test_exporters_cli_int8(self, task: str, model_type: str): elif model_type.startswith("stable-diffusion") or model_type.startswith("flux"): models = [model.unet or model.transformer, model.vae_encoder, model.vae_decoder] models.append(model.text_encoder if model_type == "stable-diffusion" else model.text_encoder_2) + elif task.startswith("image-text-to-text"): + models = [model.language_model, model.vision_embeddings] else: models = [model] diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index d9921e91ec..a9d0600e50 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import gc import os import tempfile @@ -61,7 +62,7 @@ ) from transformers.onnx.utils import get_preprocessor from transformers.testing_utils import slow -from utils_tests import MODEL_NAMES +from utils_tests import MODEL_NAMES, TEST_IMAGE_URL from optimum.exporters.openvino.model_patcher import patch_update_causal_mask from optimum.intel import ( @@ -93,10 +94,14 @@ from optimum.intel.openvino.modeling_visual_language import ( MODEL_PARTS_CLS_MAPPING, MODEL_TYPE_TO_CLS_MAPPING, - OVModelWithEmbedForCausalLM, - OVVisionEmbedding, ) -from optimum.intel.openvino.utils import TemporaryDirectory, _print_compiled_model_properties +from optimum.intel.openvino.utils import ( + OV_LANGUAGE_MODEL_NAME, + OV_TEXT_EMBEDDINGS_MODEL_NAME, + OV_VISION_EMBEDDINGS_MODEL_NAME, + TemporaryDirectory, + _print_compiled_model_properties, +) from optimum.intel.pipelines import pipeline as optimum_pipeline from optimum.intel.utils.import_utils import is_openvino_version, is_transformers_version from optimum.intel.utils.modeling_utils import _find_files_matching_pattern @@ -135,6 +140,7 @@ def __init__(self, *args, **kwargs): self.OV_DECODER_MODEL_ID = "helenai/gpt2-ov" self.OV_SEQ2SEQ_MODEL_ID = "echarlaix/t5-small-openvino" self.OV_DIFFUSION_MODEL_ID = "hf-internal-testing/tiny-stable-diffusion-openvino" + self.OV_VLM_MODEL_ID = "katuni4ka/tiny-random-llava-ov" def test_load_from_hub_and_save_model(self): tokenizer = AutoTokenizer.from_pretrained(self.OV_MODEL_ID) @@ -223,6 +229,76 @@ def test_load_from_hub_and_save_decoder_model(self, use_cache): del model gc.collect() + @unittest.skipIf( + is_transformers_version("<", "4.45"), + "model tokenizer exported with tokenizers 0.20 is not compatible with old transformers", + ) + def test_load_from_hub_and_save_visual_language_model(self): + model_id = self.OV_VLM_MODEL_ID + processor = get_preprocessor(model_id) + prompt = "\n What is shown in this image?" + image = Image.open( + requests.get( + TEST_IMAGE_URL, + stream=True, + ).raw + ) + loaded_model = OVModelForVisualCausalLM.from_pretrained(model_id) + self.assertIsInstance(loaded_model, MODEL_TYPE_TO_CLS_MAPPING[loaded_model.config.model_type]) + for component_name, component in loaded_model.components.items(): + self.assertIsInstance(component, MODEL_PARTS_CLS_MAPPING[component_name]) + self.assertIsInstance(loaded_model.config, PretrainedConfig) + # Test that PERFORMANCE_HINT is set to LATENCY by default + self.assertEqual(loaded_model.ov_config.get("PERFORMANCE_HINT"), "LATENCY") + + for component_name, component in loaded_model.components.items(): + self.assertIsInstance(component.model, ov.Model) + if component_name == "language_model": + self.assertEqual(component.request.get_compiled_model().get_property("PERFORMANCE_HINT"), "LATENCY") + self.assertIsInstance(component.text_emb_model, ov.Model) + self.assertEqual(component.text_emb_request.get_property("PERFORMANCE_HINT"), "LATENCY") + else: + self.assertEqual(component.request.get_property("PERFORMANCE_HINT"), "LATENCY") + + inputs = processor(images=image, text=prompt, return_tensors="pt") + set_seed(SEED) + loaded_model_outputs = loaded_model(**inputs) + + with TemporaryDirectory() as tmpdirname: + loaded_model.save_pretrained(tmpdirname) + folder_contents = os.listdir(tmpdirname) + model_files = [ + OV_LANGUAGE_MODEL_NAME, + OV_TEXT_EMBEDDINGS_MODEL_NAME, + OV_VISION_EMBEDDINGS_MODEL_NAME, + ] + model_files += ["openvino_{part}_model.xml" for part in loaded_model.additional_parts] + for xml_file_name in model_files: + self.assertTrue(xml_file_name in folder_contents) + self.assertTrue(xml_file_name.replace(".xml", ".bin") in folder_contents) + model = OVModelForVisualCausalLM.from_pretrained(tmpdirname) + compile_only_model = OVModelForVisualCausalLM.from_pretrained(tmpdirname, compile_only=True) + for _, submodel in compile_only_model.submodels.items(): + self.assertIsInstance(submodel, ov.runtime.CompiledModel) + for component_name, component in compile_only_model.components.items(): + self.assertIsInstance(component.model, ov.runtime.CompiledModel) + if component_name == "language_model": + self.assertIsInstance(component.request, ov.runtime.InferRequest) + self.assertIsInstance(component.text_emb_model, ov.runtime.CompiledModel) + self.assertIsInstance(component.text_emb_request, ov.runtime.CompiledModel) + else: + self.assertIsInstance(component.request, ov.runtime.CompiledModel) + + outputs = compile_only_model(**inputs) + self.assertTrue(torch.equal(loaded_model_outputs.logits, outputs.logits)) + del compile_only_model + + outputs = model(**inputs) + self.assertTrue(torch.equal(loaded_model_outputs.logits, outputs.logits)) + del loaded_model + del model + gc.collect() + def test_load_from_hub_and_save_seq2seq_model(self): tokenizer = AutoTokenizer.from_pretrained(self.OV_SEQ2SEQ_MODEL_ID) tokens = tokenizer("This is a sample input", return_tensors="pt") @@ -1332,7 +1408,7 @@ def test_compare_to_transformers(self, model_arch): set_seed(SEED) transformers_model = AutoModelForImageClassification.from_pretrained(model_id) preprocessor = AutoFeatureExtractor.from_pretrained(model_id) - url = "http://images.cocodataset.org/val2017/000000039769.jpg" + url = TEST_IMAGE_URL image = Image.open(requests.get(url, stream=True).raw) inputs = preprocessor(images=image, return_tensors="pt") with torch.no_grad(): @@ -1358,7 +1434,7 @@ def test_pipeline(self, model_arch): model.eval() preprocessor = AutoFeatureExtractor.from_pretrained(model_id) pipe = pipeline("image-classification", model=model, feature_extractor=preprocessor) - inputs = "http://images.cocodataset.org/val2017/000000039769.jpg" + inputs = TEST_IMAGE_URL outputs = pipe(inputs) self.assertEqual(pipe.device, model.device) self.assertGreaterEqual(outputs[0]["score"], 0.0) @@ -1379,7 +1455,7 @@ def test_compare_to_timm(self, model_id): self.assertIsInstance(ov_model.config, PretrainedConfig) timm_model = timm.create_model(model_id, pretrained=True) preprocessor = TimmImageProcessor.from_pretrained(model_id) - url = "http://images.cocodataset.org/val2017/000000039769.jpg" + url = TEST_IMAGE_URL image = Image.open(requests.get(url, stream=True).raw) inputs = preprocessor(images=image, return_tensors="pt") with torch.no_grad(): @@ -1886,7 +1962,7 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase): IMAGE = Image.open( requests.get( - "http://images.cocodataset.org/val2017/000000039769.jpg", + TEST_IMAGE_URL, stream=True, ).raw ) @@ -1902,6 +1978,15 @@ def get_transformer_model_class(self, model_arch): return LlavaNextForConditionalGeneration return AutoModelForCausalLM + def _check_device_and_request(self, ov_model, expected_device, has_request): + request_check_fn = self.assertFalse if has_request else self.assertTrue + self.assertEqual(ov_model._device, expected_device) + for component_name, component in ov_model.components.items(): + if component_name == "language_model": + request_check_fn(component.text_emb_request is None) + self.assertEqual(component._device, expected_device) + request_check_fn(component.request is None) + @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): prompt = "What is shown in this image?" @@ -1922,23 +2007,35 @@ def test_compare_to_transformers(self, model_arch): preprocessors = self.get_preprocessors(model_arch) set_seed(SEED) ov_model = OVModelForVisualCausalLM.from_pretrained( - model_id, export=True, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + model_id, export=True, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS, compile=False ) self.assertIsInstance(ov_model, MODEL_TYPE_TO_CLS_MAPPING[ov_model.config.model_type]) - self.assertIsInstance(ov_model.vision_embeddings, OVVisionEmbedding) - self.assertIsInstance(ov_model.language_model, OVModelWithEmbedForCausalLM) - for additional_part in ov_model.additional_parts: - self.assertTrue(hasattr(ov_model, additional_part)) - self.assertIsInstance(getattr(ov_model, additional_part), MODEL_PARTS_CLS_MAPPING[additional_part]) + for component_name, component in ov_model.components.items(): + self.assertIsInstance(component, MODEL_PARTS_CLS_MAPPING[component_name]) self.assertIsInstance(ov_model.config, PretrainedConfig) inputs = ov_model.preprocess_inputs(**preprocessors, text=prompt, image=self.IMAGE.resize((600, 600))) - # pytorch minicpmv and internvl are not designed to be used via forward + transformers_inputs = copy.deepcopy(inputs) + test_device = "AUTO" + ov_model.to(test_device) + self._check_device_and_request(ov_model, test_device, False) + test_device = "CPU" + ov_model.to(test_device) + ov_model.compile() + self._check_device_and_request(ov_model, test_device, True) + ov_model.clear_requests() + self._check_device_and_request(ov_model, test_device, False) + + # nanollava pixel_values input named as images + if model_arch == "nanollava": + pixel_values = transformers_inputs.pop("pixel_values", None) + transformers_inputs["images"] = pixel_values + # pytorch minicpmv is not designed to be used via forward if model_arch not in ["minicpmv", "internvl2"]: set_seed(SEED) ov_outputs = ov_model(**inputs) set_seed(SEED) with torch.no_grad(): - transformers_outputs = transformers_model(**inputs) + transformers_outputs = transformers_model(**transformers_inputs) self.assertTrue( torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4), f"Max abs diff {(torch.abs(ov_outputs.logits - transformers_outputs.logits).max())}", @@ -1958,7 +2055,7 @@ def test_compare_to_transformers(self, model_arch): ov_outputs = ov_model.generate(**inputs, generation_config=gen_config) set_seed(SEED) with torch.no_grad(): - transformers_outputs = transformers_model.generate(**inputs, generation_config=gen_config) + transformers_outputs = transformers_model.generate(**transformers_inputs, generation_config=gen_config) # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them if model_arch in ["minicpmv", "internvl2"]: @@ -2079,6 +2176,19 @@ def get_preprocessors(self, model_arch): preprocessors = {"processor": processor, "tokenizer": None} return preprocessors + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_model_can_be_loaded_after_saving(self, model_arch): + model_id = MODEL_NAMES[model_arch] + with TemporaryDirectory() as save_dir: + ov_model = OVModelForVisualCausalLM.from_pretrained( + model_id, compile=False, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) + ov_model.save_pretrained(save_dir) + ov_restored_model = OVModelForVisualCausalLM.from_pretrained( + save_dir, compile=False, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) + self.assertIsInstance(ov_restored_model, type(ov_model)) + class OVModelForSpeechSeq2SeqIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ("whisper",) @@ -2156,7 +2266,7 @@ class OVModelForVision2SeqIntegrationTest(unittest.TestCase): SPEEDUP_CACHE = 1.1 def _get_sample_image(self): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" + url = TEST_IMAGE_URL image = Image.open(requests.get(url, stream=True).raw) return image @@ -2263,7 +2373,7 @@ class OVModelForCustomTasksIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES_WITH_HIDDEN_STATES = ["vit-with-hidden-states"] def _get_sample_image(self): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" + url = TEST_IMAGE_URL image = Image.open(requests.get(url, stream=True).raw) return image @@ -2347,7 +2457,7 @@ class OVModelForOpenCLIPZeroShortImageClassificationTest(unittest.TestCase): OV_MODEL_ID_IR = MODEL_NAMES["open-clip-ov"] def _get_sample_image(self): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" + url = TEST_IMAGE_URL image = Image.open(requests.get(url, stream=True).raw) return image diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 394151cc3e..dde7bafd33 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -188,6 +188,8 @@ "nanollava": (30, 30, 2), } +TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg" + def get_num_quantized_nodes(model): num_fake_quantize = 0