diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index cd337fdde7..e89fa9b449 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -315,31 +315,6 @@ def quantize( else: raise TypeError(f"Unsupported model type: {type(self.model)}") - def _check_model_state(self, sub_model_names: List[str] = None): - message_template = ( - "Couldn't apply optimization to the model because it was already compressed with config: {}. " - "To avoid this issue, set load_in_8bit=False in the from_pretrained method when using the optimum-intel API, " - "or explicitly specify the desired weight format using --weight_format fp16/fp32 for CLI." - ) - - def check_rt_info(ov_model): - rt_info = ov_model.get_rt_info() - if "nncf" in rt_info: - model_weight_compression_config = rt_info["nncf"].get("weight_compression", None) - model_quantization_config = rt_info["nncf"].get("quantization", None) - if model_weight_compression_config is not None: - raise RuntimeError(message_template.format(model_weight_compression_config)) - elif model_quantization_config is not None: - raise RuntimeError(message_template.format(model_quantization_config)) - - if sub_model_names is None: - check_rt_info(self.model.model) - else: - for name in sub_model_names: - if hasattr(self.model, name): - ov_model = getattr(self.model, name).model - check_rt_info(ov_model) - def _quantize_ovbasemodel( self, ov_config: OVConfig, @@ -350,7 +325,7 @@ def _quantize_ovbasemodel( remove_unused_columns: bool = True, **kwargs, ): - from optimum.intel.openvino.modeling_seq2seq import _OVModelForWhisper, OVModelForSeq2SeqLM + from optimum.intel.openvino.modeling_seq2seq import _OVModelForWhisper from optimum.intel.openvino.modeling_visual_language import OVModelForVisualCausalLM if is_diffusers_available(): @@ -429,7 +404,6 @@ def _quantize_ovbasemodel( "text_encoder_2", "text_encoder_3", ] - self._check_model_state(sub_model_names) sub_models = filter(lambda x: x, (getattr(self.model, name) for name in sub_model_names)) for sub_model in sub_models: _weight_only_quantization(sub_model.model, quantization_config_copy, **kwargs) @@ -447,7 +421,6 @@ def _quantize_ovbasemodel( self.model.clear_requests() else: # The model may be for example OVModelForImageClassification, OVModelForAudioClassification, etc. - self._check_model_state() self.model.model = _hybrid_quantization( self.model.model, quantization_config, calibration_dataset, **kwargs ) @@ -463,31 +436,19 @@ def _quantize_ovbasemodel( "transformer", "text_encoder_3", ] - self._check_model_state(sub_model_names) sub_models = filter(lambda x: x, (getattr(self.model, name) for name in sub_model_names)) for sub_model in sub_models: _weight_only_quantization(sub_model.model, quantization_config, **kwargs) self.model.clear_requests() elif isinstance(self.model, OVModelForVisualCausalLM): language_model = self.model.language_model - sub_model_names = ["vision_embeddings", "text_embeddings"] + self.model.additional_parts - self._check_model_state(sub_model_names + ["language_model"]) _weight_only_quantization(language_model.model, quantization_config, calibration_dataset, **kwargs) + sub_model_names = ["vision_embeddings", "text_embeddings"] + self.model.additional_parts sub_models = [getattr(self.model, f"{name}_model") for name in sub_model_names] for sub_model in sub_models: _weight_only_quantization(sub_model, OVWeightQuantizationConfig(bits=8, sym=True), **kwargs) self.model.clear_requests() - elif isinstance(self.model, OVModelForSeq2SeqLM): - sub_model_names = ["encoder", "decoder"] - if self.model.decoder_with_past is not None: - sub_model_names.append("decoder_with_past") - self._check_model_state(sub_model_names) - sub_models = [getattr(self.model, name) for name in sub_model_names] - for sub_model in sub_models: - _weight_only_quantization(sub_model, quantization_config, **kwargs) - self.model.clear_requests() else: - self._check_model_state() _weight_only_quantization(self.model.model, quantization_config, calibration_dataset, **kwargs) self.model.request = None else: @@ -499,7 +460,6 @@ def _quantize_ovbasemodel( # Quantize model(s) if isinstance(self.model, _OVModelForWhisper): - self._check_model_state(["encoder_model", "decoder_model", "decoder_with_past_model"]) self._quantize_whisper_model(quantization_config, calibration_dataset, **kwargs) else: quantized_model = _full_quantization( @@ -1050,6 +1010,7 @@ def _weight_only_quantization( calibration_dataset: Optional[Union[nncf.Dataset, Iterable]] = None, **kwargs, ) -> openvino.runtime.Model: + _verify_not_optimized(model) config = quantization_config if isinstance(config, dict): config = OVWeightQuantizationConfig.from_dict(quantization_config) @@ -1106,6 +1067,7 @@ def _full_quantization( calibration_dataset: nncf.Dataset, **kwargs, ): + _verify_not_optimized(model) advanced_parameters_kwargs = {} if quantization_config.smooth_quant_alpha is not None: advanced_parameters_kwargs["smooth_quant_alphas"] = AdvancedSmoothQuantParameters( @@ -1227,3 +1189,20 @@ def _hybrid_quantization( **kwargs, ) return quantized_model + + +def _verify_not_optimized(ov_model): + message_template = ( + "Cannot apply optimization to the model because it was already optimized with the following config: {}. " + "To avoid this issue, check that you set load_in_8bit=False or not using quantization_config at export in the .from_pretrained(), " + "or explicitly specify weight format with --weight_format fp16/fp32 when using CLI." + ) + + rt_info = ov_model.get_rt_info() + if "nncf" in rt_info: + model_weight_compression_config = rt_info["nncf"].get("weight_compression", None) + model_quantization_config = rt_info["nncf"].get("quantization", None) + if model_weight_compression_config is not None: + raise RuntimeError(message_template.format(model_weight_compression_config)) + elif model_quantization_config is not None: + raise RuntimeError(message_template.format(model_quantization_config)) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 9baedd223c..c4c0ff247d 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -214,7 +214,9 @@ def preprocess_function(examples, tokenizer): # Verify that the configuration is correctly saved and loaded loaded_config = OVConfig.from_pretrained(tmp_dir) self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) - check_optimization_not_applicable_to_optimized_model(model, quantization_config=OVWeightQuantizationConfig(bits=8)) + check_optimization_not_applicable_to_optimized_model( + model, quantization_config=OVWeightQuantizationConfig(bits=8) + ) @parameterized.expand(SUPPORTED_ARCHITECTURES_OV_MODEL_WITH_AUTO_DATASET) def test_ov_model_static_quantization_with_auto_dataset( @@ -256,7 +258,6 @@ def test_ov_model_static_quantization_with_auto_dataset( self.assertTrue("logits" in outputs) else: raise Exception("Unexpected model class.") - check_optimization_not_applicable_to_optimized_model(ov_model, quantization_config=quantization_config) class OVWeightCompressionTest(unittest.TestCase): @@ -747,7 +748,7 @@ def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_fake_ self.assertEqual(0, num_weight_nodes["int4"]) model.save_pretrained(tmp_dir) - check_optimization_not_applicable_to_optimized_model(model, quantization_config=quantization_config) + check_optimization_not_applicable_to_optimized_model(model, quantization_config) def test_stable_diffusion_with_weight_compression(self): int8_pipe = OVStableDiffusionPipeline.from_pretrained(model_id=MODEL_NAMES["stable-diffusion"], export=True) @@ -762,8 +763,10 @@ def test_stable_diffusion_with_weight_compression(self): self.assertEqual(0, num_fake_nodes) self.assertEqual(242, num_weight_nodes["int8"]) self.assertEqual(0, num_weight_nodes["int4"]) - quantization_config = OVWeightQuantizationConfig(bits=8, dataset="conceptual_captions", num_samples=2, quant_method=OVQuantizationMethod.HYBRID) - check_optimization_not_applicable_to_optimized_model(int8_pipe, quantization_config=quantization_config) + quantization_config = OVWeightQuantizationConfig( + bits=8, dataset="conceptual_captions", num_samples=2, quant_method=OVQuantizationMethod.HYBRID + ) + check_optimization_not_applicable_to_optimized_model(int8_pipe, quantization_config) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION[-1:]) def test_ovmodel_hybrid_quantization_with_custom_dataset( @@ -1338,5 +1341,8 @@ def test_calibration_data_uniqueness(self, model_name, apply_caching): def check_optimization_not_applicable_to_optimized_model(model, quantization_config): quantizer = OVQuantizer(model) - with pytest.raises(RuntimeError, match="Cannot apply optimization to the model because it was already optimized with the following config"): - quantizer.quantize(quantization_config=quantization_config) + with pytest.raises( + RuntimeError, + match="Cannot apply optimization to the model because it was already optimized with the following config", + ): + quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config))