Skip to content

Commit

Permalink
Raise an error when OVQuantizer is invoked on an compressed model (#1122
Browse files Browse the repository at this point in the history
)

* Raise an error when OVQuantizer is invoked on an already compressed model

* Update tests

* apply comments
  • Loading branch information
l-bat authored Jan 27, 2025
1 parent 479577a commit 5b990b4
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 0 deletions.
19 changes: 19 additions & 0 deletions optimum/intel/openvino/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -1010,6 +1010,7 @@ def _weight_only_quantization(
calibration_dataset: Optional[Union[nncf.Dataset, Iterable]] = None,
**kwargs,
) -> openvino.runtime.Model:
_verify_not_optimized(model)
config = quantization_config
if isinstance(config, dict):
config = OVWeightQuantizationConfig.from_dict(quantization_config)
Expand Down Expand Up @@ -1066,6 +1067,7 @@ def _full_quantization(
calibration_dataset: nncf.Dataset,
**kwargs,
):
_verify_not_optimized(model)
advanced_parameters_kwargs = {}
if quantization_config.smooth_quant_alpha is not None:
advanced_parameters_kwargs["smooth_quant_alphas"] = AdvancedSmoothQuantParameters(
Expand Down Expand Up @@ -1187,3 +1189,20 @@ def _hybrid_quantization(
**kwargs,
)
return quantized_model


def _verify_not_optimized(ov_model):
message_template = (
"Cannot apply optimization to the model because it was already optimized with the following config: {}. "
"To avoid this issue, check that you set load_in_8bit=False or not using quantization_config at export in the .from_pretrained(), "
"or explicitly specify weight format with --weight_format fp16/fp32 when using CLI."
)

rt_info = ov_model.get_rt_info()
if "nncf" in rt_info:
model_weight_compression_config = rt_info["nncf"].get("weight_compression", None)
model_quantization_config = rt_info["nncf"].get("quantization", None)
if model_weight_compression_config is not None:
raise RuntimeError(message_template.format(model_weight_compression_config))
elif model_quantization_config is not None:
raise RuntimeError(message_template.format(model_quantization_config))
25 changes: 25 additions & 0 deletions tests/openvino/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,9 @@ def preprocess_function(examples, tokenizer):
# Verify that the configuration is correctly saved and loaded
loaded_config = OVConfig.from_pretrained(tmp_dir)
self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict())
check_optimization_not_applicable_to_optimized_model(
model, quantization_config=OVWeightQuantizationConfig(bits=8)
)

@parameterized.expand(SUPPORTED_ARCHITECTURES_OV_MODEL_WITH_AUTO_DATASET)
def test_ov_model_static_quantization_with_auto_dataset(
Expand Down Expand Up @@ -718,6 +721,13 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type, trust
else:
models = [model]

if model_type == "open-clip":
pytest.skip(reason="ticket 161043")
elif model_type == "t5":
pytest.skip(reason="ticket 160958")
else:
check_optimization_not_applicable_to_optimized_model(model, quantization_config={"bits": 8})

expected_ov_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type]
for i, model in enumerate(models):
_, num_weight_nodes = get_num_quantized_nodes(model)
Expand All @@ -738,6 +748,7 @@ def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_fake_
self.assertEqual(0, num_weight_nodes["int4"])

model.save_pretrained(tmp_dir)
check_optimization_not_applicable_to_optimized_model(model, quantization_config)

def test_stable_diffusion_with_weight_compression(self):
int8_pipe = OVStableDiffusionPipeline.from_pretrained(model_id=MODEL_NAMES["stable-diffusion"], export=True)
Expand All @@ -752,6 +763,10 @@ def test_stable_diffusion_with_weight_compression(self):
self.assertEqual(0, num_fake_nodes)
self.assertEqual(242, num_weight_nodes["int8"])
self.assertEqual(0, num_weight_nodes["int4"])
quantization_config = OVWeightQuantizationConfig(
bits=8, dataset="conceptual_captions", num_samples=2, quant_method=OVQuantizationMethod.HYBRID
)
check_optimization_not_applicable_to_optimized_model(int8_pipe, quantization_config)

@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION[-1:])
def test_ovmodel_hybrid_quantization_with_custom_dataset(
Expand Down Expand Up @@ -797,6 +812,7 @@ def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_
if model_id == "facebook/opt-125m":
for key, value in self.DEFAULT_INT4_CONFIG.items():
self.assertEqual(value, getattr(openvino_config.quantization_config, key))
check_optimization_not_applicable_to_optimized_model(model, quantization_config={"bits": 8})

@parameterized.expand(LOAD_IN_4_BITS_SCOPE)
def test_ovmodel_4bit_auto_compression_with_config(
Expand Down Expand Up @@ -1321,3 +1337,12 @@ def test_calibration_data_uniqueness(self, model_name, apply_caching):
else:
# Without caching, encoder hidden states tensors will be unique for each collected input
self.assertGreater(len(data_id_per_key["encoder_hidden_states"]), 2)


def check_optimization_not_applicable_to_optimized_model(model, quantization_config):
quantizer = OVQuantizer(model)
with pytest.raises(
RuntimeError,
match="Cannot apply optimization to the model because it was already optimized with the following config",
):
quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config))

0 comments on commit 5b990b4

Please sign in to comment.