Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
apply comments
Browse files Browse the repository at this point in the history
l-bat committed Jan 23, 2025

Verified

This commit was signed with the committer’s verified signature.
W-Mai Benign X
1 parent e51f426 commit 92facae
Showing 2 changed files with 34 additions and 49 deletions.
63 changes: 21 additions & 42 deletions optimum/intel/openvino/quantization.py
Original file line number Diff line number Diff line change
@@ -315,31 +315,6 @@ def quantize(
else:
raise TypeError(f"Unsupported model type: {type(self.model)}")

def _check_model_state(self, sub_model_names: List[str] = None):
message_template = (
"Couldn't apply optimization to the model because it was already compressed with config: {}. "
"To avoid this issue, set load_in_8bit=False in the from_pretrained method when using the optimum-intel API, "
"or explicitly specify the desired weight format using --weight_format fp16/fp32 for CLI."
)

def check_rt_info(ov_model):
rt_info = ov_model.get_rt_info()
if "nncf" in rt_info:
model_weight_compression_config = rt_info["nncf"].get("weight_compression", None)
model_quantization_config = rt_info["nncf"].get("quantization", None)
if model_weight_compression_config is not None:
raise RuntimeError(message_template.format(model_weight_compression_config))
elif model_quantization_config is not None:
raise RuntimeError(message_template.format(model_quantization_config))

if sub_model_names is None:
check_rt_info(self.model.model)
else:
for name in sub_model_names:
if hasattr(self.model, name):
ov_model = getattr(self.model, name).model
check_rt_info(ov_model)

def _quantize_ovbasemodel(
self,
ov_config: OVConfig,
@@ -350,7 +325,7 @@ def _quantize_ovbasemodel(
remove_unused_columns: bool = True,
**kwargs,
):
from optimum.intel.openvino.modeling_seq2seq import _OVModelForWhisper, OVModelForSeq2SeqLM
from optimum.intel.openvino.modeling_seq2seq import _OVModelForWhisper
from optimum.intel.openvino.modeling_visual_language import OVModelForVisualCausalLM

if is_diffusers_available():
@@ -429,7 +404,6 @@ def _quantize_ovbasemodel(
"text_encoder_2",
"text_encoder_3",
]
self._check_model_state(sub_model_names)
sub_models = filter(lambda x: x, (getattr(self.model, name) for name in sub_model_names))
for sub_model in sub_models:
_weight_only_quantization(sub_model.model, quantization_config_copy, **kwargs)
@@ -447,7 +421,6 @@ def _quantize_ovbasemodel(
self.model.clear_requests()
else:
# The model may be for example OVModelForImageClassification, OVModelForAudioClassification, etc.
self._check_model_state()
self.model.model = _hybrid_quantization(
self.model.model, quantization_config, calibration_dataset, **kwargs
)
@@ -463,31 +436,19 @@ def _quantize_ovbasemodel(
"transformer",
"text_encoder_3",
]
self._check_model_state(sub_model_names)
sub_models = filter(lambda x: x, (getattr(self.model, name) for name in sub_model_names))
for sub_model in sub_models:
_weight_only_quantization(sub_model.model, quantization_config, **kwargs)
self.model.clear_requests()
elif isinstance(self.model, OVModelForVisualCausalLM):
language_model = self.model.language_model
sub_model_names = ["vision_embeddings", "text_embeddings"] + self.model.additional_parts
self._check_model_state(sub_model_names + ["language_model"])
_weight_only_quantization(language_model.model, quantization_config, calibration_dataset, **kwargs)
sub_model_names = ["vision_embeddings", "text_embeddings"] + self.model.additional_parts
sub_models = [getattr(self.model, f"{name}_model") for name in sub_model_names]
for sub_model in sub_models:
_weight_only_quantization(sub_model, OVWeightQuantizationConfig(bits=8, sym=True), **kwargs)
self.model.clear_requests()
elif isinstance(self.model, OVModelForSeq2SeqLM):
sub_model_names = ["encoder", "decoder"]
if self.model.decoder_with_past is not None:
sub_model_names.append("decoder_with_past")
self._check_model_state(sub_model_names)
sub_models = [getattr(self.model, name) for name in sub_model_names]
for sub_model in sub_models:
_weight_only_quantization(sub_model, quantization_config, **kwargs)
self.model.clear_requests()
else:
self._check_model_state()
_weight_only_quantization(self.model.model, quantization_config, calibration_dataset, **kwargs)
self.model.request = None
else:
@@ -499,7 +460,6 @@ def _quantize_ovbasemodel(

# Quantize model(s)
if isinstance(self.model, _OVModelForWhisper):
self._check_model_state(["encoder_model", "decoder_model", "decoder_with_past_model"])
self._quantize_whisper_model(quantization_config, calibration_dataset, **kwargs)
else:
quantized_model = _full_quantization(
@@ -1050,6 +1010,7 @@ def _weight_only_quantization(
calibration_dataset: Optional[Union[nncf.Dataset, Iterable]] = None,
**kwargs,
) -> openvino.runtime.Model:
_verify_not_optimized(model)
config = quantization_config
if isinstance(config, dict):
config = OVWeightQuantizationConfig.from_dict(quantization_config)
@@ -1106,6 +1067,7 @@ def _full_quantization(
calibration_dataset: nncf.Dataset,
**kwargs,
):
_verify_not_optimized(model)
advanced_parameters_kwargs = {}
if quantization_config.smooth_quant_alpha is not None:
advanced_parameters_kwargs["smooth_quant_alphas"] = AdvancedSmoothQuantParameters(
@@ -1227,3 +1189,20 @@ def _hybrid_quantization(
**kwargs,
)
return quantized_model


def _verify_not_optimized(ov_model):
message_template = (
"Cannot apply optimization to the model because it was already optimized with the following config: {}. "
"To avoid this issue, check that you set load_in_8bit=False or not using quantization_config at export in the .from_pretrained(), "
"or explicitly specify weight format with --weight_format fp16/fp32 when using CLI."
)

rt_info = ov_model.get_rt_info()
if "nncf" in rt_info:
model_weight_compression_config = rt_info["nncf"].get("weight_compression", None)
model_quantization_config = rt_info["nncf"].get("quantization", None)
if model_weight_compression_config is not None:
raise RuntimeError(message_template.format(model_weight_compression_config))
elif model_quantization_config is not None:
raise RuntimeError(message_template.format(model_quantization_config))
20 changes: 13 additions & 7 deletions tests/openvino/test_quantization.py
Original file line number Diff line number Diff line change
@@ -214,7 +214,9 @@ def preprocess_function(examples, tokenizer):
# Verify that the configuration is correctly saved and loaded
loaded_config = OVConfig.from_pretrained(tmp_dir)
self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict())
check_optimization_not_applicable_to_optimized_model(model, quantization_config=OVWeightQuantizationConfig(bits=8))
check_optimization_not_applicable_to_optimized_model(
model, quantization_config=OVWeightQuantizationConfig(bits=8)
)

@parameterized.expand(SUPPORTED_ARCHITECTURES_OV_MODEL_WITH_AUTO_DATASET)
def test_ov_model_static_quantization_with_auto_dataset(
@@ -256,7 +258,6 @@ def test_ov_model_static_quantization_with_auto_dataset(
self.assertTrue("logits" in outputs)
else:
raise Exception("Unexpected model class.")
check_optimization_not_applicable_to_optimized_model(ov_model, quantization_config=quantization_config)


class OVWeightCompressionTest(unittest.TestCase):
@@ -747,7 +748,7 @@ def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_fake_
self.assertEqual(0, num_weight_nodes["int4"])

model.save_pretrained(tmp_dir)
check_optimization_not_applicable_to_optimized_model(model, quantization_config=quantization_config)
check_optimization_not_applicable_to_optimized_model(model, quantization_config)

def test_stable_diffusion_with_weight_compression(self):
int8_pipe = OVStableDiffusionPipeline.from_pretrained(model_id=MODEL_NAMES["stable-diffusion"], export=True)
@@ -762,8 +763,10 @@ def test_stable_diffusion_with_weight_compression(self):
self.assertEqual(0, num_fake_nodes)
self.assertEqual(242, num_weight_nodes["int8"])
self.assertEqual(0, num_weight_nodes["int4"])
quantization_config = OVWeightQuantizationConfig(bits=8, dataset="conceptual_captions", num_samples=2, quant_method=OVQuantizationMethod.HYBRID)
check_optimization_not_applicable_to_optimized_model(int8_pipe, quantization_config=quantization_config)
quantization_config = OVWeightQuantizationConfig(
bits=8, dataset="conceptual_captions", num_samples=2, quant_method=OVQuantizationMethod.HYBRID
)
check_optimization_not_applicable_to_optimized_model(int8_pipe, quantization_config)

@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION[-1:])
def test_ovmodel_hybrid_quantization_with_custom_dataset(
@@ -1338,5 +1341,8 @@ def test_calibration_data_uniqueness(self, model_name, apply_caching):

def check_optimization_not_applicable_to_optimized_model(model, quantization_config):
quantizer = OVQuantizer(model)
with pytest.raises(RuntimeError, match="Cannot apply optimization to the model because it was already optimized with the following config"):
quantizer.quantize(quantization_config=quantization_config)
with pytest.raises(
RuntimeError,
match="Cannot apply optimization to the model because it was already optimized with the following config",
):
quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config))

0 comments on commit 92facae

Please sign in to comment.