align rt_info int8 models compressed by default and via config (#1130)

huggingface · Jan 29, 2025 · a59bb41 · a59bb41
1 parent 3ef8ae2
commit a59bb41
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 1 deletion.
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -345,7 +345,7 @@ def run(self):
                 is_int8 = self.args.weight_format == "int8"
                 quantization_config = {
                     "bits": 8 if is_int8 else 4,
-                    "ratio": 1 if is_int8 else (self.args.ratio or _DEFAULT_4BIT_CONFIG["ratio"]),
+                    "ratio": 1.0 if is_int8 else (self.args.ratio or _DEFAULT_4BIT_CONFIG["ratio"]),
                     "sym": self.args.sym or False,
                     "group_size": -1 if is_int8 else self.args.group_size,
                     "all_layers": None if is_int8 else self.args.all_layers,

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -488,6 +488,12 @@ class StoreAttr(object):
             from optimum.intel.openvino.quantization import _weight_only_quantization
 
             _weight_only_quantization(submodel, quantization_config)
+            # kv cache compression disabled if quantization config is not provided,
+            # to keep aligned result of applying auto int8 compression and via explicit setting config, we should update it
+            if submodel.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]):
+                prev_rt_info = submodel.get_rt_info("runtime_options").value
+                prev_rt_info.pop("KV_CACHE_PRECISION")
+                submodel.set_rt_info(prev_rt_info, "runtime_options")
             compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml"
             save_model(submodel, compressed_submodel_path, compress_to_fp16=False)
             del submodel