Skip to content

Commit

Permalink
align rt_info int8 models compressed by default and via config (#1130)
Browse files Browse the repository at this point in the history
  • Loading branch information
eaidova authored Jan 29, 2025
1 parent 3ef8ae2 commit a59bb41
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 1 deletion.
2 changes: 1 addition & 1 deletion optimum/commands/export/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ def run(self):
is_int8 = self.args.weight_format == "int8"
quantization_config = {
"bits": 8 if is_int8 else 4,
"ratio": 1 if is_int8 else (self.args.ratio or _DEFAULT_4BIT_CONFIG["ratio"]),
"ratio": 1.0 if is_int8 else (self.args.ratio or _DEFAULT_4BIT_CONFIG["ratio"]),
"sym": self.args.sym or False,
"group_size": -1 if is_int8 else self.args.group_size,
"all_layers": None if is_int8 else self.args.all_layers,
Expand Down
6 changes: 6 additions & 0 deletions optimum/exporters/openvino/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,12 @@ class StoreAttr(object):
from optimum.intel.openvino.quantization import _weight_only_quantization

_weight_only_quantization(submodel, quantization_config)
# kv cache compression disabled if quantization config is not provided,
# to keep aligned result of applying auto int8 compression and via explicit setting config, we should update it
if submodel.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]):
prev_rt_info = submodel.get_rt_info("runtime_options").value
prev_rt_info.pop("KV_CACHE_PRECISION")
submodel.set_rt_info(prev_rt_info, "runtime_options")
compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml"
save_model(submodel, compressed_submodel_path, compress_to_fp16=False)
del submodel
Expand Down

0 comments on commit a59bb41

Please sign in to comment.