From 439d61f79cf55d5d0b28334f577b6ac3c5ced28f Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Wed, 10 Jul 2024 18:32:36 +0200 Subject: [PATCH] Set default 4-bit compression ratio to 1.0 (#815) * Set default 4-bit compression ratio to 1.0 * udpate doc * set default ratio using default config --- docs/source/openvino/export.mdx | 2 +- optimum/commands/export/openvino.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx index eed980076d..c11016fde1 100644 --- a/docs/source/openvino/export.mdx +++ b/docs/source/openvino/export.mdx @@ -60,7 +60,7 @@ Optional arguments: --pad-token-id PAD_TOKEN_ID This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it. --ratio RATIO A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80% of the layers will be quantized to int4 while - 20% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 0.8. + 20% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0. --sym Whether to apply symmetric quantization --group-size GROUP_SIZE The group size to use for int4 quantization. Recommended value is 128 and -1 will results in per-column quantization. diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index ee1f62388f..2bdee32e17 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -102,7 +102,7 @@ def parse_args_openvino(parser: "ArgumentParser"): default=None, help=( "A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80%% of the layers will be quantized to int4 " - "while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 0.8." + "while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0." ), ) optional_group.add_argument( @@ -277,7 +277,7 @@ def _get_default_int4_config(model_id_or_path, library_name): else: quantization_config = { "bits": 8 if is_int8 else 4, - "ratio": 1 if is_int8 else (self.args.ratio or 0.8), + "ratio": 1 if is_int8 else (self.args.ratio or _DEFAULT_4BIT_CONFIG["ratio"]), "sym": self.args.sym or False, "group_size": -1 if is_int8 else self.args.group_size, "all_layers": None if is_int8 else self.args.all_layers,