Skip to content

Commit

Permalink
Fp8 implementation (#1100)
Browse files Browse the repository at this point in the history
* Fp8 implementation

* All datasets support

* Added test

* Update test

* Correctness

* Correctness

* Update docs/source/openvino/export.mdx

Co-authored-by: Alexander Kozlov <[email protected]>

* Change test model

* Apply comments

---------

Co-authored-by: Alexander Kozlov <[email protected]>
  • Loading branch information
nikita-malininn and AlexKoff88 authored Jan 16, 2025
1 parent feaf027 commit 878b474
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 66 deletions.
9 changes: 4 additions & 5 deletions docs/source/openvino/export.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Check out the help for more options:

```text
usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt,tf}] [--trust-remote-code]
[--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8}]
[--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}] [--quant-mode {int8,f8e4m3,f8e5m2}]
[--library {transformers,diffusers,timm,sentence_transformers,open_clip}]
[--cache_dir CACHE_DIR] [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym]
[--group-size GROUP_SIZE] [--backup-precision {none,int8_sym,int8_asym}]
Expand Down Expand Up @@ -67,10 +67,9 @@ Optional arguments:
on your local machine arbitrary code present in the model repository.
--weight-format {fp32,fp16,int8,int4,mxfp4,nf4}
The weight format of the exported model.
--quant-mode {int8}
--quant-mode {int8,f8e4m3,f8e5m2}
Quantization precision mode. This is used for applying full model quantization including
activations. The only currently supported choice is 'int8' for int8 quantization of both
weights and activations.
activations.
--library {transformers,diffusers,timm,sentence_transformers,open_clip}
The library used to load the model before export. If not provided, will attempt to infer the
local checkpoint's library
Expand Down Expand Up @@ -166,7 +165,7 @@ Models larger than 1 billion parameters are exported to the OpenVINO format with
</Tip>


Besides weight-only quantization, you can also apply full model quantization including activations by setting `--quant-mode` to `int8`. This will quantize both weights and activations of Linear, Convolutional and some other layers to int8. Currently this is only supported for speech-to-text models. Please see example below.
Besides weight-only quantization, you can also apply full model quantization including activations by setting `--quant-mode` to preffered precision. This will quantize both weights and activations of Linear, Convolutional and some other layers to selected mode. Please see example below.

```bash
optimum-cli export openvino -m openai/whisper-large-v3-turbo --quant-mode int8 --dataset librispeech --num-samples 32 --smooth-quant-alpha 0.9 ./whisper-large-v3-turbo
Expand Down
6 changes: 1 addition & 5 deletions optimum/commands/export/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,10 @@ def parse_args_openvino(parser: "ArgumentParser"):
optional_group.add_argument(
"--quant-mode",
type=str,
choices=["int8"],
choices=["int8", "f8e4m3", "f8e5m2"],
default=None,
help=(
"Quantization precision mode. This is used for applying full model quantization including activations. "
"The only currently supported choice is 'int8' for int8 quantization of both weights and activations."
),
)
optional_group.add_argument(
Expand Down Expand Up @@ -365,9 +364,6 @@ def run(self):
quantization_config["trust_remote_code"] = self.args.trust_remote_code
ov_config = OVConfig(quantization_config=quantization_config)
else:
if self.args.quant_mode != "int8":
raise ValueError("Only 'int8' quantization mode is currently supported.")

quantization_config = {
"weight_format": self.args.quant_mode,
"activation_format": self.args.quant_mode,
Expand Down
31 changes: 11 additions & 20 deletions optimum/intel/openvino/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from optimum.configuration_utils import BaseConfig

from ..utils.import_utils import is_nncf_available
from .utils import PREDEFINED_SD_DATASETS, PREDEFINED_SPEECH_TO_TEXT_DATASETS, PREDEFINED_VISUAL_LM_DATASETS
from .utils import PREDEFINED_SD_DATASETS, PREDEFINED_VISUAL_LM_DATASETS


if is_nncf_available():
Expand Down Expand Up @@ -638,9 +638,9 @@ def __init__(
SmoothQuant alpha parameter that improves the distribution of activations before MatMul layers and
reduces quantization error.
weight_format (`str`, defaults to "int8"):
Data format weights are quantized to. Possible values: ['int8'].
Data format weights are quantized to. Possible values: ['int8', 'f8e4m3', 'f8e5m2'].
activation_format (`str`, defaults to "int8"):
Data format activations are compressed to. Possible values: ['int8'].
Data format activations are compressed to. Possible values: ['int8', 'f8e4m3', 'f8e5m2'].
"""
super().__init__(
bits=bits,
Expand All @@ -658,6 +658,13 @@ def __init__(
self.overflow_fix = overflow_fix
self.smooth_quant_alpha = smooth_quant_alpha
self.activation_format = activation_format

f8_formats = ["f8e4m3", "f8e5m2"]
if self.activation_format in f8_formats and self.weight_format in f8_formats:
logger.info(
f"{self.activation_format} for activations and {self.weight_format} weights were found. A symmetrical scheme will be used."
)
self.sym = True
self.post_init()

def post_init(self):
Expand All @@ -669,24 +676,11 @@ def post_init(self):
if self.bits != 8:
raise ValueError(f"Only support 8-bit for static quantization but found {self.bits}")

if self.dataset is not None:
if self.dataset not in PREDEFINED_SPEECH_TO_TEXT_DATASETS:
raise ValueError(
f"You have entered the following string value for dataset: {self.dataset}. But it is not supported."
f" Currently you can only choose {list(PREDEFINED_SPEECH_TO_TEXT_DATASETS.keys())}."
)

if self.smooth_quant_alpha is not None and not (0 <= self.smooth_quant_alpha <= 1):
raise ValueError(
f"SmoothQuant alpha parameter must be in range [0, 1], but found {self.smooth_quant_alpha}"
)

if self.weight_format != "int8":
raise ValueError("Only 'int8' weight format is currently supported.")

if self.activation_format != "int8":
raise ValueError("Only 'int8' activation format is currently supported.")


class OVConfig(BaseConfig):
CONFIG_NAME = "openvino_config.json"
Expand All @@ -711,10 +705,7 @@ def __init__(
"compression", None
) # A field for backward-compatability of training-time compression parameters
if self.quantization_config is not None:
if isinstance(self.quantization_config, OVWeightQuantizationConfig):
self.dtype = self.quantization_config.weight_format
else:
self.dtype = "int8"
self.dtype = self.quantization_config.weight_format
else:
self.dtype = dtype

Expand Down
13 changes: 8 additions & 5 deletions optimum/intel/openvino/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,11 +458,6 @@ def _quantize_ovbasemodel(
if calibration_dataset is None:
raise ValueError("Calibration dataset is required to run quantization.")

if quantization_config.weight_format != "int8":
raise ValueError("Only 'int8' weight format is currently supported.")
if quantization_config.activation_format != "int8":
raise ValueError("Only 'int8' activation format is currently supported.")

# Quantize model(s)
if isinstance(self.model, _OVModelForWhisper):
self._quantize_whisper_model(quantization_config, calibration_dataset, **kwargs)
Expand Down Expand Up @@ -1077,6 +1072,14 @@ def _full_quantization(
matmul=quantization_config.smooth_quant_alpha
)

q_mode_map = {
"f8e4m3": nncf.QuantizationMode.FP8_E4M3,
"f8e5m2": nncf.QuantizationMode.FP8_E5M2,
}

if quantization_config.activation_format in q_mode_map:
kwargs.update({"mode": q_mode_map[quantization_config.activation_format]})

quantized_model = nncf.quantize(
model,
calibration_dataset,
Expand Down
34 changes: 22 additions & 12 deletions tests/openvino/test_exporters_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,19 @@ class OVCLIExportTestCase(unittest.TestCase):
(
"automatic-speech-recognition",
"whisper",
"--quant-mode int8 --dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code",
"int8",
"--dataset librispeech --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code",
(14, 22, 21) if is_transformers_version("<=", "4.36.0") else (14, 22, 25),
(14, 21, 17) if is_transformers_version("<=", "4.36.0") else (14, 22, 18),
),
(
"text-generation",
"llama",
"f8e4m3",
"--dataset wikitext2 --num-samples 1 --smooth-quant-alpha 0.9 --trust-remote-code",
(13,),
(16,),
),
]

TEST_4BIT_CONFIGURATIONS = [
Expand Down Expand Up @@ -411,30 +420,31 @@ def test_exporters_cli_full_quantization(
self,
task: str,
model_type: str,
quant_mode: str,
option: str,
expected_num_fq_nodes_per_model: Tuple[int],
expected_num_f_nodes_per_model: Tuple[int],
expected_num_weight_nodes_per_model: Tuple[int],
):
with TemporaryDirectory() as tmpdir:
subprocess.run(
f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} {option} {tmpdir}",
f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --quant-mode {quant_mode} {option} {tmpdir}",
shell=True,
check=True,
)
model = eval(_HEAD_TO_AUTOMODELS[task]).from_pretrained(tmpdir)

submodels = []
models = [model]
if task == "automatic-speech-recognition":
submodels = [model.encoder, model.decoder]
models = [model.encoder, model.decoder]
if model.decoder_with_past is not None:
submodels.append(model.decoder_with_past)
models.append(model.decoder_with_past)
else:
expected_num_fq_nodes_per_model = expected_num_fq_nodes_per_model[:-1]
self.assertEqual(len(expected_num_fq_nodes_per_model), len(submodels))
for i, model in enumerate(submodels):
actual_num_fq_nodes, actual_num_weight_nodes = get_num_quantized_nodes(model)
self.assertEqual(expected_num_fq_nodes_per_model[i], actual_num_fq_nodes)
self.assertEqual(expected_num_weight_nodes_per_model[i], actual_num_weight_nodes["int8"])
expected_num_f_nodes_per_model = expected_num_f_nodes_per_model[:-1]
self.assertEqual(len(expected_num_f_nodes_per_model), len(models))
for i, model in enumerate(models):
actual_num_f_nodes, actual_num_weight_nodes = get_num_quantized_nodes(model)
self.assertEqual(expected_num_f_nodes_per_model[i], actual_num_f_nodes)
self.assertEqual(expected_num_weight_nodes_per_model[i], actual_num_weight_nodes[quant_mode])

def test_exporters_cli_int4_with_local_model_and_default_config(self):
with TemporaryDirectory() as tmpdir:
Expand Down
38 changes: 19 additions & 19 deletions tests/openvino/utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,31 +206,31 @@


def get_num_quantized_nodes(model):
num_fake_quantize = 0
num_weight_nodes = {
"int8": 0,
"int4": 0,
"f4e2m1": 0,
"f8e8m0": 0,
"nf4": 0,
num_fake_nodes = 0
types_map = {
"i8": "int8",
"u8": "int8",
"i4": "int4",
"u4": "int4",
"f4e2m1": "f4e2m1",
"f8e8m0": "f8e8m0",
"nf4": "nf4",
"f8e4m3": "f8e4m3",
"f8e5m2": "f8e5m2",
}
num_weight_nodes = {n: 0 for n in types_map.values()}
ov_model = model if isinstance(model, ov.Model) else model.model
for elem in ov_model.get_ops():
if "FakeQuantize" in elem.name:
num_fake_quantize += 1
num_fake_nodes += 1
if "FakeConvert" in elem.name:
num_fake_nodes += 1
for i in range(elem.get_output_size()):
type_name = elem.get_output_element_type(i).get_type_name()
if type_name in ["i8", "u8"]:
num_weight_nodes["int8"] += 1
if type_name in ["i4", "u4"]:
num_weight_nodes["int4"] += 1
if type_name == "f4e2m1":
num_weight_nodes["f4e2m1"] += 1
if type_name == "f8e8m0":
num_weight_nodes["f8e8m0"] += 1
if type_name == "nf4":
num_weight_nodes["nf4"] += 1
return num_fake_quantize, num_weight_nodes
if type_name in types_map:
name = types_map[type_name]
num_weight_nodes[name] += 1
return num_fake_nodes, num_weight_nodes


@contextmanager
Expand Down

0 comments on commit 878b474

Please sign in to comment.