Skip to content

Commit

Permalink
Fix format
Browse files Browse the repository at this point in the history
  • Loading branch information
echarlaix committed Apr 11, 2024
1 parent 1541969 commit d48bcb2
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 18 deletions.
5 changes: 3 additions & 2 deletions optimum/intel/neural_compressor/modeling_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,9 @@ def _from_pretrained(
quantization_config = PretrainedConfig.from_pretrained(model_save_dir / "quantize_config.json")
algorithm = getattr(quantization_config, "quant_method", None)
if algorithm in {"rtn", "gptq", "awq", "autoaround"}:

from intel_extension_for_transformers.transformers.modeling.modeling_auto import _BaseQBitsAutoModelClass
from intel_extension_for_transformers.transformers.modeling.modeling_auto import (
_BaseQBitsAutoModelClass,
)

_BaseQBitsAutoModelClass.ORIG_MODEL = cls.auto_model_class

Expand Down
28 changes: 18 additions & 10 deletions optimum/intel/neural_compressor/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
)
from .utils import INCDataLoader, _cfgs_to_fx_cfgs


INTEL_EXTENSION_FOR_TRANSFORMERS_MINIMUM_VERSION = "1.4.0"

if is_intel_extension_for_transformers_available():
Expand All @@ -81,8 +82,12 @@
)
from intel_extension_for_transformers.transformers.llm.quantization.utils import convert_to_quantized_model
from intel_extension_for_transformers.transformers.modeling.modeling_auto import save_low_bit
from intel_extension_for_transformers.transformers.utils.config import ITREXQuantizationConfigMixin
from intel_extension_for_transformers.transformers.utils.config import GPTQConfig, RtnConfig, AwqConfig
from intel_extension_for_transformers.transformers.utils.config import (
AwqConfig,
GPTQConfig,
ITREXQuantizationConfigMixin,
RtnConfig,
)


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -195,7 +200,9 @@ def quantize(
self._set_task()

if kwargs.pop("weight_only", None) is None:
logger.warning(f"`weight_only` is deprecated. Use `quantization_config` instead to specify which methodology and quantization pamraters to apply.")
logger.warning(
"`weight_only` is deprecated. Use `quantization_config` instead to specify which methodology and quantization pamraters to apply."
)

if (
isinstance(quantization_config, PostTrainingQuantConfig)
Expand All @@ -209,15 +216,15 @@ def quantize(
)

if save_onnx_model:
if not isinstance(quantization_config, PostTrainingQuantConfig) or INCQuantizationMode(quantization_config.approach) == INCQuantizationMode.DYNAMIC:

if (
not isinstance(quantization_config, PostTrainingQuantConfig)
or INCQuantizationMode(quantization_config.approach) == INCQuantizationMode.DYNAMIC
):
logger.warning("ONNX export for dynamic and weight only quantized model is not supported.")
save_onnx_model = False


# ITREX Weight Only Quantization
if not isinstance(quantization_config, PostTrainingQuantConfig):

# check neural-compressor version
if is_neural_compressor_version("<", NEURAL_COMPRESSOR_WEIGHT_ONLY_MINIMUM_VERSION):
raise ImportError(
Expand All @@ -237,7 +244,9 @@ def quantize(
raise ValueError("")

if not isinstance(quantization_config, (GPTQConfig, RtnConfig)):
raise ValueError(f"Weight-only quantization is only support RTN and GPTQ algorithm now! But got {quantization_config}")
raise ValueError(
f"Weight-only quantization is only support RTN and GPTQ algorithm now! But got {quantization_config}"
)

if calibration_dataset is None and isinstance(quantization_config, (GPTQConfig, AwqConfig)):
raise ValueError(
Expand Down Expand Up @@ -273,7 +282,6 @@ def quantize(
logger.warning("ONNX export is no supported for model with quantized embeddings")
save_onnx_model = False


if not isinstance(quantization_config, PostTrainingQuantConfig):
if use_cpu:
# will remove after intel-extension-for-transformers 1.3.3 release.
Expand All @@ -287,7 +295,7 @@ def quantize(
self._quantized_model = convert_to_quantized_model(
self._original_model, quantization_config, device=quantization_config.device
)

self._quantized_model.quantization_config = quantization_config
self._quantized_model.save_pretrained = types.MethodType(save_low_bit, self._quantized_model)
self._quantized_model.save_pretrained(save_directory)
Expand Down
7 changes: 1 addition & 6 deletions tests/neural_compressor/test_optimization.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,17 +87,12 @@ class OptimizationTest(INCTestMixin):
"hf-internal-testing/tiny-random-GPTNeoForCausalLM",
)




WEIGHT_ONLY_CONFIG = (
("rtn", "int4_clip"),
("gptq", "int4_clip"),
("rtn", "int8"),
)



@parameterized.expand(SUPPORTED_ARCHITECTURES_DYNAMIC)
def test_dynamic_quantization(self, task, model_name, expected_quantized_matmuls):
quantization_config = PostTrainingQuantConfig(approach="dynamic")
Expand Down Expand Up @@ -244,7 +239,7 @@ def test_weight_only_quantization(self, methodology, weight_dtype):
with torch.no_grad():
loaded_outputs = loaded_model(**tokens)
# quantizer_outputs = quantizer_model(**tokens)

self.assertTrue("logits" in loaded_outputs)
self.assertIsInstance(loaded_outputs.logits, torch.Tensor)
self.assertTrue("past_key_values" in loaded_outputs)
Expand Down

0 comments on commit d48bcb2

Please sign in to comment.