Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix compatibility for latest itrex version #658

Merged
merged 14 commits into from
Apr 18, 2024
13 changes: 9 additions & 4 deletions .github/workflows/test_inc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,16 @@ jobs:
python -m pip install --upgrade pip
pip install cmake
pip install py-cpuinfo
pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu
pip install .[neural-compressor,diffusers,tests]
pip install intel-extension-for-pytorch==2.1.100
pip install intel-extension-for-transformers==1.3.2
pip install intel-extension-for-transformers
pip install peft
- name: Test with Pytest
run: |
pytest tests/neural_compressor/
pytest tests/neural_compressor/ --ignore tests/neural_compressor/test_ipex.py --durations=0
- name: Test IPEX
run: |
pip uninstall intel-extension-for-transformers
pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu
pip install intel-extension-for-pytorch
pytest tests/neural_compressor/test_ipex.py

41 changes: 21 additions & 20 deletions examples/neural_compressor/language-modeling/run_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,7 @@


if is_intel_extension_for_transformers_available():
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig

from intel_extension_for_transformers.transformers.utils.config import GPTQConfig, RtnConfig

os.environ["CUDA_VISIBLE_DEVICES"] = ""

Expand Down Expand Up @@ -227,8 +226,9 @@ class OptimizationArguments:
metadata={"help": "Scheme for weight only quantization. Choose from 'sym' and 'asym'."},
)
quantization_methodology: str = field(
default="RTN",
metadata={"help": "Quantization methodology for weight only quantization. Choose from 'RTN' and 'GPTQ'."},
choices=["rtn", "gptq"],
default="rtn",
metadata={"help": "Quantization methodology for weight only quantization. Choose from 'rtn' and 'gptq'."},
)
damp_percent: float = field(
default=0.01,
Expand Down Expand Up @@ -662,22 +662,23 @@ def compute_metrics(eval_preds):
raise ImportError(INTEL_EXTENSION_FOR_TRANSFORMERS_IMPORT_ERROR.format("WeightOnly quantization"))
if optim_args.apply_pruning or optim_args.apply_distillation:
raise ValueError("Weight only quantization and pruning or distillation cannot be combined.")
if optim_args.quantization_methodology == "GPTQ":
algorithm_args = {
"act_order": False,
"percdamp": optim_args.damp_percent,
"block_size": optim_args.gptq_block_size,
"nsamples": optim_args.num_calibration_samples,
"use_max_length": optim_args.use_max_length,
"pad_max_length": optim_args.pad_max_length,
}
quantization_config = WeightOnlyQuantConfig(
weight_dtype=optim_args.weight_dtype,
group_size=optim_args.group_size,
scheme=optim_args.weight_only_scheme,
algorithm=optim_args.quantization_methodology,
algorithm_args=algorithm_args if optim_args.quantization_methodology == "GPTQ" else None,
)

algorithm_args = {
"weight_dtype": optim_args.weight_dtype,
"sym": optim_args.weight_only_scheme == "sym",
"group_size": optim_args.group_size,
}

if optim_args.quantization_methodology == "gptq":
quantization_config = GPTQConfig(
damp_percent=optim_args.damp_percent,
nsamples=optim_args.num_calibration_samples,
blocksize=optim_args.gptq_block_size,
**algorithm_args,
)
else:
quantization_config = RtnConfig(**algorithm_args)

else:
quantization_config = PostTrainingQuantConfig(
approach=optim_args.quantization_approach, recipes=recipes
Expand Down
24 changes: 10 additions & 14 deletions optimum/intel/neural_compressor/modeling_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,6 @@
"""


if is_intel_extension_for_transformers_available():
from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM as ITREX_WOQ_MODEL
from intel_extension_for_transformers.transformers.utils import WeightOnlyQuantConfig


class INCModel(OptimizedModel):
auto_model_class = AutoModel
export_feature = "feature-extraction"
Expand Down Expand Up @@ -142,15 +137,16 @@ def _from_pretrained(
msg = None
if is_intel_extension_for_transformers_available():
try:
quantization_config = WeightOnlyQuantConfig.from_pretrained(model_id)
algorithm = getattr(quantization_config, "algorithm", None)
if algorithm is not None and quantization_config.algorithm.lower() in {
"rtn",
"gptq",
"awq",
"autoaround",
}:
return ITREX_WOQ_MODEL.from_pretrained(
quantization_config = PretrainedConfig.from_pretrained(model_save_dir / "quantize_config.json")
algorithm = getattr(quantization_config, "quant_method", None)
if algorithm in {"rtn", "gptq", "awq", "autoaround"}:
from intel_extension_for_transformers.transformers.modeling.modeling_auto import (
_BaseQBitsAutoModelClass,
)

_BaseQBitsAutoModelClass.ORIG_MODEL = cls.auto_model_class

return _BaseQBitsAutoModelClass.from_pretrained(
pretrained_model_name_or_path=model_id,
use_auth_token=use_auth_token,
revision=revision,
Expand Down
27 changes: 0 additions & 27 deletions optimum/intel/neural_compressor/modeling_decoder.py

This file was deleted.

118 changes: 56 additions & 62 deletions optimum/intel/neural_compressor/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,24 +72,30 @@
from .utils import INCDataLoader, _cfgs_to_fx_cfgs


INTEL_EXTENSION_FOR_TRANSFORMERS_MINIMUM_VERSION = "1.4.0"

if is_intel_extension_for_transformers_available():
INTEL_EXTENSION_FOR_TRANSFORMERS_MINIMUM_VERSION = "1.3.2"
if is_intel_extension_for_transformers_version("!=", INTEL_EXTENSION_FOR_TRANSFORMERS_MINIMUM_VERSION):
raise ImportError(
f"Found an incompatible version of `intel-extension-for-transformers`. Found version {_intel_extension_for_transformers_version}, "
f"but only version {INTEL_EXTENSION_FOR_TRANSFORMERS_MINIMUM_VERSION} is supported."
)
from intel_extension_for_transformers.llm.quantization.utils import convert_to_quantized_model
from intel_extension_for_transformers.transformers.llm.quantization.utils import convert_to_quantized_model
from intel_extension_for_transformers.transformers.modeling.modeling_auto import save_low_bit
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig
from intel_extension_for_transformers.transformers.utils.config import (
AwqConfig,
GPTQConfig,
ITREXQuantizationConfigMixin,
RtnConfig,
)


logger = logging.getLogger(__name__)

NEURAL_COMPRESSOR_MINIMUM_VERSION = "2.1.0"
NEURAL_COMPRESSOR_WEIGHT_ONLY_MINIMUM_VERSION = "2.3.0"
IPEX_MINIMUM_VERSION = "2.1.0"
_ITREX_TORCH_VERSION = "2.1.0"
ITREX_MINIMUM_TORCH_VERSION = "2.2.0"

if is_neural_compressor_version("<", NEURAL_COMPRESSOR_MINIMUM_VERSION):
raise ImportError(
Expand Down Expand Up @@ -152,21 +158,20 @@ def from_pretrained(cls, model: PreTrainedModel, **kwargs):

def quantize(
self,
quantization_config: Union["PostTrainingQuantConfig", "WeightOnlyQuantConfig"],
quantization_config: Union["PostTrainingQuantConfig", "ITREXQuantizationConfigMixin"],
save_directory: Union[str, Path],
calibration_dataset: Dataset = None,
batch_size: int = 8,
data_collator: Optional[DataCollator] = None,
remove_unused_columns: bool = True,
file_name: str = None,
weight_only: bool = False,
**kwargs,
):
"""
Quantize a model given the optimization specifications defined in `quantization_config`.

Args:
quantization_config (`Union[PostTrainingQuantConfig, WeightOnlyQuantConfig]`):
quantization_config (`Union[PostTrainingQuantConfig, ITREXQuantizationConfigMixin]`):
The configuration containing the parameters related to quantization.
save_directory (`Union[str, Path]`):
The directory where the quantized model should be saved.
Expand All @@ -178,26 +183,48 @@ def quantize(
The function to use to form a batch from a list of elements of the calibration dataset.
remove_unused_columns (`bool`, defaults to `True`):
Whether or not to remove the columns unused by the model forward method.
weight_only (`bool`, defaults to `False`):
Whether compress weights to integer precision (4-bit by default) while keeping activations
floating-point. Fits best for LLM footprint reduction and performance acceleration.
"""
save_directory = Path(save_directory)
save_directory.mkdir(parents=True, exist_ok=True)
save_onnx_model = kwargs.pop("save_onnx_model", False)
device = kwargs.pop("device", "cpu")
use_cpu = device == torch.device("cpu") or device == "cpu"
use_xpu = device == torch.device("xpu") or device == "xpu"
calibration_dataloader = None

if save_onnx_model and (isinstance(self._original_model, ORTModel) or weight_only):
if save_onnx_model and isinstance(self._original_model, ORTModel):
save_onnx_model = False
logger.warning("Model provided is an ONNX model, `save_onnx_model` is set to False")

default_name = WEIGHTS_NAME if not isinstance(self._original_model, ORTModel) else ONNX_WEIGHTS_NAME
calibration_dataloader = None
self._set_task()

if weight_only or not isinstance(quantization_config, PostTrainingQuantConfig):
if kwargs.pop("weight_only", None) is None:
echarlaix marked this conversation as resolved.
Show resolved Hide resolved
logger.warning(
"`weight_only` is deprecated. Use `quantization_config` instead to specify which methodology and quantization pamraters to apply."
)

if (
isinstance(quantization_config, PostTrainingQuantConfig)
and quantization_config.backend == "ipex"
and is_ipex_version("<", IPEX_MINIMUM_VERSION)
and "generation" in self.task
):
raise ImportError(
f"Found an incompatible version of intel-extension-for-pytorch. Found version {_ipex_version}, "
f"but only version {IPEX_MINIMUM_VERSION} or higher is supported."
)

if save_onnx_model:
if (
not isinstance(quantization_config, PostTrainingQuantConfig)
or INCQuantizationMode(quantization_config.approach) == INCQuantizationMode.DYNAMIC
):
logger.warning("ONNX export for dynamic and weight only quantized model is not supported.")
save_onnx_model = False

# ITREX Weight Only Quantization
if not isinstance(quantization_config, PostTrainingQuantConfig):
# check neural-compressor version
if is_neural_compressor_version("<", NEURAL_COMPRESSOR_WEIGHT_ONLY_MINIMUM_VERSION):
raise ImportError(
Expand All @@ -207,53 +234,42 @@ def quantize(
if not is_intel_extension_for_transformers_available():
raise ImportError(INTEL_EXTENSION_FOR_TRANSFORMERS_IMPORT_ERROR.format("Weight only quantization"))

if is_torch_version("!=", _ITREX_TORCH_VERSION):
if is_torch_version("<", ITREX_MINIMUM_TORCH_VERSION):
raise ImportError(
f"Found an incompatible version of `torch`. Found version {_torch_version}, "
f"but only version {_ITREX_TORCH_VERSION} is supported."
f"but only version {ITREX_MINIMUM_TORCH_VERSION} or higher is supported."
)

if quantization_config is None:
quantization_config = WeightOnlyQuantConfig()
algo = "RTN"
elif isinstance(quantization_config, WeightOnlyQuantConfig):
algo = quantization_config.algorithm
else:
raise TypeError(
f"For weight-only quantization, `quantization_config` should be an instance of `WeightOnlyQuantConfig`, but got: {type(quantization_config)} instead."
)
if not isinstance(quantization_config, ITREXQuantizationConfigMixin):
raise ValueError("")
echarlaix marked this conversation as resolved.
Show resolved Hide resolved

if algo not in ["RTN", "GPTQ"]:
raise ValueError(f"Weight-only quantization is only support RTN and GPTQ algorithm now!But got {algo}")
if not isinstance(quantization_config, (GPTQConfig, RtnConfig)):
raise ValueError(
f"Weight-only quantization is only support RTN and GPTQ algorithm now! But got {quantization_config}"
)

if calibration_dataset is None and quantization_config.tokenizer is None and ("GPTQ" in algo):
if calibration_dataset is None and isinstance(quantization_config, (GPTQConfig, AwqConfig)):
raise ValueError(
"Weight-only quantization needs a calibration dataset for both GPTQ and AWQ methodologies."
)

if calibration_dataset is None:
calibration_dataloader = None
else:
if calibration_dataset is not None:
calibration_dataloader = self._get_calibration_dataloader(
calibration_dataset=calibration_dataset,
batch_size=batch_size,
remove_unused_columns=remove_unused_columns,
data_collator=data_collator,
use_label=False if "GPTQ" in algo else True,
use_label=not isinstance(quantization_config, (GPTQConfig)),
)
quantization_config.calib_dataloader = calibration_dataloader

save_onnx_model = False

elif INCQuantizationMode(quantization_config.approach) == INCQuantizationMode.STATIC:
# Since PyTorch fx trace does not really require an example_inputs, only need calibration_dataset or calibration_fn here.
if calibration_dataset is None and self.calibration_fn is None:
raise ValueError(
"Post-training static quantization needs a calibration dataset or a calibration_function."
)
if calibration_dataset is None:
calibration_dataloader = None
else:
if calibration_dataset is not None:
quantization_config.calibration_sampling_size = len(calibration_dataset)
calibration_dataloader = self._get_calibration_dataloader(
calibration_dataset=calibration_dataset,
Expand All @@ -266,45 +282,24 @@ def quantize(
logger.warning("ONNX export is no supported for model with quantized embeddings")
save_onnx_model = False

else:
# Disable ONNX export for dynamically quantized model as deprecated in neural-compressor>=2.2.0
if save_onnx_model:
logger.warning(
"ONNX export for dynamic quantized model is no longer supported by neural-compressor>=2.2.0. "
"To apply dynamic quantization on an ONNX model, you can use optimum.onnxruntime.ORTQuantizer"
)
save_onnx_model = False

if (
isinstance(quantization_config, PostTrainingQuantConfig)
and quantization_config.backend == "ipex"
and is_ipex_version("<", IPEX_MINIMUM_VERSION)
and "generation" in self.task
):
raise ImportError(
f"Found an incompatible version of intel-extension-for-pytorch. Found version {_ipex_version}, "
f"but only version {IPEX_MINIMUM_VERSION} or higher is supported."
)

if not isinstance(quantization_config, PostTrainingQuantConfig):
if use_cpu:
# will remove after intel-extension-for-transformers 1.3.3 release.
quantization_config.device = "cpu"
quantization_config.post_init()
quantization_config.post_init_cpu()
elif use_xpu:
# will remove after intel-extension-for-transformers 1.3.3 release.
quantization_config.device = "xpu"
quantization_config.post_init_xpu()

self._quantized_model = convert_to_quantized_model(
self._original_model, quantization_config, device=quantization_config.device
)
# will remove after intel-extension-for-transformers 1.3.3 release.
if hasattr(quantization_config, "calib_dataloader"):
quantization_config.calib_dataloader = None

self._quantized_model.quantization_config = quantization_config
echarlaix marked this conversation as resolved.
Show resolved Hide resolved
self._quantized_model.save_pretrained = types.MethodType(save_low_bit, self._quantized_model)
# Save the quantized model
self._quantized_model.save_pretrained(save_directory)

else:
if isinstance(self._original_model.config, PretrainedConfig):
self._original_model.config.backend = quantization_config.backend
Expand Down Expand Up @@ -376,7 +371,6 @@ def quantize(
self._save_pretrained(compressed_model, output_path)
quantization_config = INCConfig(quantization=quantization_config, save_onnx_model=save_onnx_model)
quantization_config.save_pretrained(save_directory)
return self._quantized_model

@staticmethod
def _save_pretrained(model: Union[PyTorchModel, IPEXModel], output_path: str):
Expand Down
Loading
Loading