v1.6.0: INC API refactorization
Refactorization of the INC API for neural-compressor v2.0 (#118)
The INCQuantizer
should be used to apply post-training (dynamic or static) quantization.
from transformers import AutoModelForQuestionAnswering
from neural_compressor.config import PostTrainingQuantConfig
from optimum.intel.neural_compressor import INCQuantizer
model_name = "distilbert-base-cased-distilled-squad"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
# Load the quantization configuration detailing the quantization we wish to apply
quantization_config = PostTrainingQuantConfig(approach="dynamic")
quantizer = INCQuantizer.from_pretrained(model)
# Apply dynamic quantization and save the resulting model in the given directory
quantizer.quantize(quantization_config=quantization_config, save_directory="quantized_model")
The INCTrainer
should be used to apply and combine during training compression techniques such as pruning, quantization and distillation.
from transformers import TrainingArguments, default_data_collator
-from transformers import Trainer
+from optimum.intel.neural_compressor import INCTrainer
+from neural_compressor import QuantizationAwareTrainingConfig
# Load the quantization configuration detailing the quantization we wish to apply
+quantization_config = QuantizationAwareTrainingConfig()
-trainer = Trainer(
+trainer = INCTrainer(
model=model,
+ quantization_config=quantization_config,
args=TrainingArguments("quantized_model", num_train_epochs=3.0),
train_dataset=train_dataset,
eval_dataset=eval_dataset,
compute_metrics=compute_metrics,
tokenizer=tokenizer,
data_collator=default_data_collator,
)
trainer.save_model()
To load a quantized model, you can just replace your AutoModelForXxx
class with the corresponding INCModelForXxx
class.
from optimum.intel.neural_compressor import INCModelForSequenceClassification
loaded_model_from_hub = INCModelForSequenceClassification.from_pretrained(
"Intel/distilbert-base-uncased-finetuned-sst-2-english-int8-dynamic"
)