Skip to content

Commit

Permalink
Merge branch 'main' into numa
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyasMoutawwakil authored Aug 26, 2024
2 parents c825876 + 1f3d0c2 commit 4535927
Show file tree
Hide file tree
Showing 24 changed files with 811 additions and 228 deletions.
1 change: 1 addition & 0 deletions docs/source/openvino/models.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ Here is the list of the supported architectures :
- DistilBert
- Electra
- Encoder Decoder
- Exaone
- Falcon
- Flaubert
- GLM-4
Expand Down
40 changes: 26 additions & 14 deletions optimum/commands/export/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,24 @@ def parse_args_openvino(parser: "ArgumentParser"):
)


def no_compression_parameter_provided(args):
return all(
(
it is None
for it in (
args.ratio,
args.group_size,
args.sym,
args.all_layers,
args.dataset,
args.num_samples,
args.awq,
args.sensitivity_metric,
)
)
)


class OVExportCommand(BaseOptimumCLICommand):
COMMAND = CommandInfo(name="openvino", help="Export PyTorch models to OpenVINO IR.")

Expand Down Expand Up @@ -230,23 +248,17 @@ def run(self):

if self.args.weight_format is None:
ov_config = None
if not no_compression_parameter_provided(self.args):
logger.warning(
"The provided compression parameters will not affect conversion because of the missing --weight-format argument."
)
elif self.args.weight_format in {"fp16", "fp32"}:
ov_config = OVConfig(dtype=self.args.weight_format)
else:
is_int8 = self.args.weight_format == "int8"

# For int4 quantization if not parameter is provided, then use the default config if exist
if (
not is_int8
and self.args.ratio is None
and self.args.group_size is None
and self.args.sym is None
and self.args.all_layers is None
and self.args.dataset is None
and self.args.num_samples is None
and self.args.awq is None
and self.args.sensitivity_metric is None
):
# For int4 quantization if no parameter is provided, then use the default config if exist
if no_compression_parameter_provided(self.args) and not is_int8:
quantization_config = get_default_int4_config(self.args.model)
else:
quantization_config = {
Expand Down Expand Up @@ -305,7 +317,7 @@ def run(self):
model = model_cls.from_pretrained(self.args.model, export=True, quantization_config=quantization_config)
model.save_pretrained(self.args.output)
if not self.args.disable_convert_tokenizer:
maybe_convert_tokenizers(library_name, self.args.output, model)
maybe_convert_tokenizers(library_name, self.args.output, model, task=task)
elif task.startswith("text-generation") and quantize_with_dataset:
from optimum.intel import OVModelForCausalLM

Expand All @@ -324,7 +336,7 @@ def run(self):
preprocessors = maybe_load_preprocessors(
self.args.model, trust_remote_code=self.args.trust_remote_code
)
maybe_convert_tokenizers(library_name, self.args.output, preprocessors=preprocessors)
maybe_convert_tokenizers(library_name, self.args.output, preprocessors=preprocessors, task=task)
else:
# TODO : add input shapes
main_export(
Expand Down
48 changes: 48 additions & 0 deletions optimum/exporters/ipex/model_patcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
# limitations under the License.

from transformers.models.bert.modeling_bert import BertIntermediate
from transformers.models.falcon.modeling_falcon import FalconDecoderLayer, FalconForCausalLM
from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2Block, GPT2LMHeadModel
from transformers.models.llama.modeling_llama import (
LlamaDecoderLayer,
LlamaForCausalLM,
Expand All @@ -22,10 +24,14 @@
from transformers.models.vit.modeling_vit import ViTIntermediate

from optimum.intel.utils.import_utils import is_ipex_version, is_transformers_version
from optimum.intel.utils.modeling_utils import replace_customized_linear_with_linear

from .modeling_utils import (
_IPEX_MINIMUM_VERSION_FOR_PATCHING,
_gpt2_block_forward,
_ipex_rms_layer_norm_forward,
_IPEXFalconDecoderLayer,
_IPEXGPT2Attention,
_IPEXIntermediate,
_IPEXLlamaDecoderLayer,
_llama_model_forward,
Expand Down Expand Up @@ -67,18 +73,56 @@ def patch_op(m, target_m, new_op_name, new_op):


def _patch_llama_model(model):
"""
Patch llama model:
1. Use IPEX Rope and IAKV cache
2. Linear fusion with (2 Linears + Silu + Mul) and (Linear + Add)
"""
convert_functions(model, LlamaModel, "forward", _llama_model_forward)
convert_functions(model, LlamaRMSNorm, "forward", _ipex_rms_layer_norm_forward)
convert_class(model, LlamaDecoderLayer, _IPEXLlamaDecoderLayer, model.config)
return model


def _patch_falcon_model(model):
"""
Patch falcon model:
1. Disable SDPA so the attention mask will be compatible to ipex attention.
2. Use IPEX Rope and IAKV cache
3. Linear fusion with (Linear + Gelu) and (Linear + Add + Add)
"""
model.transformer._use_sdpa = False
replace_customized_linear_with_linear(model)
convert_class(model, FalconDecoderLayer, _IPEXFalconDecoderLayer, model.config)
return model


def _patch_gpt2_model(model):
"""
Patch gpt2 model:
1. Disable SDPA so the attention mask will be compatible to ipex attention.
2. Use IAKV cache
"""
model.transformer._attn_implementation = "eager"
convert_class(model, GPT2Attention, _IPEXGPT2Attention, model.config)
convert_functions(model, GPT2Block, "forward", _gpt2_block_forward)
return model


def _patch_bert_model(model):
"""
Patch bert model:
1. Linear fusion with Linear + Gelu
"""
convert_class(model, BertIntermediate, _IPEXIntermediate)
return model


def _patch_vit_model(model):
"""
Patch vit model:
1. Linear fusion with Linear + Gelu
"""
convert_class(model, ViTIntermediate, _IPEXIntermediate)
return model

Expand All @@ -94,6 +138,10 @@ def _patch_model(model):
)
if isinstance(model, LlamaForCausalLM):
model = _patch_llama_model(model)
elif isinstance(model, FalconForCausalLM):
model = _patch_falcon_model(model)
elif isinstance(model, GPT2LMHeadModel):
model = _patch_gpt2_model(model)
elif model.config.model_type == "bert":
model = _patch_bert_model(model)
elif model.config.model_type == "vit":
Expand Down
Loading

0 comments on commit 4535927

Please sign in to comment.