Merge branch 'main' into numa

huggingface · Aug 26, 2024 · 4535927 · 4535927
2 parents c825876 + 1f3d0c2
commit 4535927
Show file tree

Hide file tree

Showing 24 changed files with 811 additions and 228 deletions.
diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
@@ -44,6 +44,7 @@ Here is the list of the supported architectures :
 - DistilBert
 - Electra
 - Encoder Decoder
+- Exaone
 - Falcon
 - Flaubert
 - GLM-4

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -190,6 +190,24 @@ def parse_args_openvino(parser: "ArgumentParser"):
     )
 
 
+def no_compression_parameter_provided(args):
+    return all(
+        (
+            it is None
+            for it in (
+                args.ratio,
+                args.group_size,
+                args.sym,
+                args.all_layers,
+                args.dataset,
+                args.num_samples,
+                args.awq,
+                args.sensitivity_metric,
+            )
+        )
+    )
+
+
 class OVExportCommand(BaseOptimumCLICommand):
     COMMAND = CommandInfo(name="openvino", help="Export PyTorch models to OpenVINO IR.")
 
@@ -230,23 +248,17 @@ def run(self):
 
         if self.args.weight_format is None:
             ov_config = None
+            if not no_compression_parameter_provided(self.args):
+                logger.warning(
+                    "The provided compression parameters will not affect conversion because of the missing --weight-format argument."
+                )
         elif self.args.weight_format in {"fp16", "fp32"}:
             ov_config = OVConfig(dtype=self.args.weight_format)
         else:
             is_int8 = self.args.weight_format == "int8"
 
-            # For int4 quantization if not parameter is provided, then use the default config if exist
-            if (
-                not is_int8
-                and self.args.ratio is None
-                and self.args.group_size is None
-                and self.args.sym is None
-                and self.args.all_layers is None
-                and self.args.dataset is None
-                and self.args.num_samples is None
-                and self.args.awq is None
-                and self.args.sensitivity_metric is None
-            ):
+            # For int4 quantization if no parameter is provided, then use the default config if exist
+            if no_compression_parameter_provided(self.args) and not is_int8:
                 quantization_config = get_default_int4_config(self.args.model)
             else:
                 quantization_config = {
@@ -305,7 +317,7 @@ def run(self):
             model = model_cls.from_pretrained(self.args.model, export=True, quantization_config=quantization_config)
             model.save_pretrained(self.args.output)
             if not self.args.disable_convert_tokenizer:
-                maybe_convert_tokenizers(library_name, self.args.output, model)
+                maybe_convert_tokenizers(library_name, self.args.output, model, task=task)
         elif task.startswith("text-generation") and quantize_with_dataset:
             from optimum.intel import OVModelForCausalLM
 
@@ -324,7 +336,7 @@ def run(self):
                 preprocessors = maybe_load_preprocessors(
                     self.args.model, trust_remote_code=self.args.trust_remote_code
                 )
-                maybe_convert_tokenizers(library_name, self.args.output, preprocessors=preprocessors)
+                maybe_convert_tokenizers(library_name, self.args.output, preprocessors=preprocessors, task=task)
         else:
             # TODO : add input shapes
             main_export(

diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py
@@ -13,6 +13,8 @@
 #  limitations under the License.
 
 from transformers.models.bert.modeling_bert import BertIntermediate
+from transformers.models.falcon.modeling_falcon import FalconDecoderLayer, FalconForCausalLM
+from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2Block, GPT2LMHeadModel
 from transformers.models.llama.modeling_llama import (
     LlamaDecoderLayer,
     LlamaForCausalLM,
@@ -22,10 +24,14 @@
 from transformers.models.vit.modeling_vit import ViTIntermediate
 
 from optimum.intel.utils.import_utils import is_ipex_version, is_transformers_version
+from optimum.intel.utils.modeling_utils import replace_customized_linear_with_linear
 
 from .modeling_utils import (
     _IPEX_MINIMUM_VERSION_FOR_PATCHING,
+    _gpt2_block_forward,
     _ipex_rms_layer_norm_forward,
+    _IPEXFalconDecoderLayer,
+    _IPEXGPT2Attention,
     _IPEXIntermediate,
     _IPEXLlamaDecoderLayer,
     _llama_model_forward,
@@ -67,18 +73,56 @@ def patch_op(m, target_m, new_op_name, new_op):
 
 
 def _patch_llama_model(model):
+    """
+    Patch llama model:
+        1. Use IPEX Rope and IAKV cache
+        2. Linear fusion with (2 Linears + Silu + Mul) and (Linear + Add)
+    """
     convert_functions(model, LlamaModel, "forward", _llama_model_forward)
     convert_functions(model, LlamaRMSNorm, "forward", _ipex_rms_layer_norm_forward)
     convert_class(model, LlamaDecoderLayer, _IPEXLlamaDecoderLayer, model.config)
     return model
 
 
+def _patch_falcon_model(model):
+    """
+    Patch falcon model:
+        1. Disable SDPA so the attention mask will be compatible to ipex attention.
+        2. Use IPEX Rope and IAKV cache
+        3. Linear fusion with (Linear + Gelu) and (Linear + Add + Add)
+    """
+    model.transformer._use_sdpa = False
+    replace_customized_linear_with_linear(model)
+    convert_class(model, FalconDecoderLayer, _IPEXFalconDecoderLayer, model.config)
+    return model
+
+
+def _patch_gpt2_model(model):
+    """
+    Patch gpt2 model:
+        1. Disable SDPA so the attention mask will be compatible to ipex attention.
+        2. Use IAKV cache
+    """
+    model.transformer._attn_implementation = "eager"
+    convert_class(model, GPT2Attention, _IPEXGPT2Attention, model.config)
+    convert_functions(model, GPT2Block, "forward", _gpt2_block_forward)
+    return model
+
+
 def _patch_bert_model(model):
+    """
+    Patch bert model:
+        1. Linear fusion with Linear + Gelu
+    """
     convert_class(model, BertIntermediate, _IPEXIntermediate)
     return model
 
 
 def _patch_vit_model(model):
+    """
+    Patch vit model:
+        1. Linear fusion with Linear + Gelu
+    """
     convert_class(model, ViTIntermediate, _IPEXIntermediate)
     return model
 
@@ -94,6 +138,10 @@ def _patch_model(model):
         )
     if isinstance(model, LlamaForCausalLM):
         model = _patch_llama_model(model)
+    elif isinstance(model, FalconForCausalLM):
+        model = _patch_falcon_model(model)
+    elif isinstance(model, GPT2LMHeadModel):
+        model = _patch_gpt2_model(model)
     elif model.config.model_type == "bert":
         model = _patch_bert_model(model)
     elif model.config.model_type == "vit":
-Original file line number
+Diff line change
@@ Expand Up / @@ -44,6 +44,7 @@ Here is the list of the supported architectures : @@
     - DistilBert
     - Electra
     - Encoder Decoder
+    - Exaone
     - Falcon
     - Flaubert
     - GLM-4
@@ Expand Down @@