Merge branch 'huggingface:main' into main

huggingface · Jul 19, 2024 · 86930cd · 86930cd
2 parents 138b1e4 + b362180
commit 86930cd
Show file tree

Hide file tree

Showing 14 changed files with 272 additions and 169 deletions.
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -189,14 +189,6 @@ def parse_args_openvino(parser: "ArgumentParser"):
         action="store_true",
         help="Do not add converted tokenizer and detokenizer OpenVINO models.",
     )
-    # TODO : deprecated
-    optional_group.add_argument("--fp16", action="store_true", help="Compress weights to fp16")
-    optional_group.add_argument("--int8", action="store_true", help="Compress weights to int8")
-    optional_group.add_argument(
-        "--convert-tokenizer",
-        action="store_true",
-        help="[Deprecated] Add converted tokenizer and detokenizer with OpenVINO Tokenizers.",
-    )
 
 
 class OVExportCommand(BaseOptimumCLICommand):
@@ -235,24 +227,19 @@ def _get_default_int4_config(model_id_or_path, library_name):
 
             return _DEFAULT_4BIT_CONFIG
 
-        library_name = TasksManager.infer_library_from_model(self.args.model, library_name=self.args.library)
-        if library_name == "sentence_transformers" and self.args.library is None:
-            logger.warning(
-                "Library name is not specified. There are multiple possible variants: `sentence_transformers`, `transformers`."
-                "`transformers` will be selected. If you want to load your model with the `sentence-transformers` library instead, please set --library sentence_transformers"
-            )
-            library_name = "transformers"
-
-        if self.args.fp16:
-            logger.warning(
-                "`--fp16` option is deprecated and will be removed in a future version. Use `--weight-format` instead."
+        if self.args.library is None:
+            # TODO: add revision, subfolder and token to args
+            library_name = TasksManager._infer_library_from_model_name_or_path(
+                model_name_or_path=self.args.model, cache_dir=self.args.cache_dir
             )
-            self.args.weight_format = "fp16"
-        if self.args.int8:
-            logger.warning(
-                "`--int8` option is deprecated and will be removed in a future version. Use `--weight-format` instead."
-            )
-            self.args.weight_format = "int8"
+            if library_name == "sentence_transformers":
+                logger.warning(
+                    "Library name is not specified. There are multiple possible variants: `sentence_transformers`, `transformers`."
+                    "`transformers` will be selected. If you want to load your model with the `sentence-transformers` library instead, please set --library sentence_transformers"
+                )
+                library_name = "transformers"
+        else:
+            library_name = self.args.library
 
         if self.args.weight_format is None:
             ov_config = None
@@ -296,9 +283,6 @@ def _get_default_int4_config(model_id_or_path, library_name):
                 quantization_config["group_size"] = 128 if "128" in self.args.weight_format else 64
             ov_config = OVConfig(quantization_config=quantization_config)
 
-        if self.args.convert_tokenizer:
-            logger.warning("`--convert-tokenizer` option is deprecated. Tokenizer will be converted by default.")
-
         quantization_config = ov_config.quantization_config if ov_config else None
         quantize_with_dataset = quantization_config and getattr(quantization_config, "dataset", None) is not None
         task = infer_task(self.args.task, self.args.model)

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -47,11 +47,24 @@
 logger = logging.getLogger(__name__)
 
 
-def infer_task(task, model_name_or_path):
+def infer_task(
+    task,
+    model_name_or_path,
+    subfolder: str = "",
+    revision: Optional[str] = None,
+    cache_dir: str = HUGGINGFACE_HUB_CACHE,
+    token: Optional[Union[bool, str]] = None,
+):
     task = TasksManager.map_from_synonym(task)
     if task == "auto":
         try:
-            task = TasksManager.infer_task_from_model(model_name_or_path)
+            task = TasksManager._infer_task_from_model_name_or_path(
+                model_name_or_path=model_name_or_path,
+                subfolder=subfolder,
+                revision=revision,
+                cache_dir=cache_dir,
+                token=token,
+            )
         except KeyError as e:
             raise KeyError(
                 f"The task could not be automatically inferred. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
@@ -193,19 +206,27 @@ def main_export(
             ov_config = OVConfig(quantization_config=q_config)
 
     original_task = task
-    task = infer_task(task, model_name_or_path)
-    framework = TasksManager.determine_framework(model_name_or_path, subfolder=subfolder, framework=framework)
-    library_name_is_not_provided = library_name is None
-    library_name = TasksManager.infer_library_from_model(
-        model_name_or_path, subfolder=subfolder, library_name=library_name
+    task = infer_task(
+        task, model_name_or_path, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token
+    )
+    framework = TasksManager.determine_framework(
+        model_name_or_path, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token
     )
 
-    if library_name == "sentence_transformers" and library_name_is_not_provided:
-        logger.warning(
-            "Library name is not specified. There are multiple possible variants: `sentence_tenasformers`, `transformers`."
-            "`transformers` will be selected. If you want to load your model with the `sentence-transformers` library instead, please set --library sentence_transformers"
+    if library_name is None:
+        library_name = TasksManager._infer_library_from_model_name_or_path(
+            model_name_or_path=model_name_or_path,
+            subfolder=subfolder,
+            revision=revision,
+            cache_dir=cache_dir,
+            token=token,
         )
-        library_name = "transformers"
+        if library_name == "sentence_transformers":
+            logger.warning(
+                "Library name is not specified. There are multiple possible variants: `sentence_tenasformers`, `transformers`."
+                "`transformers` will be selected. If you want to load your model with the `sentence-transformers` library instead, please set --library sentence_transformers"
+            )
+            library_name = "transformers"
 
     do_gptq_patching = False
     custom_architecture = False
@@ -317,9 +338,7 @@ class StoreAttr(object):
                 )
             model.config.pad_token_id = pad_token_id
 
-    if "stable-diffusion" in task:
-        model_type = "stable-diffusion"
-    elif hasattr(model.config, "export_model_type"):
+    if hasattr(model.config, "export_model_type"):
         model_type = model.config.export_model_type.replace("_", "-")
     else:
         model_type = model.config.model_type.replace("_", "-")

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
@@ -64,7 +64,7 @@
     from transformers.modeling_utils import PreTrainedModel
 
 if is_diffusers_available():
-    from diffusers import ModelMixin
+    from diffusers import DiffusionPipeline, ModelMixin
 
 if is_tf_available():
     from transformers.modeling_tf_utils import TFPreTrainedModel
@@ -74,7 +74,7 @@
     from optimum.intel.openvino.configuration import OVConfig
 
 
-def _save_model(model, path: str, ov_config: Optional["OVConfig"] = None):
+def _save_model(model, path: str, ov_config: Optional["OVConfig"] = None, library_name: Optional[str] = None):
     compress_to_fp16 = False
 
     if ov_config is not None:
@@ -90,13 +90,12 @@ def _save_model(model, path: str, ov_config: Optional["OVConfig"] = None):
 
         compress_to_fp16 = ov_config.dtype == "fp16"
 
-    library_name = TasksManager.infer_library_from_model(Path(path).parent)
     model = _add_version_info_to_model(model, library_name)
     save_model(model, path, compress_to_fp16)
 
 
 def export(
-    model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"],
+    model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin", "DiffusionPipeline"],
     config: OnnxConfig,
     output: Path,
     opset: Optional[int] = None,
@@ -139,7 +138,7 @@ def export(
         )
 
     if "diffusers" in str(model.__class__) and not is_diffusers_available():
-        raise ImportError("The pip package `diffusers` is required to export stable diffusion models to ONNX.")
+        raise ImportError("The package `diffusers` is required to export diffusion models to OpenVINO.")
 
     if stateful:
         # This will be checked anyway after the model conversion, but checking it earlier will save time for a user if not suitable version is used
@@ -198,7 +197,19 @@ def export_tensorflow(
     onnx_path = Path(output).with_suffix(".onnx")
     input_names, output_names = export_tensorflow_onnx(model, config, opset, onnx_path)
     ov_model = convert_model(str(onnx_path))
-    _save_model(ov_model, output.parent / output, ov_config=ov_config)
+
+    if model.__class__.__module__.startswith("optimum"):
+        # for wrapped models
+        library_name = TasksManager._infer_library_from_model_or_model_class(model=model.model)
+    else:
+        library_name = TasksManager._infer_library_from_model_or_model_class(model=model)
+
+    _save_model(
+        ov_model,
+        output.parent / output,
+        ov_config=ov_config,
+        library_name=library_name,
+    )
     return input_names, output_names, True
 
 
@@ -251,7 +262,19 @@ def export_pytorch_via_onnx(
     )
     torch.onnx.export = orig_torch_onnx_export
     ov_model = convert_model(str(onnx_output))
-    _save_model(ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output, ov_config=ov_config)
+
+    if model.__class__.__module__.startswith("optimum"):
+        # for wrapped models
+        library_name = TasksManager._infer_library_from_model_or_model_class(model=model.model)
+    else:
+        library_name = TasksManager._infer_library_from_model_or_model_class(model=model)
+
+    _save_model(
+        ov_model,
+        output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output,
+        ov_config=ov_config,
+        library_name=library_name,
+    )
     return input_names, output_names, True
 
 
@@ -413,7 +436,18 @@ def ts_patched_forward(*args, **kwargs):
         if stateful:
             patch_stateful(model.config, ov_model)
 
-        _save_model(ov_model, output, ov_config=ov_config)
+        if model.__module__.startswith("optimum"):
+            # for wrapped models like timm in optimum.intel.openvino.modeling_timm
+            library_name = TasksManager._infer_library_from_model_or_model_class(model=model.model)
+        else:
+            library_name = TasksManager._infer_library_from_model_or_model_class(model=model)
+
+        _save_model(
+            ov_model,
+            output,
+            ov_config=ov_config,
+            library_name=library_name,
+        )
         clear_class_registry()
         del model
         gc.collect()
@@ -422,7 +456,7 @@ def ts_patched_forward(*args, **kwargs):
 
 def export_models(
     models_and_export_configs: Dict[
-        str, Tuple[Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], "OnnxConfig"]
+        str, Tuple[Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin", "DiffusionPipeline"], "OnnxConfig"]
     ],
     output_dir: Path,
     opset: Optional[int] = None,
@@ -491,7 +525,7 @@ def export_models(
 
 
 def export_from_model(
-    model: Union["PreTrainedModel", "TFPreTrainedModel"],
+    model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin", "DiffusionPipeline"],
     output: Union[str, Path],
     task: Optional[str] = None,
     ov_config: Optional["OVConfig"] = None,
@@ -505,14 +539,15 @@ def export_from_model(
     trust_remote_code: bool = False,
     **kwargs_shapes,
 ):
+    model_kwargs = model_kwargs or {}
+
     if ov_config is not None and ov_config.quantization_config and not is_nncf_available():
         raise ImportError(
             f"Compression of the weights to {ov_config.quantization_config} requires nncf, please install it with `pip install nncf`"
         )
 
-    model_kwargs = model_kwargs or {}
-    library_name = TasksManager._infer_library_from_model(model)
-    TasksManager.standardize_model_attributes(model, library_name)
+    library_name = TasksManager._infer_library_from_model_or_model_class(model=model)
+    TasksManager.standardize_model_attributes(model)
 
     if hasattr(model.config, "export_model_type"):
         model_type = model.config.export_model_type.replace("_", "-")
@@ -521,7 +556,7 @@ def export_from_model(
 
     custom_architecture = library_name == "transformers" and model_type not in TasksManager._SUPPORTED_MODEL_TYPE
 
-    if task is not None:
+    if task is not None and task != "auto":
         task = TasksManager.map_from_synonym(task)
     else:
         try:

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -49,7 +49,6 @@
     ChatGLMModelPatcher,
     CodeGenModelPatcher,
     DBRXModelPatcher,
-    GemmaModelPatcher,
     InternLM2Patcher,
     InternLMModelPatcher,
     JaisModelPatcher,
@@ -319,7 +318,7 @@ class GemmaOpenVINOConfig(GemmaOnnxConfig):
     def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
-        return GemmaModelPatcher(self, model, model_kwargs=model_kwargs)
+        return LlamaModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
 @register_in_tasks_manager(

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -497,50 +497,65 @@ def _llama_gemma_update_causal_mask_latest(
     _llama_gemma_update_causal_mask = _llama_gemma_update_causal_mask_legacy
 
 
-class GemmaModelPatcher(DecoderModelPatcher):
+def llama_gemma_rotary_emb_forward(self, x, position_ids, seq_len=None):
+    # adopted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/gemma/modeling_gemma.py#L104
+    _seq_len = torch.max(position_ids) + 1 if seq_len is None else seq_len
+    if _seq_len > self.embed_positions.shape[0]:
+        if seq_len is None:
+            return self._orig_forward(x, position_ids)
+        else:
+            return self._orig_forward(x, position_ids, seq_len)
+    sincos = self.embed_positions[position_ids]
+    sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1)
+    return cos, sin
+
+
+class LlamaModelPatcher(DecoderModelPatcher):
     def __enter__(self):
         super().__enter__()
 
-        # gemma has some accuracy issues with bf16 with transformers >= 4.39
+        # llama/gemma has some accuracy issues with bf16 with transformers >= 4.39
         # fill causal mask in slightly different way for avoid overflow on some platforms
         if is_transformers_version(">=", "4.39.0"):
             self._model.model._orig_update_causal_mask = self._model.model._update_causal_mask
             self._model.model._update_causal_mask = types.MethodType(
                 _llama_gemma_update_causal_mask, self._model.model
             )
 
-        # init inv_freq for torchscript tracing
-        # https://github.com/huggingface/transformers/blob/ed74d97871468f3a4695ede50abdc0b55717a84d/src/transformers/models/gemma/modeling_gemma.py#L108
-        for layer in self._model.model.layers:
-            if layer.self_attn.rotary_emb.inv_freq is None:
-                rotary_emb = layer.self_attn.rotary_emb
-                layer.self_attn.rotary_emb.inv_freq = 1.0 / (
-                    rotary_emb.base ** (torch.arange(0, rotary_emb.dim, 2, dtype=torch.int64).float() / rotary_emb.dim)
-                )
+            max_positions = self._model.config.max_position_embeddings
 
-    def __exit__(self, exc_type, exc_value, traceback):
-        super().__exit__(exc_type, exc_value, traceback)
-        if hasattr(self._model.model, "_orig_update_causal_mask"):
-            self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask
+            # cos/sin for rotary position embeddings also having issues with bf16 and efficiency due to calculation on each step
+            # use precomputed
+            def create_sinusoidal_positions(num_pos: int, dim: int, base: int = 10000) -> torch.Tensor:
+                # adopted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L101
+                inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64) / dim))
 
+                sinusoid_inp = torch.einsum(
+                    "i , j -> i j", torch.arange(num_pos, dtype=torch.int64).float(), inv_freq
+                ).float()
+                emb = torch.cat((sinusoid_inp, sinusoid_inp), dim=-1)
+                return torch.cat((torch.sin(emb), torch.cos(emb)), dim=1)
 
-class LlamaModelPatcher(DecoderModelPatcher):
-    def __enter__(self):
-        super().__enter__()
+            base = self._model.model.layers[0].self_attn.rotary_emb.base
+            dim = self._model.model.layers[0].self_attn.rotary_emb.dim
+            embed_positions = create_sinusoidal_positions(max_positions, dim, base)
 
-        # llama has some accuracy issues with bf16 with transformers >= 4.39
-        # fill causal mask in slightly different way for avoid overflow on some platforms
-        if is_transformers_version(">=", "4.39.0"):
-            self._model.model._orig_update_causal_mask = self._model.model._update_causal_mask
-            self._model.model._update_causal_mask = types.MethodType(
-                _llama_gemma_update_causal_mask, self._model.model
-            )
+            for layer in self._model.model.layers:
+                layer.self_attn.rotary_emb.register_buffer("embed_positions", embed_positions)
+                layer.self_attn.rotary_emb._orig_forward = layer.self_attn.rotary_emb.forward
+
+                layer.self_attn.rotary_emb.forward = types.MethodType(
+                    llama_gemma_rotary_emb_forward, layer.self_attn.rotary_emb
+                )
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
         if hasattr(self._model.model, "_orig_update_causal_mask"):
             self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask
 
+            for layer in self._model.model.layers:
+                layer.self_attn.rotary_emb.forward = layer.self_attn.rotary_emb._orig_forward
+
 
 SUPPORT_SDPA = is_torch_version(">", "2.1.0")