From 89497bee74bca21ef96aaa7971d96f53a43c275d Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Fri, 27 Dec 2024 16:29:23 +0400
Subject: [PATCH 1/8] support sana text2image

---
 optimum/exporters/openvino/convert.py       | 70 +++++++++++++++++++++
 optimum/exporters/openvino/model_configs.py | 49 +++++++++++++++
 2 files changed, 119 insertions(+)

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index 22a3ca884e..7677b0158f 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -1016,6 +1016,7 @@ def get_diffusion_models_for_export_ext(
     is_sdxl = pipeline.__class__.__name__.startswith("StableDiffusionXL")
     is_sd3 = pipeline.__class__.__name__.startswith("StableDiffusion3")
     is_flux = pipeline.__class__.__name__.startswith("Flux")
+    is_sana =  pipeline.__class__.__name__.startswith("Sana")
     is_sd = pipeline.__class__.__name__.startswith("StableDiffusion") and not is_sd3
     is_lcm = pipeline.__class__.__name__.startswith("LatentConsistencyModel")
 
@@ -1034,11 +1035,80 @@ def get_diffusion_models_for_export_ext(
         models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype)
     elif is_flux:
         models_for_export = get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype)
+    elif is_sana:
+        models_for_export = get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype)
     else:
         raise ValueError(f"Unsupported pipeline type `{pipeline.__class__.__name__}` provided")
     return None, models_for_export
 
 
+def get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype):
+    DEFAULT_DUMMY_SHAPES["heigh"] = DEFAULT_DUMMY_SHAPES["height"] // 4
+    DEFAULT_DUMMY_SHAPES["width"] = DEFAULT_DUMMY_SHAPES["width"] // 4
+    models_for_export = {}
+    text_encoder = pipeline.text_encoder
+    text_encoder_config_constructor = TasksManager.get_exporter_config_constructor(
+            model=text_encoder,
+            exporter=exporter,
+            library_name="diffusers",
+            task="feature-extraction",
+            model_type="gemma2-text-encoder",
+        )
+    text_encoder_export_config = text_encoder_config_constructor(
+        pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
+    )
+    text_encoder_export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"}
+    models_for_export["text_encoder"] = (text_encoder, text_encoder_export_config)
+    transformer = pipeline.transformer
+    transformer.config.text_encoder_projection_dim = transformer.config.caption_channels
+    transformer.config.requires_aesthetics_score = False
+    transformer.config.time_cond_proj_dim = None
+    export_config_constructor = TasksManager.get_exporter_config_constructor(
+        model=transformer,
+        exporter=exporter,
+        library_name="diffusers",
+        task="semantic-segmentation",
+        model_type="sana-transformer",
+    )
+    transformer_export_config = export_config_constructor(
+        pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype
+    )
+    models_for_export["transformer"] = (transformer, transformer_export_config)
+    # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565
+    vae_encoder = copy.deepcopy(pipeline.vae)
+    vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters}
+    vae_config_constructor = TasksManager.get_exporter_config_constructor(
+        model=vae_encoder,
+        exporter=exporter,
+        library_name="diffusers",
+        task="semantic-segmentation",
+        model_type="vae-encoder",
+    )
+    vae_encoder_export_config = vae_config_constructor(
+        vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
+    )
+    vae_encoder_export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"}
+    models_for_export["vae_encoder"] = (vae_encoder, vae_encoder_export_config)
+
+    # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600
+    vae_decoder = copy.deepcopy(pipeline.vae)
+    vae_decoder.forward = lambda latent_sample: vae_decoder.decode(z=latent_sample)
+    vae_config_constructor = TasksManager.get_exporter_config_constructor(
+        model=vae_decoder,
+        exporter=exporter,
+        library_name="diffusers",
+        task="semantic-segmentation",
+        model_type="vae-decoder",
+    )
+    vae_decoder_export_config = vae_config_constructor(
+        vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype
+    )
+    vae_decoder_export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"}
+    models_for_export["vae_decoder"] = (vae_decoder, vae_decoder_export_config)
+
+    return models_for_export
+
+
 def get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype):
     models_for_export = {}
 
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 4b1dbb50b8..80c2ff24a8 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -57,6 +57,7 @@
     DummyVisionInputGenerator,
     FalconDummyPastKeyValuesGenerator,
     MistralDummyPastKeyValuesGenerator,
+    DummySeq2SeqDecoderTextInputGenerator
 )
 from optimum.utils.normalized_config import NormalizedConfig, NormalizedTextConfig, NormalizedVisionConfig
 
@@ -133,6 +134,8 @@ def init_model_configs():
     if is_diffusers_available() and "fill" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS:
         TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["fill"] = "FluxFillPipeline"
         TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["fill"] = {"flux": "FluxFillPipeline"}
+        TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["text-to-image"] = ("AutoPipelineForText2Image", "SanaPipeline")
+        TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["text-to-image"]["sana"] = "SanaPipeline"
 
     supported_model_types = [
         "_SUPPORTED_MODEL_TYPE",
@@ -1890,6 +1893,52 @@ def rename_ambiguous_inputs(self, inputs):
 class T5EncoderOpenVINOConfig(CLIPTextOpenVINOConfig):
     pass
 
+@register_in_tasks_manager("gemma2-text-encoder", *["feature-extraction"], library_name="diffusers")
+class Gemma2TextEncoderOpenVINOConfig(CLIPTextOpenVINOConfig):
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        return {
+            "input_ids": {0: "batch_size", 1: "sequence_length"},
+            "attention_mask": {0: "batch_size", 1: "sequence_length"}
+        }
+
+
+class DummySeq2SeqDecoderTextWithEncMaskInputGenerator(DummySeq2SeqDecoderTextInputGenerator):
+    SUPPORTED_INPUT_NAMES = (
+        "decoder_input_ids",
+        "decoder_attention_mask",
+        "encoder_outputs",
+        "encoder_hidden_states",
+        "encoder_attention_mask"
+    )
+
+
+class DummySanaTransformerVisionInputGenerator(DummyVisionInputGenerator):
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name not in ["sample", "latent_sample"]:
+            return super().generate(input_name, framework, int_dtype, float_dtype)
+        return self.random_float_tensor(
+            shape=[self.batch_size, self.num_channels, self.height, self.width],
+            framework=framework,
+            dtype=float_dtype,
+        )
+
+@register_in_tasks_manager("sana-transformer", *["semantic-segmentation"], library_name="diffusers")
+class SanaTransformerOpenVINOConfig(UNetOpenVINOConfig):
+    NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args(
+        image_size="sample_size",
+        num_channels="in_channels",
+        hidden_size="cross_attention_dim",
+        vocab_size="attention_head_dim",
+        allow_new=True,
+    )
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummySanaTransformerVisionInputGenerator, DummySeq2SeqDecoderTextWithEncMaskInputGenerator) + UNetOpenVINOConfig.DUMMY_INPUT_GENERATOR_CLASSES[1:-1]
+    @property
+    def inputs(self):
+        common_inputs = super().inputs
+        common_inputs["encoder_attention_mask"] = {0: "batch_size", 1: "sequence_length"}
+        return common_inputs
+
 
 class DummyFluxTransformerInputGenerator(DummyVisionInputGenerator):
     SUPPORTED_INPUT_NAMES = (

From 6f3deaed9e4cb75a178f683ba1c93256fed087c7 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Mon, 13 Jan 2025 20:20:04 +0400
Subject: [PATCH 2/8] add pipeline

---
 optimum/commands/export/openvino.py           |  7 ++
 optimum/exporters/openvino/__main__.py        |  6 ++
 optimum/exporters/openvino/convert.py         | 66 ++++++++++++++++---
 optimum/exporters/openvino/model_configs.py   | 56 +++++++++++-----
 optimum/intel/__init__.py                     |  2 +
 optimum/intel/openvino/__init__.py            |  1 +
 optimum/intel/openvino/modeling_diffusion.py  | 36 ++++++++--
 .../dummy_openvino_and_diffusers_objects.py   | 11 ++++
 tests/openvino/test_diffusion.py              | 54 ++++++++++-----
 tests/openvino/test_exporters_cli.py          |  8 ++-
 tests/openvino/utils_tests.py                 |  2 +
 11 files changed, 203 insertions(+), 46 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 20e2d7ca33..d8c4edafda 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -105,6 +105,12 @@ def parse_args_openvino(parser: "ArgumentParser"):
             "This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it."
         ),
     )
+    optional_group.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help=("Select a variant of the model to export."),
+    )
     optional_group.add_argument(
         "--ratio",
         type=float,
@@ -463,5 +469,6 @@ def run(self):
                 stateful=not self.args.disable_stateful,
                 convert_tokenizer=not self.args.disable_convert_tokenizer,
                 library_name=library_name,
+                model_variant=self.args.variant,
                 # **input_shapes,
             )
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 4047ab64aa..59a9dc41ab 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -122,6 +122,7 @@ def main_export(
     convert_tokenizer: bool = False,
     library_name: Optional[str] = None,
     model_loading_kwargs: Optional[Dict[str, Any]] = None,
+    model_variant: Optional[str] = None,
     **kwargs_shapes,
 ):
     """
@@ -237,6 +238,8 @@ def main_export(
     custom_architecture = False
     patch_16bit = False
     loading_kwargs = model_loading_kwargs or {}
+    if model_variant is not None:
+        loading_kwargs["variant"] = model_variant
     if library_name == "transformers":
         config = AutoConfig.from_pretrained(
             model_name_or_path,
@@ -347,6 +350,7 @@ class StoreAttr(object):
 
                 GPTQQuantizer.post_init_model = post_init_model
     elif library_name == "diffusers" and is_openvino_version(">=", "2024.6"):
+        _loading_kwargs = {} if model_variant is None else {"variant": model_variant}
         dtype = deduce_diffusers_dtype(
             model_name_or_path,
             revision=revision,
@@ -355,6 +359,7 @@ class StoreAttr(object):
             local_files_only=local_files_only,
             force_download=force_download,
             trust_remote_code=trust_remote_code,
+            **_loading_kwargs,
         )
         if dtype in [torch.float16, torch.bfloat16]:
             loading_kwargs["torch_dtype"] = dtype
@@ -364,6 +369,7 @@ class StoreAttr(object):
         if library_name == "open_clip":
             model = _OpenClipForZeroShotImageClassification.from_pretrained(model_name_or_path, cache_dir=cache_dir)
         else:
+            logger.warn(loading_kwargs)
             model = TasksManager.get_model_from_task(
                 task,
                 model_name_or_path,
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index 7677b0158f..cb5bf95181 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -1013,6 +1013,7 @@ def _get_submodels_and_export_configs(
 def get_diffusion_models_for_export_ext(
     pipeline: "DiffusionPipeline", int_dtype: str = "int64", float_dtype: str = "fp32", exporter: str = "openvino"
 ):
+<<<<<<< HEAD
     is_sdxl = pipeline.__class__.__name__.startswith("StableDiffusionXL")
     is_sd3 = pipeline.__class__.__name__.startswith("StableDiffusion3")
     is_flux = pipeline.__class__.__name__.startswith("Flux")
@@ -1035,6 +1036,52 @@ def get_diffusion_models_for_export_ext(
         models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype)
     elif is_flux:
         models_for_export = get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype)
+=======
+    if is_diffusers_version(">=", "0.29.0"):
+        from diffusers import StableDiffusion3Img2ImgPipeline, StableDiffusion3Pipeline
+
+        sd3_pipes = [StableDiffusion3Pipeline, StableDiffusion3Img2ImgPipeline]
+        if is_diffusers_version(">=", "0.30.0"):
+            from diffusers import StableDiffusion3InpaintPipeline
+
+            sd3_pipes.append(StableDiffusion3InpaintPipeline)
+
+        is_sd3 = isinstance(pipeline, tuple(sd3_pipes))
+        logger.warn(f"IS SD3 {pipeline} {is_sd3}")
+    else:
+        is_sd3 = False
+
+    if is_diffusers_version(">=", "0.30.0"):
+        from diffusers import FluxPipeline
+
+        flux_pipes = [FluxPipeline]
+
+        if is_diffusers_version(">=", "0.31.0"):
+            from diffusers import FluxImg2ImgPipeline, FluxInpaintPipeline
+
+            flux_pipes.extend([FluxPipeline, FluxImg2ImgPipeline, FluxInpaintPipeline])
+
+        if is_diffusers_version(">=", "0.32.0"):
+            from diffusers import FluxFillPipeline
+
+            flux_pipes.append(FluxFillPipeline)
+
+        is_flux = isinstance(pipeline, tuple(flux_pipes))
+    else:
+        is_flux = False
+
+    if is_diffusers_version(">=", "0.32.0"):
+        from diffusers import SanaPipeline
+
+        is_sana = isinstance(pipeline, SanaPipeline)
+    else:
+        is_sana = False
+
+    if not any([is_sana, is_flux, is_sd3]):
+        return None, get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter)
+    if is_sd3:
+        models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype)
+>>>>>>> add pipeline
     elif is_sana:
         models_for_export = get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype)
     else:
@@ -1043,17 +1090,15 @@ def get_diffusion_models_for_export_ext(
 
 
 def get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype):
-    DEFAULT_DUMMY_SHAPES["heigh"] = DEFAULT_DUMMY_SHAPES["height"] // 4
-    DEFAULT_DUMMY_SHAPES["width"] = DEFAULT_DUMMY_SHAPES["width"] // 4
     models_for_export = {}
     text_encoder = pipeline.text_encoder
     text_encoder_config_constructor = TasksManager.get_exporter_config_constructor(
-            model=text_encoder,
-            exporter=exporter,
-            library_name="diffusers",
-            task="feature-extraction",
-            model_type="gemma2-text-encoder",
-        )
+        model=text_encoder,
+        exporter=exporter,
+        library_name="diffusers",
+        task="feature-extraction",
+        model_type="gemma2-text-encoder",
+    )
     text_encoder_export_config = text_encoder_config_constructor(
         pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
     )
@@ -1076,13 +1121,13 @@ def get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype):
     models_for_export["transformer"] = (transformer, transformer_export_config)
     # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565
     vae_encoder = copy.deepcopy(pipeline.vae)
-    vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters}
+    vae_encoder.forward = lambda sample: {"latent": vae_encoder.encode(x=sample)["latent"]}
     vae_config_constructor = TasksManager.get_exporter_config_constructor(
         model=vae_encoder,
         exporter=exporter,
         library_name="diffusers",
         task="semantic-segmentation",
-        model_type="vae-encoder",
+        model_type="dcae-encoder",
     )
     vae_encoder_export_config = vae_config_constructor(
         vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
@@ -1140,6 +1185,7 @@ def get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype):
         task="semantic-segmentation",
         model_type="sd3-transformer",
     )
+    logger.warn(f"TRANSFORMER COFG {export_config_constructor}")
     transformer_export_config = export_config_constructor(
         pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype
     )
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 80c2ff24a8..1f7695cf82 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -41,6 +41,7 @@
     PhiOnnxConfig,
     T5OnnxConfig,
     UNetOnnxConfig,
+    VaeEncoderOnnxConfig,
     VisionOnnxConfig,
     WhisperOnnxConfig,
 )
@@ -57,7 +58,6 @@
     DummyVisionInputGenerator,
     FalconDummyPastKeyValuesGenerator,
     MistralDummyPastKeyValuesGenerator,
-    DummySeq2SeqDecoderTextInputGenerator
 )
 from optimum.utils.normalized_config import NormalizedConfig, NormalizedTextConfig, NormalizedVisionConfig
 
@@ -1893,52 +1893,78 @@ def rename_ambiguous_inputs(self, inputs):
 class T5EncoderOpenVINOConfig(CLIPTextOpenVINOConfig):
     pass
 
+
 @register_in_tasks_manager("gemma2-text-encoder", *["feature-extraction"], library_name="diffusers")
 class Gemma2TextEncoderOpenVINOConfig(CLIPTextOpenVINOConfig):
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
         return {
             "input_ids": {0: "batch_size", 1: "sequence_length"},
-            "attention_mask": {0: "batch_size", 1: "sequence_length"}
+            "attention_mask": {0: "batch_size", 1: "sequence_length"},
         }
 
 
-class DummySeq2SeqDecoderTextWithEncMaskInputGenerator(DummySeq2SeqDecoderTextInputGenerator):
+class DummySanaSeq2SeqDecoderTextWithEncMaskInputGenerator(DummySeq2SeqDecoderTextInputGenerator):
     SUPPORTED_INPUT_NAMES = (
         "decoder_input_ids",
         "decoder_attention_mask",
         "encoder_outputs",
         "encoder_hidden_states",
-        "encoder_attention_mask"
+        "encoder_attention_mask",
     )
 
 
-class DummySanaTransformerVisionInputGenerator(DummyVisionInputGenerator):
-    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
-        if input_name not in ["sample", "latent_sample"]:
-            return super().generate(input_name, framework, int_dtype, float_dtype)
-        return self.random_float_tensor(
-            shape=[self.batch_size, self.num_channels, self.height, self.width],
-            framework=framework,
-            dtype=float_dtype,
-        )
+class DummySanaTransformerVisionInputGenerator(DummyUnetVisionInputGenerator):
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedVisionConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
+        width: int = DEFAULT_DUMMY_SHAPES["width"] // 8,
+        height: int = DEFAULT_DUMMY_SHAPES["height"] // 8,
+        # Reduce img shape by 4 for FLUX to reduce memory usage on conversion
+        **kwargs,
+    ):
+        super().__init__(task, normalized_config, batch_size, num_channels, width=width, height=height, **kwargs)
+
 
 @register_in_tasks_manager("sana-transformer", *["semantic-segmentation"], library_name="diffusers")
 class SanaTransformerOpenVINOConfig(UNetOpenVINOConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args(
         image_size="sample_size",
         num_channels="in_channels",
-        hidden_size="cross_attention_dim",
+        hidden_size="caption_channels",
         vocab_size="attention_head_dim",
         allow_new=True,
     )
-    DUMMY_INPUT_GENERATOR_CLASSES = (DummySanaTransformerVisionInputGenerator, DummySeq2SeqDecoderTextWithEncMaskInputGenerator) + UNetOpenVINOConfig.DUMMY_INPUT_GENERATOR_CLASSES[1:-1]
+    DUMMY_INPUT_GENERATOR_CLASSES = (
+        DummySanaTransformerVisionInputGenerator,
+        DummySanaSeq2SeqDecoderTextWithEncMaskInputGenerator,
+    ) + UNetOpenVINOConfig.DUMMY_INPUT_GENERATOR_CLASSES[1:-1]
+
     @property
     def inputs(self):
         common_inputs = super().inputs
         common_inputs["encoder_attention_mask"] = {0: "batch_size", 1: "sequence_length"}
         return common_inputs
 
+    def rename_ambiguous_inputs(self, inputs):
+        #  The input name in the model signature is `x, hence the export input name is updated.
+        hidden_states = inputs.pop("sample", None)
+        if hidden_states is not None:
+            inputs["hidden_states"] = hidden_states
+        return inputs
+
+
+@register_in_tasks_manager("dcae-encoder", *["semantic-segmentation"], library_name="diffusers")
+class DcaeEncoderOpenVINOConfig(VaeEncoderOnnxConfig):
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        return {
+            "latent": {0: "batch_size", 2: "height_latent", 3: "width_latent"},
+        }
+
 
 class DummyFluxTransformerInputGenerator(DummyVisionInputGenerator):
     SUPPORTED_INPUT_NAMES = (
diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py
index 91aaf57ae0..2c85dcc98d 100644
--- a/optimum/intel/__init__.py
+++ b/optimum/intel/__init__.py
@@ -127,6 +127,7 @@
         "OVFluxImg2ImgPipeline",
         "OVFluxInpaintPipeline",
         "OVFluxFillPipeline",
+        "OVSanaPipeline",
         "OVPipelineForImage2Image",
         "OVPipelineForText2Image",
         "OVPipelineForInpainting",
@@ -150,6 +151,7 @@
             "OVFluxImg2ImgPipeline",
             "OVFluxInpaintPipeline",
             "OVFluxFillPipeline",
+            "OVSanaPipeline",
             "OVPipelineForImage2Image",
             "OVPipelineForText2Image",
             "OVPipelineForInpainting",
diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py
index 8945dc6382..d3142ad802 100644
--- a/optimum/intel/openvino/__init__.py
+++ b/optimum/intel/openvino/__init__.py
@@ -91,6 +91,7 @@
         OVPipelineForImage2Image,
         OVPipelineForInpainting,
         OVPipelineForText2Image,
+        OVSanaPipeline,
         OVStableDiffusion3Img2ImgPipeline,
         OVStableDiffusion3InpaintPipeline,
         OVStableDiffusion3Pipeline,
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index bc2f75e0ce..72ebf46887 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -103,9 +103,10 @@
     FluxInpaintPipeline = object
 
 if is_diffusers_version(">=", "0.32.0"):
-    from diffusers import FluxFillPipeline
+    from diffusers import FluxFillPipeline, SanaPipeline
 else:
     FluxFillPipeline = object
+    SanaPipeline = object
 
 
 DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER = "transformer"
@@ -817,9 +818,14 @@ def reshape(
         if self.tokenizer is None and self.tokenizer_2 is None:
             tokenizer_max_len = -1
         else:
-            tokenizer_max_len = (
-                self.tokenizer.model_max_length if self.tokenizer is not None else self.tokenizer_2.model_max_length
-            )
+            if self.tokenizer is not None and "Gemma" in self.tokenizer.__class__.__name__:
+                tokenizer_max_len = -1
+            else:
+                tokenizer_max_len = (
+                    self.tokenizer.model_max_length
+                    if self.tokenizer is not None
+                    else self.tokenizer_2.model_max_length
+                )
 
         if self.unet is not None:
             self.unet.model = self._reshape_unet(
@@ -1041,6 +1047,7 @@ def __init__(self, model: openvino.runtime.Model, parent_pipeline: OVDiffusionPi
         self.hidden_states_output_names = [
             name for out in self.model.outputs for name in out.names if name.startswith("hidden_states")
         ]
+        self.input_names = [inp.get_any_name() for inp in self.model.inputs]
 
     def forward(
         self,
@@ -1052,6 +1059,11 @@ def forward(
         self._compile()
         model_inputs = {"input_ids": input_ids}
 
+        if "attention_mask" in self.input_names:
+            model_inputs["attention_mask"] = (
+                attention_mask if attention_mask is not None else torch.ones(input_ids.shape, dtype=torch.long)
+            )
+
         ov_outputs = self.request(model_inputs, share_inputs=True)
         main_out = ov_outputs[0]
         model_outputs = {}
@@ -1139,6 +1151,8 @@ def forward(
         guidance: torch.Tensor = None,
         block_controlnet_hidden_states: List = None,
         joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: torch.LongTensor = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
         return_dict: bool = True,
     ):
         self._compile()
@@ -1147,9 +1161,10 @@ def forward(
             "hidden_states": hidden_states,
             "timestep": timestep,
             "encoder_hidden_states": encoder_hidden_states,
-            "pooled_projections": pooled_projections,
         }
 
+        if pooled_projections is not None:
+            model_inputs["pooled_projections"] = pooled_projections
         if img_ids is not None:
             model_inputs["img_ids"] = img_ids
         if txt_ids is not None:
@@ -1157,6 +1172,9 @@ def forward(
         if guidance is not None:
             model_inputs["guidance"] = guidance
 
+        if encoder_attention_mask is not None:
+            model_inputs["encoder_attention_mask"] = encoder_attention_mask
+
         ov_outputs = self.request(model_inputs, share_inputs=True).to_dict()
 
         model_outputs = {}
@@ -1498,6 +1516,12 @@ class OVFluxFillPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, Flu
     auto_model_class = FluxFillPipeline
 
 
+class OVSanaPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, SanaPipeline):
+    main_input_name = "prompt"
+    export_feature = "text-to-image"
+    auto_model_class = SanaPipeline
+
+
 SUPPORTED_OV_PIPELINES = [
     OVStableDiffusionPipeline,
     OVStableDiffusionImg2ImgPipeline,
@@ -1569,6 +1593,8 @@ def _get_ov_class(pipeline_class_name: str, throw_error_if_not_exist: bool = Tru
 if is_diffusers_version(">=", "0.32.0"):
     OV_INPAINT_PIPELINES_MAPPING["flux-fill"] = OVFluxFillPipeline
     SUPPORTED_OV_PIPELINES.append(OVFluxFillPipeline)
+    OV_TEXT2IMAGE_PIPELINES_MAPPING["sana"] = OVSanaPipeline
+    SUPPORTED_OV_PIPELINES.append(OVSanaPipeline)
 
 SUPPORTED_OV_PIPELINES_MAPPINGS = [
     OV_TEXT2IMAGE_PIPELINES_MAPPING,
diff --git a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py
index 2e669875fc..f6341e5435 100644
--- a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py
+++ b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py
@@ -222,3 +222,14 @@ def __init__(self, *args, **kwargs):
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["openvino", "diffusers"])
+
+
+class OVSanaPipeline(metaclass=DummyObject):
+    _backends = ["openvino", "diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["openvino", "diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["openvino", "diffusers"])
diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py
index 477799345b..501960cc9f 100644
--- a/tests/openvino/test_diffusion.py
+++ b/tests/openvino/test_diffusion.py
@@ -77,8 +77,8 @@ class OVPipelineForText2ImageTest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]
     NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]
     if is_transformers_version(">=", "4.40.0"):
-        SUPPORTED_ARCHITECTURES.extend(["stable-diffusion-3", "flux"])
-        NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES.append("stable-diffusion-3")
+        SUPPORTED_ARCHITECTURES.extend(["stable-diffusion-3", "flux", "sana"])
+        NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES.extend(["stable-diffusion-3", "sana"])
     CALLBACK_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]
 
     OVMODEL_CLASS = OVPipelineForText2Image
@@ -94,6 +94,13 @@ def generate_inputs(self, height=128, width=128, batch_size=1):
 
         return inputs
 
+    def get_auto_cls(self, model_arch):
+        if model_arch == "sana":
+            from diffusers import SanaPipeline
+
+            return SanaPipeline
+        return self.AUTOMODEL_CLASS
+
     @require_diffusers
     def test_load_vanilla_model_which_is_not_supported(self):
         with self.assertRaises(Exception) as context:
@@ -104,12 +111,14 @@ def test_load_vanilla_model_which_is_not_supported(self):
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_ov_pipeline_class_dispatch(self, model_arch: str):
-        auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+        auto_cls = self.get_auto_cls(model_arch)
+        auto_pipeline = DiffusionPipeline if model_arch != "sana" else auto_cls
+        auto_pipeline = auto_cls.from_pretrained(MODEL_NAMES[model_arch])
         ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
 
         self.assertEqual(ov_pipeline.auto_model_class, auto_pipeline.__class__)
 
-        auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch])
+        auto_pipeline = auto_pipeline.from_pretrained(MODEL_NAMES[model_arch])
         ov_pipeline = OVDiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch])
 
         self.assertEqual(ov_pipeline.auto_model_class, auto_pipeline.__class__)
@@ -130,30 +139,44 @@ def test_num_images_per_prompt(self, model_arch: str):
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_compare_to_diffusers_pipeline(self, model_arch: str):
-        height, width, batch_size = 128, 128, 1
+        height, width, batch_size = 64, 64, 1
         inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-
+        auto_cls = self.get_auto_cls(model_arch)
         ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
-        diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
-
-        for output_type in ["latent", "np", "pt"]:
-            inputs["output_type"] = output_type
+        diffusers_pipeline = auto_cls.from_pretrained(MODEL_NAMES[model_arch])
 
-            ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images
-            diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+        with torch.no_grad():
+            for output_type in ["latent", "np", "pt"]:
+                inputs["output_type"] = output_type
+                if model_arch == "sana":
+                    if output_type == "latent":
+                        continue
+                    inputs["use_resolution_binning"] = False
+                    atol = 4e-2
+                else:
+                    atol = 6e-3
 
-            np.testing.assert_allclose(ov_output, diffusers_output, atol=6e-3, rtol=1e-2)
+                ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+                diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+                np.testing.assert_allclose(ov_output, diffusers_output, atol=atol, rtol=1e-2)
 
         # test on inputs nondivisible on 64
         height, width, batch_size = 96, 96, 1
 
         for output_type in ["latent", "np", "pt"]:
             inputs["output_type"] = output_type
+            if model_arch == "sana":
+                if output_type == "latent":
+                    continue
+                inputs["use_resolution_binning"] = False
+                atol = 4e-2
+            else:
+                atol = 6e-3
 
             ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images
             diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
 
-            np.testing.assert_allclose(ov_output, diffusers_output, atol=6e-3, rtol=1e-2)
+            np.testing.assert_allclose(ov_output, diffusers_output, atol=atol, rtol=1e-2)
 
     @parameterized.expand(CALLBACK_SUPPORT_ARCHITECTURES)
     @require_diffusers
@@ -174,7 +197,8 @@ def __call__(self, *args, **kwargs) -> None:
         auto_callback = Callback()
 
         ov_pipe = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
-        auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+        auto_cls = self.get_auto_cls(model_arch)
+        auto_pipe = auto_cls.from_pretrained(MODEL_NAMES[model_arch])
 
         # callback_steps=1 to trigger callback every step
         ov_pipe(**inputs, callback=ov_callback, callback_steps=1)
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index f4b96ec998..6f047c0bee 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -84,7 +84,12 @@ class OVCLIExportTestCase(unittest.TestCase):
 
     if is_transformers_version(">=", "4.45"):
         SUPPORTED_ARCHITECTURES.extend(
-            [("text-to-image", "stable-diffusion-3"), ("text-to-image", "flux"), ("inpainting", "flux-fill")]
+            [
+                ("text-to-image", "stable-diffusion-3"),
+                ("text-to-image", "flux"),
+                ("inpainting", "flux-fill"),
+                ("text-to-image", "sana"),
+            ]
         )
     EXPECTED_NUMBER_OF_TOKENIZER_MODELS = {
         "gpt2": 2 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0,
@@ -113,6 +118,7 @@ class OVCLIExportTestCase(unittest.TestCase):
     if is_transformers_version(">=", "4.45"):
         SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("stable-diffusion-3", 9, 65))
         SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("flux", 7, 56))
+        SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("sana", 7, 56))
 
     SUPPORTED_QUANTIZATION_ARCHITECTURES = [
         (
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 3100df6159..634c5ba5e0 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -168,6 +168,7 @@
     "open-clip-ov": "zofinka/tiny-open-clip-model",
     "st-bert": "sentence-transformers/all-MiniLM-L6-v2",
     "st-mpnet": "sentence-transformers/all-mpnet-base-v2",
+    "sana": "/home/ea/work/my_optimum_intel/optimum-intel/tiny-random-sana",
 }
 
 
@@ -200,6 +201,7 @@
     "minicpmv": (30, 26, 1, 6),
     "nanollava": (30, 15, 1),
     "qwen2_vl": (30, 1, 1, 10),
+    "sana": (242, 34, 42, 64),
 }
 
 TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg"

From 6b98d62a6f213576c3d1f289b93dfb462fdcb784 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Mon, 13 Jan 2025 22:31:59 +0400
Subject: [PATCH 3/8] update tests

---
 optimum/commands/export/openvino.py          |  4 ++++
 optimum/exporters/openvino/convert.py        |  2 --
 optimum/intel/openvino/modeling_diffusion.py | 12 +++++++++---
 optimum/intel/openvino/utils.py              |  1 +
 tests/openvino/test_diffusion.py             |  4 +++-
 tests/openvino/test_exporters_cli.py         | 14 +++++++++++---
 tests/openvino/test_quantization.py          |  2 ++
 tests/openvino/utils_tests.py                |  4 ++--
 8 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index d8c4edafda..161534fad3 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -416,6 +416,10 @@ def run(self):
                 from optimum.intel import OVFluxPipeline
 
                 model_cls = OVFluxPipeline
+            elif class_name == "SanaPipeline":
+                from optimum.intel import OVSanaPipeline
+
+                model_cls = OVSanaPipeline
             else:
                 raise NotImplementedError(f"Quantization in hybrid mode isn't supported for class {class_name}.")
 
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index cb5bf95181..4614d93689 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -1047,7 +1047,6 @@ def get_diffusion_models_for_export_ext(
             sd3_pipes.append(StableDiffusion3InpaintPipeline)
 
         is_sd3 = isinstance(pipeline, tuple(sd3_pipes))
-        logger.warn(f"IS SD3 {pipeline} {is_sd3}")
     else:
         is_sd3 = False
 
@@ -1185,7 +1184,6 @@ def get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype):
         task="semantic-segmentation",
         model_type="sd3-transformer",
     )
-    logger.warn(f"TRANSFORMER COFG {export_config_constructor}")
     transformer_export_config = export_config_constructor(
         pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype
     )
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index 72ebf46887..4a3f7104b7 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -844,17 +844,23 @@ def reshape(
 
         if self.text_encoder is not None:
             self.text_encoder.model = self._reshape_text_encoder(
-                self.text_encoder.model, batch_size, self.tokenizer.model_max_length
+                self.text_encoder.model,
+                batch_size,
+                self.tokenizer.model_max_length if "Gemma" not in self.tokenizer.__class__.__name__ else -1,
             )
 
         if self.text_encoder_2 is not None:
             self.text_encoder_2.model = self._reshape_text_encoder(
-                self.text_encoder_2.model, batch_size, self.tokenizer_2.model_max_length
+                self.text_encoder_2.model,
+                batch_size,
+                self.tokenizer_2.model_max_length if "Gemma" not in self.tokenizer.__class__.__name__ else -1,
             )
 
         if self.text_encoder_3 is not None:
             self.text_encoder_3.model = self._reshape_text_encoder(
-                self.text_encoder_3.model, batch_size, self.tokenizer_3.model_max_length
+                self.text_encoder_3.model,
+                batch_size,
+                self.tokenizer_3.model_max_length if "Gemma" not in self.tokenizer.__class__.__name__ else -1,
             )
 
         self.clear_requests()
diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py
index fbb108c7d8..d9df9419ad 100644
--- a/optimum/intel/openvino/utils.py
+++ b/optimum/intel/openvino/utils.py
@@ -125,6 +125,7 @@
     "stable-diffusion": "OVStableDiffusionPipeline",
     "stable-diffusion-xl": "OVStableDiffusionXLPipeline",
     "stable-diffusion-3": "OVStableDiffusion3Pipeline",
+    "sana": "OVSanaPipeline",
     "flux": "OVFluxPipeline",
     "flux-fill": "OVFluxFillPipeline",
     "pix2struct": "OVModelForPix2Struct",
diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py
index 501960cc9f..b155353fc3 100644
--- a/tests/openvino/test_diffusion.py
+++ b/tests/openvino/test_diffusion.py
@@ -78,7 +78,7 @@ class OVPipelineForText2ImageTest(unittest.TestCase):
     NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]
     if is_transformers_version(">=", "4.40.0"):
         SUPPORTED_ARCHITECTURES.extend(["stable-diffusion-3", "flux", "sana"])
-        NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES.extend(["stable-diffusion-3", "sana"])
+        NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES.append(["stable-diffusion-3"])
     CALLBACK_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]
 
     OVMODEL_CLASS = OVPipelineForText2Image
@@ -215,6 +215,8 @@ def test_shape(self, model_arch: str):
 
         height, width, batch_size = 128, 64, 1
         inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+        if model_arch == "sana":
+            inputs["use_resolution_binning"] = False
 
         for output_type in ["pil", "np", "pt", "latent"]:
             inputs["output_type"] = output_type
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 6f047c0bee..7fbeb2e9e4 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -44,6 +44,7 @@
     OVModelOpenCLIPForZeroShotImageClassification,
     OVModelOpenCLIPText,
     OVModelOpenCLIPVisual,
+    OVSanaPipeline,
     OVSentenceTransformer,
     OVStableDiffusion3Pipeline,
     OVStableDiffusionPipeline,
@@ -107,6 +108,7 @@ class OVCLIExportTestCase(unittest.TestCase):
         "flux": 4 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0,
         "flux-fill": 4 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0,
         "llava": 2 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0,
+        "sana": 2 if is_tokenizers_version("<", "0.20.0") or is_openvino_version(">=", "2024.5") else 0,
     }
 
     SUPPORTED_SD_HYBRID_ARCHITECTURES = [
@@ -118,7 +120,7 @@ class OVCLIExportTestCase(unittest.TestCase):
     if is_transformers_version(">=", "4.45"):
         SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("stable-diffusion-3", 9, 65))
         SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("flux", 7, 56))
-        SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("sana", 7, 56))
+        SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("sana", 19, 53))
 
     SUPPORTED_QUANTIZATION_ARCHITECTURES = [
         (
@@ -357,9 +359,15 @@ def test_exporters_cli_int8(self, task: str, model_type: str):
                 models = [model.encoder, model.decoder]
                 if task.endswith("with-past") and not model.decoder.stateful:
                     models.append(model.decoder_with_past)
-            elif model_type.startswith("stable-diffusion") or model_type.startswith("flux"):
+            elif (
+                model_type.startswith("stable-diffusion")
+                or model_type.startswith("flux")
+                or model_type.startswith("sana")
+            ):
                 models = [model.unet or model.transformer, model.vae_encoder, model.vae_decoder]
-                models.append(model.text_encoder if model_type == "stable-diffusion" else model.text_encoder_2)
+                models.append(
+                    model.text_encoder if model_type in ["stable-diffusion", "sana"] else model.text_encoder_2
+                )
             elif task.startswith("image-text-to-text"):
                 models = [model.language_model, model.vision_embeddings]
             else:
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index 26ad44401a..4da88418b1 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -59,6 +59,7 @@
     OVStableDiffusionXLPipeline,
     OVStableDiffusion3Pipeline,
     OVQuantizer,
+    OVSanaPipeline,
     OVTrainer,
     OVQuantizationConfig,
     OVWeightQuantizationConfig,
@@ -543,6 +544,7 @@ class OVWeightCompressionTest(unittest.TestCase):
             [
                 (OVStableDiffusion3Pipeline, "stable-diffusion-3", 9, 65),
                 (OVFluxPipeline, "flux", 7, 56),
+                (OVSanaPipeline, "sana", 19, 53),
             ]
         )
 
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 634c5ba5e0..83ea3751d6 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -168,7 +168,7 @@
     "open-clip-ov": "zofinka/tiny-open-clip-model",
     "st-bert": "sentence-transformers/all-MiniLM-L6-v2",
     "st-mpnet": "sentence-transformers/all-mpnet-base-v2",
-    "sana": "/home/ea/work/my_optimum_intel/optimum-intel/tiny-random-sana",
+    "sana": "katuni4ka/tiny-random-sana",
 }
 
 
@@ -201,7 +201,7 @@
     "minicpmv": (30, 26, 1, 6),
     "nanollava": (30, 15, 1),
     "qwen2_vl": (30, 1, 1, 10),
-    "sana": (242, 34, 42, 64),
+    "sana": (58, 28, 28, 18),
 }
 
 TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg"

From 5ae26d0c7f753fed46d6d7a1f2ce27aec6db4ad9 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Fri, 17 Jan 2025 12:08:13 +0400
Subject: [PATCH 4/8] add variant for model loading in from_transformers

---
 optimum/exporters/openvino/convert.py         | 48 +------------------
 optimum/exporters/openvino/model_configs.py   |  6 +++
 optimum/exporters/openvino/model_patcher.py   | 38 +++++++++++++--
 optimum/exporters/openvino/utils.py           | 12 +++--
 optimum/intel/openvino/modeling_base.py       |  3 ++
 .../intel/openvino/modeling_base_seq2seq.py   |  2 +
 optimum/intel/openvino/modeling_decoder.py    |  3 ++
 optimum/intel/openvino/modeling_diffusion.py  |  2 +
 .../openvino/modeling_visual_language.py      |  2 +
 tests/openvino/test_diffusion.py              | 14 ++----
 10 files changed, 66 insertions(+), 64 deletions(-)

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index 4614d93689..0d6c2b4367 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -1013,11 +1013,10 @@ def _get_submodels_and_export_configs(
 def get_diffusion_models_for_export_ext(
     pipeline: "DiffusionPipeline", int_dtype: str = "int64", float_dtype: str = "fp32", exporter: str = "openvino"
 ):
-<<<<<<< HEAD
     is_sdxl = pipeline.__class__.__name__.startswith("StableDiffusionXL")
     is_sd3 = pipeline.__class__.__name__.startswith("StableDiffusion3")
     is_flux = pipeline.__class__.__name__.startswith("Flux")
-    is_sana =  pipeline.__class__.__name__.startswith("Sana")
+    is_sana = pipeline.__class__.__name__.startswith("Sana")
     is_sd = pipeline.__class__.__name__.startswith("StableDiffusion") and not is_sd3
     is_lcm = pipeline.__class__.__name__.startswith("LatentConsistencyModel")
 
@@ -1036,51 +1035,6 @@ def get_diffusion_models_for_export_ext(
         models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype)
     elif is_flux:
         models_for_export = get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype)
-=======
-    if is_diffusers_version(">=", "0.29.0"):
-        from diffusers import StableDiffusion3Img2ImgPipeline, StableDiffusion3Pipeline
-
-        sd3_pipes = [StableDiffusion3Pipeline, StableDiffusion3Img2ImgPipeline]
-        if is_diffusers_version(">=", "0.30.0"):
-            from diffusers import StableDiffusion3InpaintPipeline
-
-            sd3_pipes.append(StableDiffusion3InpaintPipeline)
-
-        is_sd3 = isinstance(pipeline, tuple(sd3_pipes))
-    else:
-        is_sd3 = False
-
-    if is_diffusers_version(">=", "0.30.0"):
-        from diffusers import FluxPipeline
-
-        flux_pipes = [FluxPipeline]
-
-        if is_diffusers_version(">=", "0.31.0"):
-            from diffusers import FluxImg2ImgPipeline, FluxInpaintPipeline
-
-            flux_pipes.extend([FluxPipeline, FluxImg2ImgPipeline, FluxInpaintPipeline])
-
-        if is_diffusers_version(">=", "0.32.0"):
-            from diffusers import FluxFillPipeline
-
-            flux_pipes.append(FluxFillPipeline)
-
-        is_flux = isinstance(pipeline, tuple(flux_pipes))
-    else:
-        is_flux = False
-
-    if is_diffusers_version(">=", "0.32.0"):
-        from diffusers import SanaPipeline
-
-        is_sana = isinstance(pipeline, SanaPipeline)
-    else:
-        is_sana = False
-
-    if not any([is_sana, is_flux, is_sd3]):
-        return None, get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter)
-    if is_sd3:
-        models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype)
->>>>>>> add pipeline
     elif is_sana:
         models_for_export = get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype)
     else:
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 1f7695cf82..e73039159c 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -106,6 +106,7 @@
     Qwen2VLVisionEmbMergerPatcher,
     QwenModelPatcher,
     RotaryEmbPatcher,
+    SanaTextEncoderModelPatcher,
     StatefulSeq2SeqDecoderPatcher,
     UpdateCausalMaskModelPatcher,
     XverseModelPatcher,
@@ -1903,6 +1904,11 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
             "attention_mask": {0: "batch_size", 1: "sequence_length"},
         }
 
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> ModelPatcher:
+        return SanaTextEncoderModelPatcher(self, model, model_kwargs)
+
 
 class DummySanaSeq2SeqDecoderTextWithEncMaskInputGenerator(DummySeq2SeqDecoderTextInputGenerator):
     SUPPORTED_INPUT_NAMES = (
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index e7a7779389..08bc149880 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -21,9 +21,11 @@
 
 import torch
 import torch.nn.functional as F
+from transformers import PreTrainedModel, TFPreTrainedModel
 from transformers.modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling
 from transformers.utils import is_tf_available
 
+from optimum.exporters.onnx.base import OnnxConfig
 from optimum.exporters.onnx.model_patcher import (
     DecoderModelPatcher,
     ModelPatcher,
@@ -114,9 +116,11 @@ def patch_model_with_bettertransformer(model):
     return model
 
 
-def patch_update_causal_mask(model, transformers_version, inner_model_name="model", patch_fn=None):
+def patch_update_causal_mask(
+    model, transformers_version, inner_model_name="model", patch_fn=None, patch_extrnal_model=False
+):
     if is_transformers_version(">=", transformers_version):
-        inner_model = getattr(model, inner_model_name, None)
+        inner_model = getattr(model, inner_model_name, None) if not patch_extrnal_model else model
         if inner_model is not None:
             if hasattr(inner_model, "_update_causal_mask"):
                 inner_model._orig_update_causal_mask = inner_model._update_causal_mask
@@ -124,8 +128,8 @@ def patch_update_causal_mask(model, transformers_version, inner_model_name="mode
             inner_model._update_causal_mask = types.MethodType(patch_fn, inner_model)
 
 
-def unpatch_update_causal_mask(model, inner_model_name="model"):
-    inner_model = getattr(model, inner_model_name, None)
+def unpatch_update_causal_mask(model, inner_model_name="model", patch_extrnal_model=False):
+    inner_model = getattr(model, inner_model_name, None) if not patch_extrnal_model else model
     if inner_model is not None and hasattr(inner_model, "._orig_update_causal_mask"):
         inner_model._update_causal_mask = inner_model._orig_update_causal_mask
 
@@ -3791,3 +3795,29 @@ def patched_forward(*args, **kwargs):
         model.forward = patched_forward
 
         super().__init__(config, model, model_kwargs)
+
+
+class SanaTextEncoderModelPatcher(ModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        patch_update_causal_mask(self._model, "4.39.0", None, patch_extrnal_model=True)
+
+        if self._model.config._attn_implementation != "sdpa":
+            self._model.config._orig_attn_implementation = self._model.config._attn_implementation
+            self._model.config._attn_implementation = "sdpa"
+            if is_transformers_version("<", "4.47.0"):
+                from transformers.models.gemma2.modeling_gemma2 import GEMMA2_ATTENTION_CLASSES
+
+                sdpa_attn = GEMMA2_ATTENTION_CLASSES["sdpa"]
+                for layer in self._model.layers:
+                    layer.self_attn._orig_forward = layer.self_attn.forward
+                    layer.self_attn.forward = types.MethodType(sdpa_attn.forward, layer.self_attn)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        unpatch_update_causal_mask(self._model, None, True)
+        if hasattr(self._model.config, "_orig_attn_implementation"):
+            self._model.config._attn_implementation = self._model.config._orig_attn_implementation
+            for layer in self._model.layers:
+                if hasattr(layer.self_attn, "_orig_forward"):
+                    layer.self_attn.forward = layer.self_attn._orig_forward
diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py
index 46b151e7de..1743dc59b1 100644
--- a/optimum/exporters/openvino/utils.py
+++ b/optimum/exporters/openvino/utils.py
@@ -257,9 +257,15 @@ def deduce_diffusers_dtype(model_name_or_path, **loading_kwargs):
             model_part_name = "unet"
         if model_part_name:
             directory = path / model_part_name
-            safetensors_files = [
-                filename for filename in directory.glob("*.safetensors") if len(filename.suffixes) == 1
-            ]
+
+            pattern = "*.safetensors"
+            if "variant" in loading_kwargs:
+                variant = loading_kwargs["variant"]
+                pattern = f"*.{variant}.safetensors"
+                safetensors_files = list(directory.glob(pattern))
+            else:
+                # filter out variant files
+                safetensors_files = [filename for filename in directory.glob(pattern) if len(filename.suffixes) == 1]
             safetensors_file = None
             if len(safetensors_files) > 0:
                 safetensors_file = safetensors_files.pop(0)
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index 99422f1a54..3fd26a6e0d 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -594,6 +594,8 @@ def _from_transformers(
         else:
             ov_config = OVConfig(dtype="fp32")
 
+        variant = kwargs.pop("variant", None)
+
         main_export(
             model_name_or_path=model_id,
             output=save_dir_path,
@@ -607,6 +609,7 @@ def _from_transformers(
             trust_remote_code=trust_remote_code,
             ov_config=ov_config,
             library_name=cls._library_name,
+            model_variant=variant,
         )
 
         return cls._from_pretrained(
diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py
index 11ee8f89a7..c60c0ec702 100644
--- a/optimum/intel/openvino/modeling_base_seq2seq.py
+++ b/optimum/intel/openvino/modeling_base_seq2seq.py
@@ -408,6 +408,7 @@ def _from_transformers(
         else:
             ov_config = OVConfig(dtype="fp32")
         stateful = kwargs.get("stateful", True)
+        variant = kwargs.pop("variant", None)
 
         main_export(
             model_name_or_path=model_id,
@@ -422,6 +423,7 @@ def _from_transformers(
             trust_remote_code=trust_remote_code,
             ov_config=ov_config,
             stateful=stateful,
+            model_variant=variant,
         )
 
         return cls._from_pretrained(
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 4897db1459..b411bf07d9 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -310,6 +310,8 @@ def _from_transformers(
         if torch_dtype is not None:
             model_loading_kwargs["torch_dtype"] = torch_dtype
 
+        variant = kwargs.pop("variant", None)
+
         main_export(
             model_name_or_path=model_id,
             output=save_dir_path,
@@ -325,6 +327,7 @@ def _from_transformers(
             stateful=stateful,
             model_loading_kwargs=model_loading_kwargs,
             library_name=cls._library_name,
+            model_variant=variant,
         )
 
         if config.model_type == "phi3" and config.max_position_embeddings != getattr(
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index 4a3f7104b7..c2e245c5e7 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -575,6 +575,7 @@ def _from_transformers(
 
         model_save_dir = TemporaryDirectory()
         model_save_path = Path(model_save_dir.name)
+        variant = kwargs.pop("variant", None)
 
         main_export(
             model_name_or_path=model_id,
@@ -589,6 +590,7 @@ def _from_transformers(
             force_download=force_download,
             ov_config=ov_config,
             library_name=cls._library_name,
+            model_variant=variant,
         )
 
         return cls._from_pretrained(
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index 1c0e35cca2..c7cd7227f2 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -615,6 +615,7 @@ def _from_transformers(
             ov_config = OVConfig(dtype="fp32" if load_in_8bit is False else "auto")
 
         stateful = kwargs.pop("stateful", ensure_stateful_is_available(warn=False) and use_cache)
+        variant = kwargs.pop("variant", None)
 
         main_export(
             model_name_or_path=model_id,
@@ -629,6 +630,7 @@ def _from_transformers(
             trust_remote_code=trust_remote_code,
             ov_config=ov_config,
             stateful=stateful,
+            model_variant=variant,
         )
         config = AutoConfig.from_pretrained(save_dir_path, trust_remote_code=trust_remote_code)
         return cls._from_pretrained(
diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py
index b155353fc3..bff1879340 100644
--- a/tests/openvino/test_diffusion.py
+++ b/tests/openvino/test_diffusion.py
@@ -149,12 +149,9 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str):
             for output_type in ["latent", "np", "pt"]:
                 inputs["output_type"] = output_type
                 if model_arch == "sana":
-                    if output_type == "latent":
-                        continue
+                    # resolution binning will lead to resize output to standard resolution and back that can interpolate floating-point deviations
                     inputs["use_resolution_binning"] = False
-                    atol = 4e-2
-                else:
-                    atol = 6e-3
+                atol = 1e-4
 
                 ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images
                 diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
@@ -166,12 +163,9 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str):
         for output_type in ["latent", "np", "pt"]:
             inputs["output_type"] = output_type
             if model_arch == "sana":
-                if output_type == "latent":
-                    continue
+                # resolution binning will lead to resize output to standard resolution and back that can interpolate floating-point deviations
                 inputs["use_resolution_binning"] = False
-                atol = 4e-2
-            else:
-                atol = 6e-3
+            atol = 6e-3
 
             ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images
             diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images

From 0c1189ad23b183433cae772e193d48ec0da55be3 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Fri, 17 Jan 2025 12:11:23 +0400
Subject: [PATCH 5/8] Update optimum/exporters/openvino/__main__.py

---
 optimum/exporters/openvino/__main__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 59a9dc41ab..520a28559f 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -369,7 +369,6 @@ class StoreAttr(object):
         if library_name == "open_clip":
             model = _OpenClipForZeroShotImageClassification.from_pretrained(model_name_or_path, cache_dir=cache_dir)
         else:
-            logger.warn(loading_kwargs)
             model = TasksManager.get_model_from_task(
                 task,
                 model_name_or_path,

From 23e298c06f0acf811478ccc13d435268dda8141b Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Fri, 17 Jan 2025 12:15:25 +0400
Subject: [PATCH 6/8] provide missed params to data-aware cli

---
 optimum/commands/export/openvino.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 161534fad3..695c22e985 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -452,6 +452,8 @@ def run(self):
                 quantization_config=quantization_config,
                 stateful=not self.args.disable_stateful,
                 trust_remote_code=self.args.trust_remote_code,
+                variant=self.args.variant,
+                cache_dir=self.args.cache_dir,
             )
             model.save_pretrained(self.args.output)
 

From 95c3e2c5b40e3b2e548bef53bca9e3811d0bef15 Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Tue, 28 Jan 2025 17:05:56 +0400
Subject: [PATCH 7/8] apply review comments

---
 optimum/commands/export/openvino.py           |  8 ++--
 optimum/exporters/openvino/__main__.py        |  8 ++--
 optimum/intel/openvino/modeling_base.py       |  2 +-
 optimum/intel/openvino/modeling_decoder.py    |  2 +-
 optimum/intel/openvino/modeling_diffusion.py  | 24 +++++-------
 .../openvino/modeling_visual_language.py      |  2 +-
 tests/openvino/test_diffusion.py              | 38 +++++++------------
 7 files changed, 34 insertions(+), 50 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 695c22e985..6d430cc611 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -106,10 +106,10 @@ def parse_args_openvino(parser: "ArgumentParser"):
         ),
     )
     optional_group.add_argument(
-        "--variant",
+        "--weights-variant",
         type=str,
         default=None,
-        help=("Select a variant of the model to export."),
+        help=("If specified load weights from variant filename."),
     )
     optional_group.add_argument(
         "--ratio",
@@ -452,7 +452,7 @@ def run(self):
                 quantization_config=quantization_config,
                 stateful=not self.args.disable_stateful,
                 trust_remote_code=self.args.trust_remote_code,
-                variant=self.args.variant,
+                variant=self.args.weights_variant,
                 cache_dir=self.args.cache_dir,
             )
             model.save_pretrained(self.args.output)
@@ -475,6 +475,6 @@ def run(self):
                 stateful=not self.args.disable_stateful,
                 convert_tokenizer=not self.args.disable_convert_tokenizer,
                 library_name=library_name,
-                model_variant=self.args.variant,
+                weights_variant=self.args.weights_variant,
                 # **input_shapes,
             )
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 520a28559f..afc8718dea 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -122,7 +122,7 @@ def main_export(
     convert_tokenizer: bool = False,
     library_name: Optional[str] = None,
     model_loading_kwargs: Optional[Dict[str, Any]] = None,
-    model_variant: Optional[str] = None,
+    weights_variant: Optional[str] = None,
     **kwargs_shapes,
 ):
     """
@@ -238,8 +238,8 @@ def main_export(
     custom_architecture = False
     patch_16bit = False
     loading_kwargs = model_loading_kwargs or {}
-    if model_variant is not None:
-        loading_kwargs["variant"] = model_variant
+    if weights_variant is not None:
+        loading_kwargs["variant"] = weights_variant
     if library_name == "transformers":
         config = AutoConfig.from_pretrained(
             model_name_or_path,
@@ -350,7 +350,7 @@ class StoreAttr(object):
 
                 GPTQQuantizer.post_init_model = post_init_model
     elif library_name == "diffusers" and is_openvino_version(">=", "2024.6"):
-        _loading_kwargs = {} if model_variant is None else {"variant": model_variant}
+        _loading_kwargs = {} if weights_variant is None else {"variant": weights_variant}
         dtype = deduce_diffusers_dtype(
             model_name_or_path,
             revision=revision,
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index 3fd26a6e0d..25064dd044 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -609,7 +609,7 @@ def _from_transformers(
             trust_remote_code=trust_remote_code,
             ov_config=ov_config,
             library_name=cls._library_name,
-            model_variant=variant,
+            weights_variant=variant,
         )
 
         return cls._from_pretrained(
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index b411bf07d9..b74c4dc623 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -327,7 +327,7 @@ def _from_transformers(
             stateful=stateful,
             model_loading_kwargs=model_loading_kwargs,
             library_name=cls._library_name,
-            model_variant=variant,
+            weughts_variant=variant,
         )
 
         if config.model_type == "phi3" and config.max_position_embeddings != getattr(
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index c2e245c5e7..2613d26c68 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -590,7 +590,7 @@ def _from_transformers(
             force_download=force_download,
             ov_config=ov_config,
             library_name=cls._library_name,
-            model_variant=variant,
+            weights_variant=variant,
         )
 
         return cls._from_pretrained(
@@ -767,7 +767,7 @@ def _reshape_text_encoder(
         self, model: openvino.runtime.Model, batch_size: int = -1, tokenizer_max_length: int = -1
     ):
         if batch_size != -1:
-            shapes = {model.inputs[0]: [batch_size, tokenizer_max_length]}
+            shapes = {input_tensor: [batch_size, tokenizer_max_length] for input_tensor in model.inputs}
             model.reshape(shapes)
         return model
 
@@ -824,9 +824,9 @@ def reshape(
                 tokenizer_max_len = -1
             else:
                 tokenizer_max_len = (
-                    self.tokenizer.model_max_length
+                    getattr(self.tokenizer, "model_max_length", -1)
                     if self.tokenizer is not None
-                    else self.tokenizer_2.model_max_length
+                    else getattr(self.tokenizer_2, "model_max_length", -1)
                 )
 
         if self.unet is not None:
@@ -848,21 +848,19 @@ def reshape(
             self.text_encoder.model = self._reshape_text_encoder(
                 self.text_encoder.model,
                 batch_size,
-                self.tokenizer.model_max_length if "Gemma" not in self.tokenizer.__class__.__name__ else -1,
+                getattr(self.tokenizer, "model_max_length", -1)
+                if "Gemma" not in self.tokenizer.__class__.__name__
+                else -1,
             )
 
         if self.text_encoder_2 is not None:
             self.text_encoder_2.model = self._reshape_text_encoder(
-                self.text_encoder_2.model,
-                batch_size,
-                self.tokenizer_2.model_max_length if "Gemma" not in self.tokenizer.__class__.__name__ else -1,
+                self.text_encoder_2.model, batch_size, getattr(self.tokenizer_2, "model_max_length", -1)
             )
 
         if self.text_encoder_3 is not None:
             self.text_encoder_3.model = self._reshape_text_encoder(
-                self.text_encoder_3.model,
-                batch_size,
-                self.tokenizer_3.model_max_length if "Gemma" not in self.tokenizer.__class__.__name__ else -1,
+                self.text_encoder_3.model, batch_size, getattr(self.tokenizer_3, "model_max_length", -1)
             )
 
         self.clear_requests()
@@ -1068,9 +1066,7 @@ def forward(
         model_inputs = {"input_ids": input_ids}
 
         if "attention_mask" in self.input_names:
-            model_inputs["attention_mask"] = (
-                attention_mask if attention_mask is not None else torch.ones(input_ids.shape, dtype=torch.long)
-            )
+            model_inputs["attention_mask"] = attention_mask
 
         ov_outputs = self.request(model_inputs, share_inputs=True)
         main_out = ov_outputs[0]
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index c7cd7227f2..b89c238b14 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -630,7 +630,7 @@ def _from_transformers(
             trust_remote_code=trust_remote_code,
             ov_config=ov_config,
             stateful=stateful,
-            model_variant=variant,
+            weights_variant=variant,
         )
         config = AutoConfig.from_pretrained(save_dir_path, trust_remote_code=trust_remote_code)
         return cls._from_pretrained(
diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py
index bff1879340..6ffe891c41 100644
--- a/tests/openvino/test_diffusion.py
+++ b/tests/openvino/test_diffusion.py
@@ -78,7 +78,7 @@ class OVPipelineForText2ImageTest(unittest.TestCase):
     NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]
     if is_transformers_version(">=", "4.40.0"):
         SUPPORTED_ARCHITECTURES.extend(["stable-diffusion-3", "flux", "sana"])
-        NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES.append(["stable-diffusion-3"])
+        NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES.extend(["stable-diffusion-3"])
     CALLBACK_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]
 
     OVMODEL_CLASS = OVPipelineForText2Image
@@ -94,13 +94,6 @@ def generate_inputs(self, height=128, width=128, batch_size=1):
 
         return inputs
 
-    def get_auto_cls(self, model_arch):
-        if model_arch == "sana":
-            from diffusers import SanaPipeline
-
-            return SanaPipeline
-        return self.AUTOMODEL_CLASS
-
     @require_diffusers
     def test_load_vanilla_model_which_is_not_supported(self):
         with self.assertRaises(Exception) as context:
@@ -111,9 +104,7 @@ def test_load_vanilla_model_which_is_not_supported(self):
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_ov_pipeline_class_dispatch(self, model_arch: str):
-        auto_cls = self.get_auto_cls(model_arch)
-        auto_pipeline = DiffusionPipeline if model_arch != "sana" else auto_cls
-        auto_pipeline = auto_cls.from_pretrained(MODEL_NAMES[model_arch])
+        auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch])
         ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
 
         self.assertEqual(ov_pipeline.auto_model_class, auto_pipeline.__class__)
@@ -141,21 +132,19 @@ def test_num_images_per_prompt(self, model_arch: str):
     def test_compare_to_diffusers_pipeline(self, model_arch: str):
         height, width, batch_size = 64, 64, 1
         inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-        auto_cls = self.get_auto_cls(model_arch)
         ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
-        diffusers_pipeline = auto_cls.from_pretrained(MODEL_NAMES[model_arch])
+        diffusers_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch])
 
-        with torch.no_grad():
-            for output_type in ["latent", "np", "pt"]:
-                inputs["output_type"] = output_type
-                if model_arch == "sana":
-                    # resolution binning will lead to resize output to standard resolution and back that can interpolate floating-point deviations
-                    inputs["use_resolution_binning"] = False
-                atol = 1e-4
+        for output_type in ["latent", "np", "pt"]:
+            inputs["output_type"] = output_type
+            if model_arch == "sana":
+                # resolution binning will lead to resize output to standard resolution and back that can interpolate floating-point deviations
+                inputs["use_resolution_binning"] = False
+            atol = 1e-4
 
-                ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images
-                diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
-                np.testing.assert_allclose(ov_output, diffusers_output, atol=atol, rtol=1e-2)
+            ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+            diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+            np.testing.assert_allclose(ov_output, diffusers_output, atol=atol, rtol=1e-2)
 
         # test on inputs nondivisible on 64
         height, width, batch_size = 96, 96, 1
@@ -191,8 +180,7 @@ def __call__(self, *args, **kwargs) -> None:
         auto_callback = Callback()
 
         ov_pipe = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
-        auto_cls = self.get_auto_cls(model_arch)
-        auto_pipe = auto_cls.from_pretrained(MODEL_NAMES[model_arch])
+        auto_pipe = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch])
 
         # callback_steps=1 to trigger callback every step
         ov_pipe(**inputs, callback=ov_callback, callback_steps=1)

From 9fdddc0e09345925be2e2b97bb0dc5e498bc484b Mon Sep 17 00:00:00 2001
From: eaidova <ekaterina.aidova@intel.com>
Date: Wed, 29 Jan 2025 18:39:54 +0400
Subject: [PATCH 8/8] rename weights_variant to variant

---
 optimum/commands/export/openvino.py                | 6 +++---
 optimum/exporters/openvino/__main__.py             | 8 ++++----
 optimum/intel/openvino/modeling_base.py            | 2 +-
 optimum/intel/openvino/modeling_base_seq2seq.py    | 2 +-
 optimum/intel/openvino/modeling_diffusion.py       | 2 +-
 optimum/intel/openvino/modeling_visual_language.py | 2 +-
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 6d430cc611..0c6e692a91 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -106,7 +106,7 @@ def parse_args_openvino(parser: "ArgumentParser"):
         ),
     )
     optional_group.add_argument(
-        "--weights-variant",
+        "--variant",
         type=str,
         default=None,
         help=("If specified load weights from variant filename."),
@@ -452,7 +452,7 @@ def run(self):
                 quantization_config=quantization_config,
                 stateful=not self.args.disable_stateful,
                 trust_remote_code=self.args.trust_remote_code,
-                variant=self.args.weights_variant,
+                variant=self.args.variant,
                 cache_dir=self.args.cache_dir,
             )
             model.save_pretrained(self.args.output)
@@ -475,6 +475,6 @@ def run(self):
                 stateful=not self.args.disable_stateful,
                 convert_tokenizer=not self.args.disable_convert_tokenizer,
                 library_name=library_name,
-                weights_variant=self.args.weights_variant,
+                variant=self.args.variant,
                 # **input_shapes,
             )
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index afc8718dea..88c738999a 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -122,7 +122,7 @@ def main_export(
     convert_tokenizer: bool = False,
     library_name: Optional[str] = None,
     model_loading_kwargs: Optional[Dict[str, Any]] = None,
-    weights_variant: Optional[str] = None,
+    variant: Optional[str] = None,
     **kwargs_shapes,
 ):
     """
@@ -238,8 +238,8 @@ def main_export(
     custom_architecture = False
     patch_16bit = False
     loading_kwargs = model_loading_kwargs or {}
-    if weights_variant is not None:
-        loading_kwargs["variant"] = weights_variant
+    if variant is not None:
+        loading_kwargs["variant"] = variant
     if library_name == "transformers":
         config = AutoConfig.from_pretrained(
             model_name_or_path,
@@ -350,7 +350,7 @@ class StoreAttr(object):
 
                 GPTQQuantizer.post_init_model = post_init_model
     elif library_name == "diffusers" and is_openvino_version(">=", "2024.6"):
-        _loading_kwargs = {} if weights_variant is None else {"variant": weights_variant}
+        _loading_kwargs = {} if variant is None else {"variant": variant}
         dtype = deduce_diffusers_dtype(
             model_name_or_path,
             revision=revision,
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index 25064dd044..3902deff4c 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -609,7 +609,7 @@ def _from_transformers(
             trust_remote_code=trust_remote_code,
             ov_config=ov_config,
             library_name=cls._library_name,
-            weights_variant=variant,
+            variant=variant,
         )
 
         return cls._from_pretrained(
diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py
index c60c0ec702..ba0d426e90 100644
--- a/optimum/intel/openvino/modeling_base_seq2seq.py
+++ b/optimum/intel/openvino/modeling_base_seq2seq.py
@@ -423,7 +423,7 @@ def _from_transformers(
             trust_remote_code=trust_remote_code,
             ov_config=ov_config,
             stateful=stateful,
-            model_variant=variant,
+            variant=variant,
         )
 
         return cls._from_pretrained(
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index 2613d26c68..f6c4fc37a8 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -590,7 +590,7 @@ def _from_transformers(
             force_download=force_download,
             ov_config=ov_config,
             library_name=cls._library_name,
-            weights_variant=variant,
+            variant=variant,
         )
 
         return cls._from_pretrained(
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index b89c238b14..023f46896a 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -630,7 +630,7 @@ def _from_transformers(
             trust_remote_code=trust_remote_code,
             ov_config=ov_config,
             stateful=stateful,
-            weights_variant=variant,
+            variant=variant,
         )
         config = AutoConfig.from_pretrained(save_dir_path, trust_remote_code=trust_remote_code)
         return cls._from_pretrained(