From 89497bee74bca21ef96aaa7971d96f53a43c275d Mon Sep 17 00:00:00 2001 From: eaidova Date: Fri, 27 Dec 2024 16:29:23 +0400 Subject: [PATCH 1/8] support sana text2image --- optimum/exporters/openvino/convert.py | 70 +++++++++++++++++++++ optimum/exporters/openvino/model_configs.py | 49 +++++++++++++++ 2 files changed, 119 insertions(+) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 22a3ca884e..7677b0158f 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -1016,6 +1016,7 @@ def get_diffusion_models_for_export_ext( is_sdxl = pipeline.__class__.__name__.startswith("StableDiffusionXL") is_sd3 = pipeline.__class__.__name__.startswith("StableDiffusion3") is_flux = pipeline.__class__.__name__.startswith("Flux") + is_sana = pipeline.__class__.__name__.startswith("Sana") is_sd = pipeline.__class__.__name__.startswith("StableDiffusion") and not is_sd3 is_lcm = pipeline.__class__.__name__.startswith("LatentConsistencyModel") @@ -1034,11 +1035,80 @@ def get_diffusion_models_for_export_ext( models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype) elif is_flux: models_for_export = get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype) + elif is_sana: + models_for_export = get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype) else: raise ValueError(f"Unsupported pipeline type `{pipeline.__class__.__name__}` provided") return None, models_for_export +def get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype): + DEFAULT_DUMMY_SHAPES["heigh"] = DEFAULT_DUMMY_SHAPES["height"] // 4 + DEFAULT_DUMMY_SHAPES["width"] = DEFAULT_DUMMY_SHAPES["width"] // 4 + models_for_export = {} + text_encoder = pipeline.text_encoder + text_encoder_config_constructor = TasksManager.get_exporter_config_constructor( + model=text_encoder, + exporter=exporter, + library_name="diffusers", + task="feature-extraction", + model_type="gemma2-text-encoder", + ) + text_encoder_export_config = text_encoder_config_constructor( + pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + text_encoder_export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"} + models_for_export["text_encoder"] = (text_encoder, text_encoder_export_config) + transformer = pipeline.transformer + transformer.config.text_encoder_projection_dim = transformer.config.caption_channels + transformer.config.requires_aesthetics_score = False + transformer.config.time_cond_proj_dim = None + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=transformer, + exporter=exporter, + library_name="diffusers", + task="semantic-segmentation", + model_type="sana-transformer", + ) + transformer_export_config = export_config_constructor( + pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["transformer"] = (transformer, transformer_export_config) + # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565 + vae_encoder = copy.deepcopy(pipeline.vae) + vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters} + vae_config_constructor = TasksManager.get_exporter_config_constructor( + model=vae_encoder, + exporter=exporter, + library_name="diffusers", + task="semantic-segmentation", + model_type="vae-encoder", + ) + vae_encoder_export_config = vae_config_constructor( + vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + vae_encoder_export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"} + models_for_export["vae_encoder"] = (vae_encoder, vae_encoder_export_config) + + # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600 + vae_decoder = copy.deepcopy(pipeline.vae) + vae_decoder.forward = lambda latent_sample: vae_decoder.decode(z=latent_sample) + vae_config_constructor = TasksManager.get_exporter_config_constructor( + model=vae_decoder, + exporter=exporter, + library_name="diffusers", + task="semantic-segmentation", + model_type="vae-decoder", + ) + vae_decoder_export_config = vae_config_constructor( + vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + vae_decoder_export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"} + models_for_export["vae_decoder"] = (vae_decoder, vae_decoder_export_config) + + return models_for_export + + def get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype): models_for_export = {} diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 4b1dbb50b8..80c2ff24a8 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -57,6 +57,7 @@ DummyVisionInputGenerator, FalconDummyPastKeyValuesGenerator, MistralDummyPastKeyValuesGenerator, + DummySeq2SeqDecoderTextInputGenerator ) from optimum.utils.normalized_config import NormalizedConfig, NormalizedTextConfig, NormalizedVisionConfig @@ -133,6 +134,8 @@ def init_model_configs(): if is_diffusers_available() and "fill" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS: TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["fill"] = "FluxFillPipeline" TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["fill"] = {"flux": "FluxFillPipeline"} + TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["text-to-image"] = ("AutoPipelineForText2Image", "SanaPipeline") + TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["text-to-image"]["sana"] = "SanaPipeline" supported_model_types = [ "_SUPPORTED_MODEL_TYPE", @@ -1890,6 +1893,52 @@ def rename_ambiguous_inputs(self, inputs): class T5EncoderOpenVINOConfig(CLIPTextOpenVINOConfig): pass +@register_in_tasks_manager("gemma2-text-encoder", *["feature-extraction"], library_name="diffusers") +class Gemma2TextEncoderOpenVINOConfig(CLIPTextOpenVINOConfig): + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + return { + "input_ids": {0: "batch_size", 1: "sequence_length"}, + "attention_mask": {0: "batch_size", 1: "sequence_length"} + } + + +class DummySeq2SeqDecoderTextWithEncMaskInputGenerator(DummySeq2SeqDecoderTextInputGenerator): + SUPPORTED_INPUT_NAMES = ( + "decoder_input_ids", + "decoder_attention_mask", + "encoder_outputs", + "encoder_hidden_states", + "encoder_attention_mask" + ) + + +class DummySanaTransformerVisionInputGenerator(DummyVisionInputGenerator): + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name not in ["sample", "latent_sample"]: + return super().generate(input_name, framework, int_dtype, float_dtype) + return self.random_float_tensor( + shape=[self.batch_size, self.num_channels, self.height, self.width], + framework=framework, + dtype=float_dtype, + ) + +@register_in_tasks_manager("sana-transformer", *["semantic-segmentation"], library_name="diffusers") +class SanaTransformerOpenVINOConfig(UNetOpenVINOConfig): + NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args( + image_size="sample_size", + num_channels="in_channels", + hidden_size="cross_attention_dim", + vocab_size="attention_head_dim", + allow_new=True, + ) + DUMMY_INPUT_GENERATOR_CLASSES = (DummySanaTransformerVisionInputGenerator, DummySeq2SeqDecoderTextWithEncMaskInputGenerator) + UNetOpenVINOConfig.DUMMY_INPUT_GENERATOR_CLASSES[1:-1] + @property + def inputs(self): + common_inputs = super().inputs + common_inputs["encoder_attention_mask"] = {0: "batch_size", 1: "sequence_length"} + return common_inputs + class DummyFluxTransformerInputGenerator(DummyVisionInputGenerator): SUPPORTED_INPUT_NAMES = ( From 6f3deaed9e4cb75a178f683ba1c93256fed087c7 Mon Sep 17 00:00:00 2001 From: eaidova Date: Mon, 13 Jan 2025 20:20:04 +0400 Subject: [PATCH 2/8] add pipeline --- optimum/commands/export/openvino.py | 7 ++ optimum/exporters/openvino/__main__.py | 6 ++ optimum/exporters/openvino/convert.py | 66 ++++++++++++++++--- optimum/exporters/openvino/model_configs.py | 56 +++++++++++----- optimum/intel/__init__.py | 2 + optimum/intel/openvino/__init__.py | 1 + optimum/intel/openvino/modeling_diffusion.py | 36 ++++++++-- .../dummy_openvino_and_diffusers_objects.py | 11 ++++ tests/openvino/test_diffusion.py | 54 ++++++++++----- tests/openvino/test_exporters_cli.py | 8 ++- tests/openvino/utils_tests.py | 2 + 11 files changed, 203 insertions(+), 46 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 20e2d7ca33..d8c4edafda 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -105,6 +105,12 @@ def parse_args_openvino(parser: "ArgumentParser"): "This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it." ), ) + optional_group.add_argument( + "--variant", + type=str, + default=None, + help=("Select a variant of the model to export."), + ) optional_group.add_argument( "--ratio", type=float, @@ -463,5 +469,6 @@ def run(self): stateful=not self.args.disable_stateful, convert_tokenizer=not self.args.disable_convert_tokenizer, library_name=library_name, + model_variant=self.args.variant, # **input_shapes, ) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 4047ab64aa..59a9dc41ab 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -122,6 +122,7 @@ def main_export( convert_tokenizer: bool = False, library_name: Optional[str] = None, model_loading_kwargs: Optional[Dict[str, Any]] = None, + model_variant: Optional[str] = None, **kwargs_shapes, ): """ @@ -237,6 +238,8 @@ def main_export( custom_architecture = False patch_16bit = False loading_kwargs = model_loading_kwargs or {} + if model_variant is not None: + loading_kwargs["variant"] = model_variant if library_name == "transformers": config = AutoConfig.from_pretrained( model_name_or_path, @@ -347,6 +350,7 @@ class StoreAttr(object): GPTQQuantizer.post_init_model = post_init_model elif library_name == "diffusers" and is_openvino_version(">=", "2024.6"): + _loading_kwargs = {} if model_variant is None else {"variant": model_variant} dtype = deduce_diffusers_dtype( model_name_or_path, revision=revision, @@ -355,6 +359,7 @@ class StoreAttr(object): local_files_only=local_files_only, force_download=force_download, trust_remote_code=trust_remote_code, + **_loading_kwargs, ) if dtype in [torch.float16, torch.bfloat16]: loading_kwargs["torch_dtype"] = dtype @@ -364,6 +369,7 @@ class StoreAttr(object): if library_name == "open_clip": model = _OpenClipForZeroShotImageClassification.from_pretrained(model_name_or_path, cache_dir=cache_dir) else: + logger.warn(loading_kwargs) model = TasksManager.get_model_from_task( task, model_name_or_path, diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 7677b0158f..cb5bf95181 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -1013,6 +1013,7 @@ def _get_submodels_and_export_configs( def get_diffusion_models_for_export_ext( pipeline: "DiffusionPipeline", int_dtype: str = "int64", float_dtype: str = "fp32", exporter: str = "openvino" ): +<<<<<<< HEAD is_sdxl = pipeline.__class__.__name__.startswith("StableDiffusionXL") is_sd3 = pipeline.__class__.__name__.startswith("StableDiffusion3") is_flux = pipeline.__class__.__name__.startswith("Flux") @@ -1035,6 +1036,52 @@ def get_diffusion_models_for_export_ext( models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype) elif is_flux: models_for_export = get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype) +======= + if is_diffusers_version(">=", "0.29.0"): + from diffusers import StableDiffusion3Img2ImgPipeline, StableDiffusion3Pipeline + + sd3_pipes = [StableDiffusion3Pipeline, StableDiffusion3Img2ImgPipeline] + if is_diffusers_version(">=", "0.30.0"): + from diffusers import StableDiffusion3InpaintPipeline + + sd3_pipes.append(StableDiffusion3InpaintPipeline) + + is_sd3 = isinstance(pipeline, tuple(sd3_pipes)) + logger.warn(f"IS SD3 {pipeline} {is_sd3}") + else: + is_sd3 = False + + if is_diffusers_version(">=", "0.30.0"): + from diffusers import FluxPipeline + + flux_pipes = [FluxPipeline] + + if is_diffusers_version(">=", "0.31.0"): + from diffusers import FluxImg2ImgPipeline, FluxInpaintPipeline + + flux_pipes.extend([FluxPipeline, FluxImg2ImgPipeline, FluxInpaintPipeline]) + + if is_diffusers_version(">=", "0.32.0"): + from diffusers import FluxFillPipeline + + flux_pipes.append(FluxFillPipeline) + + is_flux = isinstance(pipeline, tuple(flux_pipes)) + else: + is_flux = False + + if is_diffusers_version(">=", "0.32.0"): + from diffusers import SanaPipeline + + is_sana = isinstance(pipeline, SanaPipeline) + else: + is_sana = False + + if not any([is_sana, is_flux, is_sd3]): + return None, get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter) + if is_sd3: + models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype) +>>>>>>> add pipeline elif is_sana: models_for_export = get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype) else: @@ -1043,17 +1090,15 @@ def get_diffusion_models_for_export_ext( def get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype): - DEFAULT_DUMMY_SHAPES["heigh"] = DEFAULT_DUMMY_SHAPES["height"] // 4 - DEFAULT_DUMMY_SHAPES["width"] = DEFAULT_DUMMY_SHAPES["width"] // 4 models_for_export = {} text_encoder = pipeline.text_encoder text_encoder_config_constructor = TasksManager.get_exporter_config_constructor( - model=text_encoder, - exporter=exporter, - library_name="diffusers", - task="feature-extraction", - model_type="gemma2-text-encoder", - ) + model=text_encoder, + exporter=exporter, + library_name="diffusers", + task="feature-extraction", + model_type="gemma2-text-encoder", + ) text_encoder_export_config = text_encoder_config_constructor( pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype ) @@ -1076,13 +1121,13 @@ def get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype): models_for_export["transformer"] = (transformer, transformer_export_config) # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565 vae_encoder = copy.deepcopy(pipeline.vae) - vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters} + vae_encoder.forward = lambda sample: {"latent": vae_encoder.encode(x=sample)["latent"]} vae_config_constructor = TasksManager.get_exporter_config_constructor( model=vae_encoder, exporter=exporter, library_name="diffusers", task="semantic-segmentation", - model_type="vae-encoder", + model_type="dcae-encoder", ) vae_encoder_export_config = vae_config_constructor( vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype @@ -1140,6 +1185,7 @@ def get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype): task="semantic-segmentation", model_type="sd3-transformer", ) + logger.warn(f"TRANSFORMER COFG {export_config_constructor}") transformer_export_config = export_config_constructor( pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype ) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 80c2ff24a8..1f7695cf82 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -41,6 +41,7 @@ PhiOnnxConfig, T5OnnxConfig, UNetOnnxConfig, + VaeEncoderOnnxConfig, VisionOnnxConfig, WhisperOnnxConfig, ) @@ -57,7 +58,6 @@ DummyVisionInputGenerator, FalconDummyPastKeyValuesGenerator, MistralDummyPastKeyValuesGenerator, - DummySeq2SeqDecoderTextInputGenerator ) from optimum.utils.normalized_config import NormalizedConfig, NormalizedTextConfig, NormalizedVisionConfig @@ -1893,52 +1893,78 @@ def rename_ambiguous_inputs(self, inputs): class T5EncoderOpenVINOConfig(CLIPTextOpenVINOConfig): pass + @register_in_tasks_manager("gemma2-text-encoder", *["feature-extraction"], library_name="diffusers") class Gemma2TextEncoderOpenVINOConfig(CLIPTextOpenVINOConfig): @property def inputs(self) -> Dict[str, Dict[int, str]]: return { "input_ids": {0: "batch_size", 1: "sequence_length"}, - "attention_mask": {0: "batch_size", 1: "sequence_length"} + "attention_mask": {0: "batch_size", 1: "sequence_length"}, } -class DummySeq2SeqDecoderTextWithEncMaskInputGenerator(DummySeq2SeqDecoderTextInputGenerator): +class DummySanaSeq2SeqDecoderTextWithEncMaskInputGenerator(DummySeq2SeqDecoderTextInputGenerator): SUPPORTED_INPUT_NAMES = ( "decoder_input_ids", "decoder_attention_mask", "encoder_outputs", "encoder_hidden_states", - "encoder_attention_mask" + "encoder_attention_mask", ) -class DummySanaTransformerVisionInputGenerator(DummyVisionInputGenerator): - def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): - if input_name not in ["sample", "latent_sample"]: - return super().generate(input_name, framework, int_dtype, float_dtype) - return self.random_float_tensor( - shape=[self.batch_size, self.num_channels, self.height, self.width], - framework=framework, - dtype=float_dtype, - ) +class DummySanaTransformerVisionInputGenerator(DummyUnetVisionInputGenerator): + def __init__( + self, + task: str, + normalized_config: NormalizedVisionConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], + width: int = DEFAULT_DUMMY_SHAPES["width"] // 8, + height: int = DEFAULT_DUMMY_SHAPES["height"] // 8, + # Reduce img shape by 4 for FLUX to reduce memory usage on conversion + **kwargs, + ): + super().__init__(task, normalized_config, batch_size, num_channels, width=width, height=height, **kwargs) + @register_in_tasks_manager("sana-transformer", *["semantic-segmentation"], library_name="diffusers") class SanaTransformerOpenVINOConfig(UNetOpenVINOConfig): NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args( image_size="sample_size", num_channels="in_channels", - hidden_size="cross_attention_dim", + hidden_size="caption_channels", vocab_size="attention_head_dim", allow_new=True, ) - DUMMY_INPUT_GENERATOR_CLASSES = (DummySanaTransformerVisionInputGenerator, DummySeq2SeqDecoderTextWithEncMaskInputGenerator) + UNetOpenVINOConfig.DUMMY_INPUT_GENERATOR_CLASSES[1:-1] + DUMMY_INPUT_GENERATOR_CLASSES = ( + DummySanaTransformerVisionInputGenerator, + DummySanaSeq2SeqDecoderTextWithEncMaskInputGenerator, + ) + UNetOpenVINOConfig.DUMMY_INPUT_GENERATOR_CLASSES[1:-1] + @property def inputs(self): common_inputs = super().inputs common_inputs["encoder_attention_mask"] = {0: "batch_size", 1: "sequence_length"} return common_inputs + def rename_ambiguous_inputs(self, inputs): + # The input name in the model signature is `x, hence the export input name is updated. + hidden_states = inputs.pop("sample", None) + if hidden_states is not None: + inputs["hidden_states"] = hidden_states + return inputs + + +@register_in_tasks_manager("dcae-encoder", *["semantic-segmentation"], library_name="diffusers") +class DcaeEncoderOpenVINOConfig(VaeEncoderOnnxConfig): + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + return { + "latent": {0: "batch_size", 2: "height_latent", 3: "width_latent"}, + } + class DummyFluxTransformerInputGenerator(DummyVisionInputGenerator): SUPPORTED_INPUT_NAMES = ( diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index 91aaf57ae0..2c85dcc98d 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -127,6 +127,7 @@ "OVFluxImg2ImgPipeline", "OVFluxInpaintPipeline", "OVFluxFillPipeline", + "OVSanaPipeline", "OVPipelineForImage2Image", "OVPipelineForText2Image", "OVPipelineForInpainting", @@ -150,6 +151,7 @@ "OVFluxImg2ImgPipeline", "OVFluxInpaintPipeline", "OVFluxFillPipeline", + "OVSanaPipeline", "OVPipelineForImage2Image", "OVPipelineForText2Image", "OVPipelineForInpainting", diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index 8945dc6382..d3142ad802 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -91,6 +91,7 @@ OVPipelineForImage2Image, OVPipelineForInpainting, OVPipelineForText2Image, + OVSanaPipeline, OVStableDiffusion3Img2ImgPipeline, OVStableDiffusion3InpaintPipeline, OVStableDiffusion3Pipeline, diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index bc2f75e0ce..72ebf46887 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -103,9 +103,10 @@ FluxInpaintPipeline = object if is_diffusers_version(">=", "0.32.0"): - from diffusers import FluxFillPipeline + from diffusers import FluxFillPipeline, SanaPipeline else: FluxFillPipeline = object + SanaPipeline = object DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER = "transformer" @@ -817,9 +818,14 @@ def reshape( if self.tokenizer is None and self.tokenizer_2 is None: tokenizer_max_len = -1 else: - tokenizer_max_len = ( - self.tokenizer.model_max_length if self.tokenizer is not None else self.tokenizer_2.model_max_length - ) + if self.tokenizer is not None and "Gemma" in self.tokenizer.__class__.__name__: + tokenizer_max_len = -1 + else: + tokenizer_max_len = ( + self.tokenizer.model_max_length + if self.tokenizer is not None + else self.tokenizer_2.model_max_length + ) if self.unet is not None: self.unet.model = self._reshape_unet( @@ -1041,6 +1047,7 @@ def __init__(self, model: openvino.runtime.Model, parent_pipeline: OVDiffusionPi self.hidden_states_output_names = [ name for out in self.model.outputs for name in out.names if name.startswith("hidden_states") ] + self.input_names = [inp.get_any_name() for inp in self.model.inputs] def forward( self, @@ -1052,6 +1059,11 @@ def forward( self._compile() model_inputs = {"input_ids": input_ids} + if "attention_mask" in self.input_names: + model_inputs["attention_mask"] = ( + attention_mask if attention_mask is not None else torch.ones(input_ids.shape, dtype=torch.long) + ) + ov_outputs = self.request(model_inputs, share_inputs=True) main_out = ov_outputs[0] model_outputs = {} @@ -1139,6 +1151,8 @@ def forward( guidance: torch.Tensor = None, block_controlnet_hidden_states: List = None, joint_attention_kwargs: Optional[Dict[str, Any]] = None, + encoder_attention_mask: torch.LongTensor = None, + attention_kwargs: Optional[Dict[str, Any]] = None, return_dict: bool = True, ): self._compile() @@ -1147,9 +1161,10 @@ def forward( "hidden_states": hidden_states, "timestep": timestep, "encoder_hidden_states": encoder_hidden_states, - "pooled_projections": pooled_projections, } + if pooled_projections is not None: + model_inputs["pooled_projections"] = pooled_projections if img_ids is not None: model_inputs["img_ids"] = img_ids if txt_ids is not None: @@ -1157,6 +1172,9 @@ def forward( if guidance is not None: model_inputs["guidance"] = guidance + if encoder_attention_mask is not None: + model_inputs["encoder_attention_mask"] = encoder_attention_mask + ov_outputs = self.request(model_inputs, share_inputs=True).to_dict() model_outputs = {} @@ -1498,6 +1516,12 @@ class OVFluxFillPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, Flu auto_model_class = FluxFillPipeline +class OVSanaPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, SanaPipeline): + main_input_name = "prompt" + export_feature = "text-to-image" + auto_model_class = SanaPipeline + + SUPPORTED_OV_PIPELINES = [ OVStableDiffusionPipeline, OVStableDiffusionImg2ImgPipeline, @@ -1569,6 +1593,8 @@ def _get_ov_class(pipeline_class_name: str, throw_error_if_not_exist: bool = Tru if is_diffusers_version(">=", "0.32.0"): OV_INPAINT_PIPELINES_MAPPING["flux-fill"] = OVFluxFillPipeline SUPPORTED_OV_PIPELINES.append(OVFluxFillPipeline) + OV_TEXT2IMAGE_PIPELINES_MAPPING["sana"] = OVSanaPipeline + SUPPORTED_OV_PIPELINES.append(OVSanaPipeline) SUPPORTED_OV_PIPELINES_MAPPINGS = [ OV_TEXT2IMAGE_PIPELINES_MAPPING, diff --git a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py index 2e669875fc..f6341e5435 100644 --- a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py +++ b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py @@ -222,3 +222,14 @@ def __init__(self, *args, **kwargs): @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["openvino", "diffusers"]) + + +class OVSanaPipeline(metaclass=DummyObject): + _backends = ["openvino", "diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "diffusers"]) diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index 477799345b..501960cc9f 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -77,8 +77,8 @@ class OVPipelineForText2ImageTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] if is_transformers_version(">=", "4.40.0"): - SUPPORTED_ARCHITECTURES.extend(["stable-diffusion-3", "flux"]) - NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES.append("stable-diffusion-3") + SUPPORTED_ARCHITECTURES.extend(["stable-diffusion-3", "flux", "sana"]) + NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES.extend(["stable-diffusion-3", "sana"]) CALLBACK_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] OVMODEL_CLASS = OVPipelineForText2Image @@ -94,6 +94,13 @@ def generate_inputs(self, height=128, width=128, batch_size=1): return inputs + def get_auto_cls(self, model_arch): + if model_arch == "sana": + from diffusers import SanaPipeline + + return SanaPipeline + return self.AUTOMODEL_CLASS + @require_diffusers def test_load_vanilla_model_which_is_not_supported(self): with self.assertRaises(Exception) as context: @@ -104,12 +111,14 @@ def test_load_vanilla_model_which_is_not_supported(self): @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_ov_pipeline_class_dispatch(self, model_arch: str): - auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + auto_cls = self.get_auto_cls(model_arch) + auto_pipeline = DiffusionPipeline if model_arch != "sana" else auto_cls + auto_pipeline = auto_cls.from_pretrained(MODEL_NAMES[model_arch]) ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) self.assertEqual(ov_pipeline.auto_model_class, auto_pipeline.__class__) - auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) + auto_pipeline = auto_pipeline.from_pretrained(MODEL_NAMES[model_arch]) ov_pipeline = OVDiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) self.assertEqual(ov_pipeline.auto_model_class, auto_pipeline.__class__) @@ -130,30 +139,44 @@ def test_num_images_per_prompt(self, model_arch: str): @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_compare_to_diffusers_pipeline(self, model_arch: str): - height, width, batch_size = 128, 128, 1 + height, width, batch_size = 64, 64, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - + auto_cls = self.get_auto_cls(model_arch) ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - - for output_type in ["latent", "np", "pt"]: - inputs["output_type"] = output_type + diffusers_pipeline = auto_cls.from_pretrained(MODEL_NAMES[model_arch]) - ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images - diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images + with torch.no_grad(): + for output_type in ["latent", "np", "pt"]: + inputs["output_type"] = output_type + if model_arch == "sana": + if output_type == "latent": + continue + inputs["use_resolution_binning"] = False + atol = 4e-2 + else: + atol = 6e-3 - np.testing.assert_allclose(ov_output, diffusers_output, atol=6e-3, rtol=1e-2) + ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images + diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images + np.testing.assert_allclose(ov_output, diffusers_output, atol=atol, rtol=1e-2) # test on inputs nondivisible on 64 height, width, batch_size = 96, 96, 1 for output_type in ["latent", "np", "pt"]: inputs["output_type"] = output_type + if model_arch == "sana": + if output_type == "latent": + continue + inputs["use_resolution_binning"] = False + atol = 4e-2 + else: + atol = 6e-3 ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - np.testing.assert_allclose(ov_output, diffusers_output, atol=6e-3, rtol=1e-2) + np.testing.assert_allclose(ov_output, diffusers_output, atol=atol, rtol=1e-2) @parameterized.expand(CALLBACK_SUPPORT_ARCHITECTURES) @require_diffusers @@ -174,7 +197,8 @@ def __call__(self, *args, **kwargs) -> None: auto_callback = Callback() ov_pipe = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + auto_cls = self.get_auto_cls(model_arch) + auto_pipe = auto_cls.from_pretrained(MODEL_NAMES[model_arch]) # callback_steps=1 to trigger callback every step ov_pipe(**inputs, callback=ov_callback, callback_steps=1) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index f4b96ec998..6f047c0bee 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -84,7 +84,12 @@ class OVCLIExportTestCase(unittest.TestCase): if is_transformers_version(">=", "4.45"): SUPPORTED_ARCHITECTURES.extend( - [("text-to-image", "stable-diffusion-3"), ("text-to-image", "flux"), ("inpainting", "flux-fill")] + [ + ("text-to-image", "stable-diffusion-3"), + ("text-to-image", "flux"), + ("inpainting", "flux-fill"), + ("text-to-image", "sana"), + ] ) EXPECTED_NUMBER_OF_TOKENIZER_MODELS = { "gpt2": 2 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0, @@ -113,6 +118,7 @@ class OVCLIExportTestCase(unittest.TestCase): if is_transformers_version(">=", "4.45"): SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("stable-diffusion-3", 9, 65)) SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("flux", 7, 56)) + SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("sana", 7, 56)) SUPPORTED_QUANTIZATION_ARCHITECTURES = [ ( diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 3100df6159..634c5ba5e0 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -168,6 +168,7 @@ "open-clip-ov": "zofinka/tiny-open-clip-model", "st-bert": "sentence-transformers/all-MiniLM-L6-v2", "st-mpnet": "sentence-transformers/all-mpnet-base-v2", + "sana": "/home/ea/work/my_optimum_intel/optimum-intel/tiny-random-sana", } @@ -200,6 +201,7 @@ "minicpmv": (30, 26, 1, 6), "nanollava": (30, 15, 1), "qwen2_vl": (30, 1, 1, 10), + "sana": (242, 34, 42, 64), } TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg" From 6b98d62a6f213576c3d1f289b93dfb462fdcb784 Mon Sep 17 00:00:00 2001 From: eaidova Date: Mon, 13 Jan 2025 22:31:59 +0400 Subject: [PATCH 3/8] update tests --- optimum/commands/export/openvino.py | 4 ++++ optimum/exporters/openvino/convert.py | 2 -- optimum/intel/openvino/modeling_diffusion.py | 12 +++++++++--- optimum/intel/openvino/utils.py | 1 + tests/openvino/test_diffusion.py | 4 +++- tests/openvino/test_exporters_cli.py | 14 +++++++++++--- tests/openvino/test_quantization.py | 2 ++ tests/openvino/utils_tests.py | 4 ++-- 8 files changed, 32 insertions(+), 11 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index d8c4edafda..161534fad3 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -416,6 +416,10 @@ def run(self): from optimum.intel import OVFluxPipeline model_cls = OVFluxPipeline + elif class_name == "SanaPipeline": + from optimum.intel import OVSanaPipeline + + model_cls = OVSanaPipeline else: raise NotImplementedError(f"Quantization in hybrid mode isn't supported for class {class_name}.") diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index cb5bf95181..4614d93689 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -1047,7 +1047,6 @@ def get_diffusion_models_for_export_ext( sd3_pipes.append(StableDiffusion3InpaintPipeline) is_sd3 = isinstance(pipeline, tuple(sd3_pipes)) - logger.warn(f"IS SD3 {pipeline} {is_sd3}") else: is_sd3 = False @@ -1185,7 +1184,6 @@ def get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype): task="semantic-segmentation", model_type="sd3-transformer", ) - logger.warn(f"TRANSFORMER COFG {export_config_constructor}") transformer_export_config = export_config_constructor( pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype ) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 72ebf46887..4a3f7104b7 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -844,17 +844,23 @@ def reshape( if self.text_encoder is not None: self.text_encoder.model = self._reshape_text_encoder( - self.text_encoder.model, batch_size, self.tokenizer.model_max_length + self.text_encoder.model, + batch_size, + self.tokenizer.model_max_length if "Gemma" not in self.tokenizer.__class__.__name__ else -1, ) if self.text_encoder_2 is not None: self.text_encoder_2.model = self._reshape_text_encoder( - self.text_encoder_2.model, batch_size, self.tokenizer_2.model_max_length + self.text_encoder_2.model, + batch_size, + self.tokenizer_2.model_max_length if "Gemma" not in self.tokenizer.__class__.__name__ else -1, ) if self.text_encoder_3 is not None: self.text_encoder_3.model = self._reshape_text_encoder( - self.text_encoder_3.model, batch_size, self.tokenizer_3.model_max_length + self.text_encoder_3.model, + batch_size, + self.tokenizer_3.model_max_length if "Gemma" not in self.tokenizer.__class__.__name__ else -1, ) self.clear_requests() diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index fbb108c7d8..d9df9419ad 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -125,6 +125,7 @@ "stable-diffusion": "OVStableDiffusionPipeline", "stable-diffusion-xl": "OVStableDiffusionXLPipeline", "stable-diffusion-3": "OVStableDiffusion3Pipeline", + "sana": "OVSanaPipeline", "flux": "OVFluxPipeline", "flux-fill": "OVFluxFillPipeline", "pix2struct": "OVModelForPix2Struct", diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index 501960cc9f..b155353fc3 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -78,7 +78,7 @@ class OVPipelineForText2ImageTest(unittest.TestCase): NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] if is_transformers_version(">=", "4.40.0"): SUPPORTED_ARCHITECTURES.extend(["stable-diffusion-3", "flux", "sana"]) - NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES.extend(["stable-diffusion-3", "sana"]) + NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES.append(["stable-diffusion-3"]) CALLBACK_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] OVMODEL_CLASS = OVPipelineForText2Image @@ -215,6 +215,8 @@ def test_shape(self, model_arch: str): height, width, batch_size = 128, 64, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + if model_arch == "sana": + inputs["use_resolution_binning"] = False for output_type in ["pil", "np", "pt", "latent"]: inputs["output_type"] = output_type diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 6f047c0bee..7fbeb2e9e4 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -44,6 +44,7 @@ OVModelOpenCLIPForZeroShotImageClassification, OVModelOpenCLIPText, OVModelOpenCLIPVisual, + OVSanaPipeline, OVSentenceTransformer, OVStableDiffusion3Pipeline, OVStableDiffusionPipeline, @@ -107,6 +108,7 @@ class OVCLIExportTestCase(unittest.TestCase): "flux": 4 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0, "flux-fill": 4 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0, "llava": 2 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0, + "sana": 2 if is_tokenizers_version("<", "0.20.0") or is_openvino_version(">=", "2024.5") else 0, } SUPPORTED_SD_HYBRID_ARCHITECTURES = [ @@ -118,7 +120,7 @@ class OVCLIExportTestCase(unittest.TestCase): if is_transformers_version(">=", "4.45"): SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("stable-diffusion-3", 9, 65)) SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("flux", 7, 56)) - SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("sana", 7, 56)) + SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("sana", 19, 53)) SUPPORTED_QUANTIZATION_ARCHITECTURES = [ ( @@ -357,9 +359,15 @@ def test_exporters_cli_int8(self, task: str, model_type: str): models = [model.encoder, model.decoder] if task.endswith("with-past") and not model.decoder.stateful: models.append(model.decoder_with_past) - elif model_type.startswith("stable-diffusion") or model_type.startswith("flux"): + elif ( + model_type.startswith("stable-diffusion") + or model_type.startswith("flux") + or model_type.startswith("sana") + ): models = [model.unet or model.transformer, model.vae_encoder, model.vae_decoder] - models.append(model.text_encoder if model_type == "stable-diffusion" else model.text_encoder_2) + models.append( + model.text_encoder if model_type in ["stable-diffusion", "sana"] else model.text_encoder_2 + ) elif task.startswith("image-text-to-text"): models = [model.language_model, model.vision_embeddings] else: diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 26ad44401a..4da88418b1 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -59,6 +59,7 @@ OVStableDiffusionXLPipeline, OVStableDiffusion3Pipeline, OVQuantizer, + OVSanaPipeline, OVTrainer, OVQuantizationConfig, OVWeightQuantizationConfig, @@ -543,6 +544,7 @@ class OVWeightCompressionTest(unittest.TestCase): [ (OVStableDiffusion3Pipeline, "stable-diffusion-3", 9, 65), (OVFluxPipeline, "flux", 7, 56), + (OVSanaPipeline, "sana", 19, 53), ] ) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 634c5ba5e0..83ea3751d6 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -168,7 +168,7 @@ "open-clip-ov": "zofinka/tiny-open-clip-model", "st-bert": "sentence-transformers/all-MiniLM-L6-v2", "st-mpnet": "sentence-transformers/all-mpnet-base-v2", - "sana": "/home/ea/work/my_optimum_intel/optimum-intel/tiny-random-sana", + "sana": "katuni4ka/tiny-random-sana", } @@ -201,7 +201,7 @@ "minicpmv": (30, 26, 1, 6), "nanollava": (30, 15, 1), "qwen2_vl": (30, 1, 1, 10), - "sana": (242, 34, 42, 64), + "sana": (58, 28, 28, 18), } TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg" From 5ae26d0c7f753fed46d6d7a1f2ce27aec6db4ad9 Mon Sep 17 00:00:00 2001 From: eaidova Date: Fri, 17 Jan 2025 12:08:13 +0400 Subject: [PATCH 4/8] add variant for model loading in from_transformers --- optimum/exporters/openvino/convert.py | 48 +------------------ optimum/exporters/openvino/model_configs.py | 6 +++ optimum/exporters/openvino/model_patcher.py | 38 +++++++++++++-- optimum/exporters/openvino/utils.py | 12 +++-- optimum/intel/openvino/modeling_base.py | 3 ++ .../intel/openvino/modeling_base_seq2seq.py | 2 + optimum/intel/openvino/modeling_decoder.py | 3 ++ optimum/intel/openvino/modeling_diffusion.py | 2 + .../openvino/modeling_visual_language.py | 2 + tests/openvino/test_diffusion.py | 14 ++---- 10 files changed, 66 insertions(+), 64 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 4614d93689..0d6c2b4367 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -1013,11 +1013,10 @@ def _get_submodels_and_export_configs( def get_diffusion_models_for_export_ext( pipeline: "DiffusionPipeline", int_dtype: str = "int64", float_dtype: str = "fp32", exporter: str = "openvino" ): -<<<<<<< HEAD is_sdxl = pipeline.__class__.__name__.startswith("StableDiffusionXL") is_sd3 = pipeline.__class__.__name__.startswith("StableDiffusion3") is_flux = pipeline.__class__.__name__.startswith("Flux") - is_sana = pipeline.__class__.__name__.startswith("Sana") + is_sana = pipeline.__class__.__name__.startswith("Sana") is_sd = pipeline.__class__.__name__.startswith("StableDiffusion") and not is_sd3 is_lcm = pipeline.__class__.__name__.startswith("LatentConsistencyModel") @@ -1036,51 +1035,6 @@ def get_diffusion_models_for_export_ext( models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype) elif is_flux: models_for_export = get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype) -======= - if is_diffusers_version(">=", "0.29.0"): - from diffusers import StableDiffusion3Img2ImgPipeline, StableDiffusion3Pipeline - - sd3_pipes = [StableDiffusion3Pipeline, StableDiffusion3Img2ImgPipeline] - if is_diffusers_version(">=", "0.30.0"): - from diffusers import StableDiffusion3InpaintPipeline - - sd3_pipes.append(StableDiffusion3InpaintPipeline) - - is_sd3 = isinstance(pipeline, tuple(sd3_pipes)) - else: - is_sd3 = False - - if is_diffusers_version(">=", "0.30.0"): - from diffusers import FluxPipeline - - flux_pipes = [FluxPipeline] - - if is_diffusers_version(">=", "0.31.0"): - from diffusers import FluxImg2ImgPipeline, FluxInpaintPipeline - - flux_pipes.extend([FluxPipeline, FluxImg2ImgPipeline, FluxInpaintPipeline]) - - if is_diffusers_version(">=", "0.32.0"): - from diffusers import FluxFillPipeline - - flux_pipes.append(FluxFillPipeline) - - is_flux = isinstance(pipeline, tuple(flux_pipes)) - else: - is_flux = False - - if is_diffusers_version(">=", "0.32.0"): - from diffusers import SanaPipeline - - is_sana = isinstance(pipeline, SanaPipeline) - else: - is_sana = False - - if not any([is_sana, is_flux, is_sd3]): - return None, get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter) - if is_sd3: - models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype) ->>>>>>> add pipeline elif is_sana: models_for_export = get_sana_models_for_export(pipeline, exporter, int_dtype, float_dtype) else: diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 1f7695cf82..e73039159c 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -106,6 +106,7 @@ Qwen2VLVisionEmbMergerPatcher, QwenModelPatcher, RotaryEmbPatcher, + SanaTextEncoderModelPatcher, StatefulSeq2SeqDecoderPatcher, UpdateCausalMaskModelPatcher, XverseModelPatcher, @@ -1903,6 +1904,11 @@ def inputs(self) -> Dict[str, Dict[int, str]]: "attention_mask": {0: "batch_size", 1: "sequence_length"}, } + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> ModelPatcher: + return SanaTextEncoderModelPatcher(self, model, model_kwargs) + class DummySanaSeq2SeqDecoderTextWithEncMaskInputGenerator(DummySeq2SeqDecoderTextInputGenerator): SUPPORTED_INPUT_NAMES = ( diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index e7a7779389..08bc149880 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -21,9 +21,11 @@ import torch import torch.nn.functional as F +from transformers import PreTrainedModel, TFPreTrainedModel from transformers.modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling from transformers.utils import is_tf_available +from optimum.exporters.onnx.base import OnnxConfig from optimum.exporters.onnx.model_patcher import ( DecoderModelPatcher, ModelPatcher, @@ -114,9 +116,11 @@ def patch_model_with_bettertransformer(model): return model -def patch_update_causal_mask(model, transformers_version, inner_model_name="model", patch_fn=None): +def patch_update_causal_mask( + model, transformers_version, inner_model_name="model", patch_fn=None, patch_extrnal_model=False +): if is_transformers_version(">=", transformers_version): - inner_model = getattr(model, inner_model_name, None) + inner_model = getattr(model, inner_model_name, None) if not patch_extrnal_model else model if inner_model is not None: if hasattr(inner_model, "_update_causal_mask"): inner_model._orig_update_causal_mask = inner_model._update_causal_mask @@ -124,8 +128,8 @@ def patch_update_causal_mask(model, transformers_version, inner_model_name="mode inner_model._update_causal_mask = types.MethodType(patch_fn, inner_model) -def unpatch_update_causal_mask(model, inner_model_name="model"): - inner_model = getattr(model, inner_model_name, None) +def unpatch_update_causal_mask(model, inner_model_name="model", patch_extrnal_model=False): + inner_model = getattr(model, inner_model_name, None) if not patch_extrnal_model else model if inner_model is not None and hasattr(inner_model, "._orig_update_causal_mask"): inner_model._update_causal_mask = inner_model._orig_update_causal_mask @@ -3791,3 +3795,29 @@ def patched_forward(*args, **kwargs): model.forward = patched_forward super().__init__(config, model, model_kwargs) + + +class SanaTextEncoderModelPatcher(ModelPatcher): + def __enter__(self): + super().__enter__() + patch_update_causal_mask(self._model, "4.39.0", None, patch_extrnal_model=True) + + if self._model.config._attn_implementation != "sdpa": + self._model.config._orig_attn_implementation = self._model.config._attn_implementation + self._model.config._attn_implementation = "sdpa" + if is_transformers_version("<", "4.47.0"): + from transformers.models.gemma2.modeling_gemma2 import GEMMA2_ATTENTION_CLASSES + + sdpa_attn = GEMMA2_ATTENTION_CLASSES["sdpa"] + for layer in self._model.layers: + layer.self_attn._orig_forward = layer.self_attn.forward + layer.self_attn.forward = types.MethodType(sdpa_attn.forward, layer.self_attn) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + unpatch_update_causal_mask(self._model, None, True) + if hasattr(self._model.config, "_orig_attn_implementation"): + self._model.config._attn_implementation = self._model.config._orig_attn_implementation + for layer in self._model.layers: + if hasattr(layer.self_attn, "_orig_forward"): + layer.self_attn.forward = layer.self_attn._orig_forward diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index 46b151e7de..1743dc59b1 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -257,9 +257,15 @@ def deduce_diffusers_dtype(model_name_or_path, **loading_kwargs): model_part_name = "unet" if model_part_name: directory = path / model_part_name - safetensors_files = [ - filename for filename in directory.glob("*.safetensors") if len(filename.suffixes) == 1 - ] + + pattern = "*.safetensors" + if "variant" in loading_kwargs: + variant = loading_kwargs["variant"] + pattern = f"*.{variant}.safetensors" + safetensors_files = list(directory.glob(pattern)) + else: + # filter out variant files + safetensors_files = [filename for filename in directory.glob(pattern) if len(filename.suffixes) == 1] safetensors_file = None if len(safetensors_files) > 0: safetensors_file = safetensors_files.pop(0) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 99422f1a54..3fd26a6e0d 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -594,6 +594,8 @@ def _from_transformers( else: ov_config = OVConfig(dtype="fp32") + variant = kwargs.pop("variant", None) + main_export( model_name_or_path=model_id, output=save_dir_path, @@ -607,6 +609,7 @@ def _from_transformers( trust_remote_code=trust_remote_code, ov_config=ov_config, library_name=cls._library_name, + model_variant=variant, ) return cls._from_pretrained( diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py index 11ee8f89a7..c60c0ec702 100644 --- a/optimum/intel/openvino/modeling_base_seq2seq.py +++ b/optimum/intel/openvino/modeling_base_seq2seq.py @@ -408,6 +408,7 @@ def _from_transformers( else: ov_config = OVConfig(dtype="fp32") stateful = kwargs.get("stateful", True) + variant = kwargs.pop("variant", None) main_export( model_name_or_path=model_id, @@ -422,6 +423,7 @@ def _from_transformers( trust_remote_code=trust_remote_code, ov_config=ov_config, stateful=stateful, + model_variant=variant, ) return cls._from_pretrained( diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 4897db1459..b411bf07d9 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -310,6 +310,8 @@ def _from_transformers( if torch_dtype is not None: model_loading_kwargs["torch_dtype"] = torch_dtype + variant = kwargs.pop("variant", None) + main_export( model_name_or_path=model_id, output=save_dir_path, @@ -325,6 +327,7 @@ def _from_transformers( stateful=stateful, model_loading_kwargs=model_loading_kwargs, library_name=cls._library_name, + model_variant=variant, ) if config.model_type == "phi3" and config.max_position_embeddings != getattr( diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 4a3f7104b7..c2e245c5e7 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -575,6 +575,7 @@ def _from_transformers( model_save_dir = TemporaryDirectory() model_save_path = Path(model_save_dir.name) + variant = kwargs.pop("variant", None) main_export( model_name_or_path=model_id, @@ -589,6 +590,7 @@ def _from_transformers( force_download=force_download, ov_config=ov_config, library_name=cls._library_name, + model_variant=variant, ) return cls._from_pretrained( diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 1c0e35cca2..c7cd7227f2 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -615,6 +615,7 @@ def _from_transformers( ov_config = OVConfig(dtype="fp32" if load_in_8bit is False else "auto") stateful = kwargs.pop("stateful", ensure_stateful_is_available(warn=False) and use_cache) + variant = kwargs.pop("variant", None) main_export( model_name_or_path=model_id, @@ -629,6 +630,7 @@ def _from_transformers( trust_remote_code=trust_remote_code, ov_config=ov_config, stateful=stateful, + model_variant=variant, ) config = AutoConfig.from_pretrained(save_dir_path, trust_remote_code=trust_remote_code) return cls._from_pretrained( diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index b155353fc3..bff1879340 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -149,12 +149,9 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): for output_type in ["latent", "np", "pt"]: inputs["output_type"] = output_type if model_arch == "sana": - if output_type == "latent": - continue + # resolution binning will lead to resize output to standard resolution and back that can interpolate floating-point deviations inputs["use_resolution_binning"] = False - atol = 4e-2 - else: - atol = 6e-3 + atol = 1e-4 ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images @@ -166,12 +163,9 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): for output_type in ["latent", "np", "pt"]: inputs["output_type"] = output_type if model_arch == "sana": - if output_type == "latent": - continue + # resolution binning will lead to resize output to standard resolution and back that can interpolate floating-point deviations inputs["use_resolution_binning"] = False - atol = 4e-2 - else: - atol = 6e-3 + atol = 6e-3 ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images From 0c1189ad23b183433cae772e193d48ec0da55be3 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Fri, 17 Jan 2025 12:11:23 +0400 Subject: [PATCH 5/8] Update optimum/exporters/openvino/__main__.py --- optimum/exporters/openvino/__main__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 59a9dc41ab..520a28559f 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -369,7 +369,6 @@ class StoreAttr(object): if library_name == "open_clip": model = _OpenClipForZeroShotImageClassification.from_pretrained(model_name_or_path, cache_dir=cache_dir) else: - logger.warn(loading_kwargs) model = TasksManager.get_model_from_task( task, model_name_or_path, From 23e298c06f0acf811478ccc13d435268dda8141b Mon Sep 17 00:00:00 2001 From: eaidova Date: Fri, 17 Jan 2025 12:15:25 +0400 Subject: [PATCH 6/8] provide missed params to data-aware cli --- optimum/commands/export/openvino.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 161534fad3..695c22e985 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -452,6 +452,8 @@ def run(self): quantization_config=quantization_config, stateful=not self.args.disable_stateful, trust_remote_code=self.args.trust_remote_code, + variant=self.args.variant, + cache_dir=self.args.cache_dir, ) model.save_pretrained(self.args.output) From 95c3e2c5b40e3b2e548bef53bca9e3811d0bef15 Mon Sep 17 00:00:00 2001 From: eaidova Date: Tue, 28 Jan 2025 17:05:56 +0400 Subject: [PATCH 7/8] apply review comments --- optimum/commands/export/openvino.py | 8 ++-- optimum/exporters/openvino/__main__.py | 8 ++-- optimum/intel/openvino/modeling_base.py | 2 +- optimum/intel/openvino/modeling_decoder.py | 2 +- optimum/intel/openvino/modeling_diffusion.py | 24 +++++------- .../openvino/modeling_visual_language.py | 2 +- tests/openvino/test_diffusion.py | 38 +++++++------------ 7 files changed, 34 insertions(+), 50 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 695c22e985..6d430cc611 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -106,10 +106,10 @@ def parse_args_openvino(parser: "ArgumentParser"): ), ) optional_group.add_argument( - "--variant", + "--weights-variant", type=str, default=None, - help=("Select a variant of the model to export."), + help=("If specified load weights from variant filename."), ) optional_group.add_argument( "--ratio", @@ -452,7 +452,7 @@ def run(self): quantization_config=quantization_config, stateful=not self.args.disable_stateful, trust_remote_code=self.args.trust_remote_code, - variant=self.args.variant, + variant=self.args.weights_variant, cache_dir=self.args.cache_dir, ) model.save_pretrained(self.args.output) @@ -475,6 +475,6 @@ def run(self): stateful=not self.args.disable_stateful, convert_tokenizer=not self.args.disable_convert_tokenizer, library_name=library_name, - model_variant=self.args.variant, + weights_variant=self.args.weights_variant, # **input_shapes, ) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 520a28559f..afc8718dea 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -122,7 +122,7 @@ def main_export( convert_tokenizer: bool = False, library_name: Optional[str] = None, model_loading_kwargs: Optional[Dict[str, Any]] = None, - model_variant: Optional[str] = None, + weights_variant: Optional[str] = None, **kwargs_shapes, ): """ @@ -238,8 +238,8 @@ def main_export( custom_architecture = False patch_16bit = False loading_kwargs = model_loading_kwargs or {} - if model_variant is not None: - loading_kwargs["variant"] = model_variant + if weights_variant is not None: + loading_kwargs["variant"] = weights_variant if library_name == "transformers": config = AutoConfig.from_pretrained( model_name_or_path, @@ -350,7 +350,7 @@ class StoreAttr(object): GPTQQuantizer.post_init_model = post_init_model elif library_name == "diffusers" and is_openvino_version(">=", "2024.6"): - _loading_kwargs = {} if model_variant is None else {"variant": model_variant} + _loading_kwargs = {} if weights_variant is None else {"variant": weights_variant} dtype = deduce_diffusers_dtype( model_name_or_path, revision=revision, diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 3fd26a6e0d..25064dd044 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -609,7 +609,7 @@ def _from_transformers( trust_remote_code=trust_remote_code, ov_config=ov_config, library_name=cls._library_name, - model_variant=variant, + weights_variant=variant, ) return cls._from_pretrained( diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index b411bf07d9..b74c4dc623 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -327,7 +327,7 @@ def _from_transformers( stateful=stateful, model_loading_kwargs=model_loading_kwargs, library_name=cls._library_name, - model_variant=variant, + weughts_variant=variant, ) if config.model_type == "phi3" and config.max_position_embeddings != getattr( diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index c2e245c5e7..2613d26c68 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -590,7 +590,7 @@ def _from_transformers( force_download=force_download, ov_config=ov_config, library_name=cls._library_name, - model_variant=variant, + weights_variant=variant, ) return cls._from_pretrained( @@ -767,7 +767,7 @@ def _reshape_text_encoder( self, model: openvino.runtime.Model, batch_size: int = -1, tokenizer_max_length: int = -1 ): if batch_size != -1: - shapes = {model.inputs[0]: [batch_size, tokenizer_max_length]} + shapes = {input_tensor: [batch_size, tokenizer_max_length] for input_tensor in model.inputs} model.reshape(shapes) return model @@ -824,9 +824,9 @@ def reshape( tokenizer_max_len = -1 else: tokenizer_max_len = ( - self.tokenizer.model_max_length + getattr(self.tokenizer, "model_max_length", -1) if self.tokenizer is not None - else self.tokenizer_2.model_max_length + else getattr(self.tokenizer_2, "model_max_length", -1) ) if self.unet is not None: @@ -848,21 +848,19 @@ def reshape( self.text_encoder.model = self._reshape_text_encoder( self.text_encoder.model, batch_size, - self.tokenizer.model_max_length if "Gemma" not in self.tokenizer.__class__.__name__ else -1, + getattr(self.tokenizer, "model_max_length", -1) + if "Gemma" not in self.tokenizer.__class__.__name__ + else -1, ) if self.text_encoder_2 is not None: self.text_encoder_2.model = self._reshape_text_encoder( - self.text_encoder_2.model, - batch_size, - self.tokenizer_2.model_max_length if "Gemma" not in self.tokenizer.__class__.__name__ else -1, + self.text_encoder_2.model, batch_size, getattr(self.tokenizer_2, "model_max_length", -1) ) if self.text_encoder_3 is not None: self.text_encoder_3.model = self._reshape_text_encoder( - self.text_encoder_3.model, - batch_size, - self.tokenizer_3.model_max_length if "Gemma" not in self.tokenizer.__class__.__name__ else -1, + self.text_encoder_3.model, batch_size, getattr(self.tokenizer_3, "model_max_length", -1) ) self.clear_requests() @@ -1068,9 +1066,7 @@ def forward( model_inputs = {"input_ids": input_ids} if "attention_mask" in self.input_names: - model_inputs["attention_mask"] = ( - attention_mask if attention_mask is not None else torch.ones(input_ids.shape, dtype=torch.long) - ) + model_inputs["attention_mask"] = attention_mask ov_outputs = self.request(model_inputs, share_inputs=True) main_out = ov_outputs[0] diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index c7cd7227f2..b89c238b14 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -630,7 +630,7 @@ def _from_transformers( trust_remote_code=trust_remote_code, ov_config=ov_config, stateful=stateful, - model_variant=variant, + weights_variant=variant, ) config = AutoConfig.from_pretrained(save_dir_path, trust_remote_code=trust_remote_code) return cls._from_pretrained( diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py index bff1879340..6ffe891c41 100644 --- a/tests/openvino/test_diffusion.py +++ b/tests/openvino/test_diffusion.py @@ -78,7 +78,7 @@ class OVPipelineForText2ImageTest(unittest.TestCase): NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] if is_transformers_version(">=", "4.40.0"): SUPPORTED_ARCHITECTURES.extend(["stable-diffusion-3", "flux", "sana"]) - NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES.append(["stable-diffusion-3"]) + NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES.extend(["stable-diffusion-3"]) CALLBACK_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] OVMODEL_CLASS = OVPipelineForText2Image @@ -94,13 +94,6 @@ def generate_inputs(self, height=128, width=128, batch_size=1): return inputs - def get_auto_cls(self, model_arch): - if model_arch == "sana": - from diffusers import SanaPipeline - - return SanaPipeline - return self.AUTOMODEL_CLASS - @require_diffusers def test_load_vanilla_model_which_is_not_supported(self): with self.assertRaises(Exception) as context: @@ -111,9 +104,7 @@ def test_load_vanilla_model_which_is_not_supported(self): @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_ov_pipeline_class_dispatch(self, model_arch: str): - auto_cls = self.get_auto_cls(model_arch) - auto_pipeline = DiffusionPipeline if model_arch != "sana" else auto_cls - auto_pipeline = auto_cls.from_pretrained(MODEL_NAMES[model_arch]) + auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) self.assertEqual(ov_pipeline.auto_model_class, auto_pipeline.__class__) @@ -141,21 +132,19 @@ def test_num_images_per_prompt(self, model_arch: str): def test_compare_to_diffusers_pipeline(self, model_arch: str): height, width, batch_size = 64, 64, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - auto_cls = self.get_auto_cls(model_arch) ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - diffusers_pipeline = auto_cls.from_pretrained(MODEL_NAMES[model_arch]) + diffusers_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) - with torch.no_grad(): - for output_type in ["latent", "np", "pt"]: - inputs["output_type"] = output_type - if model_arch == "sana": - # resolution binning will lead to resize output to standard resolution and back that can interpolate floating-point deviations - inputs["use_resolution_binning"] = False - atol = 1e-4 + for output_type in ["latent", "np", "pt"]: + inputs["output_type"] = output_type + if model_arch == "sana": + # resolution binning will lead to resize output to standard resolution and back that can interpolate floating-point deviations + inputs["use_resolution_binning"] = False + atol = 1e-4 - ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images - diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - np.testing.assert_allclose(ov_output, diffusers_output, atol=atol, rtol=1e-2) + ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images + diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images + np.testing.assert_allclose(ov_output, diffusers_output, atol=atol, rtol=1e-2) # test on inputs nondivisible on 64 height, width, batch_size = 96, 96, 1 @@ -191,8 +180,7 @@ def __call__(self, *args, **kwargs) -> None: auto_callback = Callback() ov_pipe = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - auto_cls = self.get_auto_cls(model_arch) - auto_pipe = auto_cls.from_pretrained(MODEL_NAMES[model_arch]) + auto_pipe = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) # callback_steps=1 to trigger callback every step ov_pipe(**inputs, callback=ov_callback, callback_steps=1) From 9fdddc0e09345925be2e2b97bb0dc5e498bc484b Mon Sep 17 00:00:00 2001 From: eaidova Date: Wed, 29 Jan 2025 18:39:54 +0400 Subject: [PATCH 8/8] rename weights_variant to variant --- optimum/commands/export/openvino.py | 6 +++--- optimum/exporters/openvino/__main__.py | 8 ++++---- optimum/intel/openvino/modeling_base.py | 2 +- optimum/intel/openvino/modeling_base_seq2seq.py | 2 +- optimum/intel/openvino/modeling_diffusion.py | 2 +- optimum/intel/openvino/modeling_visual_language.py | 2 +- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 6d430cc611..0c6e692a91 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -106,7 +106,7 @@ def parse_args_openvino(parser: "ArgumentParser"): ), ) optional_group.add_argument( - "--weights-variant", + "--variant", type=str, default=None, help=("If specified load weights from variant filename."), @@ -452,7 +452,7 @@ def run(self): quantization_config=quantization_config, stateful=not self.args.disable_stateful, trust_remote_code=self.args.trust_remote_code, - variant=self.args.weights_variant, + variant=self.args.variant, cache_dir=self.args.cache_dir, ) model.save_pretrained(self.args.output) @@ -475,6 +475,6 @@ def run(self): stateful=not self.args.disable_stateful, convert_tokenizer=not self.args.disable_convert_tokenizer, library_name=library_name, - weights_variant=self.args.weights_variant, + variant=self.args.variant, # **input_shapes, ) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index afc8718dea..88c738999a 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -122,7 +122,7 @@ def main_export( convert_tokenizer: bool = False, library_name: Optional[str] = None, model_loading_kwargs: Optional[Dict[str, Any]] = None, - weights_variant: Optional[str] = None, + variant: Optional[str] = None, **kwargs_shapes, ): """ @@ -238,8 +238,8 @@ def main_export( custom_architecture = False patch_16bit = False loading_kwargs = model_loading_kwargs or {} - if weights_variant is not None: - loading_kwargs["variant"] = weights_variant + if variant is not None: + loading_kwargs["variant"] = variant if library_name == "transformers": config = AutoConfig.from_pretrained( model_name_or_path, @@ -350,7 +350,7 @@ class StoreAttr(object): GPTQQuantizer.post_init_model = post_init_model elif library_name == "diffusers" and is_openvino_version(">=", "2024.6"): - _loading_kwargs = {} if weights_variant is None else {"variant": weights_variant} + _loading_kwargs = {} if variant is None else {"variant": variant} dtype = deduce_diffusers_dtype( model_name_or_path, revision=revision, diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 25064dd044..3902deff4c 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -609,7 +609,7 @@ def _from_transformers( trust_remote_code=trust_remote_code, ov_config=ov_config, library_name=cls._library_name, - weights_variant=variant, + variant=variant, ) return cls._from_pretrained( diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py index c60c0ec702..ba0d426e90 100644 --- a/optimum/intel/openvino/modeling_base_seq2seq.py +++ b/optimum/intel/openvino/modeling_base_seq2seq.py @@ -423,7 +423,7 @@ def _from_transformers( trust_remote_code=trust_remote_code, ov_config=ov_config, stateful=stateful, - model_variant=variant, + variant=variant, ) return cls._from_pretrained( diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 2613d26c68..f6c4fc37a8 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -590,7 +590,7 @@ def _from_transformers( force_download=force_download, ov_config=ov_config, library_name=cls._library_name, - weights_variant=variant, + variant=variant, ) return cls._from_pretrained( diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index b89c238b14..023f46896a 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -630,7 +630,7 @@ def _from_transformers( trust_remote_code=trust_remote_code, ov_config=ov_config, stateful=stateful, - weights_variant=variant, + variant=variant, ) config = AutoConfig.from_pretrained(save_dir_path, trust_remote_code=trust_remote_code) return cls._from_pretrained(