diff --git a/examples/openvino/multithreading/README.md b/examples/openvino/multithreading/README.md
new file mode 100644
index 0000000000..19759a1f30
--- /dev/null
+++ b/examples/openvino/multithreading/README.md
@@ -0,0 +1,49 @@
+# Execution in multi-threaded environment
+
+## Overview
+
+This example demonstrates how to execute the pipelines from Hugging Face transformers with multi concurency.
+A typical scenrio is with multi threaded application without duplicating the model allocation in the host memeory.
+
+By default, the execution of the transformers with OpenVINO Runtime backend is single threaded. Runing the generation process parallel can cause an error
+`RuntimeError: Infer Request is busy`.
+
+A simple technic can overcome this limitation using `clone` method on the model or a pipeline. It duplicates the execution object while sharing the OpenVINO compiled model in the host memory. The clone object should not change the model by reshaping, changing precision and recompiling.
+The snippet below applies this concept:
+
+```python
+pipe = OVStableDiffusionPipeline.from_pretrained(
+    MODEL_PATH, ov_config=OV_CONFIG, compile=True
+)
+def thread(prompt, results):
+    pipe_exec = pipe.clone()
+    images = pipe_exec(prompt).images
+    # Do something with images
+
+T1 = threading.Thread(target=thread, args=("my prompt"))
+T1.start()
+```
+Note that the `clone` operation is quick and is not duplicating the memory usage. It just creates new context for the generating algorithm.
+
+Check the simple examples how it can be applied in practice.
+
+## Preparing python environment
+```bash
+pip install -r examples/openvino/multithreading/requirement.txt
+```
+
+## Text generation
+
+```bash
+python examples/openvino/multithreading/gen_text.py
+```
+## Image generation
+```bash
+python examples/openvino/multithreading/gen_text.py
+```
+
+## Text translation with seq2seq
+
+```bash
+python examples/openvino/multithreading/gen_seq2seq.py
+```
diff --git a/examples/openvino/multithreading/gen_image.py b/examples/openvino/multithreading/gen_image.py
new file mode 100644
index 0000000000..21f8737445
--- /dev/null
+++ b/examples/openvino/multithreading/gen_image.py
@@ -0,0 +1,62 @@
+import datetime
+import threading
+
+from optimum.intel.openvino import OVStableDiffusionPipeline
+
+
+MODEL_PATH = "runwayml/stable-diffusion-v1-5"
+OV_CONFIG = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1"}
+
+
+pipe = OVStableDiffusionPipeline.from_pretrained(
+    MODEL_PATH, ov_config=OV_CONFIG, compile=True, dynamic_shapes=True, export=True
+)
+
+vae_decoder_clon = pipe.vae_decoder.clone()
+unet_clon = pipe.unet.clone()
+
+prompt1 = [" Zebras in space "]
+prompt2 = [" The statue of liberty in New York", " Big Ben in London "]
+prompt3 = [" pigs on the grass field", "beach in the storm", "sail yacht on the ocean"]
+
+prompts = [prompt1, prompt2, prompt3]
+
+NUM_THREADS = 3
+
+threads = [None] * NUM_THREADS
+results = [None] * NUM_THREADS
+
+
+def save_response(t, p, r):
+    print("THREAD", t)
+    print("PROMPT:", p)
+    for i in range(len(r)):
+        print("IMG:", i)
+        r[i].save("img_" + str(t) + "_" + str(i) + ".png", format="PNG")
+
+
+def gen_thread(prompt, results, i):
+    start = datetime.datetime.now()
+    pipe_exec = pipe.clone()
+    end = datetime.datetime.now()
+    print("Clonning time [s]", ((end - start).total_seconds()))
+    text = prompt
+    images = pipe_exec(text).images
+    results[i] = images
+
+
+start = datetime.datetime.now()
+for i in range(len(threads)):
+    threads[i] = threading.Thread(target=gen_thread, args=(prompts[i], results, i))
+    threads[i].start()
+nu_img = 0
+for i in range(len(threads)):
+    threads[i].join()
+    nu_img += len(results[i])
+end = datetime.datetime.now()
+
+for i in range(len(threads)):
+    save_response(i, prompts[i], results[i])
+
+print("Generation time [s]", ((end - start).total_seconds()), "images:", nu_img)
+print("Throughput:", nu_img * 60 / ((end - start).total_seconds()), "images/min")
diff --git a/examples/openvino/multithreading/gen_seq2seq.py b/examples/openvino/multithreading/gen_seq2seq.py
new file mode 100644
index 0000000000..27d3ed2a45
--- /dev/null
+++ b/examples/openvino/multithreading/gen_seq2seq.py
@@ -0,0 +1,51 @@
+import datetime
+import threading
+
+from transformers import AutoTokenizer, pipeline
+
+from optimum.intel import OVModelForSeq2SeqLM
+
+
+model_id = "echarlaix/t5-small-openvino"
+model = OVModelForSeq2SeqLM.from_pretrained(model_id)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+pipe = pipeline("translation_en_to_fr", model=model, tokenizer=tokenizer)
+
+prompt1 = ["I live in Europe"]
+prompt2 = ["What is your name?", "The dog is very happy"]
+prompt3 = ["It's a beautiful weather today", "Yes", "Good morning"]
+prompts = [prompt1, prompt2, prompt3]
+
+NUM_THREADS = 3
+
+threads = [None] * NUM_THREADS
+results = [None] * NUM_THREADS
+
+
+def print_response(t, p, r):
+    print("THREAD", t)
+    print("PROMPT:", p)
+    for i in range(len(r)):
+        print("TRANSLATION", i, ":", r[i]["translation_text"])
+
+
+def gen_thread(prompt, results, i):
+    translations = pipe(prompt)
+    results[i] = translations
+
+
+start = datetime.datetime.now()
+for i in range(len(threads)):
+    threads[i] = threading.Thread(target=gen_thread, args=(prompts[i], results, i))
+    threads[i].start()
+nu_trans = 0
+for i in range(len(threads)):
+    threads[i].join()
+    nu_trans += len(results[i])
+end = datetime.datetime.now()
+
+for i in range(len(threads)):
+    print_response(i, prompts[i], results[i])
+
+print("Generation time [s]", ((end - start).total_seconds()), "translations:", nu_trans)
+print("Throughput:", nu_trans / ((end - start).total_seconds()), "translations/s")
diff --git a/examples/openvino/multithreading/gen_text.py b/examples/openvino/multithreading/gen_text.py
new file mode 100644
index 0000000000..717f20cfc6
--- /dev/null
+++ b/examples/openvino/multithreading/gen_text.py
@@ -0,0 +1,83 @@
+import threading
+from datetime import datetime
+
+from transformers import AutoConfig, AutoTokenizer, set_seed
+
+from optimum.intel import OVModelForCausalLM
+
+
+set_seed(10)
+model_id = "togethercomputer/RedPajama-INCITE-Chat-3B-v1"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+tokenizer.pad_token = "[PAD]"
+tokenizer.padding_side = "left"
+NUM_THREADS = 3
+prompt1 = ["<human>: Question: What is the weather like now? Answer: <bot>"]
+prompt2 = ["<human>: Question:  What is Openvino?", "<human>: Question: What the the relativity theory? Answer: <bot>"]
+prompt3 = [
+    "<human>: Question: Are cats smarter that dogs? Answer: <bot>",
+    "<human>: Question: How big is an elephant? Answer: <bot>",
+    "<human>: Question: The water in the ocean is much hotter than before? Answer: <bot>",
+]
+prompts = [prompt1, prompt2, prompt3]
+
+OV_CONFIG = {"PERFORMANCE_HINT": "LATENCY", "CACHE_DIR": "", "NUM_STREAMS": "2"}
+model = OVModelForCausalLM.from_pretrained(
+    model_id,
+    config=AutoConfig.from_pretrained(model_id, trust_remote_code=True),
+    ov_config=OV_CONFIG,
+    compile=True,
+    export=True,
+)
+
+threads = [None] * NUM_THREADS
+results = [None] * NUM_THREADS
+
+
+def print_response(t, p, r):
+    print("THREAD", t)
+    print("PROMPT:", p)
+    for answer in r:
+        print("Answer:")
+        print(tokenizer.decode(answer, skip_special_tokens=True))
+
+
+def gen_thread(prompt, results, i):
+    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
+    generate_kwargs = {
+        "input_ids": inputs.input_ids,
+        "max_new_tokens": 200,
+        "temperature": 1.0,
+        "do_sample": True,
+        "top_p": 1.0,
+        "top_k": 50,
+        "num_beams": 5,
+        "repetition_penalty": 1.1,
+    }
+    start = datetime.now()
+    model_exec = model.clone()
+    end = datetime.now()
+    print("cloning model duration", (end - start).total_seconds() * 1000000, "us")
+    outputs = model_exec.generate(**generate_kwargs)
+    num_tok = 0
+    for x in range(len(prompt)):
+        num_tok += outputs[x].numel() - inputs.get("input_ids")[x].numel()
+    results[i] = outputs, num_tok
+
+
+start = datetime.now()
+for i in range(len(threads)):
+    threads[i] = threading.Thread(target=gen_thread, args=(prompts[i], results, i))
+    threads[i].start()
+
+total_tok = 0
+for i in range(len(threads)):
+    threads[i].join()
+    total_tok += results[i][1]
+end = datetime.now()
+
+for i in range(len(threads)):
+    print_response(i, prompts[i], results[i][0])
+
+print("Generation time [s]", ((end - start).total_seconds()), "tokens:", total_tok)
+print("Throughput:", total_tok * 60 / ((end - start).total_seconds()), "tokens/min")
diff --git a/examples/openvino/multithreading/requirements.txt b/examples/openvino/multithreading/requirements.txt
new file mode 100644
index 0000000000..b05f68f799
--- /dev/null
+++ b/examples/openvino/multithreading/requirements.txt
@@ -0,0 +1,2 @@
+optimum-intel[openvino, nncf]"@git+https://github.com/huggingface/optimum-intel.git
+transformers
\ No newline at end of file
diff --git a/optimum/intel/openvino/modeling.py b/optimum/intel/openvino/modeling.py
index 558cc3b904..595853d303 100644
--- a/optimum/intel/openvino/modeling.py
+++ b/optimum/intel/openvino/modeling.py
@@ -134,6 +134,7 @@ def to(self, device: str):
         """
         if isinstance(device, str):
             self._device = device.upper()
+            self.compiled_model = None
             self.request = None
         else:
             logger.warning(f"device must be of type {str} but got {type(device)} instead")
@@ -187,6 +188,7 @@ def forward(
         **kwargs,
     ):
         self.compile()
+        self.create_infer_request()
 
         np_inputs = isinstance(input_ids, np.ndarray)
         if not np_inputs:
@@ -204,8 +206,16 @@ def forward(
             inputs["token_type_ids"] = token_type_ids
 
         # Run inference
-        outputs = self.request(inputs)
-        logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"]
+        if self.async_exec:
+            self.infer_request.start_async(inputs)
+            self.infer_request.wait()
+        else:
+            self.infer_request.infer(inputs)
+        logits = (
+            torch.from_numpy(self.infer_request.get_tensor("logits").data).to(self.device)
+            if not np_inputs
+            else self.infer_request.get_tensor("logits").data
+        )
         return SequenceClassifierOutput(logits=logits)
 
 
@@ -253,6 +263,7 @@ def forward(
         **kwargs,
     ):
         self.compile()
+        self.create_infer_request()
 
         np_inputs = isinstance(input_ids, np.ndarray)
         if not np_inputs:
@@ -270,12 +281,21 @@ def forward(
             inputs["token_type_ids"] = token_type_ids
 
         # Run inference
-        outputs = self.request(inputs)
+        if self.async_exec:
+            self.infer_request.start_async(inputs)
+            self.infer_request.wait()
+        else:
+            self.infer_request.infer(inputs)
+
         start_logits = (
-            torch.from_numpy(outputs["start_logits"]).to(self.device) if not np_inputs else outputs["start_logits"]
+            torch.from_numpy(self.infer_request.get_tensor("start_logits").data).to(self.device)
+            if not np_inputs
+            else self.infer_request.get_tensor("start_logits").data
         )
         end_logits = (
-            torch.from_numpy(outputs["end_logits"]).to(self.device) if not np_inputs else outputs["end_logits"]
+            torch.from_numpy(self.infer_request.get_tensor("end_logits").data).to(self.device)
+            if not np_inputs
+            else self.infer_request.get_tensor("end_logits").data
         )
         return QuestionAnsweringModelOutput(start_logits=start_logits, end_logits=end_logits)
 
@@ -323,6 +343,7 @@ def forward(
         **kwargs,
     ):
         self.compile()
+        self.create_infer_request()
 
         np_inputs = isinstance(input_ids, np.ndarray)
         if not np_inputs:
@@ -340,8 +361,16 @@ def forward(
             inputs["token_type_ids"] = token_type_ids
 
         # Run inference
-        outputs = self.request(inputs)
-        logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"]
+        if self.async_exec:
+            self.infer_request.start_async(inputs)
+            self.infer_request.wait()
+        else:
+            self.infer_request.infer(inputs)
+        logits = (
+            torch.from_numpy(self.infer_request.get_tensor("logits").data).to(self.device)
+            if not np_inputs
+            else self.infer_request.get_tensor("logits").data
+        )
         return TokenClassifierOutput(logits=logits)
 
 
@@ -388,6 +417,7 @@ def forward(
         **kwargs,
     ):
         self.compile()
+        self.create_infer_request()
 
         np_inputs = isinstance(input_ids, np.ndarray)
         if not np_inputs:
@@ -405,11 +435,15 @@ def forward(
             inputs["token_type_ids"] = token_type_ids
 
         # Run inference
-        outputs = self.request(inputs)
+        if self.async_exec:
+            self.infer_request.start_async(inputs)
+            self.infer_request.wait()
+        else:
+            self.infer_request.infer(inputs)
         last_hidden_state = (
-            torch.from_numpy(outputs["last_hidden_state"]).to(self.device)
+            torch.from_numpy(self.infer_request.get_tensor("last_hidden_state").data).to(self.device)
             if not np_inputs
-            else outputs["last_hidden_state"]
+            else self.infer_request.get_tensor("last_hidden_state").data
         )
         return BaseModelOutput(last_hidden_state=last_hidden_state)
 
@@ -500,6 +534,7 @@ def forward(
         **kwargs,
     ):
         self.compile()
+        self.create_infer_request()
 
         np_inputs = isinstance(input_ids, np.ndarray)
         if not np_inputs:
@@ -517,8 +552,16 @@ def forward(
             inputs["token_type_ids"] = token_type_ids
 
         # Run inference
-        outputs = self.request(inputs)
-        logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"]
+        if self.async_exec:
+            self.infer_request.start_async(inputs)
+            self.infer_request.wait()
+        else:
+            self.infer_request.infer(inputs)
+        logits = (
+            torch.from_numpy(self.infer_request.get_tensor("logits").data).to(self.device)
+            if not np_inputs
+            else self.infer_request.get_tensor("logits").data
+        )
         return MaskedLMOutput(logits=logits)
 
 
@@ -634,6 +677,7 @@ def forward(
         **kwargs,
     ):
         self.compile()
+        self.create_infer_request()
 
         np_inputs = isinstance(pixel_values, np.ndarray)
         if not np_inputs:
@@ -644,8 +688,16 @@ def forward(
         }
 
         # Run inference
-        outputs = self.request(inputs)
-        logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"]
+        if self.async_exec:
+            self.infer_request.start_async(inputs)
+            self.infer_request.wait()
+        else:
+            self.infer_request.infer(inputs)
+        logits = (
+            torch.from_numpy(self.infer_request.get_tensor("logits").data).to(self.device)
+            if not np_inputs
+            else self.infer_request.get_tensor("logits").data
+        )
         return ImageClassifierOutput(logits=logits)
 
 
@@ -694,6 +746,7 @@ def forward(
         **kwargs,
     ):
         self.compile()
+        self.create_infer_request()
 
         np_inputs = isinstance(input_values, np.ndarray)
         if not np_inputs:
@@ -709,8 +762,16 @@ def forward(
             inputs["attention_mask"] = attention_mask
 
         # Run inference
-        outputs = self.request(inputs)
-        logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"]
+        if self.async_exec:
+            self.infer_request.start_async(inputs)
+            self.infer_request.wait()
+        else:
+            self.infer_request.infer(inputs)
+        logits = (
+            torch.from_numpy(self.infer_request.get_tensor("logits").data).to(self.device)
+            if not np_inputs
+            else self.infer_request.get_tensor("logits").data
+        )
         return SequenceClassifierOutput(logits=logits)
 
 
@@ -767,6 +828,8 @@ def forward(
         attention_mask: Optional[Union[torch.Tensor, np.ndarray]] = None,
         **kwargs,
     ):
+        self.compile()
+        self.create_infer_request()
         np_inputs = isinstance(input_values, np.ndarray)
         if not np_inputs:
             input_values = np.array(input_values)
@@ -781,8 +844,16 @@ def forward(
             inputs["attention_mask"] = attention_mask
 
         # Run inference
-        outputs = self.request(inputs)
-        logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"]
+        if self.async_exec:
+            self.infer_request.start_async(inputs)
+            self.infer_request.wait()
+        else:
+            self.infer_request.infer(inputs)
+        logits = (
+            torch.from_numpy(self.infer_request.get_tensor("logits").data).to(self.device)
+            if not np_inputs
+            else self.infer_request.get_tensor("logits").data
+        )
         return CausalLMOutput(logits=logits)
 
 
@@ -862,12 +933,19 @@ def forward(
             inputs["attention_mask"] = attention_mask
 
         # Run inference
-        outputs = self.request(inputs)
-        logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"]
+        infer_request = self.compiled_model.create_infer_request()
+        infer_request.start_async(inputs)
+        infer_request.wait()
+        logits = (
+            torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device)
+            if not np_inputs
+            else infer_request.get_tensor("logits").data
+        )
         embeddings = (
-            torch.from_numpy(outputs["embeddings"]).to(self.device) if not np_inputs else outputs["embeddings"]
+            torch.from_numpy(infer_request.get_tensor("embeddings").data).to(self.device)
+            if not np_inputs
+            else infer_request.get_tensor("embeddings").data
         )
-
         return XVectorOutput(logits=logits, embeddings=embeddings)
 
 
@@ -939,7 +1017,13 @@ def forward(
             inputs["attention_mask"] = attention_mask
 
         # Run inference
-        outputs = self.request(inputs)
-        logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"]
+        infer_request = self.compiled_model.create_infer_request()
+        infer_request.start_async(inputs)
+        infer_request.wait()
+        logits = (
+            torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device)
+            if not np_inputs
+            else infer_request.get_tensor("logits").data
+        )
 
         return TokenClassifierOutput(logits=logits)
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index 094840c297..0892736e07 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -65,12 +65,10 @@ def __init__(
         self.ov_config = ov_config if ov_config is not None else {"PERFORMANCE_HINT": "LATENCY"}
         self.preprocessors = kwargs.get("preprocessors", [])
         enable_compilation = kwargs.get("compile", True)
-
         if self.is_dynamic:
             height = -1 if self.export_feature == "image-classification" else None
             width = -1 if self.export_feature == "image-classification" else None
             model = self._reshape(model, -1, -1, height, width)
-
         input_names = {}
         for idx, key in enumerate(model.inputs):
             names = tuple(key.get_names())
@@ -82,9 +80,11 @@ def __init__(
             names = tuple(key.get_names())
             output_names[next((name for name in names if "/" not in name), names[0])] = idx
         self.output_names = output_names
-
         self.model = model
-        self.request = None
+        self.request = None  # Deprecated attribute, use compiled_model instead
+        self.infer_request = None
+        self.async_exec = False
+        self.compiled_model = None
         if enable_compilation:
             self.compile()
 
@@ -336,7 +336,7 @@ def _to_load(
         )
 
     def compile(self):
-        if self.request is None:
+        if self.compiled_model is None:
             logger.info(f"Compiling the model to {self._device} ...")
             ov_config = {**self.ov_config}
             if (
@@ -348,11 +348,16 @@ def compile(self):
                 cache_dir = Path(self.model_save_dir).joinpath("model_cache")
                 ov_config["CACHE_DIR"] = str(cache_dir)
                 logger.info(f"Setting OpenVINO CACHE_DIR to {str(cache_dir)}")
-            self.request = core.compile_model(self.model, self._device, ov_config)
+            self.compiled_model = core.compile_model(self.model, self._device, ov_config)
+            self.request = self.compiled_model  # Deprecated attribute, use compiled_model instead
             # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html
             if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2:
                 logger.info(f"{self._device} SUPPORTED_PROPERTIES:")
-                _print_compiled_model_properties(self.request)
+                _print_compiled_model_properties(self.compiled_model)
+
+    def create_infer_request(self):
+        if self.infer_request is None:
+            self.infer_request = self.compiled_model.create_infer_request()
 
     def _reshape(
         self,
@@ -390,7 +395,9 @@ def reshape(self, batch_size: int, sequence_length: int, height: int = None, wid
         """
         self.is_dynamic = True if batch_size == -1 and sequence_length == -1 else False
         self.model = self._reshape(self.model, batch_size, sequence_length, height, width)
-        self.request = None
+        self.compiled_model = None
+        self.infer_request = None
+        self.request = None  # Deprecated attribute, use compiled_model instead
         return self
 
     def half(self):
@@ -399,12 +406,22 @@ def half(self):
         """
         apply_moc_transformations(self.model, cf=False)
         compress_model_transformation(self.model)
-        self.request = None
+        self.request = None  # Deprecated attribute, use compiled_model instead
+        self.compiled_model = None
+        self.infer_request = None
         return self
 
     def forward(self, *args, **kwargs):
         raise NotImplementedError
 
+    def clone(self):
+        self.compile()
+        model_cloned = self.__class__(self.model, config=self.config, compile=False, dynamic_shapes=False)
+        model_cloned.compiled_model = self.compiled_model
+        model_cloned.async_exec = True
+        model_cloned._device = self._device
+        return model_cloned
+
     def can_generate(self) -> bool:
         """
         Returns whether this model can generate sequences with `.generate()`.
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 08165578f0..e7ac6817e8 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -107,7 +107,6 @@ def __init__(
                 "`dynamic_shapes` was set to `False` but static shapes are not supported for causal language model. Please set `dynamic_shapes=True`."
             )
 
-        enable_compilation = kwargs.get("compile", True)
         kwargs["compile"] = False  # avoid extra compilation in the base class
 
         super().__init__(
@@ -130,21 +129,14 @@ def __init__(
         self.num_pkv = 2
         self.key_value_input_names = [key for key in self.input_names if "key_values" in key]
         self.key_value_output_names = [key for key in self.output_names if "present" in key]
-        self._original_model = self.model.clone()  # keep original model for serialization
-        self._pkv_precision = Type.f32
         self.next_beam_idx = None
-        self.update_pkv_precision()
-        if self.is_dynamic:
-            self.model = self._reshape(self.model, -1, -1)
         is_stateful_supported = ensure_stateful_is_available(warn=False)
-
         if self.use_cache and not self.stateful:
             logger.warn(
                 "Provided model does not contain state. It may lead to sub-optimal performance."
                 "Please reexport model with updated OpenVINO version >= 2023.3.0 calling the `from_pretrained` method with original model "
                 "and `export=True` parameter"
             )
-
         if self.stateful:
             if stateful is None:
                 stateful = is_stateful_supported
@@ -171,7 +163,13 @@ def raise_error(model_prop, user_prop, name):
         if use_cache ^ self.use_cache:
             raise_error(self.use_cache, use_cache, "use_cache")
 
-        if enable_compilation:
+    def init_ov_model(self, compile=True):
+        self._pkv_precision = Type.f32
+        self.update_pkv_precision(force_fp32=False)
+        if self.is_dynamic:
+            self.model = self._reshape(self.model, -1, -1)
+        self._original_model = self.model.clone()  # keep original model for serialization
+        if compile:
             self.compile()
 
     def update_pkv_precision(self, force_fp32=False):
@@ -209,7 +207,8 @@ def update_pkv_precision(self, force_fp32=False):
                 self.model = self._original_model.clone()
                 if self.is_dynamic:
                     self.model = self._reshape(self.model, -1, -1)
-                self.request = None
+                self.request = None  # Deprecated attribute, use compiled_model instead
+                self.compiled_model = None
 
     def _save_pretrained(self, save_directory: Union[str, Path]):
         """
@@ -280,6 +279,7 @@ def _from_transformers(
         config.is_decoder = True
         config.is_encoder_decoder = False
         config.save_pretrained(save_dir_path)
+
         return cls._from_pretrained(
             model_id=save_dir_path,
             config=config,
@@ -333,14 +333,19 @@ def normalized_config(self):
         return NormalizedConfigManager.get_normalized_config_class(self.config.model_type)(self.config)
 
     def compile(self):
-        if self.request is None:
+        if self.compiled_model is None:
             super().compile()
-            self.request = self.request.create_infer_request()
 
     def _make_stateful(self):
         patch_stateful(self.config, self.model)
         self.stateful = True
 
+    def create_infer_request(self):
+        if self.compiled_model is None:
+            self.compile()
+        if self.infer_request is None:
+            self.infer_request = self.compiled_model.create_infer_request()
+
 
 @add_start_docstrings(
     """
@@ -419,7 +424,7 @@ def prepare_inputs(
             # past_key_values are not used explicitly, instead they are handled inside the model
             if past_key_values is None:
                 # This is the first iteration in a sequence, reset all states
-                self.request.reset_state()
+                self.infer_request.reset_state()
                 # Set initial value for the next beam_idx input that will be used at the current iteration
                 # and will be optionally updated by _reorder_cache at the next iterations if beam_search is used
                 self.next_beam_idx = np.arange(batch_size, dtype=int)
@@ -464,6 +469,7 @@ def forward(
         **kwargs,
     ) -> CausalLMOutputWithPast:
         self.compile()
+        self.create_infer_request()
 
         inputs = self.prepare_inputs(
             input_ids=input_ids,
@@ -474,9 +480,9 @@ def forward(
         )
 
         # Run inference
-        self.request.start_async(inputs, share_inputs=True)
-        self.request.wait()
-        logits = torch.from_numpy(self.request.get_tensor("logits").data).to(self.device)
+        self.infer_request.start_async(inputs, share_inputs=True)
+        self.infer_request.wait()
+        logits = torch.from_numpy(self.infer_request.get_tensor("logits").data).to(self.device)
         if self.stateful:
             # Need a marker to differentiate the first generate iteration from the others in
             # the first condition at the function beginning above.
@@ -486,7 +492,7 @@ def forward(
         if not self.stateful:
             if self.use_cache:
                 # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the self-attention layer)
-                past_key_values = tuple(self.request.get_tensor(key).data for key in self.key_value_output_names)
+                past_key_values = tuple(self.infer_request.get_tensor(key).data for key in self.key_value_output_names)
                 if self.config.model_type not in MULTI_QUERY_ATTN_MODELS:
                     # Tuple of tuple of length `n_layers`, with each tuple of length equal to 2 (k/v of self-attention)
                     past_key_values = tuple(
@@ -593,7 +599,7 @@ def _from_pretrained(
             init_cls = cls
 
         causal_model = init_cls(model=model, config=config, model_save_dir=model_cache_path.parent, **kwargs)
-
+        causal_model.init_ov_model(compile=kwargs.get("compile", True))
         if load_in_4bit:
             if not is_nncf_available():
                 raise ImportError(
@@ -611,6 +617,13 @@ def _from_pretrained(
             _weight_only_quantization(causal_model, quantization_config)
         return causal_model
 
+    def clone(self):
+        model_instance = self.__class__(model=self.model, config=self.config, compile=False, use_cache=self.use_cache)
+        model_instance.compiled_model = self.compiled_model
+        model_instance._pkv_precision = self._pkv_precision
+        model_instance.request = None
+        return model_instance
+
 
 class OVBloomForCausalLM(OVModelForCausalLM):
     # Adapted from transformers.models.bloom.modeling_bloom.BloomForCausalLM.prepare_inputs_for_generation
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index 867354a543..ae81e4f8cc 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import copy
 import importlib
 import logging
 import os
@@ -95,7 +96,7 @@ def __init__(
         self._model_save_dir = (
             Path(model_save_dir.name) if isinstance(model_save_dir, TemporaryDirectory) else model_save_dir
         )
-        self.vae_decoder = OVModelVaeDecoder(vae_decoder, self)
+        self.vae_decoder = OVModelVaeDecoder(vae_decoder, self) if vae_decoder is not None else None
         self.unet = OVModelUnet(unet, self)
         self.text_encoder = OVModelTextEncoder(text_encoder, self) if text_encoder is not None else None
         self.text_encoder_2 = (
@@ -105,13 +106,12 @@ def __init__(
         )
         self.vae_encoder = OVModelVaeEncoder(vae_encoder, self) if vae_encoder is not None else None
 
-        if "block_out_channels" in self.vae_decoder.config:
-            self.vae_scale_factor = 2 ** (len(self.vae_decoder.config["block_out_channels"]) - 1)
-        else:
-            self.vae_scale_factor = 8
-
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
-
+        if vae_decoder is not None:
+            if "block_out_channels" in self.vae_decoder.config:
+                self.vae_scale_factor = 2 ** (len(self.vae_decoder.config["block_out_channels"]) - 1)
+            else:
+                self.vae_scale_factor = 8
+            self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
         self.tokenizer = tokenizer
         self.tokenizer_2 = tokenizer_2
         self.scheduler = scheduler
@@ -271,8 +271,8 @@ def _from_pretrained(
 
         if model_save_dir is None:
             model_save_dir = new_model_save_dir
-
-        return cls(unet=unet, config=config, model_save_dir=model_save_dir, **components, **kwargs)
+        pipe = cls(unet=unet, config=config, model_save_dir=model_save_dir, **components, **kwargs)
+        return pipe
 
     @classmethod
     def _from_transformers(
@@ -517,6 +517,41 @@ def _load_config(cls, config_name_or_path: Union[str, os.PathLike], **kwargs):
     def _save_config(self, save_directory):
         self.save_config(save_directory)
 
+    def clone(self):
+        self.compile()
+        config = self._internal_dict
+        scheduler = self.scheduler
+        unet = self.unet.model
+        model_save_dir = self._model_save_dir
+        pipe_cloned = self.__class__(
+            unet=unet,
+            config=config,
+            scheduler=scheduler,
+            compile=False,
+            dynamic_shapes=False,
+            model_save_dir=model_save_dir,
+        )
+        pipe_cloned.unet = self.unet.clone()
+        if self.vae_decoder is not None:
+            pipe_cloned.vae_decoder = self.vae_decoder.clone()
+        if self.text_encoder is not None:
+            pipe_cloned.text_encoder = self.text_encoder.clone()
+        if self.text_encoder_2 is not None:
+            pipe_cloned.text_encoder_2 = self.text_encoder_2.clone()
+        if self.vae_encoder is not None:
+            pipe_cloned.vae_encoder = self.vae_encoder.clone()
+        pipe_cloned.vae_scale_factor = self.vae_scale_factor
+        pipe_cloned.image_processor = self.image_processor
+        pipe_cloned.tokenizer = self.tokenizer
+        pipe_cloned.tokenizer_2 = self.tokenizer_2
+        pipe_cloned.is_dynamic = self.is_dynamic
+        # Default PNDMscheduler is not working in HF with multithreading
+        # https://github.com/huggingface/diffusers/issues/3672
+        # Full copy scheduler as a WA
+        if isinstance(self.scheduler, PNDMScheduler):
+            pipe_cloned.scheduler = copy.deepcopy(self.scheduler)
+        return pipe_cloned
+
 
 class OVModelPart:
     CONFIG_NAME = "config.json"
@@ -537,14 +572,16 @@ def __init__(
             for inputs in self.model.inputs
         }
         self.ov_config = ov_config or {**self.parent_model.ov_config}
-        self.request = None
+        self.compiled_model = None
+        self.request = None  # Deprecated attribute, use compiled_model instead
+        self.infer_request = None
         self._model_name = model_name
         self._model_dir = Path(model_dir or parent_model._model_save_dir)
         config_path = self._model_dir / model_name / self.CONFIG_NAME
         self.config = self.parent_model._dict_from_json_file(config_path) if config_path.is_file() else {}
 
     def _compile(self):
-        if self.request is None:
+        if self.compiled_model is None:
             if (
                 "CACHE_DIR" not in self.ov_config.keys()
                 and not str(self._model_dir).startswith(gettempdir())
@@ -552,12 +589,24 @@ def _compile(self):
             ):
                 self.ov_config["CACHE_DIR"] = os.path.join(self._model_dir, self._model_name, "model_cache")
 
-            logger.info(f"Compiling the {self._model_name} to {self.device} ...")
-            self.request = core.compile_model(self.model, self.device, self.ov_config)
+            logger.info(f"Compiling the {self._model_name} to {self.device} with config {self.ov_config} ... ")
+            self.compiled_model = core.compile_model(self.model, self.device, self.ov_config)
+            self.request = self.compiled_model  # Deprecated attribute, use compiled_model instead
             # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html
             if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2:
                 logger.info(f"{self.device} SUPPORTED_PROPERTIES:")
-                _print_compiled_model_properties(self.request)
+                _print_compiled_model_properties(self.compiled_model)
+
+    def create_infer_request(self):
+        if self.infer_request is None:
+            self.infer_request = self.compiled_model.create_infer_request()
+
+    def clone(self):
+        model_cloned = self.__class__(self.model, self.parent_model, ov_config=self.ov_config)
+        model_cloned.model = self.model
+        model_cloned.compiled_model = self.compiled_model
+        model_cloned.config = self.config
+        return model_cloned
 
     @property
     def device(self):
@@ -576,12 +625,15 @@ def __init__(
 
     def __call__(self, input_ids: np.ndarray):
         self._compile()
+        self.create_infer_request()
 
         inputs = {
             "input_ids": input_ids,
         }
-        outputs = self.request(inputs, share_inputs=True)
-        return list(outputs.values())
+        self.infer_request.start_async(inputs, share_inputs=True)
+        self.infer_request.wait()
+        outputs = [self.infer_request.get_tensor(output).data for output in self.infer_request.results]
+        return outputs
 
 
 class OVModelUnet(OVModelPart):
@@ -600,6 +652,7 @@ def __call__(
         timestep_cond: Optional[np.ndarray] = None,
     ):
         self._compile()
+        self.create_infer_request()
 
         inputs = {
             "sample": sample,
@@ -614,8 +667,10 @@ def __call__(
         if timestep_cond is not None:
             inputs["timestep_cond"] = timestep_cond
 
-        outputs = self.request(inputs, share_inputs=True)
-        return list(outputs.values())
+        self.infer_request.start_async(inputs, share_inputs=True)
+        self.infer_request.wait()
+        outputs = [self.infer_request.get_tensor(output).data for output in self.infer_request.results]
+        return outputs
 
 
 class OVModelVaeDecoder(OVModelPart):
@@ -626,12 +681,15 @@ def __init__(
 
     def __call__(self, latent_sample: np.ndarray):
         self._compile()
+        self.create_infer_request()
 
         inputs = {
             "latent_sample": latent_sample,
         }
-        outputs = self.request(inputs, share_inputs=True)
-        return list(outputs.values())
+        self.infer_request.start_async(inputs, share_inputs=True)
+        self.infer_request.wait()
+        outputs = [self.infer_request.results[output].data for output in self.infer_request.results]
+        return outputs
 
     def _compile(self):
         if "GPU" in self.device:
@@ -651,8 +709,11 @@ def __call__(self, sample: np.ndarray):
         inputs = {
             "sample": sample,
         }
-        outputs = self.request(inputs, share_inputs=True)
-        return list(outputs.values())
+        infer_request = self.compiled_model.create_infer_request()
+        infer_request.start_async(inputs, share_inputs=True)
+        infer_request.wait()
+        outputs = [infer_request.get_tensor(output).data for output in infer_request.results]
+        return outputs
 
     def _compile(self):
         if "GPU" in self.device:
diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py
index 617d898be5..98b40866a6 100644
--- a/optimum/intel/openvino/modeling_seq2seq.py
+++ b/optimum/intel/openvino/modeling_seq2seq.py
@@ -418,7 +418,9 @@ def __init__(self, model: openvino.runtime.Model, parent_model: OVModelForSeq2Se
         self.device = torch.device("cpu")
         self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)}
         self.main_input_name = self.parent_model.main_input_name or "input_ids"
-        self.request = None
+        self.compiled_model = None
+        self.request = None  # Deprecated attribute, use compiled_model instead
+        self.infer_request = None
 
     @add_start_docstrings_to_model_forward(ENCODER_INPUTS_DOCSTRING)
     def forward(
@@ -437,9 +439,10 @@ def forward(
             inputs["attention_mask"] = attention_mask
 
         # Run inference
-        last_hidden_state = torch.from_numpy(
-            self.request(inputs, share_inputs=True, share_outputs=True)["last_hidden_state"]
-        ).to(self.device)
+        infer_request = self.compiled_model.create_infer_request()
+        infer_request.start_async(inputs, share_inputs=True)
+        infer_request.wait()
+        last_hidden_state = torch.from_numpy(infer_request.get_tensor("last_hidden_state").data).to(self.device)
 
         return BaseModelOutput(last_hidden_state=last_hidden_state)
 
@@ -456,13 +459,14 @@ def _compile(self):
             cache_dir = Path(self.parent_model.model_save_dir).joinpath("model_cache")
             ov_config["CACHE_DIR"] = str(cache_dir)
 
-        if self.request is None:
+        if self.compiled_model is None:
             logger.info(f"Compiling the encoder to {self._device} ...")
-            self.request = core.compile_model(self.model, self._device, ov_config)
+            self.compiled_model = core.compile_model(self.model, self._device, ov_config)
+            self.request = self.compiled_model  # Deprecated attribute, use compiled_model instead
             # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html
             if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2:
                 logger.info(f"{self._device} SUPPORTED_PROPERTIES:")
-                _print_compiled_model_properties(self.request)
+                _print_compiled_model_properties(self.compiled_model)
 
 
 class OVDecoder:
@@ -494,7 +498,9 @@ def __init__(self, model: openvino.runtime.Model, parent_model: OVModelForSeq2Se
             self.use_past = False
             self.num_pkv = 4
 
-        self.request = None
+        self.request = None  # Deprecated attribute, use compiled_model instead
+        self.compiled_model = None
+        self.infer_request = None
 
     @add_start_docstrings_to_model_forward(DECODER_INPUTS_DOCSTRING)
     def forward(
@@ -531,13 +537,14 @@ def forward(
         if "decoder_attention_mask" in self.input_names and decoder_attention_mask is not None:
             inputs["decoder_attention_mask"] = decoder_attention_mask
         # Run inference
-        self.request.start_async(inputs, share_inputs=True)
-        self.request.wait()
-        logits = torch.from_numpy(self.request.get_tensor("logits").data).to(self.device)
+        infer_request = self.compiled_model.create_infer_request()
+        infer_request.start_async(inputs, share_inputs=True)
+        infer_request.wait()
+        logits = torch.from_numpy(infer_request.get_tensor("logits").data).to(self.device)
 
         # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the
         # self-attention layer and 2 to the cross-attention layer)
-        out_past_key_values = tuple(self.request.get_tensor(key).data for key in self.key_value_output_names)
+        out_past_key_values = tuple(infer_request.get_tensor(key).data for key in self.key_value_output_names)
 
         # Tuple of tuple of length `n_layers`, with each tuple of length equal to:
         # * 4 for the decoder without cache (k/v of self-attention + k/v of cross-attention)
@@ -568,14 +575,13 @@ def _compile(self):
             cache_dir = Path(self.parent_model.model_save_dir).joinpath("model_cache")
             ov_config["CACHE_DIR"] = str(cache_dir)
 
-        if self.request is None:
+        if self.compiled_model is None:
             logger.info(f"Compiling the decoder to {self._device} ...")
-            compiled_model = core.compile_model(self.model, self._device, ov_config)
-            self.request = compiled_model.create_infer_request()
+            self.compiled_model = core.compile_model(self.model, self._device, ov_config)
             # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html
             if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2:
                 logger.info(f"{self._device} SUPPORTED_PROPERTIES:")
-                _print_compiled_model_properties(compiled_model)
+                _print_compiled_model_properties(self.compiled_model)
 
 
 @add_start_docstrings(
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 5f3208fd58..84e33d7f7b 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -50,7 +50,7 @@
     set_seed,
 )
 from transformers.onnx.utils import get_preprocessor
-from utils_tests import MODEL_NAMES
+from utils_tests import MODEL_NAMES, run_on_multiple_threads
 
 from optimum.exporters.onnx import MODEL_TYPES_REQUIRING_POSITION_IDS
 from optimum.intel import (
@@ -125,9 +125,9 @@ def test_load_from_hub_and_save_model(self):
         self.assertTrue(manual_openvino_cache_dir.is_dir())
         self.assertGreaterEqual(len(list(manual_openvino_cache_dir.glob("*.blob"))), 1)
         if is_openvino_version("<", "2023.3"):
-            self.assertEqual(loaded_model.request.get_property("PERFORMANCE_HINT").name, "THROUGHPUT")
+            self.assertEqual(loaded_model.compiled_model.get_property("PERFORMANCE_HINT").name, "THROUGHPUT")
         else:
-            self.assertEqual(loaded_model.request.get_property("PERFORMANCE_HINT"), "THROUGHPUT")
+            self.assertEqual(loaded_model.compiled_model.get_property("PERFORMANCE_HINT"), "THROUGHPUT")
 
         with tempfile.TemporaryDirectory() as tmpdirname:
             loaded_model.save_pretrained(tmpdirname)
@@ -253,7 +253,7 @@ def test_compare_to_transformers(self, model_arch):
         self.assertIsInstance(ov_model.config, PretrainedConfig)
         transformers_model = AutoModelForSequenceClassification.from_pretrained(model_id)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inputs = "This is a sample input"
+        inputs = "This is sample input."
         tokens = tokenizer(inputs, return_tensors="pt")
         with torch.no_grad():
             transformers_outputs = transformers_model(**tokens)
@@ -268,6 +268,49 @@ def test_compare_to_transformers(self, model_arch):
         del ov_model
         gc.collect()
 
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_compare_to_transformers_multithreading(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        set_seed(SEED)
+        ov_model = OVModelForSequenceClassification.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
+        self.assertIsInstance(ov_model.config, PretrainedConfig)
+        transformers_model = AutoModelForSequenceClassification.from_pretrained(model_id)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        inputs_list = [
+            [
+                "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."
+            ],
+            [
+                "This was a tragedy. Completely different story than presented in the books. Weak writing, a lot of plot wholes, trivial characters. Might be the worst thing I've seen"
+            ],
+            [
+                "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."
+            ],
+            [
+                "This was a tragedy. Completely different story than presented in the books. Weak writing, a lot of plot wholes, trivial characters. Might be the worst thing I've seen",
+            ],
+        ]
+
+        def run_ov_model(inputs, transformers_model, ov_model):
+            tokens = tokenizer(inputs, return_tensors="pt")
+            with torch.no_grad():
+                transformers_outputs = transformers_model(**tokens)
+            ov_model_instance = ov_model.clone()
+            for input_type in ["pt", "np"]:
+                tokens = tokenizer(inputs, return_tensors=input_type)
+                ov_outputs = ov_model_instance(**tokens)
+                self.assertIn("logits", ov_outputs)
+                self.assertIsInstance(ov_outputs.logits, TENSOR_ALIAS_TO_TYPE[input_type])
+                # Compare tensor outputs
+                close_enough = torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-4)
+                self.assertTrue(close_enough)
+
+        run_on_multiple_threads(run_ov_model, inputs_list, (transformers_model, ov_model))
+
+        del transformers_model
+        del ov_model
+        gc.collect()
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_pipeline(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
@@ -530,6 +573,52 @@ def test_compare_to_transformers(self, model_arch):
         del ov_model
         gc.collect()
 
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_compare_to_transformers_multithreading(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        if "llama_gptq" in model_arch:
+            self.skipTest("Not supported without gpu and disable_exllama=True option")
+        set_seed(SEED)
+        ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
+        self.assertIsInstance(ov_model.config, PretrainedConfig)
+        self.assertTrue(ov_model.use_cache)
+        self.assertEqual(ov_model.stateful, self.IS_SUPPORT_STATEFUL and model_arch != "gpt_bigcode")
+        transformers_model = AutoModelForCausalLM.from_pretrained(model_id)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        inputs_list = ["This is a sample", "Here is another sample", "That's the thrid one", "This is the last sample"]
+        tokens_list = [
+            tokenizer(inputs, return_tensors="pt", return_token_type_ids=False if model_arch == "llama" else None)
+            for inputs in inputs_list
+        ]
+
+        def run_ov_model(tokens, transformers_model, ov_model):
+            # global ov_model, transformers_model
+            ov_model_instance = ov_model.clone()
+            position_ids = None
+            if model_arch.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS:
+                input_shape = tokens["input_ids"].shape
+                position_ids = (
+                    torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1])
+                )
+            ov_outputs = ov_model_instance(**tokens, position_ids=position_ids)
+
+            self.assertTrue("logits" in ov_outputs)
+            self.assertIsInstance(ov_outputs.logits, torch.Tensor)
+            self.assertTrue("past_key_values" in ov_outputs)
+            self.assertIsInstance(ov_outputs.past_key_values, tuple)
+            if self.IS_SUPPORT_STATEFUL and model_arch != "gpt_bigcode":
+                self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0)
+            with torch.no_grad():
+                transformers_outputs = transformers_model(**tokens)
+            # Compare tensor outputs
+            self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4))
+
+        run_on_multiple_threads(run_ov_model, tokens_list, (transformers_model, ov_model))
+
+        del transformers_model
+        del ov_model
+        gc.collect()
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_pipeline(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
@@ -547,6 +636,30 @@ def test_pipeline(self, model_arch):
         del model
         gc.collect()
 
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_pipeline_multithreading(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        model = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=False, compile=False)
+        model.config.encoder_no_repeat_ngram_size = 0
+        model.to("cpu")
+        model.half()
+        model.compile()
+
+        def run_ov_model(input_text, model):
+            # Tokenizer is not supposed to be shared by multiple threads
+            tokenizer = AutoTokenizer.from_pretrained(model_id)
+            pipe = pipeline("text-generation", model=model.clone(), tokenizer=tokenizer)
+            outputs = pipe(input_text, max_length=10)
+            self.assertEqual(pipe.device, model.device)
+            for i in range(len(outputs)):
+                self.assertTrue(all(input_text[i] in item["generated_text"] for item in outputs[i]))
+            del pipe
+
+        inputs_list = [["This is a sample"], ["This is a second sample"], ["This is a third sample"]]
+        run_on_multiple_threads(run_ov_model, inputs_list, [model])
+        del model
+        gc.collect()
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_multiple_inputs(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
@@ -563,6 +676,29 @@ def test_multiple_inputs(self, model_arch):
         del model
         gc.collect()
 
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_multiple_inputs_multithreading(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        set_seed(SEED)
+        model = OVModelForCausalLM.from_pretrained(model_id, export=True, compile=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer.pad_token = tokenizer.eos_token
+        texts = ["this is a simple input", "this is a second simple input", "this is a third simple input"]
+        tokens = tokenizer(texts, padding=True, return_tensors="pt")
+        generation_config = GenerationConfig(encoder_no_repeat_ngram_size=0, max_new_tokens=20, num_beams=2)
+
+        def run_ov_model(tokens, model):
+            model_instance = model.clone()
+            # self.assertEqual(False)
+            outputs = model_instance.generate(**tokens, generation_config=generation_config)
+            self.assertIsInstance(outputs, torch.Tensor)
+            self.assertEqual(outputs.shape[0], 3)
+
+        tokens_list = [tokens, tokens, tokens, tokens]  # running in 4 threads
+        run_on_multiple_threads(run_ov_model, tokens_list, [model])
+        del model
+        gc.collect()
+
     def test_model_and_decoder_same_device(self):
         model_id = MODEL_NAMES["gpt2"]
         model = OVModelForCausalLM.from_pretrained(model_id, export=True)
@@ -607,7 +743,7 @@ def test_print_model_properties(self):
         if openvino_log_level is not None:
             os.environ["OPENVINO_LOG_LEVEL"] = openvino_log_level
         # test calling function directly
-        _print_compiled_model_properties(model.request)
+        _print_compiled_model_properties(model.compiled_model)
 
     def test_auto_device_loading(self):
         OV_MODEL_ID = "echarlaix/distilbert-base-uncased-finetuned-sst-2-english-openvino"
@@ -618,7 +754,7 @@ def test_auto_device_loading(self):
             if device == "AUTO:CPU":
                 model = OVModelForSequenceClassification.from_pretrained(OV_MODEL_ID, device=device)
                 message = "Model should not be loaded from cache without explicitly setting CACHE_DIR"
-                self.assertFalse(model.request.get_property("LOADED_FROM_CACHE"), message)
+                self.assertFalse(model.compiled_model.get_property("LOADED_FROM_CACHE"), message)
             del model
             gc.collect()
 
@@ -787,7 +923,7 @@ def test_pipeline(self, model_arch):
     @parameterized.expand(TIMM_MODELS)
     def test_compare_to_timm(self, model_id):
         ov_model = OVModelForImageClassification.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
-        self.assertEqual(ov_model.request.get_property("INFERENCE_PRECISION_HINT").to_string(), "f32")
+        self.assertEqual(ov_model.compiled_model.get_property("INFERENCE_PRECISION_HINT").to_string(), "f32")
         self.assertIsInstance(ov_model.config, PretrainedConfig)
         timm_model = timm.create_model(model_id, pretrained=True)
         preprocessor = TimmImageProcessor.from_pretrained(model_id)
@@ -865,6 +1001,52 @@ def test_compare_to_transformers(self, model_arch):
 
         gc.collect()
 
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    # This works the old way - infer request per inference, no cloning
+    def test_compare_to_transformers_multithreading(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        set_seed(SEED)
+        ov_model = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
+
+        self.assertIsInstance(ov_model.encoder, OVEncoder)
+        self.assertIsInstance(ov_model.decoder, OVDecoder)
+        self.assertIsInstance(ov_model.decoder_with_past, OVDecoder)
+        self.assertIsInstance(ov_model.config, PretrainedConfig)
+
+        transformers_model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        inputs_list = [
+            "This is a sample input for the first thread",
+            "Input sample for another thread",
+            "This is a third sample input",
+            "This last sample is for the last thread",
+        ]
+        args_list = []
+        for inputs in inputs_list:
+            tokens = tokenizer(inputs, return_tensors="pt")
+            decoder_start_token_id = transformers_model.config.decoder_start_token_id if model_arch != "mbart" else 2
+            decoder_inputs = {"decoder_input_ids": torch.ones((1, 1), dtype=torch.long) * decoder_start_token_id}
+            args_list.append([tokens, decoder_inputs])
+
+        def run_ov_model(arg, transformers_model, ov_model):
+            tokens = arg[0]
+            decoder_inputs = arg[1]
+            # global ov_model, transformers_model
+            ov_outputs = ov_model(**tokens, **decoder_inputs)
+            self.assertTrue("logits" in ov_outputs)
+            self.assertIsInstance(ov_outputs.logits, torch.Tensor)
+            with torch.no_grad():
+                transformers_outputs = transformers_model(**tokens, **decoder_inputs)
+            # Compare tensor outputs
+            self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4))
+
+        run_on_multiple_threads(run_ov_model, args_list, (transformers_model, ov_model))
+        del transformers_model
+        del ov_model
+
+        gc.collect()
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_pipeline(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
@@ -898,6 +1080,42 @@ def test_pipeline(self, model_arch):
         del model
         gc.collect()
 
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    # This works the old way - infer request per inference, no cloning
+    def test_pipeline_multithreading(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        model = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True, compile=False)
+        model.half()
+        model.to("cpu")
+        model.compile()
+
+        def run_ov_model(text, model):
+            # Tokenizer is not supposed to be shared between multiple threads
+            tokenizer = AutoTokenizer.from_pretrained(model_id)
+            # Text2Text generation
+            pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
+            outputs = pipe(text)
+            self.assertEqual(pipe.device, model.device)
+            self.assertIsInstance(outputs[0]["generated_text"], str)
+
+            # Summarization
+            pipe = pipeline("summarization", model=model, tokenizer=tokenizer)
+            outputs = pipe(text)
+            self.assertEqual(pipe.device, model.device)
+            self.assertIsInstance(outputs[0]["summary_text"], str)
+
+            # Translation
+            pipe = pipeline("translation_en_to_fr", model=model, tokenizer=tokenizer)
+            outputs = pipe(text)
+            self.assertEqual(pipe.device, model.device)
+            self.assertIsInstance(outputs[0]["translation_text"], str)
+            del pipe
+
+        texts_list = [["This is a test"], ["This is a test, but for another thread"], ["Yet another test"]]
+        run_on_multiple_threads(run_ov_model, texts_list, [model])
+        del model
+        gc.collect()
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_generate_utils(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
@@ -925,20 +1143,23 @@ def test_compare_with_and_without_past_key_values(self):
         text = "This is a sample input"
         tokens = tokenizer(text, return_tensors="pt")
 
-        model_with_pkv = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True, use_cache=True)
+        model_with_pkv = OVModelForSeq2SeqLM.from_pretrained(
+            model_id, export=True, use_cache=True, ov_config=F32_CONFIG
+        )
         _ = model_with_pkv.generate(**tokens)  # warmup
         with Timer() as with_pkv_timer:
             outputs_model_with_pkv = model_with_pkv.generate(
                 **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
             )
 
-        model_without_pkv = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True, use_cache=False)
+        model_without_pkv = OVModelForSeq2SeqLM.from_pretrained(
+            model_id, export=True, use_cache=False, ov_config=F32_CONFIG
+        )
         _ = model_without_pkv.generate(**tokens)  # warmup
         with Timer() as without_pkv_timer:
             outputs_model_without_pkv = model_without_pkv.generate(
                 **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
             )
-
         self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv))
         self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH)
         self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH)
@@ -1179,7 +1400,7 @@ class OVModelForPix2StructIntegrationTest(unittest.TestCase):
     TASK = "image-to-text"  # is it fine as well with visual-question-answering?
 
     GENERATION_LENGTH = 100
-    SPEEDUP_CACHE = 1.1
+    SPEEDUP_CACHE = 1.01
 
     IMAGE = Image.open(
         requests.get(
@@ -1240,20 +1461,23 @@ def test_compare_with_and_without_past_key_values(self):
         question = "Who am I?"
         inputs = preprocessor(images=self.IMAGE, text=question, return_tensors="pt")
 
-        model_with_pkv = OVModelForPix2Struct.from_pretrained(model_id, export=True, use_cache=True)
+        model_with_pkv = OVModelForPix2Struct.from_pretrained(
+            model_id, export=True, use_cache=True, ov_config=F32_CONFIG
+        )
         _ = model_with_pkv.generate(**inputs)  # warmup
         with Timer() as with_pkv_timer:
             outputs_model_with_pkv = model_with_pkv.generate(
                 **inputs, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
             )
 
-        model_without_pkv = OVModelForPix2Struct.from_pretrained(model_id, export=True, use_cache=False)
+        model_without_pkv = OVModelForPix2Struct.from_pretrained(
+            model_id, export=True, use_cache=False, ov_config=F32_CONFIG
+        )
         _ = model_without_pkv.generate(**inputs)  # warmup
         with Timer() as without_pkv_timer:
             outputs_model_without_pkv = model_without_pkv.generate(
                 **inputs, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
             )
-
         self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv))
         self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH)
         self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH)
diff --git a/tests/openvino/test_stable_diffusion.py b/tests/openvino/test_stable_diffusion.py
index d8cef2e027..65044339f2 100644
--- a/tests/openvino/test_stable_diffusion.py
+++ b/tests/openvino/test_stable_diffusion.py
@@ -21,6 +21,7 @@
 import PIL
 import torch
 from diffusers import (
+    DDIMScheduler,
     StableDiffusionPipeline,
     StableDiffusionXLImg2ImgPipeline,
     StableDiffusionXLPipeline,
@@ -30,7 +31,7 @@
 from openvino.runtime.ie_api import CompiledModel
 from packaging.version import Version, parse
 from parameterized import parameterized
-from utils_tests import MODEL_NAMES, SEED
+from utils_tests import MODEL_NAMES, SEED, run_on_multiple_threads
 
 from optimum.intel import (
     OVLatentConsistencyModelPipeline,
@@ -251,6 +252,64 @@ def test_compare_to_diffusers(self, model_arch: str):
         # Compare model devices
         self.assertEqual(pipeline.device.type, ov_pipeline.device)
 
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_compare_to_diffusers_multithreading(self, model_arch: str):
+        model_id = MODEL_NAMES[model_arch]
+        ov_pipeline = self.MODEL_CLASS.from_pretrained(
+            model_id, export=True, ov_config=F32_CONFIG, compile=True, trust_remote_code=True
+        )
+        ov_pipeline.scheduler = DDIMScheduler.from_config(ov_pipeline.scheduler.config)
+        self.assertIsInstance(ov_pipeline.text_encoder, OVModelTextEncoder)
+        self.assertIsInstance(ov_pipeline.vae_encoder, OVModelVaeEncoder)
+        self.assertIsInstance(ov_pipeline.vae_decoder, OVModelVaeDecoder)
+        self.assertIsInstance(ov_pipeline.unet, OVModelUnet)
+        self.assertIsInstance(ov_pipeline.config, Dict)
+        pipeline = StableDiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch])
+        pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
+        # Default PNDMscheduler is not working in HF with multithreading
+        # https://github.com/huggingface/diffusers/issues/3672
+        pipeline.safety_checker = None
+        batch_size, num_images_per_prompt, height, width = 1, 2, 64, 64
+
+        def run_ov_model(prompt, ov_pipeline):
+            ov_pipeline_instance = ov_pipeline.clone()
+            latents = ov_pipeline_instance.prepare_latents(
+                batch_size * num_images_per_prompt,
+                ov_pipeline_instance.unet.config["in_channels"],
+                height,
+                width,
+                dtype=np.float32,
+                generator=np.random.RandomState(0),
+            )
+
+            kwargs = {
+                "prompt": prompt,
+                "num_inference_steps": 1,
+                "num_images_per_prompt": num_images_per_prompt,
+                "height": height,
+                "width": width,
+                "guidance_rescale": 0.1,
+            }
+
+            for output_type in ["latent", "np"]:
+                ov_outputs = ov_pipeline_instance(latents=latents, output_type=output_type, **kwargs).images
+                self.assertIsInstance(ov_outputs, np.ndarray)
+                with torch.no_grad():
+                    outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images
+                # Compare model outputs
+                self.assertTrue(np.allclose(ov_outputs, outputs, atol=1e-4))
+
+            # Compare model devices
+            self.assertEqual(pipeline.device.type, ov_pipeline.device)
+
+        prompt_list = [
+            ["sailing ship in storm by Leonardo da Vinci"],
+            ["central park during christmas"],
+            ["zebras in space"],
+        ]
+        prompt_list = [["sailing ship in storm by Leonardo da Vinci"], ["central park during christmas"]]
+        run_on_multiple_threads(run_ov_model, prompt_list, [ov_pipeline])
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_image_reproducibility(self, model_arch: str):
         model_id = MODEL_NAMES[model_arch]
@@ -410,7 +469,7 @@ def test_image_reproducibility(self, model_arch: str):
 
         # Verify every subcomponent is compiled by default
         for component in {"unet", "vae_encoder", "vae_decoder", "text_encoder", "text_encoder_2"}:
-            self.assertIsInstance(getattr(pipeline, component).request, CompiledModel)
+            self.assertIsInstance(getattr(pipeline, component).compiled_model, CompiledModel)
 
         batch_size, num_images_per_prompt, height, width = 2, 3, 64, 128
         inputs = _generate_inputs(batch_size)
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 8fabb34e38..5a1f65663a 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -12,6 +12,8 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import threading
+
 import numpy as np
 import torch
 
@@ -132,3 +134,37 @@ def get_num_quantized_nodes(ov_model):
             if "4" in elem.get_output_element_type(i).get_type_name():
                 num_int4 += 1
     return num_fake_quantize, num_int8, num_int4
+
+
+### Multithreading
+
+
+class OVThread(threading.Thread):
+    def __init__(self, target, args):
+        super().__init__()
+        self.target = target
+        self.args = args
+
+    def run(self):
+        self.exception = None
+        try:
+            self.target(*self.args)
+        except Exception as e:
+            self.exception = e
+
+    def join(self):
+        super().join()
+        if self.exception:
+            raise self.exception
+
+
+# Each set of args is run in a separate thread.
+# Amount of such sets define how many threads are spawned.
+def run_on_multiple_threads(target, list, extra_args):
+    threads = []
+    for input in list:
+        threads.append(OVThread(target=target, args=(input, *extra_args)))
+    for thread in threads:
+        thread.start()
+    for thread in threads:
+        thread.join()