diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ea9c494bb8..475f5433a4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -14,6 +14,8 @@ repos:
       - id: check-json
       - id: check-yaml
       - id: debug-statements
+      - id: mixed-line-ending
+        args: [--fix=lf]
       - id: requirements-txt-fixer
       - id: trailing-whitespace
         files: (.*\.(py|rst|cmake|yaml|yml|json|ts|js|html|svelte|sh))$
diff --git a/comps/asr/whisper/whisper_model.py b/comps/asr/whisper/whisper_model.py
index c5f16e1121..85d4126cdc 100644
--- a/comps/asr/whisper/whisper_model.py
+++ b/comps/asr/whisper/whisper_model.py
@@ -148,7 +148,7 @@ def audio2text(self, audio_path):
                 return_tensors="pt",
                 sampling_rate=16000,
             )
-        elif self.device == "hpu":
+        elif self.device == "hpu" and processed_inputs.input_features.shape[-1] > 3000:
             processed_inputs["input_features"] = torch.nn.functional.pad(
                 processed_inputs.input_features,
                 (0, self.hpu_max_len - processed_inputs.input_features.size(-1)),
diff --git a/comps/dataprep/redis/llama_index/requirements.txt b/comps/dataprep/redis/llama_index/requirements.txt
index 9e8dbaa9f7..ad75869c18 100644
--- a/comps/dataprep/redis/llama_index/requirements.txt
+++ b/comps/dataprep/redis/llama_index/requirements.txt
@@ -12,6 +12,7 @@ opentelemetry-exporter-otlp
 opentelemetry-sdk
 prometheus-fastapi-instrumentator
 python-bidi==0.4.2
+python-multipart
 redis
 sentence_transformers
 shortuuid
diff --git a/comps/embeddings/README.md b/comps/embeddings/README.md
index ce4b4fa461..edf164b486 100644
--- a/comps/embeddings/README.md
+++ b/comps/embeddings/README.md
@@ -45,7 +45,7 @@ First, you need to start a TEI service.
 your_port=8090
 model="BAAI/bge-large-en-v1.5"
 revision="refs/pr/5"
-docker run -p $your_port:80 -v ./data:/data --name tei_server -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 --model-id $model --revision $revision
+docker run -p $your_port:80 -v ./data:/data --name tei_server -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model --revision $revision
 ```
 
 Then you need to test your TEI service using the following commands:
@@ -66,9 +66,6 @@ cd langchain
 cd llama_index
 export TEI_EMBEDDING_ENDPOINT="http://localhost:$yourport"
 export TEI_EMBEDDING_MODEL_NAME="BAAI/bge-large-en-v1.5"
-export LANGCHAIN_TRACING_V2=true
-export LANGCHAIN_API_KEY=${your_langchain_api_key}
-export LANGCHAIN_PROJECT="opea/gen-ai-comps:embeddings"
 python embedding_tei.py
 ```
 
@@ -92,7 +89,7 @@ First, you need to start a TEI service.
 your_port=8090
 model="BAAI/bge-large-en-v1.5"
 revision="refs/pr/5"
-docker run -p $your_port:80 -v ./data:/data --name tei_server -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 --model-id $model --revision $revision
+docker run -p $your_port:80 -v ./data:/data --name tei_server -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model --revision $revision
 ```
 
 Then you need to test your TEI service using the following commands:
@@ -124,13 +121,16 @@ docker build -t opea/embedding-tei:latest --build-arg https_proxy=$https_proxy -
 
 ```bash
 cd ../../
-docker build -t opea/embedding-tei:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/llama_index/docker/Dockerfile .
+docker build -t opea/embedding-tei-llama-index:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/llama_index/docker/Dockerfile .
 ```
 
 ## 2.3 Run Docker with CLI
 
 ```bash
+# run with langchain docker
 docker run -d --name="embedding-tei-server" -p 6000:6000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e TEI_EMBEDDING_MODEL_NAME=$TEI_EMBEDDING_MODEL_NAME opea/embedding-tei:latest
+# run with llama-index docker
+docker run -d --name="embedding-tei-llama-index-server" -p 6000:6000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e TEI_EMBEDDING_MODEL_NAME=$TEI_EMBEDDING_MODEL_NAME opea/embedding-tei-llama-index:latest
 ```
 
 ## 2.4 Run Docker with Docker Compose
diff --git a/comps/embeddings/llama_index/docker/docker_compose_embedding.yaml b/comps/embeddings/llama_index/docker/docker_compose_embedding.yaml
index 62f5870b7b..152f5030b0 100644
--- a/comps/embeddings/llama_index/docker/docker_compose_embedding.yaml
+++ b/comps/embeddings/llama_index/docker/docker_compose_embedding.yaml
@@ -5,7 +5,7 @@ version: "3.8"
 
 services:
   embedding:
-    image: opea/embedding-tei:latest
+    image: opea/embedding-tei-llama-index:latest
     container_name: embedding-tei-server
     ports:
       - "6000:6000"
@@ -16,7 +16,6 @@ services:
       https_proxy: ${https_proxy}
       TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
       TEI_EMBEDDING_MODEL_NAME: ${TEI_EMBEDDING_MODEL_NAME}
-      LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY}
     restart: unless-stopped
 
 networks:
diff --git a/comps/guardrails/pii_detection/requirements.txt b/comps/guardrails/pii_detection/requirements.txt
index 9ca5116dab..e9bb7ba663 100644
--- a/comps/guardrails/pii_detection/requirements.txt
+++ b/comps/guardrails/pii_detection/requirements.txt
@@ -20,6 +20,7 @@ prometheus-fastapi-instrumentator
 pyarrow
 pymupdf
 python-docx
+python-multipart
 ray
 redis
 scikit-learn
diff --git a/comps/llms/text-generation/native/Dockerfile b/comps/llms/text-generation/native/Dockerfile
deleted file mode 100644
index 9d7d1e0945..0000000000
--- a/comps/llms/text-generation/native/Dockerfile
+++ /dev/null
@@ -1,41 +0,0 @@
-
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-# HABANA environment
-FROM vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest as hpu
-
-ENV LANG=en_US.UTF-8
-ARG REPO=https://github.com/huggingface/optimum-habana.git
-ARG REPO_VER=v1.11.1
-
-RUN apt-get update && \
-    apt-get install git-lfs && \
-    git-lfs install && \
-    apt-get install -y --no-install-recommends --fix-missing \
-    libgl1-mesa-glx \
-    libjemalloc-dev \
-    vim
-
-RUN useradd -m -s /bin/bash user && \
-    mkdir -p /home/user && \
-    chown -R user /home/user/
-
-USER user
-
-COPY comps /home/user/comps
-COPY comps/llm/text-generation/qwen2/qwen2.patch /home/user/qwen2.patch
-
-SHELL ["/bin/bash", "--login", "-c"]
-RUN git clone --single-branch -b ${REPO_VER} ${REPO} /optimum-habana
-
-ENV PYTHONPATH=/root:/home/user
-
-RUN cd /optimum-habana && git apply /qwen2.patch && \
-    cd /optimum-habana/examples/text-generation && pip install -r requirements.txt && \
-    cd /optimum-habana && python setup.py install
-
-WORKDIR /home/user/comps/llms/text-generation/qwen2
-
-ENTRYPOINT ["python", "llm.py"]
diff --git a/comps/llms/text-generation/native/README.md b/comps/llms/text-generation/native/README.md
new file mode 100644
index 0000000000..a4fcc74c33
--- /dev/null
+++ b/comps/llms/text-generation/native/README.md
@@ -0,0 +1,61 @@
+# LLM Native Microservice
+
+LLM Native microservice uses [optimum-habana](https://github.com/huggingface/optimum-habana) for model initialization and warm-up, focusing solely on large language models (LLMs). It operates without frameworks like TGI/VLLM, using PyTorch directly for inference, and supports only non-streaming formats. This streamlined approach optimizes performance on Habana hardware.
+
+## 🚀1. Start Microservice
+
+If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a Native LLM service with docker.
+
+### 1.1 Setup Environment Variables
+
+In order to start Native LLM service, you need to setup the following environment variables first.
+
+```bash
+export LLM_NATIVE_MODEL="Qwen/Qwen2-7B-Instruct"
+```
+
+### 1.2 Build Docker Image
+
+```bash
+cd ../../../../
+docker build -t opea/llm-native:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/native/docker/Dockerfile .
+```
+
+To start a docker container, you have two options:
+
+- A. Run Docker with CLI
+- B. Run Docker with Docker Compose
+
+You can choose one as needed.
+
+### 1.3 Run Docker with CLI (Option A)
+
+```bash
+docker run -d --runtime=habana --name="llm-native-server" -p 9000:9000 -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e TOKENIZERS_PARALLELISM=false -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e LLM_NATIVE_MODEL=${LLM_NATIVE_MODEL} opea/llm-native:latest
+```
+
+### 1.4 Run Docker with Docker Compose (Option B)
+
+```bash
+cd docker
+docker compose -f docker_compose_llm.yaml up -d
+```
+
+## 🚀2. Consume LLM Service
+
+### 2.1 Check Service Status
+
+```bash
+curl http://${your_ip}:9000/v1/health_check\
+  -X GET \
+  -H 'Content-Type: application/json'
+```
+
+### 2.2 Consume LLM Service
+
+```bash
+curl http://${your_ip}:9000/v1/chat/completions\
+  -X POST \
+  -d '{"query":"What is Deep Learning?"}' \
+  -H 'Content-Type: application/json'
+```
diff --git a/comps/llms/text-generation/native/docker/Dockerfile b/comps/llms/text-generation/native/docker/Dockerfile
new file mode 100644
index 0000000000..3dacf52114
--- /dev/null
+++ b/comps/llms/text-generation/native/docker/Dockerfile
@@ -0,0 +1,42 @@
+
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# HABANA environment
+FROM vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest as hpu
+
+ENV LANG=en_US.UTF-8
+ARG REPO=https://github.com/huggingface/optimum-habana.git
+ARG REPO_VER=v1.12.1
+
+RUN apt-get update && \
+    apt-get install git-lfs && \
+    git-lfs install && \
+    apt-get install -y --no-install-recommends --fix-missing \
+    libgl1-mesa-glx \
+    libjemalloc-dev \
+    vim
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+USER user
+
+COPY comps /home/user/comps
+
+RUN pip install --upgrade-strategy eager optimum[habana] && \
+    pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.17.0
+
+RUN git clone ${REPO} /home/user/optimum-habana && \
+    cd /home/user/optimum-habana && git checkout ${REPO_VER} && \
+    cd examples/text-generation && pip install -r requirements.txt && \
+    cd /home/user/comps/llms/text-generation/native && pip install -r requirements.txt && \
+    pip install --upgrade --force-reinstall pydantic
+
+ENV PYTHONPATH=/root:/home/user
+
+WORKDIR /home/user/comps/llms/text-generation/native
+
+ENTRYPOINT ["python", "llm.py"]
diff --git a/comps/llms/text-generation/native/docker/docker_compose_llm.yaml b/comps/llms/text-generation/native/docker/docker_compose_llm.yaml
new file mode 100644
index 0000000000..f3a36e5bb8
--- /dev/null
+++ b/comps/llms/text-generation/native/docker/docker_compose_llm.yaml
@@ -0,0 +1,28 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3.8"
+
+services:
+  llm:
+    image: opea/llm-native:latest
+    container_name: llm-native-server
+    ports:
+      - "9000:9000"
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      LLM_NATIVE_MODEL: ${LLM_NATIVE_MODEL}
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+      TOKENIZERS_PARALLELISM: false
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
diff --git a/comps/llms/text-generation/native/llm.py b/comps/llms/text-generation/native/llm.py
index 4f407ccd65..43348670d5 100644
--- a/comps/llms/text-generation/native/llm.py
+++ b/comps/llms/text-generation/native/llm.py
@@ -11,87 +11,156 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import sys
 
-import os
-from datetime import datetime
+sys.path.append("/test/GenAIComps/")
+
+import logging
+import threading
+import time
 
 import torch
-from fastapi.responses import StreamingResponse
-from langsmith import traceable
+from langchain_core.prompts import PromptTemplate
+from template import ChatTemplate, args_dict, input_sentences
 from utils import initialize_model
 
-from comps import GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice
+from comps import (
+    GeneratedDoc,
+    LLMParamsDoc,
+    ServiceType,
+    opea_microservices,
+    register_microservice,
+    register_statistics,
+)
 
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
+)
+logger = logging.getLogger(__name__)
 
-def warmup():
-    input_sentences = ["DeepSpeed is a machine learning framework", "He is working on", "He has a", "He got all"]
-    input_tokens = tokenizer.batch_encode_plus(input_sentences, return_tensors="pt", padding=True)
-    for t in input_tokens:
-        if torch.is_tensor(input_tokens[t]):
-            input_tokens[t] = input_tokens[t].to("hpu")
-    for i in range(3):
-        print(f"Current time: {datetime.now()}")
-        print(f"Warming up {i+1}...")
-        outputs = model.generate(
-            **input_tokens,
-            generation_config=generation_config,
-            lazy_mode=True,
-            hpu_graphs=True,
-            profiling_steps=0,
-            profiling_warmup_steps=0,
-        ).cpu()
-        res = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        print(f"res: {res}")
 
+class Args:
+    def __init__(self, **entries):
+        self.__dict__.update(entries)
 
-@register_microservice(
-    name="opea_service@llm_qwen",
-    service_type=ServiceType.LLM,
-    endpoint="/v1/chat/completions",
-    host="0.0.0.0",
-    port=8000,
-)
-@traceable(run_type="llm")
-def llm_generate(input: LLMParamsDoc):
-    input_query = input.query
-    input_tokens = tokenizer.batch_encode_plus([input_query], return_tensors="pt", padding=True)
+
+model = None
+assistant_model = None
+tokenizer = None
+generation_config = None
+args = Args(**args_dict)
+initialization_lock = threading.Lock()
+initialized = False
+
+
+def generate(
+    input_query: list,
+    device="hpu",
+    use_lazy_mode=True,
+    use_hpu_graphs=True,
+    profiling_steps=0,
+    profiling_warmup_steps=0,
+    ignore_eos=True,
+    profiling_record_shapes=False,
+):
+    """Generates sequences from the input sentences and returns them."""
+    logger.info(f"[llm - generate] starting to inference with prompt {input_query}")
+    encode_t0 = time.perf_counter()
+
+    # Tokenization
+    input_tokens = tokenizer.batch_encode_plus(input_query, return_tensors="pt", padding=True)
+    encode_duration = time.perf_counter() - encode_t0
+    logger.info(f"[llm - generate] input tokenized: {input_tokens}")
+
+    # Move inputs to target device(s)
     for t in input_tokens:
+        logger.info(f"[llm - generate] t: {t}")
         if torch.is_tensor(input_tokens[t]):
-            input_tokens[t] = input_tokens[t].to("hpu")
+            logger.info("[llm - generate] input[t] is tensor")
+            logger.info(f"[llm - generate] device: {model.device}")
+            input_tokens[t] = input_tokens[t].to(model.device)
 
-    print(f"[llm - qwen] Current time: {datetime.now()}")
-    output = model.generate(
+    logger.info("[llm - generate] inputs transferred.")
+
+    iteration_times = []
+    outputs = model.generate(
         **input_tokens,
         generation_config=generation_config,
-        lazy_mode=True,
-        hpu_graphs=True,
-        profiling_steps=0,
-        profiling_warmup_steps=0,
+        assistant_model=assistant_model,
+        lazy_mode=use_lazy_mode,
+        hpu_graphs=use_hpu_graphs,
+        profiling_steps=profiling_steps,
+        profiling_warmup_steps=profiling_warmup_steps,
+        ignore_eos=ignore_eos,
+        iteration_times=iteration_times,
+        profiling_record_shapes=profiling_record_shapes,
     ).cpu()
-    res = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
-    print(f"[llm - qwen] res: {res}")
-    return res
+    logger.info("[llm - generate] result generated")
+    first_token_time = iteration_times[0] + encode_duration
+    result = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+    logger.info(f"[llm - generate] result: {result}")
+    logger.info(f"[llm - generate] Time to first token = {first_token_time*1000}ms")
+    return result
 
 
-if __name__ == "__main__":
-    model, tokenizer, generation_config = initialize_model(
-        model_name_or_path="Qwen/Qwen1.5-7B-Chat", max_new_tokens=128
-    )
-    import habana_frameworks.torch.hpu as torch_hpu
+def initialize():
+    global model, assistant_model, tokenizer, generation_config, initialized
+    with initialization_lock:
+        if not initialized:
+            # initialize model and tokenizer
+            import habana_frameworks.torch.hpu as torch_hpu
+            from optimum.habana.utils import HabanaProfile
+
+            model, assistant_model, tokenizer, generation_config = initialize_model(args, logger)
+            logger.info("[llm] model and tokenizer initialized.")
+
+            # compilation and model warmup
+            HabanaProfile.disable()
+            logger.info("[llm - native] Graph compilation...")
+            for _ in range(args.warmup):
+                generate(input_sentences)
+            logger.info("[llm - native] model warm up finished.")
+            torch_hpu.synchronize()
+            HabanaProfile.enable()
+            logger.info("[llm - native] Ready to inference")
+            res = generate(["What is Deep Learning?"])
+            logger.info(f"[llm - native] test result: {res}")
+            initialized = True
+
 
-    print("[llm - qwen] model and tokenizer initialized.")
+@register_microservice(
+    name="opea_service@llm_native",
+    service_type=ServiceType.LLM,
+    endpoint="/v1/chat/completions",
+    host="0.0.0.0",
+    port=9000,
+)
+@register_statistics(names=["opea_service@llm_native"])
+def llm_generate(input: LLMParamsDoc):
+    initialize()
 
-    from optimum.habana.utils import HabanaProfile
+    prompt = input.query
+    prompt_template = None
+    if input.chat_template:
+        prompt_template = PromptTemplate.from_template(input.chat_template)
+        input_variables = prompt_template.input_variables
+    if prompt_template:
+        if sorted(input_variables) == ["context", "question"]:
+            prompt = prompt_template.format(question=input.query, context="\n".join(input.documents))
+        elif input_variables == ["question"]:
+            prompt = prompt_template.format(question=input.query)
+        else:
+            logger.info(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']")
+    else:
+        if input.documents:
+            prompt = ChatTemplate.generate_rag_prompt(input.query, input.documents)
+    res = generate([prompt])
 
-    # compilation stage disable profiling
-    HabanaProfile.disable()
-    # Compilation
-    print("Graph compilation...")
-    warmup()
-    print("[llm - qwen] model warm up finished.")
+    logger.info(f"[llm - native] inference result: {res}")
+    return GeneratedDoc(text=res[0], prompt=input.query)
 
-    torch_hpu.synchronize()
-    HabanaProfile.enable()
-    print("[llm - qwen] Ready to inference")
 
-    opea_microservices["opea_service@llm_qwen"].start()
+if __name__ == "__main__":
+    opea_microservices["opea_service@llm_native"].start()
diff --git a/comps/llms/text-generation/native/qwen2.patch b/comps/llms/text-generation/native/qwen2.patch
deleted file mode 100644
index 9b5d935670..0000000000
--- a/comps/llms/text-generation/native/qwen2.patch
+++ /dev/null
@@ -1,127 +0,0 @@
-diff --git a/examples/text-generation/run_lm_eval.py b/examples/text-generation/run_lm_eval.py
-index b086c80..e0e5a9f 100644
---- a/examples/text-generation/run_lm_eval.py
-+++ b/examples/text-generation/run_lm_eval.py
-@@ -75,13 +75,13 @@ class HabanaModelAdapter(lm_eval.base.BaseLM):
-         self.options = options
-         self._device = args.device
-         self.model_inputs = {"use_cache": self.options.use_cache}
--        if self.model.config.model_type in ["llama", "falcon"]:
-+        if self.model.config.model_type in ["llama", "falcon", "qwen2"]:
-             self.model_inputs.update(
-                 {
-                     "reuse_cache": self.options.reuse_cache,
-                 }
-             )
--        if self.model.config.model_type == "llama":
-+        if self.model.config.model_type in ["llama","mistral","qwen2"]:
-             self.model_inputs.update(
-                 {
-                     "attn_softmax_bf16": self.options.attn_softmax_bf16,
-diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py
-index 8bce0ae..c29f458 100644
---- a/examples/text-generation/utils.py
-+++ b/examples/text-generation/utils.py
-@@ -234,7 +234,7 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger):
- 
-     model = deepspeed.init_inference(model, **ds_inference_kwargs)
-     model = model.module
--    if model.config.model_type in ["llama", "falcon"]:
-+    if model.config.model_type in ["llama", "falcon","qwen2"]:
-         patch_scoped_linear_all_reduce(model)
- 
-     if args.quant_config:
-diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py
-index 0d50470..94cc7eb 100755
---- a/optimum/habana/transformers/generation/utils.py
-+++ b/optimum/habana/transformers/generation/utils.py
-@@ -740,7 +740,7 @@ class GaudiGenerationMixin(GenerationMixin):
-                     )
-                     model_kwargs["kv_cache_len"] = calculated_max_length
- 
--            if self.config.model_type in ["llama", "falcon"]:
-+            if self.config.model_type in ["llama", "falcon","qwen2"]:
-                 if self.config.max_position_embeddings < calculated_max_length:
-                     unwrap_deepspeed_model(self).update_sincos_cache(seq_len=calculated_max_length)
- 
-diff --git a/optimum/habana/transformers/modeling_utils.py b/optimum/habana/transformers/modeling_utils.py
-index 6dc40a7..b5044af 100644
---- a/optimum/habana/transformers/modeling_utils.py
-+++ b/optimum/habana/transformers/modeling_utils.py
-@@ -55,6 +55,9 @@ from .models import (
-     GaudiOPTForCausalLM,
-     GaudiOPTLearnedPositionalEmbedding,
-     GaudiPhiForCausalLM,
-+    GaudiQwen2Model,
-+    GaudiQwen2Attention,
-+    GaudiQwen2MLP,
-     _gaudi_wav2vec2_compute_mask_indices,
-     _gaudi_wav2vec2_mask_hidden_states,
-     gaudi_albert_forward,
-@@ -118,6 +121,7 @@ from .models import (
-     gaudi_phi_attention_forward,
-     gaudi_phi_decoder_layer_forward,
-     gaudi_phi_model_forward,
-+    gaudi_qwen2_rmsnorm_forward,
-     gaudi_rot_matmul,
-     gaudi_rot_vec_mul,
-     gaudi_SpeechT5Attention_forward,
-@@ -367,3 +371,11 @@ def adapt_transformers_to_gaudi():
-     transformers.models.speecht5.modeling_speecht5.SpeechT5SpeechDecoderPrenet.forward = (
-         gaudi_SpeechT5SpeechDecoderPrenet_forward
-     )
-+
-+    # Optimization for qwen2 on Gaudi
-+    transformers.models.qwen2.modeling_qwen2.Qwen2ForCausalLM = GaudiQwen2ForCausalLM
-+    transformers.models.qwen2.modeling_qwen2.Qwen2Model = GaudiQwen2Model
-+    transformers.models.qwen2.modeling_qwen2.Qwen2Attention = GaudiQwen2Attention
-+    transformers.models.qwen2.modeling_qwen2.Qwen2MLP = GaudiQwen2MLP
-+    transformers.models.qwen2.modeling_qwen2.Qwen2DecoderLayer = GaudiQwen2DecoderLayer
-+    transformers.models.qwen2.modeling_qwen2.Qwen2RMSNorm.forward = gaudi_qwen2_rmsnorm_forward
-diff --git a/optimum/habana/transformers/models/__init__.py b/optimum/habana/transformers/models/__init__.py
-index 1582d3f..41fdfdc 100644
---- a/optimum/habana/transformers/models/__init__.py
-+++ b/optimum/habana/transformers/models/__init__.py
-@@ -122,6 +122,14 @@ from .phi import (
-     gaudi_phi_decoder_layer_forward,
-     gaudi_phi_model_forward,
- )
-+from .qwen2 import (
-+    GaudiQwen2Attention,
-+    GaudiQwen2DecoderLayer,
-+    GaudiQwen2ForCausalLM,
-+    GaudiQwen2MLP,
-+    GaudiQwen2Model,
-+    gaudi_qwen2_rmsnorm_forward,
-+)
- from .speecht5 import (
-     gaudi_generate_speech,
-     gaudi_SpeechT5Attention_forward,
-diff --git a/optimum/habana/transformers/trainer.py b/optimum/habana/transformers/trainer.py
-index dc6e136..7dfebaa 100644
---- a/optimum/habana/transformers/trainer.py
-+++ b/optimum/habana/transformers/trainer.py
-@@ -916,9 +916,9 @@ class GaudiTrainer(Trainer):
-                 if step % args.gradient_accumulation_steps == 0:
-                     self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
- 
--                # attn_softmax_bf16 and use_flash_attention is enabled only for llama
-+                # attn_softmax_bf16 and use_flash_attention is enabled only for llama and qwen2
-                 if hasattr(self.model, "generation_config") and self.model.generation_config is not None:
--                    if self.model.config.model_type == "llama":
-+                    if self.model.config.model_type in ["llama", "qwen2"]:
-                         if self.model.generation_config.attn_softmax_bf16:
-                             inputs["attn_softmax_bf16"] = True
-                         if self.model.generation_config.use_flash_attention:
-@@ -1799,9 +1799,9 @@ class GaudiTrainer(Trainer):
-                 if batch_size is None:
-                     batch_size = observed_batch_size
- 
--            # attn_softmax_bf16 and use_flash_attention are enabled only for llama
-+            # attn_softmax_bf16 and use_flash_attention are enabled only for llama and qwen2
-             if hasattr(self.model, "generation_config") and self.model.generation_config is not None:
--                if self.model.config.model_type == "llama":
-+                if self.model.config.model_type in ["llama", "qwen2"]:
-                     if self.model.generation_config.attn_softmax_bf16:
-                         inputs["attn_softmax_bf16"] = True
-                     if self.model.generation_config.use_flash_attention:
diff --git a/comps/llms/text-generation/native/requirements.txt b/comps/llms/text-generation/native/requirements.txt
index e8473a80c4..806f2d29fa 100644
--- a/comps/llms/text-generation/native/requirements.txt
+++ b/comps/llms/text-generation/native/requirements.txt
@@ -1,10 +1,10 @@
-docarray[full]
+docarray
 fastapi
-langsmith
+httpx
+langchain_core
 opentelemetry-api
 opentelemetry-exporter-otlp
 opentelemetry-sdk
 prometheus-fastapi-instrumentator
 shortuuid
-transformers
 uvicorn
diff --git a/comps/llms/text-generation/native/template.py b/comps/llms/text-generation/native/template.py
new file mode 100644
index 0000000000..c43205a0ae
--- /dev/null
+++ b/comps/llms/text-generation/native/template.py
@@ -0,0 +1,99 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import re
+
+
+class ChatTemplate:
+    @staticmethod
+    def generate_rag_prompt(question, documents):
+        context_str = "\n".join(documents)
+        if context_str and len(re.findall("[\u4E00-\u9FFF]", context_str)) / len(context_str) >= 0.3:
+            # chinese context
+            template = """
+### 你将扮演一个乐于助人、尊重他人并诚实的助手，你的目标是帮助用户解答问题。有效地利用来自本地知识库的搜索结果。确保你的回答中只包含相关信息。如果你不确定问题的答案，请避免分享不准确的信息。
+### 搜索结果：{context}
+### 问题：{question}
+### 回答：
+"""
+        else:
+            template = """
+### You are a helpful, respectful and honest assistant to help the user with questions. \
+Please refer to the search results obtained from the local knowledge base. \
+But be careful to not incorporate the information that you think is not relevant to the question. \
+If you don't know the answer to a question, please don't share false information. \n
+### Search results: {context} \n
+### Question: {question} \n
+### Answer:
+"""
+        return template.format(context=context_str, question=question)
+
+
+input_sentences = [
+    "DeepSpeed is a machine learning framework",
+    "He is working on",
+    "He has a",
+    "He got all",
+    "Everyone is happy and I can",
+    "The new movie that got Oscar this year",
+    "In the far far distance from our galaxy,",
+    "Peace is the only way",
+]
+
+
+llm_model = os.getenv("LLM_NATIVE_MODEL", "Qwen/Qwen2-7B-Instruct")
+args_dict = {
+    "device": "hpu",
+    "model_name_or_path": llm_model,
+    "bf16": True,
+    "max_new_tokens": 100,
+    "max_input_tokens": 0,
+    "batch_size": 1,
+    "warmup": 3,
+    "n_iterations": 5,
+    "local_rank": 0,
+    "use_kv_cache": True,
+    "use_hpu_graphs": True,
+    "dataset_name": None,
+    "column_name": None,
+    "do_sample": False,
+    "num_beams": 1,
+    "trim_logits": False,
+    "seed": 27,
+    "profiling_warmup_steps": 0,
+    "profiling_steps": 0,
+    "profiling_record_shapes": False,
+    "prompt": None,
+    "bad_words": None,
+    "force_words": None,
+    "assistant_model": None,
+    "peft_model": None,
+    "num_return_sequences": 1,
+    "token": None,
+    "model_revision": "main",
+    "attn_softmax_bf16": False,
+    "output_dir": None,
+    "bucket_size": -1,
+    "bucket_internal": False,
+    "dataset_max_samples": -1,
+    "limit_hpu_graphs": False,
+    "reuse_cache": False,
+    "verbose_workers": False,
+    "simulate_dyn_prompt": None,
+    "reduce_recompile": False,
+    "use_flash_attention": False,
+    "flash_attention_recompute": False,
+    "flash_attention_causal_mask": False,
+    "flash_attention_fast_softmax": False,
+    "book_source": False,
+    "torch_compile": False,
+    "ignore_eos": True,
+    "temperature": 1.0,
+    "top_p": 1.0,
+    "const_serialization_path": None,
+    "disk_offload": False,
+    "trust_remote_code": False,
+    "quant_config": "",
+    "world_size": 0,
+}
diff --git a/comps/llms/text-generation/native/utils.py b/comps/llms/text-generation/native/utils.py
index 3eef7a6e24..04cebfbd49 100644
--- a/comps/llms/text-generation/native/utils.py
+++ b/comps/llms/text-generation/native/utils.py
@@ -1,10 +1,11 @@
-# Copyright (c) 2024 Intel Corporation
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,11 +13,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+###############################################################################
+# Copyright (C) 2020-2021 Habana Labs, Ltd. an Intel Company
+###############################################################################
 
 import copy
+import glob
 import os
 import shutil
+import tempfile
 import time
+from pathlib import Path
 
 import torch
 from optimum.habana.checkpoint_utils import (
@@ -26,66 +33,376 @@
     model_on_meta,
     write_checkpoints_json,
 )
-from optimum.habana.utils import check_habana_frameworks_version, check_optimum_habana_min_version, set_seed
+from optimum.habana.utils import (
+    check_habana_frameworks_version,
+    check_optimum_habana_min_version,
+    get_habana_frameworks_version,
+    set_seed,
+)
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 from transformers.utils import check_min_version
 
 
-def setup_env():
+def adjust_batch(batch, size):
+    curr_size = batch["input_ids"].shape[1]
+    if curr_size >= size:
+        adjusted_batch = {
+            "input_ids": batch["input_ids"][:, :size],
+            "attention_mask": batch["attention_mask"][:, :size],
+        }
+    else:
+        adjusted_batch = {}
+        for k in batch.keys():
+            last_colm = batch[k][:, -1]
+            expanded = last_colm.tile((size - curr_size, 1)).T
+            adjusted_batch[k] = torch.concat([batch[k], expanded], 1)
+    assert adjusted_batch["input_ids"].shape[1] == size
+    assert adjusted_batch["attention_mask"].shape[1] == size
+    return adjusted_batch
+
+
+def override_print(enable):
+    import builtins as __builtin__
+
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        if force or enable:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def override_logger(logger, enable):
+    logger_info = logger.info
+
+    def info(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        if force or enable:
+            logger_info(*args, **kwargs)
+
+    logger.info = info
+
+
+def count_hpu_graphs():
+    return len(glob.glob(".graph_dumps/*PreGraph*"))
+
+
+def override_prints(enable, logger):
+    override_print(enable)
+    override_logger(logger, enable)
+
+
+def setup_distributed(args):
+    args.local_rank = int(os.getenv("LOCAL_RANK", "0"))
+    args.world_size = int(os.getenv("WORLD_SIZE", "0"))
+    args.global_rank = int(os.getenv("RANK", "0"))
+
+
+def setup_inference(args, model):
+    import habana_frameworks.torch.core as htcore
+
+    habana_version = get_habana_frameworks_version()
+
+    print("Initializing inference mode")
+    # Keeping the if-else here for back compat. TODO remove later
+    if habana_version.major >= 1 and habana_version.minor >= 16:
+        htcore.hpu_initialize(model, mark_only_scales_as_const=True)
+    else:
+        const_marking = os.getenv("ENABLE_CONST_MARKING", "True")
+        if const_marking == "True":
+            htcore.hpu_initialize(model)
+    return model
+
+
+def setup_const_serialization(const_serialization_path):
+    import uuid
+
+    const_serialization_path = os.path.join(const_serialization_path + uuid.uuid4().hex)
+    os.makedirs(const_serialization_path)
+    from habana_frameworks.torch.hpu import enable_const_section_serialization
+
+    print("Serializing const params to {}".format(const_serialization_path))
+    enable_const_section_serialization(const_serialization_path, True)
+
+
+def setup_env(args):
     # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
     check_min_version("4.34.0")
     check_optimum_habana_min_version("1.9.0.dev0")
     # TODO: SW-167588 - WA for memory issue in hqt prep_model
     os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE")
 
+    if args.global_rank == 0 and not args.torch_compile:
+        os.environ.setdefault("GRAPH_VISUALIZATION", "true")
+        shutil.rmtree(".graph_dumps", ignore_errors=True)
+
+    if args.world_size > 0:
+        os.environ.setdefault("PT_HPU_LAZY_ACC_PAR_MODE", "0")
+        os.environ.setdefault("PT_HPU_ENABLE_LAZY_COLLECTIVES", "true")
+
+    if args.use_hpu_graphs and args.limit_hpu_graphs and not args.reuse_cache and args.bucket_internal:
+        # Based upon above conditions and below env variable,
+        # we can call HPU graphs clear_inputs().
+        os.environ.setdefault("PT_HPUGRAPH_DISABLE_TENSOR_CACHE", "1")
+
     # Tweak generation so that it runs faster on Gaudi
     from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
 
     adapt_transformers_to_gaudi()
 
 
-def setup_device():
-    import habana_frameworks.torch.core as htcore
+def setup_device(args):
+    if args.device == "hpu":
+        import habana_frameworks.torch.core as htcore
+
+        if args.quant_config:
+            htcore.hpu_set_env()
+    return torch.device(args.device)
+
 
-    return torch.device("hpu")
+# patching LinearAllreduce to use ScopedLinearAllReduce
+def patch_scoped_linear_all_reduce(model):
+    from deepspeed.module_inject.layers import LinearAllreduce
+    from optimum.habana.transformers.models.modeling_all_models import ScopedLinearAllReduce
+
+    for name, module in model.named_children():
+        if type(module) is LinearAllreduce:
+            SL = ScopedLinearAllReduce(mod=module)
+            setattr(model, name, SL)
+        patch_scoped_linear_all_reduce(module)
 
 
 def get_torch_compiled_model(model):
-    model.model = torch.compile(model.model, backend="hpu_backend")
+    model.model = torch.compile(model.model, backend="hpu_backend", options={"keep_input_mutations": True})
     return model
 
 
-def setup_model(model_name_or_path, model_dtype, model_kwargs):
-    model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=model_dtype, **model_kwargs)
-    model = model.eval().to("hpu")
+def setup_model(args, model_dtype, model_kwargs, logger):
+    logger.info("Single-device run.")
+    if args.assistant_model is None:
+        assistant_model = None
+    else:
+        logger.info(f"Using asssitant model {args.assistant_model}.")
+    if args.disk_offload:
+        from accelerate import infer_auto_device_map, init_empty_weights
+
+        config = AutoConfig.from_pretrained(args.model_name_or_path)
+        with init_empty_weights():
+            model = AutoModelForCausalLM.from_config(config)
+        max_memory = {"cpu": "10GiB"}
+        device_map = infer_auto_device_map(model, max_memory=max_memory, dtype=model_dtype)
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_name_or_path,
+            device_map=device_map,
+            offload_folder="/tmp/offload_folder/",
+            offload_state_dict=True,
+            torch_dtype=model_dtype,
+            **model_kwargs,
+        )
+    else:
+        if args.assistant_model is not None:
+            assistant_model = AutoModelForCausalLM.from_pretrained(
+                args.assistant_model, torch_dtype=model_dtype, **model_kwargs
+            )
+        if args.peft_model is not None:
+            model = peft_model(args, model_dtype, logger, **model_kwargs)
+        else:
+            model = AutoModelForCausalLM.from_pretrained(
+                args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs
+            )
+    if args.quant_config:
+        import habana_quantization_toolkit
+
+        habana_quantization_toolkit.prep_model(model)
+        if args.assistant_model is not None:
+            habana_quantization_toolkit.quantize_model(assistant_model)
+
+    model = model.eval().to(args.device)
+    if args.assistant_model is not None:
+        assistant_model = assistant_model.eval().to(args.device)
+
+    if args.use_hpu_graphs:
+        from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+        from optimum.habana.transformers.trainer import _is_peft_model
+
+        if check_habana_frameworks_version("1.13.0") and model.config.model_type == "falcon":
+            model = wrap_in_hpu_graph(model, hash_with_views=False)
+        else:
+            model = wrap_in_hpu_graph(model)
+        if args.assistant_model is not None:
+            assistant_model = wrap_in_hpu_graph(assistant_model)
+        if _is_peft_model(model):
+            model.base_model = wrap_in_hpu_graph(model.base_model)
 
-    from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+    if args.torch_compile and model.config.model_type == "llama":
+        model = get_torch_compiled_model(model)
+        # if args.assistant_model is not None:
+        #     assistant_model = get_torch_compiled_model(assistant_model)
+    return model, assistant_model
+
+
+def setup_distributed_model(args, model_dtype, model_kwargs, logger):
+    import deepspeed
 
-    if check_habana_frameworks_version("1.13.0") and model.config.model_type == "falcon":
-        model = wrap_in_hpu_graph(model, hash_with_views=False)
+    logger.info("DeepSpeed is enabled.")
+    deepspeed.init_distributed(dist_backend="hccl")
+    config = AutoConfig.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs)
+    load_to_meta = model_on_meta(config)
+
+    if args.assistant_model is None:
+        assistant_model = None
     else:
-        model = wrap_in_hpu_graph(model)
+        logger.info(f"Using asssitant model {args.assistant_model}.")
 
-    if model.config.model_type == "llama":
+    if load_to_meta:
+        # Construct model with fake meta tensors, later will be replaced on devices during ds-inference ckpt load
+        with deepspeed.OnDevice(dtype=model_dtype, device="meta"):
+            model = AutoModelForCausalLM.from_config(config, torch_dtype=model_dtype)
+
+        # Model loaded to meta is managed differently
+        checkpoints_json = tempfile.NamedTemporaryFile(suffix=".json", mode="+w")
+
+        # For PEFT models, write the merged model on disk to be able to load it on the meta device
+        if args.peft_model is not None:
+            merged_model_dir = "/tmp/text_generation_merged_peft_model"
+            if args.local_rank == 0:
+                if Path(merged_model_dir).is_dir():
+                    shutil.rmtree(merged_model_dir)
+                peft_model(args, model_dtype, logger, **model_kwargs).save_pretrained(merged_model_dir)
+            torch.distributed.barrier()
+
+        write_checkpoints_json(
+            merged_model_dir if args.peft_model is not None else args.model_name_or_path,
+            args.local_rank,
+            checkpoints_json,
+            token=args.token,
+        )
+    else:
+        # TODO: revisit placement on CPU when auto-injection is possible
+        with deepspeed.OnDevice(dtype=model_dtype, device="cpu"):
+            if args.peft_model is not None:
+                model = peft_model(args, model_dtype, logger, **model_kwargs)
+            else:
+                model = AutoModelForCausalLM.from_pretrained(
+                    args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs
+                )
+    model.eval()
+
+    if args.assistant_model is not None:
+        assistant_model = AutoModelForCausalLM.from_pretrained(
+            args.assistant_model, torch_dtype=model_dtype, **model_kwargs
+        ).eval()
+
+    # Initialize the model
+    ds_inference_kwargs = {"dtype": model_dtype}
+    ds_inference_kwargs["tensor_parallel"] = {"tp_size": args.world_size}
+    ds_inference_kwargs["enable_cuda_graph"] = args.use_hpu_graphs
+    ds_inference_kwargs["injection_policy"] = get_ds_injection_policy(config)
+    if load_to_meta:
+        ds_inference_kwargs["checkpoint"] = checkpoints_json.name
+
+    model = deepspeed.init_inference(model, **ds_inference_kwargs)
+    model = model.module
+    if model.config.model_type in ["llama", "falcon", "qwen2"]:
+        patch_scoped_linear_all_reduce(model)
+
+    if args.quant_config:
+        import habana_quantization_toolkit
+
+        habana_quantization_toolkit.prep_model(model)
+        if args.assistant_model is not None:
+            habana_quantization_toolkit.prep_model(assistant_model)
+
+    if args.torch_compile and model.config.model_type == "llama":
         model = get_torch_compiled_model(model)
+        # if args.assistant_model is not None:
+        #     assistant_model = get_torch_compiled_model(assistant_model)
+    return model, assistant_model
 
-    return model
 
+def peft_model(args, model_dtype, logger, **model_kwargs):
+    import importlib.util
+
+    if importlib.util.find_spec("peft") is None:
+        raise ImportError("The `peft` package is not installed, please run: `pip install peft`.")
+    from peft import AutoPeftModelForCausalLM
+    from peft.config import PeftConfigMixin
+
+    base_model_name = PeftConfigMixin.from_pretrained(
+        args.peft_model,
+        token=model_kwargs["token"] if "token" in model_kwargs else None,
+    ).base_model_name_or_path
+
+    base_model_is_local = Path(base_model_name).is_dir()
+    if not base_model_is_local:
+        # Check if the base model path to a remote repository on the HF Hub exists
+        from huggingface_hub import list_repo_files
+
+        try:
+            list_repo_files(base_model_name)
+            base_model_is_remote = True
+        except Exception:
+            base_model_is_remote = False
+
+    if base_model_is_local or base_model_is_remote:
+        model = AutoPeftModelForCausalLM.from_pretrained(args.peft_model, torch_dtype=model_dtype, **model_kwargs)
+    else:
+        # Since the base model doesn't exist locally nor remotely, use `args.model_name_or_path` as the base model
+        logger.warning(
+            f"The base model `{base_model_name}` of the LoRA configuration associated"
+            f" to `{args.peft_model}` does not exist locally or remotely. Using "
+            f"`--model_name_or_path {args.model_name_or_path}` as a fall back for the base model."
+        )
+        from peft import PeftModel
+
+        model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs)
+        model = PeftModel.from_pretrained(model, args.peft_model, torch_dtype=model_dtype, **model_kwargs)
+    if hasattr(model, "merge_and_unload"):
+        model = model.merge_and_unload()
+        if model_dtype == torch.bfloat16:
+            model = model.to(torch.bfloat16)
+        return model
+    else:
+        from optimum.habana.peft.peft_model import gaudi_generate, gaudi_prepare_inputs_for_generation
+
+        model.__class__.generate = gaudi_generate
+        model.__class__.prepare_inputs_for_generation = gaudi_prepare_inputs_for_generation
+        return model
 
-def setup_tokenizer(model_name_or_path, model):
+
+def setup_tokenizer(args, model, assistant_model):
     tokenizer_kwargs = {
-        "revision": "main",
-        "token": None,
+        "revision": args.model_revision,
+        "token": args.token,
+        "trust_remote_code": args.trust_remote_code,
     }
-    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, **tokenizer_kwargs)
+    if args.bad_words is not None or args.force_words is not None:
+        tokenizer_kwargs["add_prefix_space"] = True
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, **tokenizer_kwargs)
     if not model.config.is_encoder_decoder:
         tokenizer.padding_side = "left"
-    # Some models like GPT2 do not have a PAD token so we have to set it if necessary
+
     if model.config.model_type == "llama":
         # unwind broken decapoda-research config
         model.generation_config.pad_token_id = 0
         model.generation_config.bos_token_id = 1
         model.generation_config.eos_token_id = 2
+        if assistant_model is not None:
+            assistant_model.generation_config.pad_token_id = 0
+            assistant_model.generation_config.bos_token_id = 1
+            assistant_model.generation_config.eos_token_id = 2
+        tokenizer.bos_token_id = model.generation_config.bos_token_id
+        tokenizer.eos_token_id = model.generation_config.eos_token_id
+        tokenizer.pad_token_id = model.generation_config.pad_token_id
+        tokenizer.pad_token = tokenizer.decode(tokenizer.pad_token_id)
+        tokenizer.eos_token = tokenizer.decode(tokenizer.eos_token_id)
+        tokenizer.bos_token = tokenizer.decode(tokenizer.bos_token_id)
+    if model.config.model_type == "persimmon":
+        model.generation_config.pad_token_id = model.generation_config.eos_token_id
+        if assistant_model is not None:
+            assistant_model.generation_config.pad_token_id = assistant_model.generation_config.eos_token_id
         tokenizer.bos_token_id = model.generation_config.bos_token_id
         tokenizer.eos_token_id = model.generation_config.eos_token_id
         tokenizer.pad_token_id = model.generation_config.pad_token_id
@@ -93,54 +410,112 @@ def setup_tokenizer(model_name_or_path, model):
         tokenizer.eos_token = tokenizer.decode(tokenizer.eos_token_id)
         tokenizer.bos_token = tokenizer.decode(tokenizer.bos_token_id)
 
+    # Some models like GPT2 do not have a PAD token so we have to set it if necessary
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
         model.generation_config.pad_token_id = model.generation_config.eos_token_id
-    return tokenizer, model
+        if assistant_model is not None:
+            assistant_model.generation_config.pad_token_id = assistant_model.generation_config.eos_token_id
 
+    return tokenizer, model, assistant_model
 
-def setup_generation_config(model, tokenizer, max_new_tokens):
+
+def setup_generation_config(args, model, assistant_model, tokenizer):
     bad_words_ids = None
     force_words_ids = None
+    if args.bad_words is not None:
+        bad_words_ids = [tokenizer.encode(bad_word, add_special_tokens=False) for bad_word in args.bad_words]
+    if args.force_words is not None:
+        force_words_ids = [tokenizer.encode(force_word, add_special_tokens=False) for force_word in args.force_words]
 
     is_optimized = model_is_optimized(model.config)
+
     # Generation configuration
     generation_config = copy.deepcopy(model.generation_config)
-    generation_config.max_new_tokens = max_new_tokens
-    generation_config.use_cache = True
-    generation_config.static_shapes = is_optimized
-    generation_config.bucket_size = -1
-    generation_config.bucket_internal = True
-    generation_config.do_sample = True
-    generation_config.num_beams = 1
+    generation_config.max_new_tokens = args.max_new_tokens
+    generation_config.use_cache = args.use_kv_cache
+    generation_config.static_shapes = is_optimized and assistant_model is None
+    generation_config.bucket_size = args.bucket_size if is_optimized else -1
+    generation_config.bucket_internal = args.bucket_internal
+    generation_config.do_sample = args.do_sample
+    generation_config.num_beams = args.num_beams
     generation_config.bad_words_ids = bad_words_ids
     generation_config.force_words_ids = force_words_ids
-    generation_config.num_return_sequences = 1
-    generation_config.trim_logits = True
-    generation_config.attn_softmax_bf16 = True
-    generation_config.limit_hpu_graphs = True
-    generation_config.reuse_cache = False
-    generation_config.reduce_recompile = False
-    generation_config.use_flash_attention = False
-    generation_config.flash_attention_recompute = True
-    generation_config.flash_attention_causal_mask = True
+    generation_config.num_return_sequences = args.num_return_sequences
+    generation_config.trim_logits = args.trim_logits
+    generation_config.attn_softmax_bf16 = args.attn_softmax_bf16
+    generation_config.limit_hpu_graphs = args.limit_hpu_graphs
+    generation_config.reuse_cache = args.reuse_cache
+    generation_config.reduce_recompile = args.reduce_recompile
+    if generation_config.reduce_recompile:
+        assert generation_config.bucket_size > 0
+    generation_config.use_flash_attention = args.use_flash_attention
+    generation_config.flash_attention_recompute = args.flash_attention_recompute
+    generation_config.flash_attention_causal_mask = args.flash_attention_causal_mask
+    generation_config.flash_attention_fast_softmax = args.flash_attention_fast_softmax
+    generation_config.trust_remote_code = args.trust_remote_code
+
     return generation_config
 
 
-def initialize_model(model_name_or_path, max_new_tokens=128):
+def exclude_hpu_graph_configs(args):
+    # Excluded configs for batch size 1 for hpu graph
+    if args.batch_size == 1 and args.limit_hpu_graphs:
+        if "falcon-180B" in args.model_name_or_path or "falcon-180b" in args.model_name_or_path:
+            return False
+        if args.world_size == 2 or args.world_size == 4 or args.world_size == 8:
+            if args.quant_config:
+                if args.max_input_tokens >= 8192 and args.max_new_tokens >= 128:
+                    return False
+            else:
+                if args.max_input_tokens >= 4096 and args.max_new_tokens >= 128:
+                    return False
+        return True
+    else:
+        return False
+
+
+def initialize_model(args, logger):
     init_start = time.perf_counter()
-    setup_env()
-    setup_device()
-    set_seed(17)
-    get_repo_root(model_name_or_path, local_rank=0, token=None)
-    model_dtype = torch.bfloat16
+    setup_distributed(args)
+    if exclude_hpu_graph_configs(args):
+        args.limit_hpu_graphs = False
+    override_prints(args.global_rank == 0 or args.verbose_workers, logger)
+    setup_env(args)
+    setup_device(args)
+    set_seed(args.seed)
+    get_repo_root(args.model_name_or_path, local_rank=args.local_rank, token=args.token)
+    if args.assistant_model is not None:
+        get_repo_root(args.assistant_model, local_rank=args.local_rank, token=args.token)
+    use_deepspeed = args.world_size > 0
+    if use_deepspeed or args.bf16:
+        model_dtype = torch.bfloat16
+    else:
+        model_dtype = torch.float
+        args.attn_softmax_bf16 = False
 
-    model_kwargs = {"revision": "main", "token": None, "device_map": "auto", "offload_folder": "/tmp/offload_folder/"}
+    model_kwargs = {
+        "revision": args.model_revision,
+        "token": args.token,
+        "trust_remote_code": args.trust_remote_code,
+    }
+    if args.trust_remote_code:
+        logger.warning("`trust_remote_code` is set, there is no guarantee this model works properly and it may fail")
 
-    model = setup_model(model_name_or_path, model_dtype, model_kwargs)
-    tokenizer, model = setup_tokenizer(model_name_or_path, model)
-    generation_config = setup_generation_config(model, tokenizer, max_new_tokens)
+    model, assistant_model = (
+        setup_model(args, model_dtype, model_kwargs, logger)
+        if not use_deepspeed
+        else setup_distributed_model(args, model_dtype, model_kwargs, logger)
+    )
+    tokenizer, model, assistant_model = setup_tokenizer(args, model, assistant_model)
+    generation_config = setup_generation_config(args, model, assistant_model, tokenizer)
 
+    if args.const_serialization_path:
+        setup_const_serialization(args.const_serialization_path)
+    if args.quant_config:
+        model = setup_inference(args, model)
     init_end = time.perf_counter()
-    print(f"Model initialization took {(init_end - init_start):.3f}s")
-    return model, tokenizer, generation_config
+    logger.info(f"Args: {args}")
+    logger.info(f"device: {args.device}, n_hpu: {args.world_size}, bf16: {model_dtype == torch.bfloat16}")
+    logger.info(f"Model initialization took {(init_end - init_start):.3f}s")
+    return model, assistant_model, tokenizer, generation_config
diff --git a/comps/ragas/tgi/llm.py b/comps/ragas/tgi/llm.py
index f31c666576..895705703c 100644
--- a/comps/ragas/tgi/llm.py
+++ b/comps/ragas/tgi/llm.py
@@ -1,86 +1,86 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-
-from datasets import Dataset
-from langchain_community.embeddings import (
-    HuggingFaceBgeEmbeddings,
-    HuggingFaceEmbeddings,
-    HuggingFaceHubEmbeddings,
-    HuggingFaceInstructEmbeddings,
-)
-from langchain_community.llms import HuggingFaceEndpoint
-from langsmith import traceable
-from ragas import evaluate
-from ragas.metrics import answer_relevancy, context_precision, context_recall, faithfulness
-
-from comps import GeneratedDoc, RAGASParams, RAGASScores, ServiceType, opea_microservices, register_microservice
-
-tei_embedding_endpoint = os.getenv("TEI_ENDPOINT")
-EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5")
-
-
-@register_microservice(
-    name="opea_service@ragas_tgi_llm",
-    service_type=ServiceType.RAGAS,
-    endpoint="/v1/ragas",
-    host="0.0.0.0",
-    port=9050,
-    input_datatype=RAGASParams,
-    output_datatype=RAGASScores,
-)
-@traceable(run_type="llm")
-def llm_generate(input: RAGASParams):
-    llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
-
-    # Create vectorstore
-    if tei_embedding_endpoint:
-        # create embeddings using TEI endpoint service
-        embedder = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint)
-    else:
-        # create embeddings using local embedding model
-        embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL)
-
-    llm = HuggingFaceEndpoint(
-        endpoint_url=llm_endpoint,
-        max_new_tokens=input.max_new_tokens,
-        top_k=input.top_k,
-        top_p=input.top_p,
-        typical_p=input.typical_p,
-        temperature=input.temperature,
-        repetition_penalty=input.repetition_penalty,
-        streaming=input.streaming,
-        timeout=600,
-    )
-
-    data_collections = {
-        "question": input.questions,
-        "answer": input.answers,
-        "docs": input.docs,
-        "ground_truth": input.groundtruths,
-    }
-    dataset = Dataset.from_dict(data_collections)
-
-    score = evaluate(
-        dataset,
-        metrics=[answer_relevancy, faithfulness, context_recall, context_precision],
-        llm=llm,
-        embeddings=embedder,
-    )
-    df = score.to_pandas()
-    answer_relevancy_average = df["answer_relevancy"][:].mean()
-    faithfulness_average = df["faithfulness"][:].mean()
-    context_recall_average = df["context_recall"][:].mean()
-    context_precision_average = df["context_precision"][:].mean()
-
-    return RAGASScores(
-        answer_relevancy=answer_relevancy_average,
-        faithfulness=faithfulness_average,
-        context_recallL=context_recall_average,
-        context_precision=context_precision_average,
-    )
-
-
-if __name__ == "__main__":
-    opea_microservices["opea_service@llm_tgi"].start()
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+from datasets import Dataset
+from langchain_community.embeddings import (
+    HuggingFaceBgeEmbeddings,
+    HuggingFaceEmbeddings,
+    HuggingFaceHubEmbeddings,
+    HuggingFaceInstructEmbeddings,
+)
+from langchain_community.llms import HuggingFaceEndpoint
+from langsmith import traceable
+from ragas import evaluate
+from ragas.metrics import answer_relevancy, context_precision, context_recall, faithfulness
+
+from comps import GeneratedDoc, RAGASParams, RAGASScores, ServiceType, opea_microservices, register_microservice
+
+tei_embedding_endpoint = os.getenv("TEI_ENDPOINT")
+EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5")
+
+
+@register_microservice(
+    name="opea_service@ragas_tgi_llm",
+    service_type=ServiceType.RAGAS,
+    endpoint="/v1/ragas",
+    host="0.0.0.0",
+    port=9050,
+    input_datatype=RAGASParams,
+    output_datatype=RAGASScores,
+)
+@traceable(run_type="llm")
+def llm_generate(input: RAGASParams):
+    llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
+
+    # Create vectorstore
+    if tei_embedding_endpoint:
+        # create embeddings using TEI endpoint service
+        embedder = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint)
+    else:
+        # create embeddings using local embedding model
+        embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL)
+
+    llm = HuggingFaceEndpoint(
+        endpoint_url=llm_endpoint,
+        max_new_tokens=input.max_new_tokens,
+        top_k=input.top_k,
+        top_p=input.top_p,
+        typical_p=input.typical_p,
+        temperature=input.temperature,
+        repetition_penalty=input.repetition_penalty,
+        streaming=input.streaming,
+        timeout=600,
+    )
+
+    data_collections = {
+        "question": input.questions,
+        "answer": input.answers,
+        "docs": input.docs,
+        "ground_truth": input.groundtruths,
+    }
+    dataset = Dataset.from_dict(data_collections)
+
+    score = evaluate(
+        dataset,
+        metrics=[answer_relevancy, faithfulness, context_recall, context_precision],
+        llm=llm,
+        embeddings=embedder,
+    )
+    df = score.to_pandas()
+    answer_relevancy_average = df["answer_relevancy"][:].mean()
+    faithfulness_average = df["faithfulness"][:].mean()
+    context_recall_average = df["context_recall"][:].mean()
+    context_precision_average = df["context_precision"][:].mean()
+
+    return RAGASScores(
+        answer_relevancy=answer_relevancy_average,
+        faithfulness=faithfulness_average,
+        context_recallL=context_recall_average,
+        context_precision=context_precision_average,
+    )
+
+
+if __name__ == "__main__":
+    opea_microservices["opea_service@llm_tgi"].start()
diff --git a/comps/ragas/tgi/requirements.txt b/comps/ragas/tgi/requirements.txt
index 3fa49150ec..2c8fad29f2 100644
--- a/comps/ragas/tgi/requirements.txt
+++ b/comps/ragas/tgi/requirements.txt
@@ -1,14 +1,14 @@
-datasets
-docarray[full]
-fastapi
-huggingface_hub
-langchain==0.1.16
-langsmith
-opentelemetry-api
-opentelemetry-exporter-otlp
-opentelemetry-sdk
-prometheus-fastapi-instrumentator
-ragas
-shortuuid
-transformers
-uvicorn
+datasets
+docarray[full]
+fastapi
+huggingface_hub
+langchain==0.1.16
+langsmith
+opentelemetry-api
+opentelemetry-exporter-otlp
+opentelemetry-sdk
+prometheus-fastapi-instrumentator
+ragas
+shortuuid
+transformers
+uvicorn
diff --git a/tests/test_agent_langchain.sh b/tests/test_agent_langchain.sh
index e44d882921..c08024847e 100644
--- a/tests/test_agent_langchain.sh
+++ b/tests/test_agent_langchain.sh
@@ -12,7 +12,7 @@ function build_docker_images() {
     echo "Building the docker images"
     cd $WORKPATH
     echo $WORKPATH
-    docker build -t opea/comps-agent-langchain:comps -f comps/agent/langchain/docker/Dockerfile .
+    docker build --no-cache -t opea/comps-agent-langchain:comps -f comps/agent/langchain/docker/Dockerfile .
     if $? ; then
         echo "opea/comps-agent-langchain built fail"
         exit 1
@@ -34,7 +34,7 @@ function start_service() {
     docker logs test-comps-tgi-gaudi-service
 
     echo "Starting agent microservice"
-    docker run -d --runtime=runc --name="test-comps-langchain-agent-endpoint" -v $WORKPATH/comps/agent/langchain/tools:/home/user/comps/agent/langchain/tools -p 9090:9090 --ipc=host -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e model=${model} -e strategy=react -e llm_endpoint_url=http://${ip_address}:8080 -e llm_engine=tgi -e recursion_limit=5 -e require_human_feedback=false -e tools=/home/user/comps/agent/langchain/tools/custom_tools.yaml opea/comps-agent-langchain:comps
+    docker run -d --runtime=runc --name="test-comps-langchain-agent-endpoint" -v $WORKPATH/comps/agent/langchain/tools:/home/user/comps/agent/langchain/tools -p 5042:9090 --ipc=host -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e model=${model} -e strategy=react -e llm_endpoint_url=http://${ip_address}:8080 -e llm_engine=tgi -e recursion_limit=5 -e require_human_feedback=false -e tools=/home/user/comps/agent/langchain/tools/custom_tools.yaml opea/comps-agent-langchain:comps
     sleep 5s
     docker logs test-comps-langchain-agent-endpoint
 
@@ -69,7 +69,7 @@ function validate() {
 
 function validate_microservice() {
     echo "Testing agent service"
-    local CONTENT=$(curl http://${ip_address}:9090/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
+    local CONTENT=$(curl http://${ip_address}:5042/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{
      "query": "What is Intel OPEA project?"
     }' | tee ${LOG_PATH}/test-agent-langchain.log)
     local EXIT_CODE=$(validate "$CONTENT" "OPEA" "test-agent-langchain")
diff --git a/tests/test_dataprep_pgvector.sh b/tests/test_dataprep_pgvector.sh
index 1f2b93ab40..3ea3df9f8b 100755
--- a/tests/test_dataprep_pgvector.sh
+++ b/tests/test_dataprep_pgvector.sh
@@ -16,7 +16,7 @@ function build_docker_images() {
     docker pull pgvector/pgvector:0.7.0-pg16
 
     # build dataprep image for pgvector
-    docker build -t opea/dataprep-pgvector:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f $WORKPATH/comps/dataprep/pgvector/langchain/docker/Dockerfile .
+    docker build --no-cache -t opea/dataprep-pgvector:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f $WORKPATH/comps/dataprep/pgvector/langchain/docker/Dockerfile .
     if $? ; then
         echo "opea/dataprep-pgvector built fail"
         exit 1
diff --git a/tests/test_dataprep_pinecone.sh b/tests/test_dataprep_pinecone.sh
index 55e42073d8..1930d27981 100755
--- a/tests/test_dataprep_pinecone.sh
+++ b/tests/test_dataprep_pinecone.sh
@@ -10,7 +10,7 @@ function build_docker_images() {
     cd $WORKPATH
 
     # build dataprep image for pinecone
-    docker build -t opea/dataprep-pinecone:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f $WORKPATH/comps/dataprep/pinecone/docker/Dockerfile .
+    docker build --no-cache -t opea/dataprep-pinecone:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f $WORKPATH/comps/dataprep/pinecone/docker/Dockerfile .
     if $? ; then
         echo "opea/dataprep-pinecone built fail"
         exit 1
@@ -24,20 +24,33 @@ function start_service() {
     export PINECONE_INDEX_NAME="test-index"
     export HUGGINGFACEHUB_API_TOKEN=$HF_TOKEN
 
-    docker run -d --name="test-comps-dataprep-pinecone" -p 6007:6007 -p 6008:6008 -p 6009:6009 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e PINECONE_API_KEY=$PINECONE_API_KEY -e PINECONE_INDEX_NAME=$PINECONE_INDEX_NAME opea/dataprep-pinecone:comps
+    docker run -d --name="test-comps-dataprep-pinecone" -p 5039:6007 -p 5040:6008 -p 5041:6009 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e PINECONE_API_KEY=$PINECONE_API_KEY -e PINECONE_INDEX_NAME=$PINECONE_INDEX_NAME opea/dataprep-pinecone:comps
 
     sleep 1m
 }
 
 function validate_microservice() {
-    URL="http://$ip_address:6007/v1/dataprep"
+    URL="http://$ip_address:5039/v1/dataprep"
     echo 'The OPEA platform includes: Detailed framework of composable building blocks for state-of-the-art generative AI systems including LLMs, data stores, and prompt engines' > ./dataprep_file.txt
     result=$(curl --noproxy $ip_address --location --request POST \
       --form 'files=@./dataprep_file.txt' $URL)
-
-    DELETE_URL="http://$ip_address:6009/v1/dataprep/delete_file"
-    result_2=$(curl --noproxy $ip_address --location --request POST \
+    if [[ $result == *"200"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong. Received was $result"
+        docker logs test-comps-dataprep-pinecone
+        exit 1
+    fi
+    DELETE_URL="http://$ip_address:5041/v1/dataprep/delete_file"
+    result=$(curl --noproxy $ip_address --location --request POST \
       -d '{"file_path": "all"}' -H 'Content-Type: application/json' $DELETE_URL)
+    if [[ $result == *"true"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong. Received was $result"
+        docker logs test-comps-dataprep-pinecone
+        exit 1
+    fi
 }
 
 function stop_docker() {
diff --git a/tests/test_dataprep_qdrant_langchain.sh b/tests/test_dataprep_qdrant_langchain.sh
index 410c08903b..e112438fd3 100644
--- a/tests/test_dataprep_qdrant_langchain.sh
+++ b/tests/test_dataprep_qdrant_langchain.sh
@@ -57,6 +57,9 @@ function validate_services() {
     # check response status
     if [ "$HTTP_STATUS" -ne "200" ]; then
         echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs test-comps-dataprep-qdrant-langchain
+        docker logs test-comps-dataprep-qdrant-langchain-tei
+        docker logs test-comps-dataprep-qdrant-langchain-server
         exit 1
     else
         echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
@@ -64,6 +67,9 @@ function validate_services() {
     # check response body
     if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then
         echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+        docker logs test-comps-dataprep-qdrant-langchain
+        docker logs test-comps-dataprep-qdrant-langchain-tei
+        docker logs test-comps-dataprep-qdrant-langchain-server
         exit 1
     else
         echo "[ $SERVICE_NAME ] Content is as expected."
diff --git a/tests/test_dataprep_redis_langchain.sh b/tests/test_dataprep_redis_langchain.sh
index f09e08a488..1a9831dc58 100644
--- a/tests/test_dataprep_redis_langchain.sh
+++ b/tests/test_dataprep_redis_langchain.sh
@@ -39,16 +39,17 @@ function validate_microservice() {
     HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
     RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
     SERVICE_NAME="dataprep - upload - file"
-    docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_upload_file.log
 
     if [ "$HTTP_STATUS" -ne "200" ]; then
         echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_upload_file.log
         exit 1
     else
         echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
     fi
     if [[ "$RESPONSE_BODY" != *"Data preparation succeeded"* ]]; then
         echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+        docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_upload_file.log
         exit 1
     else
         echo "[ $SERVICE_NAME ] Content is as expected."
@@ -60,16 +61,18 @@ function validate_microservice() {
     HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
     RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
     SERVICE_NAME="dataprep - upload - link"
-    docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_upload_link.log
+
 
     if [ "$HTTP_STATUS" -ne "200" ]; then
         echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_upload_link.log
         exit 1
     else
         echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
     fi
     if [[ "$RESPONSE_BODY" != *"Data preparation succeeded"* ]]; then
         echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+        docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_upload_link.log
         exit 1
     else
         echo "[ $SERVICE_NAME ] Content is as expected."
@@ -81,16 +84,17 @@ function validate_microservice() {
     HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
     RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
     SERVICE_NAME="dataprep - get"
-    docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_file.log
 
     if [ "$HTTP_STATUS" -ne "200" ]; then
         echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_file.log
         exit 1
     else
         echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
     fi
     if [[ "$RESPONSE_BODY" != *'{"name":'* ]]; then
         echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+        docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_file.log
         exit 1
     else
         echo "[ $SERVICE_NAME ] Content is as expected."
@@ -102,11 +106,11 @@ function validate_microservice() {
     HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
     RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
     SERVICE_NAME="dataprep - del"
-    docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_del.log
 
     # check response status
     if [ "$HTTP_STATUS" -ne "200" ]; then
         echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_del.log
         exit 1
     else
         echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
@@ -114,6 +118,7 @@ function validate_microservice() {
     # check response body
     if [[ "$RESPONSE_BODY" != *'{"status":true}'* ]]; then
         echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+        docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_del.log
         exit 1
     else
         echo "[ $SERVICE_NAME ] Content is as expected."
diff --git a/tests/test_dataprep_redis_langchain_ray.sh b/tests/test_dataprep_redis_langchain_ray.sh
index 220609b135..0980eafc19 100644
--- a/tests/test_dataprep_redis_langchain_ray.sh
+++ b/tests/test_dataprep_redis_langchain_ray.sh
@@ -11,7 +11,7 @@ ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
     echo "Building the docker images"
     cd $WORKPATH
-    docker build -t opea/dataprep-on-ray-redis:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/redis/langchain_ray/docker/Dockerfile .
+    docker build --no-cache -t opea/dataprep-on-ray-redis:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/redis/langchain_ray/docker/Dockerfile .
     if $? ; then
         echo "opea/dataprep-on-ray-redis built fail"
         exit 1
diff --git a/tests/test_embeddings_langchain.sh b/tests/test_embeddings_langchain.sh
index b9f1beefba..caa7301eaa 100644
--- a/tests/test_embeddings_langchain.sh
+++ b/tests/test_embeddings_langchain.sh
@@ -37,6 +37,14 @@ function validate_microservice() {
         -X POST \
         -d '{"text":"What is Deep Learning?"}' \
         -H 'Content-Type: application/json')
+    if [[ $result == *"embedding"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong. Received was $result"
+        docker logs test-comps-embedding-tei-endpoint
+        docker logs test-comps-embedding-tei-server
+        exit 1
+    fi
 }
 
 function stop_docker() {
diff --git a/tests/test_guardrails_pii_detection.sh b/tests/test_guardrails_pii_detection.sh
index 783fa0d992..fef024a3d1 100644
--- a/tests/test_guardrails_pii_detection.sh
+++ b/tests/test_guardrails_pii_detection.sh
@@ -2,7 +2,7 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-set -xe
+set -x
 
 WORKPATH=$(dirname "$PWD")
 ip_address=$(hostname -I | awk '{print $1}')
@@ -10,7 +10,7 @@ ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
     echo "Start building docker images for microservice"
     cd $WORKPATH
-    docker build -t opea/guardrails-pii-detection:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/guardrails/pii_detection/docker/Dockerfile .
+    docker build --no-cache -t opea/guardrails-pii-detection:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/guardrails/pii_detection/docker/Dockerfile .
     if $? ; then
         echo "opea/guardrails-pii-detection built fail"
         exit 1
@@ -31,15 +31,42 @@ function validate_microservice() {
     export PATH="${HOME}/miniforge3/bin:$PATH"
     source activate
     echo "test 1 - single task - ner"
-    python comps/guardrails/pii_detection/test.py --test_text --batch_size 1 --ip_addr $ip_address --strategy ner
+    result=$(python comps/guardrails/pii_detection/test.py --test_text --batch_size 1 --ip_addr $ip_address --strategy ner)
+    if [[ $result == *"An error occurred"* ]]; then
+        echo "Result wrong. Received was $result"
+        docker logs test-comps-guardrails-pii-detection-endpoint
+        exit 1
+    else
+        echo "Result correct."
+    fi
     echo "test 2 - 20 tasks in parallel - ner"
-    python comps/guardrails/pii_detection/test.py --test_text --batch_size 20 --ip_addr $ip_address --strategy ner
+    result=$(python comps/guardrails/pii_detection/test.py --test_text --batch_size 20 --ip_addr $ip_address --strategy ner)
+    if [[ $result == *"An error occurred"* ]]; then
+        echo "Result wrong. Received was $result"
+        docker logs test-comps-guardrails-pii-detection-endpoint
+        exit 1
+    else
+        echo "Result correct."
+    fi
     echo "test 3 - single task - ml"
-    python comps/guardrails/pii_detection/test.py --test_text --batch_size 1 --ip_addr $ip_address --strategy ml
+    result=$(python comps/guardrails/pii_detection/test.py --test_text --batch_size 1 --ip_addr $ip_address --strategy ml)
+    if [[ $result == *"An error occurred"* ]]; then
+        echo "Result wrong. Received was $result"
+        docker logs test-comps-guardrails-pii-detection-endpoint
+        exit 1
+    else
+        echo "Result correct."
+    fi
     echo "test 4 - 20 tasks in parallel - ml"
-    python comps/guardrails/pii_detection/test.py --test_text --batch_size 20 --ip_addr $ip_address --strategy ml
+    result=$(python comps/guardrails/pii_detection/test.py --test_text --batch_size 20 --ip_addr $ip_address --strategy ml)
+    if [[ $result == *"An error occurred"* ]]; then
+        echo "Result wrong. Received was $result"
+        docker logs test-comps-guardrails-pii-detection-endpoint
+        exit 1
+    else
+        echo "Result correct."
+    fi
     echo "Validate microservice completed"
-    docker logs test-comps-guardrails-pii-detection-endpoint
 }
 
 function stop_docker() {
diff --git a/tests/test_llms_text-generation_native.sh b/tests/test_llms_text-generation_native.sh
new file mode 100644
index 0000000000..69517327e5
--- /dev/null
+++ b/tests/test_llms_text-generation_native.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -x
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+
+function build_docker_images() {
+    cd $WORKPATH
+    docker build --no-cache \
+        --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy \
+        -t opea/llm-native:comps \
+        -f comps/llms/text-generation/native/docker/Dockerfile .
+    if $? ; then
+        echo "opea/llm-native built fail"
+        exit 1
+    else
+        echo "opea/llm-native built successful"
+    fi
+}
+
+function start_service() {
+    LLM_NATIVE_MODEL="Qwen/Qwen2-7B-Instruct"
+    llm_native_service_port=5070
+    docker run -d \
+        --name="test-comps-llm-native-server" \
+        -p ${llm_native_service_port}:9000 \
+        --runtime=habana \
+        --cap-add=SYS_NICE \
+        --ipc=host \
+        -e http_proxy=${http_proxy} \
+        -e https_proxy=${https_proxy} \
+        -e LLM_NATIVE_MODEL=${LLM_NATIVE_MODEL} \
+        -e HABANA_VISIBLE_DEVICES=all \
+        -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+        -e TOKENIZERS_PARALLELISM=false \
+        --restart unless-stopped \
+        --network bridge \
+        opea/llm-native:comps
+
+    sleep 5s
+}
+
+function validate_microservice() {
+    llm_native_service_port=5070
+    URL="http://${ip_address}:${llm_native_service_port}/v1/chat/completions"
+    INPUT_DATA='{"query":"What is Deep Learning?"}'
+    HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+    HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
+    RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
+    SERVICE_NAME="llm-native"
+
+    # check response status
+    if [ "$HTTP_STATUS" -ne "200" ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs test-comps-llm-native-server >> ${LOG_PATH}/${SERVICE_NAME}.log
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+    fi
+    # check response body
+    if [[ "$RESPONSE_BODY" != *'"text":"What'* ]]; then
+        echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+        docker logs test-comps-llm-native-server >> ${LOG_PATH}/${SERVICE_NAME}.log
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] Content is as expected."
+    fi
+}
+
+function stop_docker() {
+    cid=$(docker ps -aq --filter "name=test-comps-llm-native*")
+    if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
+}
+
+function main() {
+
+    stop_docker
+    build_docker_images
+    start_service
+    validate_microservice
+    stop_docker
+
+    echo y | docker system prune
+
+}
+
+main
diff --git a/tests/test_llms_text-generation_vllm-openvino.sh b/tests/test_llms_text-generation_vllm-openvino.sh
index 7e4e40e986..dc460d0aa5 100755
--- a/tests/test_llms_text-generation_vllm-openvino.sh
+++ b/tests/test_llms_text-generation_vllm-openvino.sh
@@ -16,7 +16,7 @@ function build_container() {
     cd $WORKPATH
     git clone https://github.com/vllm-project/vllm.git vllm-openvino
     cd ./vllm-openvino/
-    docker build -t $DOCKER_IMAGE \
+    docker build --no-cache -t $DOCKER_IMAGE \
       -f Dockerfile.openvino \
       . \
       --build-arg https_proxy=$https_proxy \
@@ -102,6 +102,7 @@ function test_api_endpoint {
     else
         echo "FAIL: $endpoint returned unexpected status code: $response (expected: $expected_status)"
         docker logs $CONTAINER_NAME
+        exit 1
     fi
 }
 # Main function
diff --git a/tests/test_llms_text-generation_vllm-ray.sh b/tests/test_llms_text-generation_vllm-ray.sh
index 3b1c606814..ae9a427280 100644
--- a/tests/test_llms_text-generation_vllm-ray.sh
+++ b/tests/test_llms_text-generation_vllm-ray.sh
@@ -12,7 +12,7 @@ function build_docker_images() {
     cd $WORKPATH
     docker build \
         -f comps/llms/text-generation/vllm-ray/docker/Dockerfile.vllmray  \
-        -t opea/vllm_ray-habana:comps --network=host .
+        --no-cache -t opea/vllm_ray-habana:comps --network=host .
     if $? ; then
         echo "opea/vllm_ray-habana built fail"
         exit 1
@@ -23,7 +23,7 @@ function build_docker_images() {
     ## Build OPEA microservice docker
     cd $WORKPATH
     docker build \
-        -t opea/llm-vllm-ray:comps \
+        --no-cache -t opea/llm-vllm-ray:comps \
         -f comps/llms/text-generation/vllm-ray/docker/Dockerfile.microservice .
     if $? ; then
         echo "opea/llm-vllm-ray built fail"
@@ -76,12 +76,26 @@ function validate_microservice() {
     result=$(http_proxy="" curl http://${ip_address}:5031/v1/chat/completions \
         -H "Content-Type: application/json" \
         -d '{"model": "facebook/opt-125m", "messages": [{"role": "user", "content": "How are you?"}]}')
-    result_2=$(http_proxy="" curl http://${ip_address}:5032/v1/chat/completions \
+    if [[ $result == *"message"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong. Received was $result"
+        docker logs test-comps-vllm-ray-service
+        docker logs test-comps-vllm-ray-microservice
+        exit 1
+    fi
+    result=$(http_proxy="" curl http://${ip_address}:5032/v1/chat/completions \
         -X POST \
         -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \
         -H 'Content-Type: application/json')
-    docker logs test-comps-vllm-ray-service
-    docker logs test-comps-vllm-ray-microservice
+    if [[ $result == *"text"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong. Received was $result"
+        docker logs test-comps-vllm-ray-service
+        docker logs test-comps-vllm-ray-microservice
+        exit 1
+    fi
 }
 
 function stop_docker() {
diff --git a/tests/test_llms_text-generation_vllm.sh b/tests/test_llms_text-generation_vllm.sh
index 05f249cc8a..1aae114e83 100644
--- a/tests/test_llms_text-generation_vllm.sh
+++ b/tests/test_llms_text-generation_vllm.sh
@@ -12,7 +12,7 @@ function build_docker_images() {
     cd $WORKPATH/comps/llms/text-generation/vllm
     docker build \
         -f docker/Dockerfile.hpu \
-        -t opea/vllm-hpu:comps \
+        --no-cache -t opea/vllm-hpu:comps \
         --shm-size=128g .
     if $? ; then
         echo "opea/vllm-hpu built fail"
@@ -24,7 +24,7 @@ function build_docker_images() {
     ## Build OPEA microservice docker
     cd $WORKPATH
     docker build  \
-        -t opea/llm-vllm:comps \
+        --no-cache -t opea/llm-vllm:comps \
         -f comps/llms/text-generation/vllm/docker/Dockerfile.microservice .
     if $? ; then
         echo "opea/llm-vllm built fail"
@@ -74,7 +74,7 @@ function start_service() {
 }
 
 function validate_microservice() {
-    result=$(http_proxy="" curl http://${ip_address}:8008/v1/completions \
+    result=$(http_proxy="" curl http://${ip_address}:5025/v1/completions \
         -H "Content-Type: application/json" \
         -d '{
         "model": "facebook/opt-125m",
@@ -82,12 +82,26 @@ function validate_microservice() {
         "max_tokens": 32,
         "temperature": 0
         }')
-    result_2=$(http_proxy="" curl http://${ip_address}:5030/v1/chat/completions \
+    if [[ $result == *"text"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong. Received was $result"
+        docker logs test-comps-vllm-service
+        docker logs test-comps-vllm-microservice
+        exit 1
+    fi
+    result=$(http_proxy="" curl http://${ip_address}:5030/v1/chat/completions \
         -X POST \
         -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_p":0.95,"temperature":0.01,"streaming":false}' \
         -H 'Content-Type: application/json')
-            docker logs test-comps-vllm-service
-            docker logs test-comps-vllm-microservice
+    if [[ $result == *"text"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong. Received was $result"
+        docker logs test-comps-vllm-service
+        docker logs test-comps-vllm-microservice
+        exit 1
+    fi
 }
 
 function stop_docker() {
diff --git a/tests/test_lvms_llava.sh b/tests/test_lvms_llava.sh
index 9282298475..2e8f3cbd55 100644
--- a/tests/test_lvms_llava.sh
+++ b/tests/test_lvms_llava.sh
@@ -10,7 +10,7 @@ ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
     cd $WORKPATH
     echo $(pwd)
-    docker build -t opea/llava:comps -f comps/lvms/llava/Dockerfile .
+    docker build --no-cache -t opea/llava:comps -f comps/lvms/llava/Dockerfile .
     if $? ; then
         echo "opea/llava built fail"
         exit 1
diff --git a/tests/test_lvms_tgi_llava_next.sh b/tests/test_lvms_tgi_llava_next.sh
index 1824df0152..5a654b4ec7 100644
--- a/tests/test_lvms_tgi_llava_next.sh
+++ b/tests/test_lvms_tgi_llava_next.sh
@@ -11,13 +11,14 @@ function build_docker_images() {
     cd $WORKPATH
     echo $(pwd)
     git clone https://github.com/yuanwu2017/tgi-gaudi.git && cd tgi-gaudi && git checkout v2.0.4
-    docker build -t opea/llava-tgi:comps .
+    docker build --no-cache -t opea/llava-tgi:comps .
     if $? ; then
         echo "opea/llava-tgi built fail"
         exit 1
     else
         echo "opea/llava-tgi built successful"
     fi
+
     cd ..
     docker build --no-cache -t opea/lvm-tgi:comps -f comps/lvms/Dockerfile_tgi .
     if $? ; then
diff --git a/tests/test_reranks_fastrag.sh b/tests/test_reranks_fastrag.sh
index 3438be280d..0be7489aff 100644
--- a/tests/test_reranks_fastrag.sh
+++ b/tests/test_reranks_fastrag.sh
@@ -31,7 +31,13 @@ function validate_microservice() {
         -X POST \
         -d '{"initial_query":"What is Deep Learning?", "retrieved_docs": [{"text":"Deep Learning is not..."}, {"text":"Deep learning is..."}]}' \
         -H 'Content-Type: application/json')
-    docker logs test-comps-reranking-fastrag-server
+    if [[ $result == *"reranked_docs"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong. Received was $result"
+        docker logs test-comps-reranking-fastrag-server
+        exit 1
+    fi
 }
 
 function stop_docker() {
diff --git a/tests/test_retrievers_haystack_qdrant.sh b/tests/test_retrievers_haystack_qdrant.sh
index bc196efff2..364f63450d 100644
--- a/tests/test_retrievers_haystack_qdrant.sh
+++ b/tests/test_retrievers_haystack_qdrant.sh
@@ -49,8 +49,14 @@ function validate_microservice() {
         -X POST \
         -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" \
         -H 'Content-Type: application/json')
-    docker logs test-comps-retriever-qdrant-server
-    docker logs test-comps-retriever-tei-endpoint
+    if [[ $result == *"retrieved_docs"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong. Received was $result"
+        docker logs test-comps-retriever-qdrant-server
+        docker logs test-comps-retriever-tei-endpoint
+        exit 1
+    fi
 }
 
 function stop_docker() {
diff --git a/tests/test_retrievers_langchain_pgvector.sh b/tests/test_retrievers_langchain_pgvector.sh
index 481a389521..b28fb632e5 100755
--- a/tests/test_retrievers_langchain_pgvector.sh
+++ b/tests/test_retrievers_langchain_pgvector.sh
@@ -47,8 +47,14 @@ function validate_microservice() {
         -X POST \
         -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" \
         -H 'Content-Type: application/json')
-    docker logs test-comps-vectorstore-postgres
-    docker logs test-comps-retriever-tei-endpoint
+    if [[ $result == *"retrieved_docs"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong. Received was $result"
+        docker logs test-comps-vectorstore-postgres
+        docker logs test-comps-retriever-tei-endpoint
+        exit 1
+    fi
 }
 
 function stop_docker() {
diff --git a/tests/test_retrievers_langchain_pinecone.sh b/tests/test_retrievers_langchain_pinecone.sh
index 40ae67942a..7fb105a94c 100755
--- a/tests/test_retrievers_langchain_pinecone.sh
+++ b/tests/test_retrievers_langchain_pinecone.sh
@@ -44,8 +44,14 @@ function validate_microservice() {
         -X POST \
         -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" \
         -H 'Content-Type: application/json')
-    docker logs test-comps-retriever-pinecone-server
-    docker logs test-comps-retriever-tei-endpoint
+    if [[ $result == *"retrieved_docs"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong. Received was $result"
+        docker logs test-comps-retriever-pinecone-server
+        docker logs test-comps-retriever-tei-endpoint
+        exit 1
+    fi
 }
 
 function stop_docker() {
diff --git a/tests/test_vectorstores_langchain_milvus.sh b/tests/test_vectorstores_langchain_milvus.sh
index 60303017d4..86124baa55 100644
--- a/tests/test_vectorstores_langchain_milvus.sh
+++ b/tests/test_vectorstores_langchain_milvus.sh
@@ -2,7 +2,7 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-set -xe
+set -x
 
 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
@@ -30,6 +30,7 @@ function validate_vectorstore() {
         echo "[ test create ] create collection succeed"
     else
         echo "[ test create ] create collection failed"
+        docker logs milvus-standalone
         exit 1
     fi
 
@@ -41,6 +42,7 @@ function validate_vectorstore() {
         echo "[ test insert ] insert data succeed"
     else
         echo "[ test insert ] insert data failed"
+        docker logs milvus-standalone
         exit 1
     fi
 
@@ -52,6 +54,7 @@ function validate_vectorstore() {
         echo "[ test search ] search data succeed"
     else
         echo "[ test search ] search data failed"
+        docker logs milvus-standalone
         exit 1
     fi
 }