diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ea9c494bb8..475f5433a4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,6 +14,8 @@ repos: - id: check-json - id: check-yaml - id: debug-statements + - id: mixed-line-ending + args: [--fix=lf] - id: requirements-txt-fixer - id: trailing-whitespace files: (.*\.(py|rst|cmake|yaml|yml|json|ts|js|html|svelte|sh))$ diff --git a/comps/asr/whisper/whisper_model.py b/comps/asr/whisper/whisper_model.py index c5f16e1121..85d4126cdc 100644 --- a/comps/asr/whisper/whisper_model.py +++ b/comps/asr/whisper/whisper_model.py @@ -148,7 +148,7 @@ def audio2text(self, audio_path): return_tensors="pt", sampling_rate=16000, ) - elif self.device == "hpu": + elif self.device == "hpu" and processed_inputs.input_features.shape[-1] > 3000: processed_inputs["input_features"] = torch.nn.functional.pad( processed_inputs.input_features, (0, self.hpu_max_len - processed_inputs.input_features.size(-1)), diff --git a/comps/dataprep/redis/llama_index/requirements.txt b/comps/dataprep/redis/llama_index/requirements.txt index 9e8dbaa9f7..ad75869c18 100644 --- a/comps/dataprep/redis/llama_index/requirements.txt +++ b/comps/dataprep/redis/llama_index/requirements.txt @@ -12,6 +12,7 @@ opentelemetry-exporter-otlp opentelemetry-sdk prometheus-fastapi-instrumentator python-bidi==0.4.2 +python-multipart redis sentence_transformers shortuuid diff --git a/comps/embeddings/README.md b/comps/embeddings/README.md index ce4b4fa461..edf164b486 100644 --- a/comps/embeddings/README.md +++ b/comps/embeddings/README.md @@ -45,7 +45,7 @@ First, you need to start a TEI service. your_port=8090 model="BAAI/bge-large-en-v1.5" revision="refs/pr/5" -docker run -p $your_port:80 -v ./data:/data --name tei_server -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 --model-id $model --revision $revision +docker run -p $your_port:80 -v ./data:/data --name tei_server -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model --revision $revision ``` Then you need to test your TEI service using the following commands: @@ -66,9 +66,6 @@ cd langchain cd llama_index export TEI_EMBEDDING_ENDPOINT="http://localhost:$yourport" export TEI_EMBEDDING_MODEL_NAME="BAAI/bge-large-en-v1.5" -export LANGCHAIN_TRACING_V2=true -export LANGCHAIN_API_KEY=${your_langchain_api_key} -export LANGCHAIN_PROJECT="opea/gen-ai-comps:embeddings" python embedding_tei.py ``` @@ -92,7 +89,7 @@ First, you need to start a TEI service. your_port=8090 model="BAAI/bge-large-en-v1.5" revision="refs/pr/5" -docker run -p $your_port:80 -v ./data:/data --name tei_server -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 --model-id $model --revision $revision +docker run -p $your_port:80 -v ./data:/data --name tei_server -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model --revision $revision ``` Then you need to test your TEI service using the following commands: @@ -124,13 +121,16 @@ docker build -t opea/embedding-tei:latest --build-arg https_proxy=$https_proxy - ```bash cd ../../ -docker build -t opea/embedding-tei:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/llama_index/docker/Dockerfile . +docker build -t opea/embedding-tei-llama-index:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/llama_index/docker/Dockerfile . ``` ## 2.3 Run Docker with CLI ```bash +# run with langchain docker docker run -d --name="embedding-tei-server" -p 6000:6000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e TEI_EMBEDDING_MODEL_NAME=$TEI_EMBEDDING_MODEL_NAME opea/embedding-tei:latest +# run with llama-index docker +docker run -d --name="embedding-tei-llama-index-server" -p 6000:6000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e TEI_EMBEDDING_MODEL_NAME=$TEI_EMBEDDING_MODEL_NAME opea/embedding-tei-llama-index:latest ``` ## 2.4 Run Docker with Docker Compose diff --git a/comps/embeddings/llama_index/docker/docker_compose_embedding.yaml b/comps/embeddings/llama_index/docker/docker_compose_embedding.yaml index 62f5870b7b..152f5030b0 100644 --- a/comps/embeddings/llama_index/docker/docker_compose_embedding.yaml +++ b/comps/embeddings/llama_index/docker/docker_compose_embedding.yaml @@ -5,7 +5,7 @@ version: "3.8" services: embedding: - image: opea/embedding-tei:latest + image: opea/embedding-tei-llama-index:latest container_name: embedding-tei-server ports: - "6000:6000" @@ -16,7 +16,6 @@ services: https_proxy: ${https_proxy} TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} TEI_EMBEDDING_MODEL_NAME: ${TEI_EMBEDDING_MODEL_NAME} - LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY} restart: unless-stopped networks: diff --git a/comps/guardrails/pii_detection/requirements.txt b/comps/guardrails/pii_detection/requirements.txt index 9ca5116dab..e9bb7ba663 100644 --- a/comps/guardrails/pii_detection/requirements.txt +++ b/comps/guardrails/pii_detection/requirements.txt @@ -20,6 +20,7 @@ prometheus-fastapi-instrumentator pyarrow pymupdf python-docx +python-multipart ray redis scikit-learn diff --git a/comps/llms/text-generation/native/Dockerfile b/comps/llms/text-generation/native/Dockerfile deleted file mode 100644 index 9d7d1e0945..0000000000 --- a/comps/llms/text-generation/native/Dockerfile +++ /dev/null @@ -1,41 +0,0 @@ - - -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -# HABANA environment -FROM vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest as hpu - -ENV LANG=en_US.UTF-8 -ARG REPO=https://github.com/huggingface/optimum-habana.git -ARG REPO_VER=v1.11.1 - -RUN apt-get update && \ - apt-get install git-lfs && \ - git-lfs install && \ - apt-get install -y --no-install-recommends --fix-missing \ - libgl1-mesa-glx \ - libjemalloc-dev \ - vim - -RUN useradd -m -s /bin/bash user && \ - mkdir -p /home/user && \ - chown -R user /home/user/ - -USER user - -COPY comps /home/user/comps -COPY comps/llm/text-generation/qwen2/qwen2.patch /home/user/qwen2.patch - -SHELL ["/bin/bash", "--login", "-c"] -RUN git clone --single-branch -b ${REPO_VER} ${REPO} /optimum-habana - -ENV PYTHONPATH=/root:/home/user - -RUN cd /optimum-habana && git apply /qwen2.patch && \ - cd /optimum-habana/examples/text-generation && pip install -r requirements.txt && \ - cd /optimum-habana && python setup.py install - -WORKDIR /home/user/comps/llms/text-generation/qwen2 - -ENTRYPOINT ["python", "llm.py"] diff --git a/comps/llms/text-generation/native/README.md b/comps/llms/text-generation/native/README.md new file mode 100644 index 0000000000..a4fcc74c33 --- /dev/null +++ b/comps/llms/text-generation/native/README.md @@ -0,0 +1,61 @@ +# LLM Native Microservice + +LLM Native microservice uses [optimum-habana](https://github.com/huggingface/optimum-habana) for model initialization and warm-up, focusing solely on large language models (LLMs). It operates without frameworks like TGI/VLLM, using PyTorch directly for inference, and supports only non-streaming formats. This streamlined approach optimizes performance on Habana hardware. + +## 🚀1. Start Microservice + +If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a Native LLM service with docker. + +### 1.1 Setup Environment Variables + +In order to start Native LLM service, you need to setup the following environment variables first. + +```bash +export LLM_NATIVE_MODEL="Qwen/Qwen2-7B-Instruct" +``` + +### 1.2 Build Docker Image + +```bash +cd ../../../../ +docker build -t opea/llm-native:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/native/docker/Dockerfile . +``` + +To start a docker container, you have two options: + +- A. Run Docker with CLI +- B. Run Docker with Docker Compose + +You can choose one as needed. + +### 1.3 Run Docker with CLI (Option A) + +```bash +docker run -d --runtime=habana --name="llm-native-server" -p 9000:9000 -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e TOKENIZERS_PARALLELISM=false -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e LLM_NATIVE_MODEL=${LLM_NATIVE_MODEL} opea/llm-native:latest +``` + +### 1.4 Run Docker with Docker Compose (Option B) + +```bash +cd docker +docker compose -f docker_compose_llm.yaml up -d +``` + +## 🚀2. Consume LLM Service + +### 2.1 Check Service Status + +```bash +curl http://${your_ip}:9000/v1/health_check\ + -X GET \ + -H 'Content-Type: application/json' +``` + +### 2.2 Consume LLM Service + +```bash +curl http://${your_ip}:9000/v1/chat/completions\ + -X POST \ + -d '{"query":"What is Deep Learning?"}' \ + -H 'Content-Type: application/json' +``` diff --git a/comps/llms/text-generation/native/docker/Dockerfile b/comps/llms/text-generation/native/docker/Dockerfile new file mode 100644 index 0000000000..3dacf52114 --- /dev/null +++ b/comps/llms/text-generation/native/docker/Dockerfile @@ -0,0 +1,42 @@ + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# HABANA environment +FROM vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest as hpu + +ENV LANG=en_US.UTF-8 +ARG REPO=https://github.com/huggingface/optimum-habana.git +ARG REPO_VER=v1.12.1 + +RUN apt-get update && \ + apt-get install git-lfs && \ + git-lfs install && \ + apt-get install -y --no-install-recommends --fix-missing \ + libgl1-mesa-glx \ + libjemalloc-dev \ + vim + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --upgrade-strategy eager optimum[habana] && \ + pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.17.0 + +RUN git clone ${REPO} /home/user/optimum-habana && \ + cd /home/user/optimum-habana && git checkout ${REPO_VER} && \ + cd examples/text-generation && pip install -r requirements.txt && \ + cd /home/user/comps/llms/text-generation/native && pip install -r requirements.txt && \ + pip install --upgrade --force-reinstall pydantic + +ENV PYTHONPATH=/root:/home/user + +WORKDIR /home/user/comps/llms/text-generation/native + +ENTRYPOINT ["python", "llm.py"] diff --git a/comps/llms/text-generation/native/docker/docker_compose_llm.yaml b/comps/llms/text-generation/native/docker/docker_compose_llm.yaml new file mode 100644 index 0000000000..f3a36e5bb8 --- /dev/null +++ b/comps/llms/text-generation/native/docker/docker_compose_llm.yaml @@ -0,0 +1,28 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3.8" + +services: + llm: + image: opea/llm-native:latest + container_name: llm-native-server + ports: + - "9000:9000" + runtime: habana + cap_add: + - SYS_NICE + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + LLM_NATIVE_MODEL: ${LLM_NATIVE_MODEL} + HABANA_VISIBLE_DEVICES: all + OMPI_MCA_btl_vader_single_copy_mechanism: none + TOKENIZERS_PARALLELISM: false + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/llms/text-generation/native/llm.py b/comps/llms/text-generation/native/llm.py index 4f407ccd65..43348670d5 100644 --- a/comps/llms/text-generation/native/llm.py +++ b/comps/llms/text-generation/native/llm.py @@ -11,87 +11,156 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import sys -import os -from datetime import datetime +sys.path.append("/test/GenAIComps/") + +import logging +import threading +import time import torch -from fastapi.responses import StreamingResponse -from langsmith import traceable +from langchain_core.prompts import PromptTemplate +from template import ChatTemplate, args_dict, input_sentences from utils import initialize_model -from comps import GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice +from comps import ( + GeneratedDoc, + LLMParamsDoc, + ServiceType, + opea_microservices, + register_microservice, + register_statistics, +) +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, +) +logger = logging.getLogger(__name__) -def warmup(): - input_sentences = ["DeepSpeed is a machine learning framework", "He is working on", "He has a", "He got all"] - input_tokens = tokenizer.batch_encode_plus(input_sentences, return_tensors="pt", padding=True) - for t in input_tokens: - if torch.is_tensor(input_tokens[t]): - input_tokens[t] = input_tokens[t].to("hpu") - for i in range(3): - print(f"Current time: {datetime.now()}") - print(f"Warming up {i+1}...") - outputs = model.generate( - **input_tokens, - generation_config=generation_config, - lazy_mode=True, - hpu_graphs=True, - profiling_steps=0, - profiling_warmup_steps=0, - ).cpu() - res = tokenizer.batch_decode(outputs, skip_special_tokens=True) - print(f"res: {res}") +class Args: + def __init__(self, **entries): + self.__dict__.update(entries) -@register_microservice( - name="opea_service@llm_qwen", - service_type=ServiceType.LLM, - endpoint="/v1/chat/completions", - host="0.0.0.0", - port=8000, -) -@traceable(run_type="llm") -def llm_generate(input: LLMParamsDoc): - input_query = input.query - input_tokens = tokenizer.batch_encode_plus([input_query], return_tensors="pt", padding=True) + +model = None +assistant_model = None +tokenizer = None +generation_config = None +args = Args(**args_dict) +initialization_lock = threading.Lock() +initialized = False + + +def generate( + input_query: list, + device="hpu", + use_lazy_mode=True, + use_hpu_graphs=True, + profiling_steps=0, + profiling_warmup_steps=0, + ignore_eos=True, + profiling_record_shapes=False, +): + """Generates sequences from the input sentences and returns them.""" + logger.info(f"[llm - generate] starting to inference with prompt {input_query}") + encode_t0 = time.perf_counter() + + # Tokenization + input_tokens = tokenizer.batch_encode_plus(input_query, return_tensors="pt", padding=True) + encode_duration = time.perf_counter() - encode_t0 + logger.info(f"[llm - generate] input tokenized: {input_tokens}") + + # Move inputs to target device(s) for t in input_tokens: + logger.info(f"[llm - generate] t: {t}") if torch.is_tensor(input_tokens[t]): - input_tokens[t] = input_tokens[t].to("hpu") + logger.info("[llm - generate] input[t] is tensor") + logger.info(f"[llm - generate] device: {model.device}") + input_tokens[t] = input_tokens[t].to(model.device) - print(f"[llm - qwen] Current time: {datetime.now()}") - output = model.generate( + logger.info("[llm - generate] inputs transferred.") + + iteration_times = [] + outputs = model.generate( **input_tokens, generation_config=generation_config, - lazy_mode=True, - hpu_graphs=True, - profiling_steps=0, - profiling_warmup_steps=0, + assistant_model=assistant_model, + lazy_mode=use_lazy_mode, + hpu_graphs=use_hpu_graphs, + profiling_steps=profiling_steps, + profiling_warmup_steps=profiling_warmup_steps, + ignore_eos=ignore_eos, + iteration_times=iteration_times, + profiling_record_shapes=profiling_record_shapes, ).cpu() - res = tokenizer.batch_decode(output, skip_special_tokens=True)[0] - print(f"[llm - qwen] res: {res}") - return res + logger.info("[llm - generate] result generated") + first_token_time = iteration_times[0] + encode_duration + result = tokenizer.batch_decode(outputs, skip_special_tokens=True) + logger.info(f"[llm - generate] result: {result}") + logger.info(f"[llm - generate] Time to first token = {first_token_time*1000}ms") + return result -if __name__ == "__main__": - model, tokenizer, generation_config = initialize_model( - model_name_or_path="Qwen/Qwen1.5-7B-Chat", max_new_tokens=128 - ) - import habana_frameworks.torch.hpu as torch_hpu +def initialize(): + global model, assistant_model, tokenizer, generation_config, initialized + with initialization_lock: + if not initialized: + # initialize model and tokenizer + import habana_frameworks.torch.hpu as torch_hpu + from optimum.habana.utils import HabanaProfile + + model, assistant_model, tokenizer, generation_config = initialize_model(args, logger) + logger.info("[llm] model and tokenizer initialized.") + + # compilation and model warmup + HabanaProfile.disable() + logger.info("[llm - native] Graph compilation...") + for _ in range(args.warmup): + generate(input_sentences) + logger.info("[llm - native] model warm up finished.") + torch_hpu.synchronize() + HabanaProfile.enable() + logger.info("[llm - native] Ready to inference") + res = generate(["What is Deep Learning?"]) + logger.info(f"[llm - native] test result: {res}") + initialized = True + - print("[llm - qwen] model and tokenizer initialized.") +@register_microservice( + name="opea_service@llm_native", + service_type=ServiceType.LLM, + endpoint="/v1/chat/completions", + host="0.0.0.0", + port=9000, +) +@register_statistics(names=["opea_service@llm_native"]) +def llm_generate(input: LLMParamsDoc): + initialize() - from optimum.habana.utils import HabanaProfile + prompt = input.query + prompt_template = None + if input.chat_template: + prompt_template = PromptTemplate.from_template(input.chat_template) + input_variables = prompt_template.input_variables + if prompt_template: + if sorted(input_variables) == ["context", "question"]: + prompt = prompt_template.format(question=input.query, context="\n".join(input.documents)) + elif input_variables == ["question"]: + prompt = prompt_template.format(question=input.query) + else: + logger.info(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']") + else: + if input.documents: + prompt = ChatTemplate.generate_rag_prompt(input.query, input.documents) + res = generate([prompt]) - # compilation stage disable profiling - HabanaProfile.disable() - # Compilation - print("Graph compilation...") - warmup() - print("[llm - qwen] model warm up finished.") + logger.info(f"[llm - native] inference result: {res}") + return GeneratedDoc(text=res[0], prompt=input.query) - torch_hpu.synchronize() - HabanaProfile.enable() - print("[llm - qwen] Ready to inference") - opea_microservices["opea_service@llm_qwen"].start() +if __name__ == "__main__": + opea_microservices["opea_service@llm_native"].start() diff --git a/comps/llms/text-generation/native/qwen2.patch b/comps/llms/text-generation/native/qwen2.patch deleted file mode 100644 index 9b5d935670..0000000000 --- a/comps/llms/text-generation/native/qwen2.patch +++ /dev/null @@ -1,127 +0,0 @@ -diff --git a/examples/text-generation/run_lm_eval.py b/examples/text-generation/run_lm_eval.py -index b086c80..e0e5a9f 100644 ---- a/examples/text-generation/run_lm_eval.py -+++ b/examples/text-generation/run_lm_eval.py -@@ -75,13 +75,13 @@ class HabanaModelAdapter(lm_eval.base.BaseLM): - self.options = options - self._device = args.device - self.model_inputs = {"use_cache": self.options.use_cache} -- if self.model.config.model_type in ["llama", "falcon"]: -+ if self.model.config.model_type in ["llama", "falcon", "qwen2"]: - self.model_inputs.update( - { - "reuse_cache": self.options.reuse_cache, - } - ) -- if self.model.config.model_type == "llama": -+ if self.model.config.model_type in ["llama","mistral","qwen2"]: - self.model_inputs.update( - { - "attn_softmax_bf16": self.options.attn_softmax_bf16, -diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py -index 8bce0ae..c29f458 100644 ---- a/examples/text-generation/utils.py -+++ b/examples/text-generation/utils.py -@@ -234,7 +234,7 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger): - - model = deepspeed.init_inference(model, **ds_inference_kwargs) - model = model.module -- if model.config.model_type in ["llama", "falcon"]: -+ if model.config.model_type in ["llama", "falcon","qwen2"]: - patch_scoped_linear_all_reduce(model) - - if args.quant_config: -diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py -index 0d50470..94cc7eb 100755 ---- a/optimum/habana/transformers/generation/utils.py -+++ b/optimum/habana/transformers/generation/utils.py -@@ -740,7 +740,7 @@ class GaudiGenerationMixin(GenerationMixin): - ) - model_kwargs["kv_cache_len"] = calculated_max_length - -- if self.config.model_type in ["llama", "falcon"]: -+ if self.config.model_type in ["llama", "falcon","qwen2"]: - if self.config.max_position_embeddings < calculated_max_length: - unwrap_deepspeed_model(self).update_sincos_cache(seq_len=calculated_max_length) - -diff --git a/optimum/habana/transformers/modeling_utils.py b/optimum/habana/transformers/modeling_utils.py -index 6dc40a7..b5044af 100644 ---- a/optimum/habana/transformers/modeling_utils.py -+++ b/optimum/habana/transformers/modeling_utils.py -@@ -55,6 +55,9 @@ from .models import ( - GaudiOPTForCausalLM, - GaudiOPTLearnedPositionalEmbedding, - GaudiPhiForCausalLM, -+ GaudiQwen2Model, -+ GaudiQwen2Attention, -+ GaudiQwen2MLP, - _gaudi_wav2vec2_compute_mask_indices, - _gaudi_wav2vec2_mask_hidden_states, - gaudi_albert_forward, -@@ -118,6 +121,7 @@ from .models import ( - gaudi_phi_attention_forward, - gaudi_phi_decoder_layer_forward, - gaudi_phi_model_forward, -+ gaudi_qwen2_rmsnorm_forward, - gaudi_rot_matmul, - gaudi_rot_vec_mul, - gaudi_SpeechT5Attention_forward, -@@ -367,3 +371,11 @@ def adapt_transformers_to_gaudi(): - transformers.models.speecht5.modeling_speecht5.SpeechT5SpeechDecoderPrenet.forward = ( - gaudi_SpeechT5SpeechDecoderPrenet_forward - ) -+ -+ # Optimization for qwen2 on Gaudi -+ transformers.models.qwen2.modeling_qwen2.Qwen2ForCausalLM = GaudiQwen2ForCausalLM -+ transformers.models.qwen2.modeling_qwen2.Qwen2Model = GaudiQwen2Model -+ transformers.models.qwen2.modeling_qwen2.Qwen2Attention = GaudiQwen2Attention -+ transformers.models.qwen2.modeling_qwen2.Qwen2MLP = GaudiQwen2MLP -+ transformers.models.qwen2.modeling_qwen2.Qwen2DecoderLayer = GaudiQwen2DecoderLayer -+ transformers.models.qwen2.modeling_qwen2.Qwen2RMSNorm.forward = gaudi_qwen2_rmsnorm_forward -diff --git a/optimum/habana/transformers/models/__init__.py b/optimum/habana/transformers/models/__init__.py -index 1582d3f..41fdfdc 100644 ---- a/optimum/habana/transformers/models/__init__.py -+++ b/optimum/habana/transformers/models/__init__.py -@@ -122,6 +122,14 @@ from .phi import ( - gaudi_phi_decoder_layer_forward, - gaudi_phi_model_forward, - ) -+from .qwen2 import ( -+ GaudiQwen2Attention, -+ GaudiQwen2DecoderLayer, -+ GaudiQwen2ForCausalLM, -+ GaudiQwen2MLP, -+ GaudiQwen2Model, -+ gaudi_qwen2_rmsnorm_forward, -+) - from .speecht5 import ( - gaudi_generate_speech, - gaudi_SpeechT5Attention_forward, -diff --git a/optimum/habana/transformers/trainer.py b/optimum/habana/transformers/trainer.py -index dc6e136..7dfebaa 100644 ---- a/optimum/habana/transformers/trainer.py -+++ b/optimum/habana/transformers/trainer.py -@@ -916,9 +916,9 @@ class GaudiTrainer(Trainer): - if step % args.gradient_accumulation_steps == 0: - self.control = self.callback_handler.on_step_begin(args, self.state, self.control) - -- # attn_softmax_bf16 and use_flash_attention is enabled only for llama -+ # attn_softmax_bf16 and use_flash_attention is enabled only for llama and qwen2 - if hasattr(self.model, "generation_config") and self.model.generation_config is not None: -- if self.model.config.model_type == "llama": -+ if self.model.config.model_type in ["llama", "qwen2"]: - if self.model.generation_config.attn_softmax_bf16: - inputs["attn_softmax_bf16"] = True - if self.model.generation_config.use_flash_attention: -@@ -1799,9 +1799,9 @@ class GaudiTrainer(Trainer): - if batch_size is None: - batch_size = observed_batch_size - -- # attn_softmax_bf16 and use_flash_attention are enabled only for llama -+ # attn_softmax_bf16 and use_flash_attention are enabled only for llama and qwen2 - if hasattr(self.model, "generation_config") and self.model.generation_config is not None: -- if self.model.config.model_type == "llama": -+ if self.model.config.model_type in ["llama", "qwen2"]: - if self.model.generation_config.attn_softmax_bf16: - inputs["attn_softmax_bf16"] = True - if self.model.generation_config.use_flash_attention: diff --git a/comps/llms/text-generation/native/requirements.txt b/comps/llms/text-generation/native/requirements.txt index e8473a80c4..806f2d29fa 100644 --- a/comps/llms/text-generation/native/requirements.txt +++ b/comps/llms/text-generation/native/requirements.txt @@ -1,10 +1,10 @@ -docarray[full] +docarray fastapi -langsmith +httpx +langchain_core opentelemetry-api opentelemetry-exporter-otlp opentelemetry-sdk prometheus-fastapi-instrumentator shortuuid -transformers uvicorn diff --git a/comps/llms/text-generation/native/template.py b/comps/llms/text-generation/native/template.py new file mode 100644 index 0000000000..c43205a0ae --- /dev/null +++ b/comps/llms/text-generation/native/template.py @@ -0,0 +1,99 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +import re + + +class ChatTemplate: + @staticmethod + def generate_rag_prompt(question, documents): + context_str = "\n".join(documents) + if context_str and len(re.findall("[\u4E00-\u9FFF]", context_str)) / len(context_str) >= 0.3: + # chinese context + template = """ +### 你将扮演一个乐于助人、尊重他人并诚实的助手,你的目标是帮助用户解答问题。有效地利用来自本地知识库的搜索结果。确保你的回答中只包含相关信息。如果你不确定问题的答案,请避免分享不准确的信息。 +### 搜索结果:{context} +### 问题:{question} +### 回答: +""" + else: + template = """ +### You are a helpful, respectful and honest assistant to help the user with questions. \ +Please refer to the search results obtained from the local knowledge base. \ +But be careful to not incorporate the information that you think is not relevant to the question. \ +If you don't know the answer to a question, please don't share false information. \n +### Search results: {context} \n +### Question: {question} \n +### Answer: +""" + return template.format(context=context_str, question=question) + + +input_sentences = [ + "DeepSpeed is a machine learning framework", + "He is working on", + "He has a", + "He got all", + "Everyone is happy and I can", + "The new movie that got Oscar this year", + "In the far far distance from our galaxy,", + "Peace is the only way", +] + + +llm_model = os.getenv("LLM_NATIVE_MODEL", "Qwen/Qwen2-7B-Instruct") +args_dict = { + "device": "hpu", + "model_name_or_path": llm_model, + "bf16": True, + "max_new_tokens": 100, + "max_input_tokens": 0, + "batch_size": 1, + "warmup": 3, + "n_iterations": 5, + "local_rank": 0, + "use_kv_cache": True, + "use_hpu_graphs": True, + "dataset_name": None, + "column_name": None, + "do_sample": False, + "num_beams": 1, + "trim_logits": False, + "seed": 27, + "profiling_warmup_steps": 0, + "profiling_steps": 0, + "profiling_record_shapes": False, + "prompt": None, + "bad_words": None, + "force_words": None, + "assistant_model": None, + "peft_model": None, + "num_return_sequences": 1, + "token": None, + "model_revision": "main", + "attn_softmax_bf16": False, + "output_dir": None, + "bucket_size": -1, + "bucket_internal": False, + "dataset_max_samples": -1, + "limit_hpu_graphs": False, + "reuse_cache": False, + "verbose_workers": False, + "simulate_dyn_prompt": None, + "reduce_recompile": False, + "use_flash_attention": False, + "flash_attention_recompute": False, + "flash_attention_causal_mask": False, + "flash_attention_fast_softmax": False, + "book_source": False, + "torch_compile": False, + "ignore_eos": True, + "temperature": 1.0, + "top_p": 1.0, + "const_serialization_path": None, + "disk_offload": False, + "trust_remote_code": False, + "quant_config": "", + "world_size": 0, +} diff --git a/comps/llms/text-generation/native/utils.py b/comps/llms/text-generation/native/utils.py index 3eef7a6e24..04cebfbd49 100644 --- a/comps/llms/text-generation/native/utils.py +++ b/comps/llms/text-generation/native/utils.py @@ -1,10 +1,11 @@ -# Copyright (c) 2024 Intel Corporation +# coding=utf-8 +# Copyright 2022 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, @@ -12,11 +13,17 @@ # See the License for the specific language governing permissions and # limitations under the License. +############################################################################### +# Copyright (C) 2020-2021 Habana Labs, Ltd. an Intel Company +############################################################################### import copy +import glob import os import shutil +import tempfile import time +from pathlib import Path import torch from optimum.habana.checkpoint_utils import ( @@ -26,66 +33,376 @@ model_on_meta, write_checkpoints_json, ) -from optimum.habana.utils import check_habana_frameworks_version, check_optimum_habana_min_version, set_seed +from optimum.habana.utils import ( + check_habana_frameworks_version, + check_optimum_habana_min_version, + get_habana_frameworks_version, + set_seed, +) from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from transformers.utils import check_min_version -def setup_env(): +def adjust_batch(batch, size): + curr_size = batch["input_ids"].shape[1] + if curr_size >= size: + adjusted_batch = { + "input_ids": batch["input_ids"][:, :size], + "attention_mask": batch["attention_mask"][:, :size], + } + else: + adjusted_batch = {} + for k in batch.keys(): + last_colm = batch[k][:, -1] + expanded = last_colm.tile((size - curr_size, 1)).T + adjusted_batch[k] = torch.concat([batch[k], expanded], 1) + assert adjusted_batch["input_ids"].shape[1] == size + assert adjusted_batch["attention_mask"].shape[1] == size + return adjusted_batch + + +def override_print(enable): + import builtins as __builtin__ + + builtin_print = __builtin__.print + + def print(*args, **kwargs): + force = kwargs.pop("force", False) + if force or enable: + builtin_print(*args, **kwargs) + + __builtin__.print = print + + +def override_logger(logger, enable): + logger_info = logger.info + + def info(*args, **kwargs): + force = kwargs.pop("force", False) + if force or enable: + logger_info(*args, **kwargs) + + logger.info = info + + +def count_hpu_graphs(): + return len(glob.glob(".graph_dumps/*PreGraph*")) + + +def override_prints(enable, logger): + override_print(enable) + override_logger(logger, enable) + + +def setup_distributed(args): + args.local_rank = int(os.getenv("LOCAL_RANK", "0")) + args.world_size = int(os.getenv("WORLD_SIZE", "0")) + args.global_rank = int(os.getenv("RANK", "0")) + + +def setup_inference(args, model): + import habana_frameworks.torch.core as htcore + + habana_version = get_habana_frameworks_version() + + print("Initializing inference mode") + # Keeping the if-else here for back compat. TODO remove later + if habana_version.major >= 1 and habana_version.minor >= 16: + htcore.hpu_initialize(model, mark_only_scales_as_const=True) + else: + const_marking = os.getenv("ENABLE_CONST_MARKING", "True") + if const_marking == "True": + htcore.hpu_initialize(model) + return model + + +def setup_const_serialization(const_serialization_path): + import uuid + + const_serialization_path = os.path.join(const_serialization_path + uuid.uuid4().hex) + os.makedirs(const_serialization_path) + from habana_frameworks.torch.hpu import enable_const_section_serialization + + print("Serializing const params to {}".format(const_serialization_path)) + enable_const_section_serialization(const_serialization_path, True) + + +def setup_env(args): # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.34.0") check_optimum_habana_min_version("1.9.0.dev0") # TODO: SW-167588 - WA for memory issue in hqt prep_model os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE") + if args.global_rank == 0 and not args.torch_compile: + os.environ.setdefault("GRAPH_VISUALIZATION", "true") + shutil.rmtree(".graph_dumps", ignore_errors=True) + + if args.world_size > 0: + os.environ.setdefault("PT_HPU_LAZY_ACC_PAR_MODE", "0") + os.environ.setdefault("PT_HPU_ENABLE_LAZY_COLLECTIVES", "true") + + if args.use_hpu_graphs and args.limit_hpu_graphs and not args.reuse_cache and args.bucket_internal: + # Based upon above conditions and below env variable, + # we can call HPU graphs clear_inputs(). + os.environ.setdefault("PT_HPUGRAPH_DISABLE_TENSOR_CACHE", "1") + # Tweak generation so that it runs faster on Gaudi from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi adapt_transformers_to_gaudi() -def setup_device(): - import habana_frameworks.torch.core as htcore +def setup_device(args): + if args.device == "hpu": + import habana_frameworks.torch.core as htcore + + if args.quant_config: + htcore.hpu_set_env() + return torch.device(args.device) + - return torch.device("hpu") +# patching LinearAllreduce to use ScopedLinearAllReduce +def patch_scoped_linear_all_reduce(model): + from deepspeed.module_inject.layers import LinearAllreduce + from optimum.habana.transformers.models.modeling_all_models import ScopedLinearAllReduce + + for name, module in model.named_children(): + if type(module) is LinearAllreduce: + SL = ScopedLinearAllReduce(mod=module) + setattr(model, name, SL) + patch_scoped_linear_all_reduce(module) def get_torch_compiled_model(model): - model.model = torch.compile(model.model, backend="hpu_backend") + model.model = torch.compile(model.model, backend="hpu_backend", options={"keep_input_mutations": True}) return model -def setup_model(model_name_or_path, model_dtype, model_kwargs): - model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=model_dtype, **model_kwargs) - model = model.eval().to("hpu") +def setup_model(args, model_dtype, model_kwargs, logger): + logger.info("Single-device run.") + if args.assistant_model is None: + assistant_model = None + else: + logger.info(f"Using asssitant model {args.assistant_model}.") + if args.disk_offload: + from accelerate import infer_auto_device_map, init_empty_weights + + config = AutoConfig.from_pretrained(args.model_name_or_path) + with init_empty_weights(): + model = AutoModelForCausalLM.from_config(config) + max_memory = {"cpu": "10GiB"} + device_map = infer_auto_device_map(model, max_memory=max_memory, dtype=model_dtype) + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, + device_map=device_map, + offload_folder="/tmp/offload_folder/", + offload_state_dict=True, + torch_dtype=model_dtype, + **model_kwargs, + ) + else: + if args.assistant_model is not None: + assistant_model = AutoModelForCausalLM.from_pretrained( + args.assistant_model, torch_dtype=model_dtype, **model_kwargs + ) + if args.peft_model is not None: + model = peft_model(args, model_dtype, logger, **model_kwargs) + else: + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs + ) + if args.quant_config: + import habana_quantization_toolkit + + habana_quantization_toolkit.prep_model(model) + if args.assistant_model is not None: + habana_quantization_toolkit.quantize_model(assistant_model) + + model = model.eval().to(args.device) + if args.assistant_model is not None: + assistant_model = assistant_model.eval().to(args.device) + + if args.use_hpu_graphs: + from habana_frameworks.torch.hpu import wrap_in_hpu_graph + from optimum.habana.transformers.trainer import _is_peft_model + + if check_habana_frameworks_version("1.13.0") and model.config.model_type == "falcon": + model = wrap_in_hpu_graph(model, hash_with_views=False) + else: + model = wrap_in_hpu_graph(model) + if args.assistant_model is not None: + assistant_model = wrap_in_hpu_graph(assistant_model) + if _is_peft_model(model): + model.base_model = wrap_in_hpu_graph(model.base_model) - from habana_frameworks.torch.hpu import wrap_in_hpu_graph + if args.torch_compile and model.config.model_type == "llama": + model = get_torch_compiled_model(model) + # if args.assistant_model is not None: + # assistant_model = get_torch_compiled_model(assistant_model) + return model, assistant_model + + +def setup_distributed_model(args, model_dtype, model_kwargs, logger): + import deepspeed - if check_habana_frameworks_version("1.13.0") and model.config.model_type == "falcon": - model = wrap_in_hpu_graph(model, hash_with_views=False) + logger.info("DeepSpeed is enabled.") + deepspeed.init_distributed(dist_backend="hccl") + config = AutoConfig.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs) + load_to_meta = model_on_meta(config) + + if args.assistant_model is None: + assistant_model = None else: - model = wrap_in_hpu_graph(model) + logger.info(f"Using asssitant model {args.assistant_model}.") - if model.config.model_type == "llama": + if load_to_meta: + # Construct model with fake meta tensors, later will be replaced on devices during ds-inference ckpt load + with deepspeed.OnDevice(dtype=model_dtype, device="meta"): + model = AutoModelForCausalLM.from_config(config, torch_dtype=model_dtype) + + # Model loaded to meta is managed differently + checkpoints_json = tempfile.NamedTemporaryFile(suffix=".json", mode="+w") + + # For PEFT models, write the merged model on disk to be able to load it on the meta device + if args.peft_model is not None: + merged_model_dir = "/tmp/text_generation_merged_peft_model" + if args.local_rank == 0: + if Path(merged_model_dir).is_dir(): + shutil.rmtree(merged_model_dir) + peft_model(args, model_dtype, logger, **model_kwargs).save_pretrained(merged_model_dir) + torch.distributed.barrier() + + write_checkpoints_json( + merged_model_dir if args.peft_model is not None else args.model_name_or_path, + args.local_rank, + checkpoints_json, + token=args.token, + ) + else: + # TODO: revisit placement on CPU when auto-injection is possible + with deepspeed.OnDevice(dtype=model_dtype, device="cpu"): + if args.peft_model is not None: + model = peft_model(args, model_dtype, logger, **model_kwargs) + else: + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs + ) + model.eval() + + if args.assistant_model is not None: + assistant_model = AutoModelForCausalLM.from_pretrained( + args.assistant_model, torch_dtype=model_dtype, **model_kwargs + ).eval() + + # Initialize the model + ds_inference_kwargs = {"dtype": model_dtype} + ds_inference_kwargs["tensor_parallel"] = {"tp_size": args.world_size} + ds_inference_kwargs["enable_cuda_graph"] = args.use_hpu_graphs + ds_inference_kwargs["injection_policy"] = get_ds_injection_policy(config) + if load_to_meta: + ds_inference_kwargs["checkpoint"] = checkpoints_json.name + + model = deepspeed.init_inference(model, **ds_inference_kwargs) + model = model.module + if model.config.model_type in ["llama", "falcon", "qwen2"]: + patch_scoped_linear_all_reduce(model) + + if args.quant_config: + import habana_quantization_toolkit + + habana_quantization_toolkit.prep_model(model) + if args.assistant_model is not None: + habana_quantization_toolkit.prep_model(assistant_model) + + if args.torch_compile and model.config.model_type == "llama": model = get_torch_compiled_model(model) + # if args.assistant_model is not None: + # assistant_model = get_torch_compiled_model(assistant_model) + return model, assistant_model - return model +def peft_model(args, model_dtype, logger, **model_kwargs): + import importlib.util + + if importlib.util.find_spec("peft") is None: + raise ImportError("The `peft` package is not installed, please run: `pip install peft`.") + from peft import AutoPeftModelForCausalLM + from peft.config import PeftConfigMixin + + base_model_name = PeftConfigMixin.from_pretrained( + args.peft_model, + token=model_kwargs["token"] if "token" in model_kwargs else None, + ).base_model_name_or_path + + base_model_is_local = Path(base_model_name).is_dir() + if not base_model_is_local: + # Check if the base model path to a remote repository on the HF Hub exists + from huggingface_hub import list_repo_files + + try: + list_repo_files(base_model_name) + base_model_is_remote = True + except Exception: + base_model_is_remote = False + + if base_model_is_local or base_model_is_remote: + model = AutoPeftModelForCausalLM.from_pretrained(args.peft_model, torch_dtype=model_dtype, **model_kwargs) + else: + # Since the base model doesn't exist locally nor remotely, use `args.model_name_or_path` as the base model + logger.warning( + f"The base model `{base_model_name}` of the LoRA configuration associated" + f" to `{args.peft_model}` does not exist locally or remotely. Using " + f"`--model_name_or_path {args.model_name_or_path}` as a fall back for the base model." + ) + from peft import PeftModel + + model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs) + model = PeftModel.from_pretrained(model, args.peft_model, torch_dtype=model_dtype, **model_kwargs) + if hasattr(model, "merge_and_unload"): + model = model.merge_and_unload() + if model_dtype == torch.bfloat16: + model = model.to(torch.bfloat16) + return model + else: + from optimum.habana.peft.peft_model import gaudi_generate, gaudi_prepare_inputs_for_generation + + model.__class__.generate = gaudi_generate + model.__class__.prepare_inputs_for_generation = gaudi_prepare_inputs_for_generation + return model -def setup_tokenizer(model_name_or_path, model): + +def setup_tokenizer(args, model, assistant_model): tokenizer_kwargs = { - "revision": "main", - "token": None, + "revision": args.model_revision, + "token": args.token, + "trust_remote_code": args.trust_remote_code, } - tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, **tokenizer_kwargs) + if args.bad_words is not None or args.force_words is not None: + tokenizer_kwargs["add_prefix_space"] = True + tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, **tokenizer_kwargs) if not model.config.is_encoder_decoder: tokenizer.padding_side = "left" - # Some models like GPT2 do not have a PAD token so we have to set it if necessary + if model.config.model_type == "llama": # unwind broken decapoda-research config model.generation_config.pad_token_id = 0 model.generation_config.bos_token_id = 1 model.generation_config.eos_token_id = 2 + if assistant_model is not None: + assistant_model.generation_config.pad_token_id = 0 + assistant_model.generation_config.bos_token_id = 1 + assistant_model.generation_config.eos_token_id = 2 + tokenizer.bos_token_id = model.generation_config.bos_token_id + tokenizer.eos_token_id = model.generation_config.eos_token_id + tokenizer.pad_token_id = model.generation_config.pad_token_id + tokenizer.pad_token = tokenizer.decode(tokenizer.pad_token_id) + tokenizer.eos_token = tokenizer.decode(tokenizer.eos_token_id) + tokenizer.bos_token = tokenizer.decode(tokenizer.bos_token_id) + if model.config.model_type == "persimmon": + model.generation_config.pad_token_id = model.generation_config.eos_token_id + if assistant_model is not None: + assistant_model.generation_config.pad_token_id = assistant_model.generation_config.eos_token_id tokenizer.bos_token_id = model.generation_config.bos_token_id tokenizer.eos_token_id = model.generation_config.eos_token_id tokenizer.pad_token_id = model.generation_config.pad_token_id @@ -93,54 +410,112 @@ def setup_tokenizer(model_name_or_path, model): tokenizer.eos_token = tokenizer.decode(tokenizer.eos_token_id) tokenizer.bos_token = tokenizer.decode(tokenizer.bos_token_id) + # Some models like GPT2 do not have a PAD token so we have to set it if necessary if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model.generation_config.pad_token_id = model.generation_config.eos_token_id - return tokenizer, model + if assistant_model is not None: + assistant_model.generation_config.pad_token_id = assistant_model.generation_config.eos_token_id + return tokenizer, model, assistant_model -def setup_generation_config(model, tokenizer, max_new_tokens): + +def setup_generation_config(args, model, assistant_model, tokenizer): bad_words_ids = None force_words_ids = None + if args.bad_words is not None: + bad_words_ids = [tokenizer.encode(bad_word, add_special_tokens=False) for bad_word in args.bad_words] + if args.force_words is not None: + force_words_ids = [tokenizer.encode(force_word, add_special_tokens=False) for force_word in args.force_words] is_optimized = model_is_optimized(model.config) + # Generation configuration generation_config = copy.deepcopy(model.generation_config) - generation_config.max_new_tokens = max_new_tokens - generation_config.use_cache = True - generation_config.static_shapes = is_optimized - generation_config.bucket_size = -1 - generation_config.bucket_internal = True - generation_config.do_sample = True - generation_config.num_beams = 1 + generation_config.max_new_tokens = args.max_new_tokens + generation_config.use_cache = args.use_kv_cache + generation_config.static_shapes = is_optimized and assistant_model is None + generation_config.bucket_size = args.bucket_size if is_optimized else -1 + generation_config.bucket_internal = args.bucket_internal + generation_config.do_sample = args.do_sample + generation_config.num_beams = args.num_beams generation_config.bad_words_ids = bad_words_ids generation_config.force_words_ids = force_words_ids - generation_config.num_return_sequences = 1 - generation_config.trim_logits = True - generation_config.attn_softmax_bf16 = True - generation_config.limit_hpu_graphs = True - generation_config.reuse_cache = False - generation_config.reduce_recompile = False - generation_config.use_flash_attention = False - generation_config.flash_attention_recompute = True - generation_config.flash_attention_causal_mask = True + generation_config.num_return_sequences = args.num_return_sequences + generation_config.trim_logits = args.trim_logits + generation_config.attn_softmax_bf16 = args.attn_softmax_bf16 + generation_config.limit_hpu_graphs = args.limit_hpu_graphs + generation_config.reuse_cache = args.reuse_cache + generation_config.reduce_recompile = args.reduce_recompile + if generation_config.reduce_recompile: + assert generation_config.bucket_size > 0 + generation_config.use_flash_attention = args.use_flash_attention + generation_config.flash_attention_recompute = args.flash_attention_recompute + generation_config.flash_attention_causal_mask = args.flash_attention_causal_mask + generation_config.flash_attention_fast_softmax = args.flash_attention_fast_softmax + generation_config.trust_remote_code = args.trust_remote_code + return generation_config -def initialize_model(model_name_or_path, max_new_tokens=128): +def exclude_hpu_graph_configs(args): + # Excluded configs for batch size 1 for hpu graph + if args.batch_size == 1 and args.limit_hpu_graphs: + if "falcon-180B" in args.model_name_or_path or "falcon-180b" in args.model_name_or_path: + return False + if args.world_size == 2 or args.world_size == 4 or args.world_size == 8: + if args.quant_config: + if args.max_input_tokens >= 8192 and args.max_new_tokens >= 128: + return False + else: + if args.max_input_tokens >= 4096 and args.max_new_tokens >= 128: + return False + return True + else: + return False + + +def initialize_model(args, logger): init_start = time.perf_counter() - setup_env() - setup_device() - set_seed(17) - get_repo_root(model_name_or_path, local_rank=0, token=None) - model_dtype = torch.bfloat16 + setup_distributed(args) + if exclude_hpu_graph_configs(args): + args.limit_hpu_graphs = False + override_prints(args.global_rank == 0 or args.verbose_workers, logger) + setup_env(args) + setup_device(args) + set_seed(args.seed) + get_repo_root(args.model_name_or_path, local_rank=args.local_rank, token=args.token) + if args.assistant_model is not None: + get_repo_root(args.assistant_model, local_rank=args.local_rank, token=args.token) + use_deepspeed = args.world_size > 0 + if use_deepspeed or args.bf16: + model_dtype = torch.bfloat16 + else: + model_dtype = torch.float + args.attn_softmax_bf16 = False - model_kwargs = {"revision": "main", "token": None, "device_map": "auto", "offload_folder": "/tmp/offload_folder/"} + model_kwargs = { + "revision": args.model_revision, + "token": args.token, + "trust_remote_code": args.trust_remote_code, + } + if args.trust_remote_code: + logger.warning("`trust_remote_code` is set, there is no guarantee this model works properly and it may fail") - model = setup_model(model_name_or_path, model_dtype, model_kwargs) - tokenizer, model = setup_tokenizer(model_name_or_path, model) - generation_config = setup_generation_config(model, tokenizer, max_new_tokens) + model, assistant_model = ( + setup_model(args, model_dtype, model_kwargs, logger) + if not use_deepspeed + else setup_distributed_model(args, model_dtype, model_kwargs, logger) + ) + tokenizer, model, assistant_model = setup_tokenizer(args, model, assistant_model) + generation_config = setup_generation_config(args, model, assistant_model, tokenizer) + if args.const_serialization_path: + setup_const_serialization(args.const_serialization_path) + if args.quant_config: + model = setup_inference(args, model) init_end = time.perf_counter() - print(f"Model initialization took {(init_end - init_start):.3f}s") - return model, tokenizer, generation_config + logger.info(f"Args: {args}") + logger.info(f"device: {args.device}, n_hpu: {args.world_size}, bf16: {model_dtype == torch.bfloat16}") + logger.info(f"Model initialization took {(init_end - init_start):.3f}s") + return model, assistant_model, tokenizer, generation_config diff --git a/comps/ragas/tgi/llm.py b/comps/ragas/tgi/llm.py index f31c666576..895705703c 100644 --- a/comps/ragas/tgi/llm.py +++ b/comps/ragas/tgi/llm.py @@ -1,86 +1,86 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import os - -from datasets import Dataset -from langchain_community.embeddings import ( - HuggingFaceBgeEmbeddings, - HuggingFaceEmbeddings, - HuggingFaceHubEmbeddings, - HuggingFaceInstructEmbeddings, -) -from langchain_community.llms import HuggingFaceEndpoint -from langsmith import traceable -from ragas import evaluate -from ragas.metrics import answer_relevancy, context_precision, context_recall, faithfulness - -from comps import GeneratedDoc, RAGASParams, RAGASScores, ServiceType, opea_microservices, register_microservice - -tei_embedding_endpoint = os.getenv("TEI_ENDPOINT") -EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5") - - -@register_microservice( - name="opea_service@ragas_tgi_llm", - service_type=ServiceType.RAGAS, - endpoint="/v1/ragas", - host="0.0.0.0", - port=9050, - input_datatype=RAGASParams, - output_datatype=RAGASScores, -) -@traceable(run_type="llm") -def llm_generate(input: RAGASParams): - llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") - - # Create vectorstore - if tei_embedding_endpoint: - # create embeddings using TEI endpoint service - embedder = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint) - else: - # create embeddings using local embedding model - embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) - - llm = HuggingFaceEndpoint( - endpoint_url=llm_endpoint, - max_new_tokens=input.max_new_tokens, - top_k=input.top_k, - top_p=input.top_p, - typical_p=input.typical_p, - temperature=input.temperature, - repetition_penalty=input.repetition_penalty, - streaming=input.streaming, - timeout=600, - ) - - data_collections = { - "question": input.questions, - "answer": input.answers, - "docs": input.docs, - "ground_truth": input.groundtruths, - } - dataset = Dataset.from_dict(data_collections) - - score = evaluate( - dataset, - metrics=[answer_relevancy, faithfulness, context_recall, context_precision], - llm=llm, - embeddings=embedder, - ) - df = score.to_pandas() - answer_relevancy_average = df["answer_relevancy"][:].mean() - faithfulness_average = df["faithfulness"][:].mean() - context_recall_average = df["context_recall"][:].mean() - context_precision_average = df["context_precision"][:].mean() - - return RAGASScores( - answer_relevancy=answer_relevancy_average, - faithfulness=faithfulness_average, - context_recallL=context_recall_average, - context_precision=context_precision_average, - ) - - -if __name__ == "__main__": - opea_microservices["opea_service@llm_tgi"].start() +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +from datasets import Dataset +from langchain_community.embeddings import ( + HuggingFaceBgeEmbeddings, + HuggingFaceEmbeddings, + HuggingFaceHubEmbeddings, + HuggingFaceInstructEmbeddings, +) +from langchain_community.llms import HuggingFaceEndpoint +from langsmith import traceable +from ragas import evaluate +from ragas.metrics import answer_relevancy, context_precision, context_recall, faithfulness + +from comps import GeneratedDoc, RAGASParams, RAGASScores, ServiceType, opea_microservices, register_microservice + +tei_embedding_endpoint = os.getenv("TEI_ENDPOINT") +EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5") + + +@register_microservice( + name="opea_service@ragas_tgi_llm", + service_type=ServiceType.RAGAS, + endpoint="/v1/ragas", + host="0.0.0.0", + port=9050, + input_datatype=RAGASParams, + output_datatype=RAGASScores, +) +@traceable(run_type="llm") +def llm_generate(input: RAGASParams): + llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") + + # Create vectorstore + if tei_embedding_endpoint: + # create embeddings using TEI endpoint service + embedder = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint) + else: + # create embeddings using local embedding model + embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) + + llm = HuggingFaceEndpoint( + endpoint_url=llm_endpoint, + max_new_tokens=input.max_new_tokens, + top_k=input.top_k, + top_p=input.top_p, + typical_p=input.typical_p, + temperature=input.temperature, + repetition_penalty=input.repetition_penalty, + streaming=input.streaming, + timeout=600, + ) + + data_collections = { + "question": input.questions, + "answer": input.answers, + "docs": input.docs, + "ground_truth": input.groundtruths, + } + dataset = Dataset.from_dict(data_collections) + + score = evaluate( + dataset, + metrics=[answer_relevancy, faithfulness, context_recall, context_precision], + llm=llm, + embeddings=embedder, + ) + df = score.to_pandas() + answer_relevancy_average = df["answer_relevancy"][:].mean() + faithfulness_average = df["faithfulness"][:].mean() + context_recall_average = df["context_recall"][:].mean() + context_precision_average = df["context_precision"][:].mean() + + return RAGASScores( + answer_relevancy=answer_relevancy_average, + faithfulness=faithfulness_average, + context_recallL=context_recall_average, + context_precision=context_precision_average, + ) + + +if __name__ == "__main__": + opea_microservices["opea_service@llm_tgi"].start() diff --git a/comps/ragas/tgi/requirements.txt b/comps/ragas/tgi/requirements.txt index 3fa49150ec..2c8fad29f2 100644 --- a/comps/ragas/tgi/requirements.txt +++ b/comps/ragas/tgi/requirements.txt @@ -1,14 +1,14 @@ -datasets -docarray[full] -fastapi -huggingface_hub -langchain==0.1.16 -langsmith -opentelemetry-api -opentelemetry-exporter-otlp -opentelemetry-sdk -prometheus-fastapi-instrumentator -ragas -shortuuid -transformers -uvicorn +datasets +docarray[full] +fastapi +huggingface_hub +langchain==0.1.16 +langsmith +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +prometheus-fastapi-instrumentator +ragas +shortuuid +transformers +uvicorn diff --git a/tests/test_agent_langchain.sh b/tests/test_agent_langchain.sh index e44d882921..c08024847e 100644 --- a/tests/test_agent_langchain.sh +++ b/tests/test_agent_langchain.sh @@ -12,7 +12,7 @@ function build_docker_images() { echo "Building the docker images" cd $WORKPATH echo $WORKPATH - docker build -t opea/comps-agent-langchain:comps -f comps/agent/langchain/docker/Dockerfile . + docker build --no-cache -t opea/comps-agent-langchain:comps -f comps/agent/langchain/docker/Dockerfile . if $? ; then echo "opea/comps-agent-langchain built fail" exit 1 @@ -34,7 +34,7 @@ function start_service() { docker logs test-comps-tgi-gaudi-service echo "Starting agent microservice" - docker run -d --runtime=runc --name="test-comps-langchain-agent-endpoint" -v $WORKPATH/comps/agent/langchain/tools:/home/user/comps/agent/langchain/tools -p 9090:9090 --ipc=host -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e model=${model} -e strategy=react -e llm_endpoint_url=http://${ip_address}:8080 -e llm_engine=tgi -e recursion_limit=5 -e require_human_feedback=false -e tools=/home/user/comps/agent/langchain/tools/custom_tools.yaml opea/comps-agent-langchain:comps + docker run -d --runtime=runc --name="test-comps-langchain-agent-endpoint" -v $WORKPATH/comps/agent/langchain/tools:/home/user/comps/agent/langchain/tools -p 5042:9090 --ipc=host -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e model=${model} -e strategy=react -e llm_endpoint_url=http://${ip_address}:8080 -e llm_engine=tgi -e recursion_limit=5 -e require_human_feedback=false -e tools=/home/user/comps/agent/langchain/tools/custom_tools.yaml opea/comps-agent-langchain:comps sleep 5s docker logs test-comps-langchain-agent-endpoint @@ -69,7 +69,7 @@ function validate() { function validate_microservice() { echo "Testing agent service" - local CONTENT=$(curl http://${ip_address}:9090/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{ + local CONTENT=$(curl http://${ip_address}:5042/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{ "query": "What is Intel OPEA project?" }' | tee ${LOG_PATH}/test-agent-langchain.log) local EXIT_CODE=$(validate "$CONTENT" "OPEA" "test-agent-langchain") diff --git a/tests/test_dataprep_pgvector.sh b/tests/test_dataprep_pgvector.sh index 1f2b93ab40..3ea3df9f8b 100755 --- a/tests/test_dataprep_pgvector.sh +++ b/tests/test_dataprep_pgvector.sh @@ -16,7 +16,7 @@ function build_docker_images() { docker pull pgvector/pgvector:0.7.0-pg16 # build dataprep image for pgvector - docker build -t opea/dataprep-pgvector:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f $WORKPATH/comps/dataprep/pgvector/langchain/docker/Dockerfile . + docker build --no-cache -t opea/dataprep-pgvector:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f $WORKPATH/comps/dataprep/pgvector/langchain/docker/Dockerfile . if $? ; then echo "opea/dataprep-pgvector built fail" exit 1 diff --git a/tests/test_dataprep_pinecone.sh b/tests/test_dataprep_pinecone.sh index 55e42073d8..1930d27981 100755 --- a/tests/test_dataprep_pinecone.sh +++ b/tests/test_dataprep_pinecone.sh @@ -10,7 +10,7 @@ function build_docker_images() { cd $WORKPATH # build dataprep image for pinecone - docker build -t opea/dataprep-pinecone:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f $WORKPATH/comps/dataprep/pinecone/docker/Dockerfile . + docker build --no-cache -t opea/dataprep-pinecone:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f $WORKPATH/comps/dataprep/pinecone/docker/Dockerfile . if $? ; then echo "opea/dataprep-pinecone built fail" exit 1 @@ -24,20 +24,33 @@ function start_service() { export PINECONE_INDEX_NAME="test-index" export HUGGINGFACEHUB_API_TOKEN=$HF_TOKEN - docker run -d --name="test-comps-dataprep-pinecone" -p 6007:6007 -p 6008:6008 -p 6009:6009 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e PINECONE_API_KEY=$PINECONE_API_KEY -e PINECONE_INDEX_NAME=$PINECONE_INDEX_NAME opea/dataprep-pinecone:comps + docker run -d --name="test-comps-dataprep-pinecone" -p 5039:6007 -p 5040:6008 -p 5041:6009 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e PINECONE_API_KEY=$PINECONE_API_KEY -e PINECONE_INDEX_NAME=$PINECONE_INDEX_NAME opea/dataprep-pinecone:comps sleep 1m } function validate_microservice() { - URL="http://$ip_address:6007/v1/dataprep" + URL="http://$ip_address:5039/v1/dataprep" echo 'The OPEA platform includes: Detailed framework of composable building blocks for state-of-the-art generative AI systems including LLMs, data stores, and prompt engines' > ./dataprep_file.txt result=$(curl --noproxy $ip_address --location --request POST \ --form 'files=@./dataprep_file.txt' $URL) - - DELETE_URL="http://$ip_address:6009/v1/dataprep/delete_file" - result_2=$(curl --noproxy $ip_address --location --request POST \ + if [[ $result == *"200"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs test-comps-dataprep-pinecone + exit 1 + fi + DELETE_URL="http://$ip_address:5041/v1/dataprep/delete_file" + result=$(curl --noproxy $ip_address --location --request POST \ -d '{"file_path": "all"}' -H 'Content-Type: application/json' $DELETE_URL) + if [[ $result == *"true"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs test-comps-dataprep-pinecone + exit 1 + fi } function stop_docker() { diff --git a/tests/test_dataprep_qdrant_langchain.sh b/tests/test_dataprep_qdrant_langchain.sh index 410c08903b..e112438fd3 100644 --- a/tests/test_dataprep_qdrant_langchain.sh +++ b/tests/test_dataprep_qdrant_langchain.sh @@ -57,6 +57,9 @@ function validate_services() { # check response status if [ "$HTTP_STATUS" -ne "200" ]; then echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-qdrant-langchain + docker logs test-comps-dataprep-qdrant-langchain-tei + docker logs test-comps-dataprep-qdrant-langchain-server exit 1 else echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." @@ -64,6 +67,9 @@ function validate_services() { # check response body if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + docker logs test-comps-dataprep-qdrant-langchain + docker logs test-comps-dataprep-qdrant-langchain-tei + docker logs test-comps-dataprep-qdrant-langchain-server exit 1 else echo "[ $SERVICE_NAME ] Content is as expected." diff --git a/tests/test_dataprep_redis_langchain.sh b/tests/test_dataprep_redis_langchain.sh index f09e08a488..1a9831dc58 100644 --- a/tests/test_dataprep_redis_langchain.sh +++ b/tests/test_dataprep_redis_langchain.sh @@ -39,16 +39,17 @@ function validate_microservice() { HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') SERVICE_NAME="dataprep - upload - file" - docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_upload_file.log if [ "$HTTP_STATUS" -ne "200" ]; then echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_upload_file.log exit 1 else echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." fi if [[ "$RESPONSE_BODY" != *"Data preparation succeeded"* ]]; then echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_upload_file.log exit 1 else echo "[ $SERVICE_NAME ] Content is as expected." @@ -60,16 +61,18 @@ function validate_microservice() { HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') SERVICE_NAME="dataprep - upload - link" - docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_upload_link.log + if [ "$HTTP_STATUS" -ne "200" ]; then echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_upload_link.log exit 1 else echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." fi if [[ "$RESPONSE_BODY" != *"Data preparation succeeded"* ]]; then echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_upload_link.log exit 1 else echo "[ $SERVICE_NAME ] Content is as expected." @@ -81,16 +84,17 @@ function validate_microservice() { HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') SERVICE_NAME="dataprep - get" - docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_file.log if [ "$HTTP_STATUS" -ne "200" ]; then echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_file.log exit 1 else echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." fi if [[ "$RESPONSE_BODY" != *'{"name":'* ]]; then echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_file.log exit 1 else echo "[ $SERVICE_NAME ] Content is as expected." @@ -102,11 +106,11 @@ function validate_microservice() { HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') SERVICE_NAME="dataprep - del" - docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_del.log # check response status if [ "$HTTP_STATUS" -ne "200" ]; then echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_del.log exit 1 else echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." @@ -114,6 +118,7 @@ function validate_microservice() { # check response body if [[ "$RESPONSE_BODY" != *'{"status":true}'* ]]; then echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_del.log exit 1 else echo "[ $SERVICE_NAME ] Content is as expected." diff --git a/tests/test_dataprep_redis_langchain_ray.sh b/tests/test_dataprep_redis_langchain_ray.sh index 220609b135..0980eafc19 100644 --- a/tests/test_dataprep_redis_langchain_ray.sh +++ b/tests/test_dataprep_redis_langchain_ray.sh @@ -11,7 +11,7 @@ ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { echo "Building the docker images" cd $WORKPATH - docker build -t opea/dataprep-on-ray-redis:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/redis/langchain_ray/docker/Dockerfile . + docker build --no-cache -t opea/dataprep-on-ray-redis:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/redis/langchain_ray/docker/Dockerfile . if $? ; then echo "opea/dataprep-on-ray-redis built fail" exit 1 diff --git a/tests/test_embeddings_langchain.sh b/tests/test_embeddings_langchain.sh index b9f1beefba..caa7301eaa 100644 --- a/tests/test_embeddings_langchain.sh +++ b/tests/test_embeddings_langchain.sh @@ -37,6 +37,14 @@ function validate_microservice() { -X POST \ -d '{"text":"What is Deep Learning?"}' \ -H 'Content-Type: application/json') + if [[ $result == *"embedding"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs test-comps-embedding-tei-endpoint + docker logs test-comps-embedding-tei-server + exit 1 + fi } function stop_docker() { diff --git a/tests/test_guardrails_pii_detection.sh b/tests/test_guardrails_pii_detection.sh index 783fa0d992..fef024a3d1 100644 --- a/tests/test_guardrails_pii_detection.sh +++ b/tests/test_guardrails_pii_detection.sh @@ -2,7 +2,7 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -set -xe +set -x WORKPATH=$(dirname "$PWD") ip_address=$(hostname -I | awk '{print $1}') @@ -10,7 +10,7 @@ ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { echo "Start building docker images for microservice" cd $WORKPATH - docker build -t opea/guardrails-pii-detection:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/guardrails/pii_detection/docker/Dockerfile . + docker build --no-cache -t opea/guardrails-pii-detection:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/guardrails/pii_detection/docker/Dockerfile . if $? ; then echo "opea/guardrails-pii-detection built fail" exit 1 @@ -31,15 +31,42 @@ function validate_microservice() { export PATH="${HOME}/miniforge3/bin:$PATH" source activate echo "test 1 - single task - ner" - python comps/guardrails/pii_detection/test.py --test_text --batch_size 1 --ip_addr $ip_address --strategy ner + result=$(python comps/guardrails/pii_detection/test.py --test_text --batch_size 1 --ip_addr $ip_address --strategy ner) + if [[ $result == *"An error occurred"* ]]; then + echo "Result wrong. Received was $result" + docker logs test-comps-guardrails-pii-detection-endpoint + exit 1 + else + echo "Result correct." + fi echo "test 2 - 20 tasks in parallel - ner" - python comps/guardrails/pii_detection/test.py --test_text --batch_size 20 --ip_addr $ip_address --strategy ner + result=$(python comps/guardrails/pii_detection/test.py --test_text --batch_size 20 --ip_addr $ip_address --strategy ner) + if [[ $result == *"An error occurred"* ]]; then + echo "Result wrong. Received was $result" + docker logs test-comps-guardrails-pii-detection-endpoint + exit 1 + else + echo "Result correct." + fi echo "test 3 - single task - ml" - python comps/guardrails/pii_detection/test.py --test_text --batch_size 1 --ip_addr $ip_address --strategy ml + result=$(python comps/guardrails/pii_detection/test.py --test_text --batch_size 1 --ip_addr $ip_address --strategy ml) + if [[ $result == *"An error occurred"* ]]; then + echo "Result wrong. Received was $result" + docker logs test-comps-guardrails-pii-detection-endpoint + exit 1 + else + echo "Result correct." + fi echo "test 4 - 20 tasks in parallel - ml" - python comps/guardrails/pii_detection/test.py --test_text --batch_size 20 --ip_addr $ip_address --strategy ml + result=$(python comps/guardrails/pii_detection/test.py --test_text --batch_size 20 --ip_addr $ip_address --strategy ml) + if [[ $result == *"An error occurred"* ]]; then + echo "Result wrong. Received was $result" + docker logs test-comps-guardrails-pii-detection-endpoint + exit 1 + else + echo "Result correct." + fi echo "Validate microservice completed" - docker logs test-comps-guardrails-pii-detection-endpoint } function stop_docker() { diff --git a/tests/test_llms_text-generation_native.sh b/tests/test_llms_text-generation_native.sh new file mode 100644 index 0000000000..69517327e5 --- /dev/null +++ b/tests/test_llms_text-generation_native.sh @@ -0,0 +1,91 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') + +function build_docker_images() { + cd $WORKPATH + docker build --no-cache \ + --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy \ + -t opea/llm-native:comps \ + -f comps/llms/text-generation/native/docker/Dockerfile . + if $? ; then + echo "opea/llm-native built fail" + exit 1 + else + echo "opea/llm-native built successful" + fi +} + +function start_service() { + LLM_NATIVE_MODEL="Qwen/Qwen2-7B-Instruct" + llm_native_service_port=5070 + docker run -d \ + --name="test-comps-llm-native-server" \ + -p ${llm_native_service_port}:9000 \ + --runtime=habana \ + --cap-add=SYS_NICE \ + --ipc=host \ + -e http_proxy=${http_proxy} \ + -e https_proxy=${https_proxy} \ + -e LLM_NATIVE_MODEL=${LLM_NATIVE_MODEL} \ + -e HABANA_VISIBLE_DEVICES=all \ + -e OMPI_MCA_btl_vader_single_copy_mechanism=none \ + -e TOKENIZERS_PARALLELISM=false \ + --restart unless-stopped \ + --network bridge \ + opea/llm-native:comps + + sleep 5s +} + +function validate_microservice() { + llm_native_service_port=5070 + URL="http://${ip_address}:${llm_native_service_port}/v1/chat/completions" + INPUT_DATA='{"query":"What is Deep Learning?"}' + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") + HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') + SERVICE_NAME="llm-native" + + # check response status + if [ "$HTTP_STATUS" -ne "200" ]; then + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-llm-native-server >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + else + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + fi + # check response body + if [[ "$RESPONSE_BODY" != *'"text":"What'* ]]; then + echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + docker logs test-comps-llm-native-server >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + else + echo "[ $SERVICE_NAME ] Content is as expected." + fi +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps-llm-native*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + build_docker_images + start_service + validate_microservice + stop_docker + + echo y | docker system prune + +} + +main diff --git a/tests/test_llms_text-generation_vllm-openvino.sh b/tests/test_llms_text-generation_vllm-openvino.sh index 7e4e40e986..dc460d0aa5 100755 --- a/tests/test_llms_text-generation_vllm-openvino.sh +++ b/tests/test_llms_text-generation_vllm-openvino.sh @@ -16,7 +16,7 @@ function build_container() { cd $WORKPATH git clone https://github.com/vllm-project/vllm.git vllm-openvino cd ./vllm-openvino/ - docker build -t $DOCKER_IMAGE \ + docker build --no-cache -t $DOCKER_IMAGE \ -f Dockerfile.openvino \ . \ --build-arg https_proxy=$https_proxy \ @@ -102,6 +102,7 @@ function test_api_endpoint { else echo "FAIL: $endpoint returned unexpected status code: $response (expected: $expected_status)" docker logs $CONTAINER_NAME + exit 1 fi } # Main function diff --git a/tests/test_llms_text-generation_vllm-ray.sh b/tests/test_llms_text-generation_vllm-ray.sh index 3b1c606814..ae9a427280 100644 --- a/tests/test_llms_text-generation_vllm-ray.sh +++ b/tests/test_llms_text-generation_vllm-ray.sh @@ -12,7 +12,7 @@ function build_docker_images() { cd $WORKPATH docker build \ -f comps/llms/text-generation/vllm-ray/docker/Dockerfile.vllmray \ - -t opea/vllm_ray-habana:comps --network=host . + --no-cache -t opea/vllm_ray-habana:comps --network=host . if $? ; then echo "opea/vllm_ray-habana built fail" exit 1 @@ -23,7 +23,7 @@ function build_docker_images() { ## Build OPEA microservice docker cd $WORKPATH docker build \ - -t opea/llm-vllm-ray:comps \ + --no-cache -t opea/llm-vllm-ray:comps \ -f comps/llms/text-generation/vllm-ray/docker/Dockerfile.microservice . if $? ; then echo "opea/llm-vllm-ray built fail" @@ -76,12 +76,26 @@ function validate_microservice() { result=$(http_proxy="" curl http://${ip_address}:5031/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{"model": "facebook/opt-125m", "messages": [{"role": "user", "content": "How are you?"}]}') - result_2=$(http_proxy="" curl http://${ip_address}:5032/v1/chat/completions \ + if [[ $result == *"message"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs test-comps-vllm-ray-service + docker logs test-comps-vllm-ray-microservice + exit 1 + fi + result=$(http_proxy="" curl http://${ip_address}:5032/v1/chat/completions \ -X POST \ -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \ -H 'Content-Type: application/json') - docker logs test-comps-vllm-ray-service - docker logs test-comps-vllm-ray-microservice + if [[ $result == *"text"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs test-comps-vllm-ray-service + docker logs test-comps-vllm-ray-microservice + exit 1 + fi } function stop_docker() { diff --git a/tests/test_llms_text-generation_vllm.sh b/tests/test_llms_text-generation_vllm.sh index 05f249cc8a..1aae114e83 100644 --- a/tests/test_llms_text-generation_vllm.sh +++ b/tests/test_llms_text-generation_vllm.sh @@ -12,7 +12,7 @@ function build_docker_images() { cd $WORKPATH/comps/llms/text-generation/vllm docker build \ -f docker/Dockerfile.hpu \ - -t opea/vllm-hpu:comps \ + --no-cache -t opea/vllm-hpu:comps \ --shm-size=128g . if $? ; then echo "opea/vllm-hpu built fail" @@ -24,7 +24,7 @@ function build_docker_images() { ## Build OPEA microservice docker cd $WORKPATH docker build \ - -t opea/llm-vllm:comps \ + --no-cache -t opea/llm-vllm:comps \ -f comps/llms/text-generation/vllm/docker/Dockerfile.microservice . if $? ; then echo "opea/llm-vllm built fail" @@ -74,7 +74,7 @@ function start_service() { } function validate_microservice() { - result=$(http_proxy="" curl http://${ip_address}:8008/v1/completions \ + result=$(http_proxy="" curl http://${ip_address}:5025/v1/completions \ -H "Content-Type: application/json" \ -d '{ "model": "facebook/opt-125m", @@ -82,12 +82,26 @@ function validate_microservice() { "max_tokens": 32, "temperature": 0 }') - result_2=$(http_proxy="" curl http://${ip_address}:5030/v1/chat/completions \ + if [[ $result == *"text"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs test-comps-vllm-service + docker logs test-comps-vllm-microservice + exit 1 + fi + result=$(http_proxy="" curl http://${ip_address}:5030/v1/chat/completions \ -X POST \ -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_p":0.95,"temperature":0.01,"streaming":false}' \ -H 'Content-Type: application/json') - docker logs test-comps-vllm-service - docker logs test-comps-vllm-microservice + if [[ $result == *"text"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs test-comps-vllm-service + docker logs test-comps-vllm-microservice + exit 1 + fi } function stop_docker() { diff --git a/tests/test_lvms_llava.sh b/tests/test_lvms_llava.sh index 9282298475..2e8f3cbd55 100644 --- a/tests/test_lvms_llava.sh +++ b/tests/test_lvms_llava.sh @@ -10,7 +10,7 @@ ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { cd $WORKPATH echo $(pwd) - docker build -t opea/llava:comps -f comps/lvms/llava/Dockerfile . + docker build --no-cache -t opea/llava:comps -f comps/lvms/llava/Dockerfile . if $? ; then echo "opea/llava built fail" exit 1 diff --git a/tests/test_lvms_tgi_llava_next.sh b/tests/test_lvms_tgi_llava_next.sh index 1824df0152..5a654b4ec7 100644 --- a/tests/test_lvms_tgi_llava_next.sh +++ b/tests/test_lvms_tgi_llava_next.sh @@ -11,13 +11,14 @@ function build_docker_images() { cd $WORKPATH echo $(pwd) git clone https://github.com/yuanwu2017/tgi-gaudi.git && cd tgi-gaudi && git checkout v2.0.4 - docker build -t opea/llava-tgi:comps . + docker build --no-cache -t opea/llava-tgi:comps . if $? ; then echo "opea/llava-tgi built fail" exit 1 else echo "opea/llava-tgi built successful" fi + cd .. docker build --no-cache -t opea/lvm-tgi:comps -f comps/lvms/Dockerfile_tgi . if $? ; then diff --git a/tests/test_reranks_fastrag.sh b/tests/test_reranks_fastrag.sh index 3438be280d..0be7489aff 100644 --- a/tests/test_reranks_fastrag.sh +++ b/tests/test_reranks_fastrag.sh @@ -31,7 +31,13 @@ function validate_microservice() { -X POST \ -d '{"initial_query":"What is Deep Learning?", "retrieved_docs": [{"text":"Deep Learning is not..."}, {"text":"Deep learning is..."}]}' \ -H 'Content-Type: application/json') - docker logs test-comps-reranking-fastrag-server + if [[ $result == *"reranked_docs"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs test-comps-reranking-fastrag-server + exit 1 + fi } function stop_docker() { diff --git a/tests/test_retrievers_haystack_qdrant.sh b/tests/test_retrievers_haystack_qdrant.sh index bc196efff2..364f63450d 100644 --- a/tests/test_retrievers_haystack_qdrant.sh +++ b/tests/test_retrievers_haystack_qdrant.sh @@ -49,8 +49,14 @@ function validate_microservice() { -X POST \ -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" \ -H 'Content-Type: application/json') - docker logs test-comps-retriever-qdrant-server - docker logs test-comps-retriever-tei-endpoint + if [[ $result == *"retrieved_docs"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs test-comps-retriever-qdrant-server + docker logs test-comps-retriever-tei-endpoint + exit 1 + fi } function stop_docker() { diff --git a/tests/test_retrievers_langchain_pgvector.sh b/tests/test_retrievers_langchain_pgvector.sh index 481a389521..b28fb632e5 100755 --- a/tests/test_retrievers_langchain_pgvector.sh +++ b/tests/test_retrievers_langchain_pgvector.sh @@ -47,8 +47,14 @@ function validate_microservice() { -X POST \ -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" \ -H 'Content-Type: application/json') - docker logs test-comps-vectorstore-postgres - docker logs test-comps-retriever-tei-endpoint + if [[ $result == *"retrieved_docs"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs test-comps-vectorstore-postgres + docker logs test-comps-retriever-tei-endpoint + exit 1 + fi } function stop_docker() { diff --git a/tests/test_retrievers_langchain_pinecone.sh b/tests/test_retrievers_langchain_pinecone.sh index 40ae67942a..7fb105a94c 100755 --- a/tests/test_retrievers_langchain_pinecone.sh +++ b/tests/test_retrievers_langchain_pinecone.sh @@ -44,8 +44,14 @@ function validate_microservice() { -X POST \ -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" \ -H 'Content-Type: application/json') - docker logs test-comps-retriever-pinecone-server - docker logs test-comps-retriever-tei-endpoint + if [[ $result == *"retrieved_docs"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs test-comps-retriever-pinecone-server + docker logs test-comps-retriever-tei-endpoint + exit 1 + fi } function stop_docker() { diff --git a/tests/test_vectorstores_langchain_milvus.sh b/tests/test_vectorstores_langchain_milvus.sh index 60303017d4..86124baa55 100644 --- a/tests/test_vectorstores_langchain_milvus.sh +++ b/tests/test_vectorstores_langchain_milvus.sh @@ -2,7 +2,7 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -set -xe +set -x WORKPATH=$(dirname "$PWD") LOG_PATH="$WORKPATH/tests" @@ -30,6 +30,7 @@ function validate_vectorstore() { echo "[ test create ] create collection succeed" else echo "[ test create ] create collection failed" + docker logs milvus-standalone exit 1 fi @@ -41,6 +42,7 @@ function validate_vectorstore() { echo "[ test insert ] insert data succeed" else echo "[ test insert ] insert data failed" + docker logs milvus-standalone exit 1 fi @@ -52,6 +54,7 @@ function validate_vectorstore() { echo "[ test search ] search data succeed" else echo "[ test search ] search data failed" + docker logs milvus-standalone exit 1 fi }