Merge pull request #47 from ansible/aap_38439

PR to apply for E2E OLS evaluation framework for AAP chatbot
ansible · Jan 31, 2025 · 265d1c6 · 265d1c6
2 parents 8d092ac + e346349
commit 265d1c6
Show file tree

Hide file tree

Showing 6 changed files with 65 additions and 1 deletion.
diff --git a/scripts/evaluation/README.md b/scripts/evaluation/README.md
@@ -11,6 +11,7 @@ Currently we have 2 types of evaluations.
 - QnAs were generated from OCP docs by LLMs. It is possible that some of the questions/answers are not entirely correct. We are constantly trying to verify both Questions & Answers manually. If you find any QnA pair to be modified or removed, please create a PR.
 - OLS API should be ready/live with all the required provider+model configured.
 - It is possible that we want to run both consistency and model evaluation together. To avoid multiple API calls for same query, *model* evaluation first checks .csv file generated by *consistency* evaluation. If response is not present in csv file, then only we call API to get the response.
+- User needs to install python `matplotlib`, and `rouge_score` before running the evaluation.
 
 ### e2e test case
 
@@ -21,6 +22,11 @@ These evaluations are also part of **e2e test cases**. Currently *consistency* e
 python -m scripts.evaluation.driver
 ```
 
+### Sample run command
+```
+OPENAI_API_KEY=IGNORED python -m scripts.evaluation.driver --qna_pool_file ./scripts/evaluation/eval_data/aap-sample.parquet --eval_provider_model_id my_rhoai+granite3-8b --eval_metrics answer_relevancy answer_similarity_llm cos_score rougeL_precision --eval_modes vanilla --judge_model granite3-8b --judge_provider my_rhoai3 --eval_query_ids qna1
+```
+
 ### Input Data/QnA pool
 [Json file](eval_data/question_answer_pair.json)
 

diff --git a/scripts/evaluation/eval_data/aap-sample.parquet b/scripts/evaluation/eval_data/aap-sample.parquet
diff --git a/scripts/evaluation/eval_data/aap.parquet b/scripts/evaluation/eval_data/aap.parquet
diff --git a/scripts/evaluation/olsconfig.yaml b/scripts/evaluation/olsconfig.yaml
@@ -0,0 +1,56 @@
+# olsconfig.yaml sample for local ollama server
+#
+# 1. install local ollama server from https://ollama.com/
+# 2. install llama3.1:latest model with:
+#       ollama pull llama3.1:latest
+# 3. Copy this file to the project root of cloned lightspeed-service repo
+# 4. Install dependencies with:
+#       make install-deps
+# 5. Start lightspeed-service with:
+#       OPENAI_API_KEY=IGNORED make run
+# 6. Open https://localhost:8080/ui in your web browser
+#
+llm_providers:
+  - name: ollama
+    type: openai
+    url: "http://localhost:11434/v1/"
+    models:
+      - name: "mistral"
+      - name: 'llama3.2:latest'
+  - name: my_rhoai
+    type: openai
+    url: "https://granite3-8b-wisdom-model-staging.apps.stage2-west.v2dz.p1.openshiftapps.com/v1"
+    credentials_path: ols_api_key.txt
+    models:
+      - name: granite3-8b
+ols_config:
+  # max_workers: 1
+  reference_content:
+    # product_docs_index_path: "./vector_db/vector_db/aap_product_docs/2.5"
+    # product_docs_index_id: aap-product-docs-2_5
+    # embeddings_model_path: "./vector_db/embeddings_model"
+  conversation_cache:
+    type: memory
+    memory:
+      max_entries: 1000
+  logging_config:
+    app_log_level: info
+    lib_log_level: warning
+    uvicorn_log_level: info
+  default_provider: ollama
+  default_model: 'llama3.2:latest'
+  query_validation_method: llm
+  user_data_collection:
+    feedback_disabled: false
+    feedback_storage: "/tmp/data/feedback"
+    transcripts_disabled: false
+    transcripts_storage: "/tmp/data/transcripts"
+dev_config:
+  # config options specific to dev environment - launching OLS in local
+  enable_dev_ui: true
+  disable_auth: true
+  disable_tls: true
+  pyroscope_url: "https://pyroscope.pyroscope.svc.cluster.local:4040"
+  # llm_params:
+  #   temperature_override: 0
+  # k8s_auth_token: optional_token_when_no_available_kube_config
diff --git a/scripts/evaluation/utils/constants.py b/scripts/evaluation/utils/constants.py
@@ -11,6 +11,8 @@
     "azure_openai+gpt-4o": ("azure_openai", "gpt-4o"),
     "ollama+llama3.1:latest": ("ollama", "llama3.1:latest"),
     "ollama+mistral": ("ollama", "mistral"),
+    "my_rhoai+granite3-8b": ("my_rhoai", "granite3-8b"),
+    "my_rhoai3+granite3-1-8b": ("my_rhoai3", "granite3-1-8b"),
 }
 
 NON_LLM_EVALS = {

diff --git a/scripts/evaluation/utils/relevancy_score.py b/scripts/evaluation/utils/relevancy_score.py
@@ -42,7 +42,7 @@ def get_score(
                     # raise
             sleep(time_to_breath)
 
-        if out:
+        if out and isinstance(out, dict):
             valid_flag = out["Valid"]
             gen_questions = out["Question"]
             score = 0