Clean up notebooks for blog chunking eval

Signed-off-by: Christy Bergman <[email protected]>
milvus-io · Jul 9, 2024 · 9c6756f · 9c6756f
1 parent e7f9f26
commit 9c6756f
Show file tree

Hide file tree

Showing 6 changed files with 3,614 additions and 2,573 deletions.
diff --git a/bootcamp/Evaluation/data/blog_eval_answers.csv b/bootcamp/Evaluation/data/blog_eval_answers.csv
diff --git a/bootcamp/Evaluation/data/ground_truth_answers.csv b/bootcamp/Evaluation/data/ground_truth_answers.csv
diff --git a/bootcamp/Integration/eval_ragas.py → bootcamp/Evaluation/eval_ragas.py b/bootcamp/Integration/eval_ragas.py → bootcamp/Evaluation/eval_ragas.py
@@ -16,7 +16,7 @@ def assemble_ragas_dataset(input_df):
     truth_list = input_df.ground_truth_answer.to_list()
 
     # Get all the Milvus Retrieval Contexts as list[list[str]]
-    context_list = input_df.Custom_RAG_context.to_list()
+    context_list = input_df.recursive_context_512_k_2.to_list()
     context_list = [[context] for context in context_list]
 
     # Get all the RAG answers based on contexts.
@@ -34,7 +34,7 @@ def assemble_ragas_dataset(input_df):
 def evaluate_ragas_model(pandas_eval_df, 
                          ragas_eval_metrics, 
                          what_to_evaluate='CONTEXTS',
-                         cols_to_evaluate=['Custom_RAG_context', 'simple_context']):
+                         cols_to_evaluate=['recursive_context_512_k_2', 'html_context_512_k_2']):
     """Evaluate the RAGAS model using the input pandas df."""
 
     temp_df = pandas_eval_df.copy()
@@ -48,7 +48,7 @@ def evaluate_ragas_model(pandas_eval_df,
         if what_to_evaluate == "CONTEXTS":
             # Keep the Custom_RAG_answer as is.
             # Replace the Custom_RAG_context with the col context.
-            temp_df['Custom_RAG_context'] = temp_df[col]
+            temp_df['recursive_context_512_k_2'] = temp_df[col]
 
         # Replace the Custom_RAG_answer with the LLM answer to evaluate.
         elif what_to_evaluate == "ANSWERS":
@@ -80,7 +80,8 @@ def evaluate_ragas_model(pandas_eval_df,
         elif what_to_evaluate == "ANSWERS":
             print(f"Evaluate LLM: {col}, ",end="")
             # Calculate avg LLM answer scores across all floating point number scores between 0 and 1.
-            temp['avg_answer_score'] = (temp.answer_relevancy + temp.answer_similarity + temp.answer_correctness) / 3
+            # temp['avg_answer_score'] = (temp.answer_relevancy + temp.answer_similarity + temp.answer_correctness) / 3
+            temp['avg_answer_score'] = temp.answer_correctness
             avg_answer_score = np.round(temp.avg_answer_score.mean(),4)
             temp_score = avg_answer_score
         print(f"avg_score: {temp_score}")