Merge branch 'tag-bench-dev' of https://github.com/minmin-intel/GenAI…

…Eval into tag-bench-dev
opea-project · Feb 4, 2025 · 69a1dec · 69a1dec
2 parents 2efa96b + b51697f
commit 69a1dec
Show file tree

Hide file tree

Showing 20 changed files with 89 additions and 55 deletions.
diff --git a/evals/evaluation/agent_eval/README.md b/evals/evaluation/agent_eval/README.md
@@ -5,4 +5,4 @@ We collected two benchmarks for evaluating agentic applications:
 
 These agent benchmarks are enabled on Intel Gaudi systems using vllm as the LLM serving framework. You can choose to serve the models on other hardware with vllm too.
 
-We will add more benchmarks for agents in the future. Stay tuned.
+We will add more benchmarks for agents in the future. Stay tuned.
diff --git a/evals/evaluation/agent_eval/TAG-Bench/README.md b/evals/evaluation/agent_eval/TAG-Bench/README.md
@@ -79,7 +79,7 @@ docker build --no-cache -t $agent_image --build-arg http_proxy=$http_proxy --bui
 export GOOGLE_CSE_ID=<your-GOOGLE_CSE_ID>
 export GOOGLE_API_KEY=<your-GOOGLE_API_KEY>
 ```
-For intructions on how to obtain your `GOOGLE_CSE_ID` and `your-GOOGLE_API_KEY`, refer to instructions [here](https://python.langchain.com/docs/integrations/tools/google_search/).
+For instructions on how to obtain your `GOOGLE_CSE_ID` and `your-GOOGLE_API_KEY`, refer to instructions [here](https://python.langchain.com/docs/integrations/tools/google_search/).
 
 5. Launch SQL agent
 ```bash
@@ -128,4 +128,4 @@ Human grading criteria:
 |**Text2SQL (TAG-Bench paper)**|0.17||
 |**Human performance (TAG-Bench paper)**|0.55||
 
-We can see that our SQL agent achieved much higher accuracy than Text2SQL, although still lower than human experts.
+We can see that our SQL agent achieved much higher accuracy than Text2SQL, although still lower than human experts.
diff --git a/evals/evaluation/agent_eval/TAG-Bench/opea_sql_agent_llama/launch_sql_agent.sh b/evals/evaluation/agent_eval/TAG-Bench/opea_sql_agent_llama/launch_sql_agent.sh
@@ -21,7 +21,7 @@ export temperature=0.01
 export max_new_tokens=4096
 
 # Tools
-export TOOLSET_PATH=${EVALDIR}/opea_sql_agent_llama/tools/ 
+export TOOLSET_PATH=${EVALDIR}/opea_sql_agent_llama/tools/
 ls ${TOOLSET_PATH}
 # for using Google search API
 export GOOGLE_CSE_ID=${GOOGLE_CSE_ID}

diff --git a/evals/evaluation/agent_eval/TAG-Bench/opea_sql_agent_llama/sql_agent_llama.yaml b/evals/evaluation/agent_eval/TAG-Bench/opea_sql_agent_llama/sql_agent_llama.yaml
@@ -35,4 +35,4 @@ services:
       https_proxy: ${https_proxy}
       port: 9095
       GOOGLE_CSE_ID: ${GOOGLE_CSE_ID} #delete
-      GOOGLE_API_KEY: ${GOOGLE_API_KEY} # delete
+      GOOGLE_API_KEY: ${GOOGLE_API_KEY} # delete
diff --git a/evals/evaluation/agent_eval/TAG-Bench/opea_sql_agent_llama/tools/sql_agent_tools.py b/evals/evaluation/agent_eval/TAG-Bench/opea_sql_agent_llama/tools/sql_agent_tools.py
@@ -16,4 +16,4 @@ def search_web(query: str) -> str:
     )
 
     response = tool.run(query)
-    return response
+    return response
diff --git a/evals/evaluation/agent_eval/TAG-Bench/opea_sql_agent_llama/tools/sql_agent_tools.yaml b/evals/evaluation/agent_eval/TAG-Bench/opea_sql_agent_llama/tools/sql_agent_tools.yaml
@@ -8,4 +8,4 @@ search_web:
     query:
       type: str
       description: query
-  return_output: retrieved_data
+  return_output: retrieved_data
diff --git a/evals/evaluation/agent_eval/TAG-Bench/preprocess_data/generate_hints.py b/evals/evaluation/agent_eval/TAG-Bench/preprocess_data/generate_hints.py
@@ -52,4 +52,4 @@ def generate_column_descriptions(db_name):
     for db_name in subfolders:
         print("Generating hints for database: ", db_name)
         generate_column_descriptions(db_name)
-        print("="*30)
+        print("=" * 30)
diff --git a/evals/evaluation/agent_eval/TAG-Bench/preprocess_data/run_data_split.sh b/evals/evaluation/agent_eval/TAG-Bench/preprocess_data/run_data_split.sh
@@ -3,4 +3,4 @@
 
 DATAPATH=$WORKDIR/TAG-Bench/tag_queries.csv
 OUTFOLDER=$WORKDIR/TAG-Bench/query_by_db
-python3 split_data.py --path $DATAPATH --output $OUTFOLDER
+python3 split_data.py --path $DATAPATH --output $OUTFOLDER
diff --git a/evals/evaluation/agent_eval/TAG-Bench/run_benchmark/combine_scores.py b/evals/evaluation/agent_eval/TAG-Bench/run_benchmark/combine_scores.py
@@ -1,14 +1,18 @@
-import pandas as pd
-import os
-import glob
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
+import glob
+import os
+
+import pandas as pd
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--filedir', type=str, required=True, help='Directory containing the csv files')
+parser.add_argument("--filedir", type=str, required=True, help="Directory containing the csv files")
 args = parser.parse_args()
 
 filedir = args.filedir
-csv_files = glob.glob(os.path.join(filedir, '*_graded.csv'))
+csv_files = glob.glob(os.path.join(filedir, "*_graded.csv"))
 print("Number of score files found: ", len(csv_files))
 print(csv_files)
 

diff --git a/evals/evaluation/agent_eval/TAG-Bench/run_benchmark/generate_answers.py b/evals/evaluation/agent_eval/TAG-Bench/run_benchmark/generate_answers.py
@@ -10,10 +10,7 @@
 
 def generate_answer_agent_api(url, prompt):
     proxies = {"http": ""}
-    payload = {
-        "messages": prompt,
-        "stream": False
-    }
+    payload = {"messages": prompt, "stream": False}
     response = requests.post(url, json=payload, proxies=proxies)
     answer = response.json()["text"]
     return answer
@@ -54,10 +51,10 @@ def save_json_lines(json_lines, args):
         res = generate_answer_agent_api(url, query)
         answers.append(res)
         print("******Answer:\n", res)
-        json_lines.append({"query": query,"ref_answer":ref_answer, "answer": res})
+        json_lines.append({"query": query, "ref_answer": ref_answer, "answer": res})
         save_json_lines(json_lines, args)
         print("=" * 20)
 
-    df.rename(columns={"Answer": "ref_answer", "Query":"query"}, inplace=True)
-    df['answer'] = answers
-    df.to_csv(os.path.join(args.output_dir, args.output_file), index=False)
+    df.rename(columns={"Answer": "ref_answer", "Query": "query"}, inplace=True)
+    df["answer"] = answers
+    df.to_csv(os.path.join(args.output_dir, args.output_file), index=False)
diff --git a/evals/evaluation/agent_eval/TAG-Bench/run_benchmark/grade_answers.py b/evals/evaluation/agent_eval/TAG-Bench/run_benchmark/grade_answers.py
@@ -37,6 +37,7 @@ def make_list_of_test_cases(data):
             )
     return output
 
+
 def read_data(args):
     data = pd.read_csv(os.path.join(args.filedir, args.filename))
     if "query" not in data.columns:
@@ -46,13 +47,21 @@ def read_data(args):
             raise ValueError("The query column is missing in the data")
     return data
 
+
 def grade_answers(args, test_case):
     from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 
     print("==============getting embeddings==============")
     embeddings = HuggingFaceBgeEmbeddings(model_name=args.embed_model)
     print("==============initiating metric==============")
-    metric = RagasMetric(threshold=0.5, metrics=["answer_correctness"], model=args.llm_endpoint, model_name=args.model_name,embeddings=embeddings, use_vllm=args.use_vllm)
+    metric = RagasMetric(
+        threshold=0.5,
+        metrics=["answer_correctness"],
+        model=args.llm_endpoint,
+        model_name=args.model_name,
+        embeddings=embeddings,
+        use_vllm=args.use_vllm,
+    )
     print("==============start grading==============")
 
     if args.batch_grade:
@@ -66,7 +75,7 @@ def grade_answers(args, test_case):
             score = metric.score["answer_correctness"][0]
             print(score)
             scores.append(score)
-                
+
             print("-" * 50)
         return scores
 

diff --git a/evals/evaluation/agent_eval/TAG-Bench/run_benchmark/run_all_databases.sh b/evals/evaluation/agent_eval/TAG-Bench/run_benchmark/run_all_databases.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 # This script is used to run benchmarks on all databases
 
 EVALDIR=$WORKDIR/GenAIEval/evals/evaluation/agent_eval/TAG-Bench
@@ -33,4 +36,4 @@ done
 combine all scores
 python3 combine_scores.py --filedir $WORKDIR/sql_agent_output/
 
-echo "All done!"
+echo "All done!"
diff --git a/evals/evaluation/agent_eval/TAG-Bench/run_benchmark/run_generate_answer.sh b/evals/evaluation/agent_eval/TAG-Bench/run_benchmark/run_generate_answer.sh
@@ -9,4 +9,4 @@ query_file=${WORKDIR}/TAG-Bench/query_by_db/query_${db_name}.csv
 outdir=$WORKDIR/sql_agent_output
 outfile=${db_name}_agent_test_result.csv
 port=9096
-python3 generate_answers.py --query_file $query_file --output_dir $outdir --output_file $outfile --db_name $db_name --port $port
+python3 generate_answers.py --query_file $query_file --output_dir $outdir --output_file $outfile --db_name $db_name --port $port
diff --git a/evals/evaluation/agent_eval/crag_eval/opea_rag_agent/tools/tools.py b/evals/evaluation/agent_eval/crag_eval/opea_rag_agent/tools/tools.py
@@ -8,7 +8,10 @@
 
 
 def search_knowledge_base(query: str) -> str:
-    """Search a knowledge base about music and singers for a given query. Returns text related to the query."""
+    """Search a knowledge base about music and singers for a given query.
+
+    Returns text related to the query.
+    """
     url = os.environ.get("WORKER_AGENT_URL")
     print(url)
     proxies = {"http": ""}
@@ -20,120 +23,119 @@ def search_knowledge_base(query: str) -> str:
     return response.json()["text"]
 
 
-def search_artist_entity_by_name(artist_name: str) -> dict: 
-    '''Search for music artists by name.'''
+def search_artist_entity_by_name(artist_name: str) -> dict:
+    """Search for music artists by name."""
     api = CRAG()
     return api.music_search_artist_entity_by_name(artist_name)
 
 
 def search_song_entity_by_name(song_name: str) -> dict:
-    '''Search for songs by name.'''
+    """Search for songs by name."""
     api = CRAG()
     return api.music_search_song_entity_by_name(song_name)
 
 
 def get_billboard_rank_date(rank: int, date: str = None) -> dict:
-    '''Get Billboard ranking for a specific rank and date.'''
+    """Get Billboard ranking for a specific rank and date."""
     api = CRAG()
     rank = int(rank)
     return api.music_get_billboard_rank_date(rank, date)
 
 
 def get_billboard_attributes(date: str, attribute: str, song_name: str) -> dict:
-    '''Get attributes of a song from Billboard rankings.'''
+    """Get attributes of a song from Billboard rankings."""
     api = CRAG()
     return api.music_get_billboard_attributes(date, attribute, song_name)
 
 
 def get_grammy_best_artist_by_year(year: int) -> dict:
-    '''Get the Grammy Best New Artist for a specific year.'''
+    """Get the Grammy Best New Artist for a specific year."""
     api = CRAG()
     year = int(year)
     return api.music_grammy_get_best_artist_by_year(year)
 
 
 def get_grammy_award_count_by_artist(artist_name: str) -> dict:
-    '''Get the total Grammy awards won by an artist.'''
+    """Get the total Grammy awards won by an artist."""
     api = CRAG()
     return api.music_grammy_get_award_count_by_artist(artist_name)
 
 
 def get_grammy_award_count_by_song(song_name: str) -> dict:
-    '''Get the total Grammy awards won by a song.'''
+    """Get the total Grammy awards won by a song."""
     api = CRAG()
     return api.music_grammy_get_award_count_by_song(song_name)
 
 
 def get_grammy_best_song_by_year(year: int) -> dict:
-    '''Get the Grammy Song of the Year for a specific year.'''
+    """Get the Grammy Song of the Year for a specific year."""
     api = CRAG()
     year = int(year)
     return api.music_grammy_get_best_song_by_year(year)
 
 
-
 def get_grammy_award_date_by_artist(artist_name: str) -> dict:
-    '''Get the years an artist won a Grammy award.'''
+    """Get the years an artist won a Grammy award."""
     api = CRAG()
     return api.music_grammy_get_award_date_by_artist(artist_name)
 
 
 def get_grammy_best_album_by_year(year: int) -> dict:
-    '''Get the Grammy Album of the Year for a specific year.'''
+    """Get the Grammy Album of the Year for a specific year."""
     api = CRAG()
     year = int(year)
     return api.music_grammy_get_best_album_by_year(year)
 
 
 def get_all_awarded_artists() -> dict:
-    '''Get all artists awarded the Grammy Best New Artist.'''
+    """Get all artists awarded the Grammy Best New Artist."""
     api = CRAG()
     return api.music_grammy_get_all_awarded_artists()
 
 
 def get_artist_birth_place(artist_name: str) -> dict:
-    '''Get the birthplace of an artist.'''
+    """Get the birthplace of an artist."""
     api = CRAG()
     return api.music_get_artist_birth_place(artist_name)
 
 
 def get_artist_birth_date(artist_name: str) -> dict:
-    '''Get the birth date of an artist.'''
+    """Get the birth date of an artist."""
     api = CRAG()
     return api.music_get_artist_birth_date(artist_name)
 
 
 def get_members(band_name: str) -> dict:
-    '''Get the member list of a band.'''
+    """Get the member list of a band."""
     api = CRAG()
     return api.music_get_members(band_name)
 
 
 def get_lifespan(artist_name: str) -> dict:
-    '''Get the lifespan of an artist.'''
+    """Get the lifespan of an artist."""
     api = CRAG()
     return api.music_get_lifespan(artist_name)
 
 
 def get_song_author(song_name: str) -> dict:
-    '''Get the author of a song.'''
+    """Get the author of a song."""
     api = CRAG()
     return api.music_get_song_author(song_name)
 
 
 def get_song_release_country(song_name: str) -> dict:
-    '''Get the release country of a song.'''
+    """Get the release country of a song."""
     api = CRAG()
     return api.music_get_song_release_country(song_name)
 
 
 def get_song_release_date(song_name: str) -> dict:
-    '''Get the release date of a song.'''
+    """Get the release date of a song."""
     api = CRAG()
     return api.music_get_song_release_date(song_name)
 
 
 def get_artist_all_works(artist_name: str) -> dict:
-    '''Get all works by an artist.'''
+    """Get all works by an artist."""
     api = CRAG()
-    return api.music_get_artist_all_works(artist_name)
+    return api.music_get_artist_all_works(artist_name)
diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py b/evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py
@@ -37,6 +37,7 @@ def make_list_of_test_cases(data):
             )
     return output
 
+
 def read_data(args):
     data = pd.read_csv(os.path.join(args.filedir, args.filename))
     if "query" not in data.columns:
@@ -46,13 +47,21 @@ def read_data(args):
             raise ValueError("The query column is missing in the data")
     return data
 
+
 def grade_answers(args, test_case):
     from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 
     print("==============getting embeddings==============")
     embeddings = HuggingFaceBgeEmbeddings(model_name=args.embed_model)
     print("==============initiating metric==============")
-    metric = RagasMetric(threshold=0.5, metrics=["answer_correctness"], model=args.llm_endpoint, model_name=args.model_name,embeddings=embeddings, use_vllm=args.use_vllm)
+    metric = RagasMetric(
+        threshold=0.5,
+        metrics=["answer_correctness"],
+        model=args.llm_endpoint,
+        model_name=args.model_name,
+        embeddings=embeddings,
+        use_vllm=args.use_vllm,
+    )
     print("==============start grading==============")
 
     if args.batch_grade:
@@ -66,7 +75,7 @@ def grade_answers(args, test_case):
             score = metric.score["answer_correctness"][0]
             print(score)
             scores.append(score)
-                
+
             print("-" * 50)
         return scores
 

diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_grading.sh b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_grading.sh
@@ -11,4 +11,4 @@ python3 grade_answers.py \
 --filename $FILENAME \
 --llm_endpoint $LLM_ENDPOINT \
 --model_name $MODEL_NAME \
---use_vllm
+--use_vllm
diff --git a/evals/evaluation/agent_eval/vllm-gaudi/build_image.sh b/evals/evaluation/agent_eval/vllm-gaudi/build_image.sh
@@ -1,4 +1,7 @@
 
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 echo "Building the vllm docker image"
 cd $WORKDIR
 echo $WORKDIR

diff --git a/evals/evaluation/agent_eval/vllm-gaudi/launch_vllm_gaudi.sh b/evals/evaluation/agent_eval/vllm-gaudi/launch_vllm_gaudi.sh
@@ -1,3 +1,6 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 model="meta-llama/Meta-Llama-3.1-70B-Instruct"
 vllm_port=8085
 vllm_volume=${HF_CACHE_DIR}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -5,4 +5,4 @@ We collected two benchmarks for evaluating agentic applications:

		These agent benchmarks are enabled on Intel Gaudi systems using vllm as the LLM serving framework. You can choose to serve the models on other hardware with vllm too.

		We will add more benchmarks for agents in the future. Stay tuned.
		We will add more benchmarks for agents in the future. Stay tuned.