Skip to content

Commit

Permalink
Merge branch 'tag-bench-dev' of https://github.com/minmin-intel/GenAI…
Browse files Browse the repository at this point in the history
…Eval into tag-bench-dev
  • Loading branch information
minmin-intel committed Feb 4, 2025
2 parents 2efa96b + b51697f commit 69a1dec
Show file tree
Hide file tree
Showing 20 changed files with 89 additions and 55 deletions.
2 changes: 1 addition & 1 deletion evals/evaluation/agent_eval/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ We collected two benchmarks for evaluating agentic applications:

These agent benchmarks are enabled on Intel Gaudi systems using vllm as the LLM serving framework. You can choose to serve the models on other hardware with vllm too.

We will add more benchmarks for agents in the future. Stay tuned.
We will add more benchmarks for agents in the future. Stay tuned.
4 changes: 2 additions & 2 deletions evals/evaluation/agent_eval/TAG-Bench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ docker build --no-cache -t $agent_image --build-arg http_proxy=$http_proxy --bui
export GOOGLE_CSE_ID=<your-GOOGLE_CSE_ID>
export GOOGLE_API_KEY=<your-GOOGLE_API_KEY>
```
For intructions on how to obtain your `GOOGLE_CSE_ID` and `your-GOOGLE_API_KEY`, refer to instructions [here](https://python.langchain.com/docs/integrations/tools/google_search/).
For instructions on how to obtain your `GOOGLE_CSE_ID` and `your-GOOGLE_API_KEY`, refer to instructions [here](https://python.langchain.com/docs/integrations/tools/google_search/).

5. Launch SQL agent
```bash
Expand Down Expand Up @@ -128,4 +128,4 @@ Human grading criteria:
|**Text2SQL (TAG-Bench paper)**|0.17||
|**Human performance (TAG-Bench paper)**|0.55||

We can see that our SQL agent achieved much higher accuracy than Text2SQL, although still lower than human experts.
We can see that our SQL agent achieved much higher accuracy than Text2SQL, although still lower than human experts.
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ export temperature=0.01
export max_new_tokens=4096

# Tools
export TOOLSET_PATH=${EVALDIR}/opea_sql_agent_llama/tools/
export TOOLSET_PATH=${EVALDIR}/opea_sql_agent_llama/tools/
ls ${TOOLSET_PATH}
# for using Google search API
export GOOGLE_CSE_ID=${GOOGLE_CSE_ID}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,4 @@ services:
https_proxy: ${https_proxy}
port: 9095
GOOGLE_CSE_ID: ${GOOGLE_CSE_ID} #delete
GOOGLE_API_KEY: ${GOOGLE_API_KEY} # delete
GOOGLE_API_KEY: ${GOOGLE_API_KEY} # delete
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ def search_web(query: str) -> str:
)

response = tool.run(query)
return response
return response
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ search_web:
query:
type: str
description: query
return_output: retrieved_data
return_output: retrieved_data
Original file line number Diff line number Diff line change
Expand Up @@ -52,4 +52,4 @@ def generate_column_descriptions(db_name):
for db_name in subfolders:
print("Generating hints for database: ", db_name)
generate_column_descriptions(db_name)
print("="*30)
print("=" * 30)
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@

DATAPATH=$WORKDIR/TAG-Bench/tag_queries.csv
OUTFOLDER=$WORKDIR/TAG-Bench/query_by_db
python3 split_data.py --path $DATAPATH --output $OUTFOLDER
python3 split_data.py --path $DATAPATH --output $OUTFOLDER
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
import pandas as pd
import os
import glob
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import argparse
import glob
import os

import pandas as pd

parser = argparse.ArgumentParser()
parser.add_argument('--filedir', type=str, required=True, help='Directory containing the csv files')
parser.add_argument("--filedir", type=str, required=True, help="Directory containing the csv files")
args = parser.parse_args()

filedir = args.filedir
csv_files = glob.glob(os.path.join(filedir, '*_graded.csv'))
csv_files = glob.glob(os.path.join(filedir, "*_graded.csv"))
print("Number of score files found: ", len(csv_files))
print(csv_files)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,7 @@

def generate_answer_agent_api(url, prompt):
proxies = {"http": ""}
payload = {
"messages": prompt,
"stream": False
}
payload = {"messages": prompt, "stream": False}
response = requests.post(url, json=payload, proxies=proxies)
answer = response.json()["text"]
return answer
Expand Down Expand Up @@ -54,10 +51,10 @@ def save_json_lines(json_lines, args):
res = generate_answer_agent_api(url, query)
answers.append(res)
print("******Answer:\n", res)
json_lines.append({"query": query,"ref_answer":ref_answer, "answer": res})
json_lines.append({"query": query, "ref_answer": ref_answer, "answer": res})
save_json_lines(json_lines, args)
print("=" * 20)

df.rename(columns={"Answer": "ref_answer", "Query":"query"}, inplace=True)
df['answer'] = answers
df.to_csv(os.path.join(args.output_dir, args.output_file), index=False)
df.rename(columns={"Answer": "ref_answer", "Query": "query"}, inplace=True)
df["answer"] = answers
df.to_csv(os.path.join(args.output_dir, args.output_file), index=False)
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def make_list_of_test_cases(data):
)
return output


def read_data(args):
data = pd.read_csv(os.path.join(args.filedir, args.filename))
if "query" not in data.columns:
Expand All @@ -46,13 +47,21 @@ def read_data(args):
raise ValueError("The query column is missing in the data")
return data


def grade_answers(args, test_case):
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

print("==============getting embeddings==============")
embeddings = HuggingFaceBgeEmbeddings(model_name=args.embed_model)
print("==============initiating metric==============")
metric = RagasMetric(threshold=0.5, metrics=["answer_correctness"], model=args.llm_endpoint, model_name=args.model_name,embeddings=embeddings, use_vllm=args.use_vllm)
metric = RagasMetric(
threshold=0.5,
metrics=["answer_correctness"],
model=args.llm_endpoint,
model_name=args.model_name,
embeddings=embeddings,
use_vllm=args.use_vllm,
)
print("==============start grading==============")

if args.batch_grade:
Expand All @@ -66,7 +75,7 @@ def grade_answers(args, test_case):
score = metric.score["answer_correctness"][0]
print(score)
scores.append(score)

print("-" * 50)
return scores

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#!/bin/bash
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# This script is used to run benchmarks on all databases

EVALDIR=$WORKDIR/GenAIEval/evals/evaluation/agent_eval/TAG-Bench
Expand Down Expand Up @@ -33,4 +36,4 @@ done
combine all scores
python3 combine_scores.py --filedir $WORKDIR/sql_agent_output/

echo "All done!"
echo "All done!"
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ query_file=${WORKDIR}/TAG-Bench/query_by_db/query_${db_name}.csv
outdir=$WORKDIR/sql_agent_output
outfile=${db_name}_agent_test_result.csv
port=9096
python3 generate_answers.py --query_file $query_file --output_dir $outdir --output_file $outfile --db_name $db_name --port $port
python3 generate_answers.py --query_file $query_file --output_dir $outdir --output_file $outfile --db_name $db_name --port $port
48 changes: 25 additions & 23 deletions evals/evaluation/agent_eval/crag_eval/opea_rag_agent/tools/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@


def search_knowledge_base(query: str) -> str:
"""Search a knowledge base about music and singers for a given query. Returns text related to the query."""
"""Search a knowledge base about music and singers for a given query.
Returns text related to the query.
"""
url = os.environ.get("WORKER_AGENT_URL")
print(url)
proxies = {"http": ""}
Expand All @@ -20,120 +23,119 @@ def search_knowledge_base(query: str) -> str:
return response.json()["text"]


def search_artist_entity_by_name(artist_name: str) -> dict:
'''Search for music artists by name.'''
def search_artist_entity_by_name(artist_name: str) -> dict:
"""Search for music artists by name."""
api = CRAG()
return api.music_search_artist_entity_by_name(artist_name)


def search_song_entity_by_name(song_name: str) -> dict:
'''Search for songs by name.'''
"""Search for songs by name."""
api = CRAG()
return api.music_search_song_entity_by_name(song_name)


def get_billboard_rank_date(rank: int, date: str = None) -> dict:
'''Get Billboard ranking for a specific rank and date.'''
"""Get Billboard ranking for a specific rank and date."""
api = CRAG()
rank = int(rank)
return api.music_get_billboard_rank_date(rank, date)


def get_billboard_attributes(date: str, attribute: str, song_name: str) -> dict:
'''Get attributes of a song from Billboard rankings.'''
"""Get attributes of a song from Billboard rankings."""
api = CRAG()
return api.music_get_billboard_attributes(date, attribute, song_name)


def get_grammy_best_artist_by_year(year: int) -> dict:
'''Get the Grammy Best New Artist for a specific year.'''
"""Get the Grammy Best New Artist for a specific year."""
api = CRAG()
year = int(year)
return api.music_grammy_get_best_artist_by_year(year)


def get_grammy_award_count_by_artist(artist_name: str) -> dict:
'''Get the total Grammy awards won by an artist.'''
"""Get the total Grammy awards won by an artist."""
api = CRAG()
return api.music_grammy_get_award_count_by_artist(artist_name)


def get_grammy_award_count_by_song(song_name: str) -> dict:
'''Get the total Grammy awards won by a song.'''
"""Get the total Grammy awards won by a song."""
api = CRAG()
return api.music_grammy_get_award_count_by_song(song_name)


def get_grammy_best_song_by_year(year: int) -> dict:
'''Get the Grammy Song of the Year for a specific year.'''
"""Get the Grammy Song of the Year for a specific year."""
api = CRAG()
year = int(year)
return api.music_grammy_get_best_song_by_year(year)



def get_grammy_award_date_by_artist(artist_name: str) -> dict:
'''Get the years an artist won a Grammy award.'''
"""Get the years an artist won a Grammy award."""
api = CRAG()
return api.music_grammy_get_award_date_by_artist(artist_name)


def get_grammy_best_album_by_year(year: int) -> dict:
'''Get the Grammy Album of the Year for a specific year.'''
"""Get the Grammy Album of the Year for a specific year."""
api = CRAG()
year = int(year)
return api.music_grammy_get_best_album_by_year(year)


def get_all_awarded_artists() -> dict:
'''Get all artists awarded the Grammy Best New Artist.'''
"""Get all artists awarded the Grammy Best New Artist."""
api = CRAG()
return api.music_grammy_get_all_awarded_artists()


def get_artist_birth_place(artist_name: str) -> dict:
'''Get the birthplace of an artist.'''
"""Get the birthplace of an artist."""
api = CRAG()
return api.music_get_artist_birth_place(artist_name)


def get_artist_birth_date(artist_name: str) -> dict:
'''Get the birth date of an artist.'''
"""Get the birth date of an artist."""
api = CRAG()
return api.music_get_artist_birth_date(artist_name)


def get_members(band_name: str) -> dict:
'''Get the member list of a band.'''
"""Get the member list of a band."""
api = CRAG()
return api.music_get_members(band_name)


def get_lifespan(artist_name: str) -> dict:
'''Get the lifespan of an artist.'''
"""Get the lifespan of an artist."""
api = CRAG()
return api.music_get_lifespan(artist_name)


def get_song_author(song_name: str) -> dict:
'''Get the author of a song.'''
"""Get the author of a song."""
api = CRAG()
return api.music_get_song_author(song_name)


def get_song_release_country(song_name: str) -> dict:
'''Get the release country of a song.'''
"""Get the release country of a song."""
api = CRAG()
return api.music_get_song_release_country(song_name)


def get_song_release_date(song_name: str) -> dict:
'''Get the release date of a song.'''
"""Get the release date of a song."""
api = CRAG()
return api.music_get_song_release_date(song_name)


def get_artist_all_works(artist_name: str) -> dict:
'''Get all works by an artist.'''
"""Get all works by an artist."""
api = CRAG()
return api.music_get_artist_all_works(artist_name)
return api.music_get_artist_all_works(artist_name)
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def make_list_of_test_cases(data):
)
return output


def read_data(args):
data = pd.read_csv(os.path.join(args.filedir, args.filename))
if "query" not in data.columns:
Expand All @@ -46,13 +47,21 @@ def read_data(args):
raise ValueError("The query column is missing in the data")
return data


def grade_answers(args, test_case):
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

print("==============getting embeddings==============")
embeddings = HuggingFaceBgeEmbeddings(model_name=args.embed_model)
print("==============initiating metric==============")
metric = RagasMetric(threshold=0.5, metrics=["answer_correctness"], model=args.llm_endpoint, model_name=args.model_name,embeddings=embeddings, use_vllm=args.use_vllm)
metric = RagasMetric(
threshold=0.5,
metrics=["answer_correctness"],
model=args.llm_endpoint,
model_name=args.model_name,
embeddings=embeddings,
use_vllm=args.use_vllm,
)
print("==============start grading==============")

if args.batch_grade:
Expand All @@ -66,7 +75,7 @@ def grade_answers(args, test_case):
score = metric.score["answer_correctness"][0]
print(score)
scores.append(score)

print("-" * 50)
return scores

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ python3 grade_answers.py \
--filename $FILENAME \
--llm_endpoint $LLM_ENDPOINT \
--model_name $MODEL_NAME \
--use_vllm
--use_vllm
3 changes: 3 additions & 0 deletions evals/evaluation/agent_eval/vllm-gaudi/build_image.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@

# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

echo "Building the vllm docker image"
cd $WORKDIR
echo $WORKDIR
Expand Down
3 changes: 3 additions & 0 deletions evals/evaluation/agent_eval/vllm-gaudi/launch_vllm_gaudi.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

model="meta-llama/Meta-Llama-3.1-70B-Instruct"
vllm_port=8085
vllm_volume=${HF_CACHE_DIR}
Expand Down
Loading

0 comments on commit 69a1dec

Please sign in to comment.