Skip to content

Commit

Permalink
Merge pull request #64 from yeonir/yeonirhee/sotopia-task
Browse files Browse the repository at this point in the history
Sotopia Task Submission
  • Loading branch information
XuhuiZhou authored May 1, 2024
2 parents e140a93 + aa9be2a commit e2fad4e
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 253 deletions.
45 changes: 6 additions & 39 deletions sotopia_space/benchmark.py
Original file line number Diff line number Diff line change
@@ -1,61 +1,29 @@
import gradio as gr # type: ignore
import pandas as pd
from sotopia_space.constants import MODEL_OPTIONS
from sotopia_space.utils import estimated_win_rate, make_clickable_model, styled_error, styled_warning, styled_message,apply_length_penalty
from sotopia_space.utils import post_processing

LP_MODE = "v2"
original_df, ablation_df = None, None
LP_original_dfs = {}
DEFAULT_LP = 0.5

available_models = [] # to be filled in later
original_df, ablation_df = None, None

def slider_change_main(length_penalty):
global original_df, ablation_df, LP_MODE
adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
adjusted_df = adjusted_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]]
adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
# adjusted_df = add_winrates(adjusted_df, LP=length_penalty)
# adjusted_df = adjusted_df.drop(columns=["Length"])
adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
return adjusted_df

def slider_change_full(length_penalty, show_winrate):
global original_df, ablation_df, LP_MODE
adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
# sort the model by the "Task-Avg Elo" column
adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
adjusted_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"], inplace=True)
if show_winrate == "none":
adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
return adjusted_df
elif show_winrate == "gpt-3.5":
adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-3.5", LP=length_penalty)
elif show_winrate == "gpt-4":
adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4", LP=length_penalty)
adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
return adjusted_df

def benchmark_table():
global original_df, ablation_df
global LP_original_dfs, LP_MODE

gr.Markdown(f"**Version**: sotopia (v1.01; 2024.04.22) | **# Examples**: 7200 | **# Models**: {len(MODEL_OPTIONS)} | **# Comparisons**: x", elem_classes="markdown-text")

with gr.TabItem("Vs GPT-3.5", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
# original_df, ablation_df = skip_empty_original_df, skip_empty_ablation_df
original_df = pd.read_json('data_dir/models_vs_gpt35.jsonl', lines=True)
default_main_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
default_main_df = pd.read_json('data_dir/models_vs_gpt35.jsonl', lines=True)
default_main_df = default_main_df.sort_values(by="GOAL [0, 10]", ascending=False)
default_main_df = post_processing(default_main_df, None)
# add a Rank column to the first columnn (starting from 1)
default_main_df.insert(0, "Rank", range(1, 1 + len(default_main_df)))

with gr.Row():
with gr.Column(scale=4):
gr.Markdown("**Vs GPT3.5**: The interlocutors are compared against GPT-3.5, the baseline model.")
with gr.Column(scale=1):
length_penlty_slider = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP, label="Length Penalty", elem_id="length-penalty-slider")
# checkbox_skip_empty = gr.Checkbox(label="Skip empty results", value=False, elem_id="skip-empty-checkbox", scale=2)
gr.Markdown("<h3>**Vs GPT3.5**: The interlocutors are compared against GPT-3.5, the baseline model.")
TYPES = ["number", "markdown", "number"]
leaderboard_table = gr.components.Dataframe(
value=default_main_df,
Expand All @@ -66,5 +34,4 @@ def benchmark_table():
interactive=False,
visible=True,
min_width=60,
)
#length_penlty_slider.change(fn=slider_change_main, inputs=[length_penlty_slider], outputs=[leaderboard_table])
)
2 changes: 0 additions & 2 deletions sotopia_space/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,6 @@ def chat_introduction():
🗄️ **Disclaimer**: User prompts and generated replies from the model may be collected solely for the purpose of pure academic research. By using this demo, users implicitly agree to these terms.
"""
)
# with gr.Column(scale=1):
# toggle_dark = gr.Button(value="Toggle Dark")

def create_user_agent_dropdown(environment_id):
_, environment_dict, agent_dict, relationship_dict = get_sotopia_profiles()
Expand Down
26 changes: 4 additions & 22 deletions sotopia_space/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,26 +14,8 @@
]

MODEL_INFO = {
"Llama-2-13b-chat-hf.nosp": {"pretty_name": "Llama-2-13B-chat", "hf_model_id": "meta-llama/Llama-2-13b-chat-hf"},
"Llama-2-70b-chat-hf.nosp": {"pretty_name": "Llama-2-70B-chat", "hf_model_id": "meta-llama/Llama-2-70b-chat-hf"},
"Llama-2-7b-chat-hf.nosp": {"pretty_name": "Llama-2-7B-chat", "hf_model_id": "meta-llama/Llama-2-7b-chat-hf"},
"Llama-2-7b-chat-hf": {"pretty_name": "Llama-2-7B-chat (+sys prmpt)", "hf_model_id": "meta-llama/Llama-2-7b-chat-hf"},
"Mistral-7B-Instruct-v0.1": {"pretty_name": "Mistral-7B-Instruct", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.1"},
"Mistral-7B-Instruct-v0.2": {"pretty_name": "Mistral-7B-Instruct (v0.2)", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.2"},
"Mixtral-8x7B-Instruct-v0.1": {"pretty_name": "Mixtral-8x7B-Instruct", "hf_model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1"},
"Nous-Hermes-2-Mixtral-8x7B-DPO": {"pretty_name": "Nous-Hermes-2-Mixtral-8x7B-DPO", "hf_model_id": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"},
"Yi-34B-Chat": {"pretty_name": "Yi-34B-Chat", "hf_model_id": "01-ai/Yi-34B"},
"gemini-1.0-pro": {"pretty_name": "gemini-1.0-pro", "hf_model_id": "https://blog.google/technology/ai/google-gemini-ai/"},
"gemma-7b-it": {"pretty_name": "Gemma-7B-it", "hf_model_id": "google/gemma-7b"},
"gpt-3.5-turbo-0125": {"pretty_name": "gpt-3.5-turbo-0125", "hf_model_id": "https://platform.openai.com/"},
"gpt-4-0125-preview": {"pretty_name": "gpt-4-0125-preview", "hf_model_id": "https://platform.openai.com/"},
"tulu-2-dpo-70b": {"pretty_name": "Tulu-2-dpo-70b", "hf_model_id": "cmu-lti/tulu-2-dpo-70b"},
"vicuna-13b-v1.5": {"pretty_name": "Vicuna-13b-v1.5", "hf_model_id": "lmsys/vicuna-13b-v1.5"},
"zephyr-7b-beta": {"pretty_name": "Zephyr-7b-beta", "hf_model_id": "HuggingFaceH4/zephyr-7b-beta"},
"mistral-large-2402": {"pretty_name": "Mistral-Large", "hf_model_id": "https://mistral.ai/news/mistral-large/"},
"claude-3-opus-20240229": {"pretty_name": "Claude 3 Opus", "hf_model_id": "https://www.anthropic.com/claude"},
"claude-3-sonnet-20240229": {"pretty_name": "Claude 3 Sonnet", "hf_model_id": "https://www.anthropic.com/claude"},
"zephyr-7b-gemma-v0.1": {"pretty_name": "Zephyr-7b-Gemma", "hf_model_id": "HuggingFaceH4/zephyr-7b-gemma-v0.1"},
"Starling-LM-7B-beta": {"pretty_name": "StarlingLM-7B-beta", "hf_model_id": "Nexusflow/Starling-LM-7B-beta"},
"dbrx-instruct": {"pretty_name": "DBRX Instruct", "hf_model_id": "databricks/dbrx-instruct"}
"GPT-4": {"pretty_name": "GPT-4", "hf_model_id": "https://openai.com/blog/new-models-and-developer-products-announced-at-devday"},
"GPT-3.5": {"pretty_name": "GPT-3.5", "hf_model_id": "https://openai.com/blog/new-models-and-developer-products-announced-at-devday"},
"Llama-2": {"pretty_name": "Llama-2", "hf_model_id": "https://llama.meta.com/llama2/"},
"MPT": {"pretty_name": "MPT", "hf_model_id": "https://huggingface.co/docs/transformers/main/en/model_doc/mpt"}
}
196 changes: 6 additions & 190 deletions sotopia_space/utils.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,6 @@
from datasets import load_dataset, Dataset
import os
import json
from datasets import load_dataset
from datasets.utils.logging import disable_progress_bar # type: ignore
from ui_constants import column_names, all_task_types
import random
disable_progress_bar()
import math
from ui_constants import column_names
from sotopia_space.constants import MODEL_INFO

id_to_data = None
model_len_info = None


def make_clickable_model(model_name):
global MODEL_INFO
Expand All @@ -25,199 +14,26 @@ def make_clickable_model(model_name):
else:
return model_name


def styled_error(error):
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"

def styled_warning(warn):
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"

def styled_message(message):
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"


def estimated_win_rate(elo_a, elo_b, LP=0):
"""
Calculate the estimated win rate for player A against player B using their Elo ratings.
:param elo_a: Elo rating of player A
:param elo_b: Elo rating of player B
:return: Estimated win rate for player A
"""
exponent = (elo_b - elo_a)*(10**LP) / 400
probability_a_wins = 1 / (1 + 10 ** exponent)
return (1-probability_a_wins)*100



# Formats the columns
def formatter(x):
if type(x) is str:
x = x
else:
x = round(x, 1)
x = round(x, 2)
return x


def add_winrates(current_df, LP=0):
df = current_df.copy()
elo_column = "Task-Avg Elo"

# Correct way to filter the DataFrame and get the Elo rating for "gpt-4-0125-preview"
model_a_elo = df[df["Model"].str.contains("gpt-4")][elo_column].iloc[0]

# Correct way to filter the DataFrame and get the Elo rating for "gpt-3.5-turbo-0125"
model_b_elo = df[df["Model"].str.contains("gpt-3.5")][elo_column].iloc[0]


# Calculate the win rate of "gpt-4-0125-preview" against all models
df['Win% vs GPT-4'] = df[elo_column].apply(lambda x: estimated_win_rate(model_a_elo, x, LP=LP)).apply(formatter)
df['Win% vs GPT-3.5T'] = df[elo_column].apply(lambda x: estimated_win_rate(model_b_elo, x, LP=LP)).apply(formatter)
# apply the formatter for the two new columns
cols = list(df.columns)
cols.remove("# battles"); cols.append("# battles")
cols.remove("Length"); cols.append("Length")
df = df[cols]
return df

def add_winrates_tasks(current_df, ref="gpt-4", LP=0):
new_df = current_df.copy()
for t in all_task_types:
column = column_names[t]
model_a_elo = current_df[current_df["Model"].str.contains(ref)][column].iloc[0]
new_df[column] = current_df[column].apply(lambda x: estimated_win_rate(model_a_elo, x, LP=LP)).apply(formatter)
return new_df


def post_processing(df, model_len_info):
if model_len_info:
df["Length"] = df["model name "].apply(lambda x: model_len_info[x]["avg_len"])
df["Length"] = df["model_name"].apply(lambda x: model_len_info[x]["avg_len"])

for col in df.columns:
if col == "model name ":
if col == "model_name":
df[col] = df[col].apply(lambda x: x.replace(x, make_clickable_model(x)))
else:
df[col] = df[col].apply(formatter) # For numerical values
df.rename(columns=column_names, inplace=True)
df.sort_values(by="Task-Avg Elo", inplace=True, ascending=False)
df.sort_values(by="GOAL [0, 10]", inplace=True, ascending=False)
# put the "Overall Elo" and "Task-Avg Elo" column to the front
# add the length info
df = df[["Model", "Task-Avg Elo"] + [col for col in df.columns if col not in ["Model", "Task-Avg Elo"]]]
df = df[["model_name", "GOAL [0, 10]"] + [col for col in df.columns if col not in ["model_name", "GOAL [0, 10]"]]]
return df

def apply_length_penalty(original_df, ablation_df, length_penalty=0.2, mode='v1', LP_original_dfs=None):
"""
Temporarily disable the length penalty feature
if mode == 'v2' and LP_original_dfs is not None:
L = f"{length_penalty:.1f}"
return LP_original_dfs[L]
original_df = original_df.copy()
ablation_df = ablation_df.copy()
# replace all values in original_df with the values as z = x - y * length_penalty where y is from ablation_df at the same row and column
# except for the "Model" column and the "# battles" column
# do not assume the order of the rows are the same in both dataframes
for i, row in original_df.iterrows():
for col in original_df.columns:
if col == "Model" or col == "# battles" or col == "Length":
continue
# assert that the model names are the same in both dataframes
assert original_df.at[i, "Model"] == ablation_df[ablation_df["Model"] == row["Model"]]["Model"].values[0]
original_df[col] = original_df[col].astype(float)
if mode == "v1":
original_df.at[i, col] = original_df.at[i, col] - ablation_df[ablation_df["Model"] == row["Model"]][col].values[0] * length_penalty
elif mode == "v1.1":
diff = original_df.at[i, col] - ablation_df[ablation_df["Model"] == row["Model"]][col].values[0]
original_df.at[i, col] = original_df.at[i, col] * (1-length_penalty) + diff*length_penalty
# post_processing
original_df = post_processing(original_df, model_len_info=None)
"""
return original_df

def load_benchdata():
print("Loading sotopia data...")
bench_data = load_dataset("cmu-lti/sotopia", split="test")
return bench_data

def load_benchdata_dict():
print("Loading sotopia data....")
bench_data = load_dataset("cmu-lti/sotopia", data_files="sotopia_episodes_v1_hf.jsonl")['train']
id_to_data = {}
for item in bench_data:
id_to_data[item["session_id"]] = item
return id_to_data

def load_eval_results():
print("Loading sotopia Evaluation data...")
eval_results = load_dataset("WildEval/sotopia-Evaluation", "all", split="train")
return eval_results

def load_infer_results(model_name):
print(f"Loading sotopia Results for {model_name}...")
infer_results = load_dataset("WildEval/sotopia-Results", model_name, split="train")
return infer_results

def sample_an_eval_result(eval_results, model_list=[], tag_list=[]):
global id_to_data
eval_results = list(eval_results)
random.shuffle(eval_results)
for eval_item in eval_results:
# print(json.dumps(eval_item, indent=2))
# print(f"## Session ID: {eval_item['session_id']}")
# eval_item["eval_id"]
assignment = eval_item['assignment']
model_1, model_2 = eval_item['model_1'], eval_item['model_2']
model_A = model_1 if assignment['A'] == model_1 else model_2
model_B = model_2 if assignment['B'] == model_2 else model_1
if len(model_list) >= 2:
if model_A not in model_list or model_B not in model_list:
continue
elif len(model_list) == 1:
if model_A != model_list[0] and model_B != model_list[0]:
continue
else:
pass
if tag_list:
if set(tag_list).isdisjoint(set(eval_item['tags'])):
continue
winner = eval_item['winner']
# print(f"## Model A: {model_A} | Model B: {model_B} | Winner: {winner}")
task_type = eval_item['tags'][0] # primary task type
chat_history = eval_item['history']
last_query = eval_item['last_query']
# print(f"## Task Type: {task_type}")
# print(f"## Chat History: {chat_history}")
# print(f"## Last Query --> USER: {last_query}")

model_A_output = eval_item['model_1_output'] if model_1 == model_A else eval_item['model_2_output']
model_B_output = eval_item['model_2_output'] if model_2 == model_B else eval_item['model_1_output']

if len(model_A_output.strip()) == 0 or len(model_B_output.strip()) == 0:
continue

conversation_input = id_to_data[eval_item['session_id']]["conversation_input"]
# print(f"\n\n\n## Model A ({model_A}) Output ##\n{model_A_output}")
# print(f"\n\n\n## Model B ({model_B}) Output ##\n{model_B_output}")

# print(f"\n\n\n## Winner ##\n{winner}")
# print(f"\n\n\n## GPT-4 Judgement ##\n{eval_item['parsed_result']}")

result_dict = {
"session_id": eval_item['session_id'],
"model_A": model_A,
"model_B": model_B,
"winner": winner,
"intent": id_to_data[eval_item['session_id']]["intent"],
"task_type": task_type,
"all_tags": eval_item['tags'],
"chat_history": chat_history,
"last_query": last_query,
"conversation_input": conversation_input,
"model_A_output": model_A_output,
"model_B_output": model_B_output,
"reason": eval_item['parsed_result']["reason"],
"choice": eval_item['parsed_result']["choice"],
"checklist": id_to_data[eval_item['session_id']]["checklist"],
}
break
return result_dict

#id_to_data = load_benchdata_dict()

0 comments on commit e2fad4e

Please sign in to comment.