-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathmake_table_of_results.py
43 lines (37 loc) · 1.93 KB
/
make_table_of_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import argparse
import json
from pathlib import Path
import pandas as pd
# argparser to get the experiment name
parser = argparse.ArgumentParser(description="Create a table from JSON files")
parser.add_argument("--experiment_name", default=None, type=str, nargs="+", help="list of experiments to include.")
args = parser.parse_args()
# Create a Path object with the folder path
folder_path = Path("outputs")
# List to hold all json data
json_list = []
experiment_names_list = []
# Iterate over each JSON file in the directory
for json_file in folder_path.glob("*/*.json"):
with open(json_file, "r", encoding="utf-8") as f:
# Load the JSON content
json_content = json.load(f)
# Append the content as is (which will result in a single cell containing the JSON in the DataFrame)
experiment_names_list.append(json_content["cfg_experiment_name"])
if args.experiment_name is not None and json_content["cfg_experiment_name"] in args.experiment_name:
json_list.append(json_content)
elif args.experiment_name is None:
json_list.append(json_content)
print(f"The set of experiment names in the folder is: {set(experiment_names_list)}")
# Convert the list of JSON objects to a DataFrame
df = pd.DataFrame(json_list)
df = df.sort_values(by=["cfg_model_name", "cfg_dataset", "cfg_data_idx", "cfg_discrete_optimizer"])
df["ratio"] = df["target_length"] / df["num_free_tokens"]
df["memorized"] = df["ratio"] > 1
print(df[["cfg_model_name", "cfg_dataset", "cfg_data_idx", "cfg_discrete_optimizer", "ratio", "memorized",
"success"]].round(2).to_markdown())
# Make summary counting the average ratio and success rate for each dataset and discrete_optimizer include counts
summary = df.groupby(["cfg_model_name", "cfg_dataset", "cfg_discrete_optimizer"]).agg(
{"ratio": "mean", "memorized": "mean", "success": "count"}).round(2)
print(summary.to_markdown())
print(f"dataframe shape: {df.shape}")