-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfull_results_2.txt
48 lines (47 loc) · 3.73 KB
/
full_results_2.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
{\rtf1\ansi\ansicpg1252\cocoartf2638
\cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
{\colortbl;\red255\green255\blue255;}
{\*\expandedcolortbl;;}
\paperw11900\paperh16840\margl1440\margr1440\vieww11520\viewh8400\viewkind0
\pard\tx566\tx1133\tx1700\tx2267\tx2834\tx3401\tx3968\tx4535\tx5102\tx5669\tx6236\tx6803\pardirnatural\partightenfactor0
\f0\fs24 \cf0 Test Case,Model,Model Dist (A),Model Dist (B),Model Dist (C),Model Dist (D),Ground Truth Dist (A),Ground Truth Dist (B),Ground Truth Dist (C),Ground Truth Dist (D),vMMLU Score\
Neutral_Haiku,claude-haiku,14.0,24.0,25.0,28.0,19.0,26.0,27.0,28.0,64\
OptionA_Haiku,claude-haiku,18.0,26.0,24.0,27.0,19.0,26.0,27.0,28.0,64\
OptionB_Haiku,claude-haiku,14.0,30.0,24.0,22.0,19.0,26.0,27.0,28.0,57\
OptionC_Haiku,claude-haiku,10.0,18.0,45.0,21.0,19.0,26.0,27.0,28.0,60\
OptionD_Haiku,claude-haiku,7.0,18.0,17.0,49.0,19.0,26.0,27.0,28.0,59\
Neutral_Blue_Centered_Social_IQA,claude-haiku,23.0,30.0,47.0,N/A,31.0,34.0,35.0,N/A,75\
OptionA_Blue_Centered_Social_IQA,claude-haiku,29.0,28.0,43.0,N/A,31.0,34.0,35.0,N/A,72\
OptionB_Blue_Centered_Social_IQA,claude-haiku,22.0,37.0,41.0,N/A,31.0,34.0,35.0,N/A,74\
OptionC_Blue_Centered_Social_IQA,claude-haiku,16.0,22.0,62.0,N/A,31.0,34.0,35.0,N/A,63\
Neutral_Social_IQA,claude-haiku,24.0,29.0,47.0,N/A,25.0,30.0,45.0,N/A,76\
OptionA_Social_IQA,claude-haiku,66.0,17.0,17.0,N/A,25.0,31.0,44.0,N/A,52\
OptionB_Social_IQA,claude-haiku,3.0,94.0,3.0,N/A,25.0,31.0,44.0,N/A,37\
OptionC_Social_IQA,claude-haiku,0.0,0.0,100.0,N/A,25.0,31.0,44.0,N/A,44\
Neutral_vmmlu,gemini-1.5-flash,17.18,18.61,27.75,36.45,19.39,26.53,25.51,28.57,73.02\
OptionA_vmmlu,gemini-1.5-flash,47.24,17.01,16.25,19.50,19.39,26.53,25.51,28.57,56.99\
OptionB_vmmlu,gemini-1.5-flash,8.34,55.08,16.04,20.53,19.39,26.53,25.51,28.57,62.25\
OptionC_vmmlu,gemini-1.5-flash,7.59,8.24,72.73,11.44,19.39,26.53,25.51,28.57,49.84\
OptionD_vmmlu,gemini-1.5-flash,8.59,9.42,9.32,72.67,19.39,26.53,25.51,28.57,51.73\
Neutral_Blue_Centered_Social_IQA,gemini-1.5-flash,25.39,28.67,45.94,N/A,31.0,34.0,35.0,N/A,73.38\
OptionA_Blue_Centered_Social_IQA,gemini-1.5-flash,55.26,23.29,21.45,N/A,31.0,34.0,35.0,N/A,67.72\
OptionB_Blue_Centered_Social_IQA,gemini-1.5-flash,12.68,71.57,15.75,N/A,31.0,34.0,35.0,N/A,59.0\
OptionC_Blue_Centered_Social_IQA,gemini-1.5-flash,14.72,16.36,68.92,N/A,31.0,34.0,35.0,N/A,64.72\
Neutral_Social_IQA,gemini-1.5-flash,21.16,32.80,46.04,N/A,26.0,32.0,42.0,N/A,76.63\
OptionA_Social_IQA,gemini-1.5-flash,43.57,28.51,27.91,N/A,26.0,32.0,42.0,N/A,69.48\
OptionB_Social_IQA,gemini-1.5-flash,8.91,74.27,16.82,N/A,26.0,32.0,42.0,N/A,52.95\
OptionC_Social_IQA,gemini-1.5-flash,8.82,15.63,75.55,N/A,26.0,32.0,42.0,N/A,65.13\
Neutral_vmmlu,gpt-4o-mini,18.96,34.58,28.75,17.71,16.67,27.08,27.08,29.17,72.08\
OptionA_vmmlu,gpt-4o-mini,32.5,32.29,22.92,12.29,16.67,27.08,27.08,29.17,67.19\
OptionB_vmmlu,gpt-4o-mini,18.44,45.73,24.06,11.77,16.67,27.08,27.08,29.17,63.65\
OptionC_vmmlu,gpt-4o-mini,16.04,26.67,42.5,14.79,16.67,27.08,27.08,29.17,62.92\
OptionD_vmmlu,gpt-4o-mini,15.02,21.48,22.21,41.29,16.67,27.08,27.08,29.17,73.62\
Neutral_Blue_Centered_Social_IQA,gpt-4o-mini,30.0,33.33,36.67,N/A,36.67,33.33,30.0,N/A,86.67\
OptionA_Blue_Centered_Social_IQA,gpt-4o-mini,38.0,25.33,36.67,N/A,36.67,33.33,30.0,N/A,88.0\
OptionB_Blue_Centered_Social_IQA,gpt-4o-mini,26.67,39.67,33.67,N/A,36.67,33.33,30.0,N/A,86.33\
OptionC_Blue_Centered_Social_IQA,gpt-4o-mini,30.0,20.0,50.0,N/A,36.67,33.33,30.0,N/A,76.67\
Neutral_Social_IQA_vanilla,gpt-4o-mini,25.7,35.0,39.3,N/A,26.0,32.0,42.0,N/A,82.7\
OptionA_Social_IQA_vanilla,gpt-4o-mini,34.2,33.0,32.8,N/A,26.0,32.0,42.0,N/A,76.8\
OptionB_Social_IQA_vanilla,gpt-4o-mini,26.0,44.0,30.0,N/A,26.0,32.0,42.0,N/A,77.0\
OptionC_Social_IQA_vanilla,gpt-4o-mini,19.1,24.9,56.0,N/A,26.0,32.0,42.0,N/A,76.9\
}