-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathE1_evaluator_qna_eval_pool.py
111 lines (98 loc) · 2.78 KB
/
E1_evaluator_qna_eval_pool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# Create the Q&A eval pool
import pandas as pd
from tqdm import tqdm
from utils import db_conn
import os, sys
from dotenv import load_dotenv
if not load_dotenv("config.env"):
print(f"ERROR: Missing config file.")
sys.exit()
DEBUG=False
def qna_pool(conn, title='', order_by='RANDOM()', num=100):
"""
When title='' it will select from all topics
"""
cur = conn.cursor()
cur.execute(f"""
SELECT
id,
question,
answer,
doc_source,
doc_title,
doc_page_num,
ocp_version,
h_validated,
m_validated
FROM
qna_pool_hf
WHERE
doc_title ~ '{title}'
ORDER BY
{order_by}
LIMIT
{num}
;
""")
content = []
rows = cur.fetchall()
for id, q, a, d_src, d_title, d_page, ocp_ver, h_valid, m_valid in tqdm(rows):
content.append(
{
"ID": id,
"Question": q.strip(),
"Answer": a.strip(),
"doc_source": d_src.strip(),
"doc_title": d_title.strip(),
"doc_page": d_page,
"ocp_version": ocp_ver.strip(),
"human_validated": bool(h_valid),
"model_validated": bool(m_valid)
}
)
cur.close()
return content
if __name__ == '__main__':
"""
"""
conn = db_conn.conn()
cur = conn.cursor()
# fetch unique titles
cur.execute("""
SELECT DISTINCT
doc_title
FROM
qna_pool_hf
;
""")
titles = []
_titles = cur.fetchall()
for i in _titles:
titles.append(i[0].strip())
# print title
print(f"Topics: {titles}")
frames = []
for t in tqdm(titles):
print(f"""\n{'#'*40}""")
print(f"Processing: {t}")
# select 30 random Q&A pairs per topic
content = qna_pool(conn, title=t, order_by='RANDOM()', num=30)
if DEBUG is True:
for entry in content:
print(f"{entry['Question']}")
df = pd.DataFrame(content).set_index('ID')
if DEBUG is True:
print(f"""{df.head()}\n{df.dtypes}""")
frames.append(df) # append to the list of dataframes
df_full = pd.concat(frames).sort_index(ascending=True)
print(df_full.head(), df_full.shape)
QNA_EVAL_POOL=os.environ.get("QNA_EVAL_POOL", "unk_qna_eval_pool.parquet")
print(f"Writing {QNA_EVAL_POOL}...")
df_full.to_parquet(QNA_EVAL_POOL)
##
# Close DB connection
cur.close()
conn.close()
####################################################################################
# END OF FLE
####################################################################################