-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_data.py
284 lines (230 loc) · 9.95 KB
/
get_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
import json
from numpy.lib.twodim_base import fliplr
import requests
from tqdm import tqdm
import jieba
import jsonlines
# http://shuyantech.com/api/cndbpedia/ment2ent?q= #! 根据单词返回实体列表
# http://shuyantech.com/api/cndbpedia/avpair?q= #! 输入实体名返回三元知识(word)
# http://shuyantech.com/api/cnprobase/ment2ent?q= #! 输入单词返回实体列表
# http://shuyantech.com/api/cnprobase/concept?q= #! 输入实体返回实体概念列表
# http://shuyantech.com/api/entitylinking/cutsegment?q= #!分词筛选出现的概念列表
# 输入单词或短语,返回实体列表列表(取代http://shuyantech.com/api/entitylinking/cutsegment?q=的更精确功能)
def word2entityL(w):
url = "http://shuyantech.com/api/cndbpedia/ment2ent?q=" + \
w + "&apikey=6474c03336e13f7d130ab3eadff84b8"
r = requests.get(url)
cnt = 0
while (r.status_code != 200 and cnt < 5):
r = requests.get(url)
cnt += 1
e_list = r.json()["ret"]
if len(e_list):
return e_list
return []
def entity2desc(e):
# ensure the entity can be find in KG
url = "http://shuyantech.com/api/cndbpedia/avpair?q=" + \
e + "&apikey=6474c03336e13f7d130ab3eadff84b8"
r = requests.get(url)
cnt = 0
while (r.status_code != 200 and cnt < 5):
r = requests.get(url)
cnt += 1
rw_list = r.json()["ret"]
for rw in rw_list:
if rw[0] == "DESC":
return rw[1]
# 没有的话.需要统计比例,并用另一种内容替代
return "无"
# filter 字词,语言,名词
stopConcept = ["字词", "词语"]
def get_hint(e):
url = "http://shuyantech.com/api/cnprobase/concept?q=" + \
e + "&apikey=6474c03336e13f7d130ab3eadff84b8"
r = requests.get(url)
cnt = 0
while (r.status_code != 200 and cnt < 5):
r = requests.get(url)
cnt += 1
e_list = r.json()["ret"]
e_return = []
if len(e_list):
for x in e_list:
if x[0] not in stopConcept:
e_return.append(x[0])
return e_return
return []
def get_hint_complex(ce) :
hl = get_hint(ce)
if len(hl) > 0:
return hl
else:
return ["无"]
n = 500
with open("valid.jsonl", "r", encoding= 'utf-8') as f1,\
open("valid_hint_why.json", "w",encoding= 'utf-8') as f2:
for i in tqdm(range(n)):
line = f1.readline()
d_line = json.loads(line) # json按照dict形式存储
# 对于choice,需要对于每一个list获得entity
choices = d_line["choice"] # choice列表
label = d_line["label"] # lab
hint = d_line["hint"]
entity_final = [] # choice对应的entity列表
desc_final = []
hc_final = []
in_hint = []
# get hint concept,会有一个或者多个,但是如果hint直接能够搜索到concept,视为hint已经被选择出
flag_hint = -1 # 选定hint的角标
hint_concepts = [] # 最终根据hint_flag 选择元素
hint_concept = get_hint(hint)
# 已经可以确定
if len(hint_concept) > 0:
hint_concepts.append(hint_concept)
flag_hint = 0
# 不可以确定,需要分词后检索
else:
f = 0
tmp = []
hw_l = jieba.lcut_for_search(hint)
he_l = []
for hw in hw_l:
# 如果所有都加入的话,那么在check的时候需要选择出最合适的hint,方法是什么呢?[如果得分根据某两个最高,那么直接认为是这样的][如果没有利用到这一条规则的choice,则选择label]
# 一维列表
he_l += word2entityL(hw)
for he in he_l:
if len(get_hint(he)) > 0:
tmp.append(get_hint(he))
f = 1
if f == 0:
hint_concepts.append(["无"])
else:
hint_concepts = tmp
entityconcept_final = []
for c_idx, c in tqdm(enumerate(choices)):
# get entities for each choice
flag_entity = 0 # 选择哪一个实体作为结果
flag_in = -1 # 判断是否属于hint
c_entities = []
tmp = word2entityL(c)
if len(tmp) > 0:
c_entities = tmp
else:
# cut word for null entites try to get extra entities
cw_l = jieba.lcut_for_search(c)
for cw in cw_l:
c_entities += word2entityL(cw)
if len(c_entities) == 0:
c_entities += ["无"] # 专门处理,可以预计到后面的结果了,设置为"无"
# 获得实体的DESC
c_descs = [entity2desc(ce) for ce in c_entities]
# 获得实体的concept,即使为空,也进行保留,因为实体是可以查询的最小单元,不应再次分词
c_concepts = [get_hint_complex(ce) for ce in c_entities] # 是二维数组
# 选择,分为flag_hint == -1 与 flag_hint >= 0 两种情况
if flag_hint == -1:
hw_l = jieba.lcut_for_search(hint)
hw_l.append(hint)
for hw in hw_l:
if hw in c:
flag_in = 1
for cc_idx, c_concept in enumerate(c_concepts):
# hint lv1 hint及其分词在出现在entity上位概念中
# entity lv1 entity的上位概念出现在hint及其分词中
if len(set(hw_l).intersection(set(c_concept))) > 0:
flag_in = 1
flag_entity = cc_idx
break
# hint lv2 hint及其分词在出现在desc上位概念中
# entity lv2 entity的desc中出现了hint及其分词
desc = c_descs[cc_idx]
fl = 0
for hw in hw_l:
if hw in desc:
flag_in = 1
flag_entity = cc_idx
fl = 1
break
if fl:
break
# hint lv3 hint某一上位概念和desc中出现
# entity lv3 desc中出现hint描述的上位概念
for hc_idx, h_concept in enumerate(hint_concepts):
fl = 0
for hc in h_concept:
if hc in desc:
flag_in = 1
flag_entity = cc_idx
flag_hint = hc_idx
fl = 1
break
if fl:
break
# hint/entity lv4 上位概念重合
if len(set(h_concept).intersection(set(c_concept))) > 0:
flag_in = 1
flag_entity = cc_idx
flag_hint = hc_idx
break
if fl:
break
if flag_in == -1:
flag_in = 0
elif flag_hint >= 0:
hw_l = jieba.lcut_for_search(hint)
hw_l.append(hint)
for hw in hw_l:
if hw in c:
flag_in = 1
for cc_idx, c_concept in enumerate(c_concepts):
# hint lv1 hint及其分词在出现在entity上位概念中
# entity lv1 entity的上位概念出现在hint及其分词中
if len(set(hw_l).intersection(set(c_concept))) > 0:
flag_in = 1
flag_entity = cc_idx
break
# hint lv2 hint及其分词在出现在desc上位概念中
# entity lv2 entity的desc中出现了hint及其分词
desc = c_descs[cc_idx]
fl = 0
for hw in hw_l:
if hw in desc:
flag_in = 1
flag_entity = cc_idx
fl = 1
break
if fl:
break
# hint lv3 hint某一上位概念和desc中出现
# entity lv3 desc中出现hint描述的上位概念
# 因为已经确定了hint,那么可以直接决定了
fl = 0
for hc in hint_concepts[flag_hint]:
if hc in desc:
flag_in = 1
flag_entity = cc_idx
fl = 1
break
if fl:
break
# hint/entity lv4 上位概念重合
if len(set(hint_concepts[flag_hint]).intersection(set(c_concept))) > 0:
flag_in = 1
flag_entity = cc_idx
break
if flag_in == -1:
flag_in = 0
flag_entity = 0
entity_final.append(c_entities[flag_entity])
entityconcept_final.append(c_concepts[flag_entity])
desc_final.append(c_descs[flag_entity])
in_hint.append(flag_in)
if flag_hint == -1:
flag_hint = 0 # 默认hint下标为0
d_line["entities"] = entity_final
d_line["hint_concept"] = hint_concepts[flag_hint]
d_line["entity_concept"] = entityconcept_final
d_line["in_hint"] = in_hint
d_line["desc"] = desc_final
json.dump(d_line, f2, ensure_ascii=False)
f2.write('\n')