forked from Emma1066/Zero-Shot-NER-with-ChatGPT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbatch_hanlp_tok_pos_con_dep.py
69 lines (55 loc) · 1.91 KB
/
batch_hanlp_tok_pos_con_dep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import hanlp
import json
from tqdm import tqdm
def get_parse_per_sentence(sample, parser):
doc = parser(
sample["sentence"],
tasks=["tok*", "pos", "con", "dep"]
)
tok = doc["tok/fine"]
pos = doc["pos/ctb"]
dep = doc["dep"]
con = doc["con"]
tok_coarse = doc["tok/coarse"]
# arrange POS output format
tok_pos_pair = [[t,p] for t,p in zip(tok, pos)]
tok_pos_pair = [f"{t}/{p}" for [t,p] in tok_pos_pair]
tok_pos_pair_str = " ".join(tok_pos_pair)
# arrange dependency tree output format
trip_dep = []
for i_item, item in enumerate(dep):
'''[head_idx, dep_rel]'''
tmp_tok = tok[i_item]
tmp_head_idx = item[0]-1
tmp_dep_rel = item[1]
tmp_head = tok[tmp_head_idx]
trip_dep.append(
[tmp_tok, tmp_head, tmp_dep_rel]
)
sample["tok/fine"] = str(tok)
sample["tok/coarse"] = str(tok_coarse)
sample["pos/ctb"] = str(pos)
sample["dep"] = str(dep)
sample["tok_pos_pair"] = str(tok_pos_pair)
sample["tok_pos_pair_str"] = tok_pos_pair_str
sample["trip_dep"] = str(trip_dep)
sample["con_str"] = str(con)
return sample
# load data
DATANAME = "msra_5_samples"
MODE = "test"
datafolder = DATANAME
datamode = MODE
indata_path = f"data/{datafolder}/{datamode}.json"
outdata_path = f"data/{datafolder}/{datamode}_parse_hanlp.json"
indata = json.load(open(indata_path, "r", encoding="utf-8"))
HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)
print("========== HANLP parsing ===========")
print(f"save path: {outdata_path}")
data_parse = []
for i_item, item in enumerate(tqdm(indata, desc="get parse")):
item_parse = get_parse_per_sentence(item, HanLP)
data_parse.append(item_parse)
with open(outdata_path, "w", encoding="utf-8") as wf:
wf.write(json.dumps(data_parse, indent=4, ensure_ascii=False))
print(f"file saved to: {outdata_path}")