forked from wengong-jin/hgraph2graph
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_vocab.py
executable file
·33 lines (27 loc) · 908 Bytes
/
get_vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import sys
from hgraph import *
from rdkit import Chem
from multiprocessing import Pool
def process(data):
vocab = set()
for line in data:
s = line.strip("\r\n ")
hmol = MolGraph(s)
for node,attr in hmol.mol_tree.nodes(data=True):
smiles = attr['smiles']
vocab.add( attr['label'] )
for i,s in attr['inter_label']:
vocab.add( (smiles, s) )
return vocab
if __name__ == "__main__":
data = [mol for line in sys.stdin for mol in line.split()[:2]]
data = list(set(data))
ncpu = 15
batch_size = len(data) // ncpu + 1
batches = [data[i : i + batch_size] for i in range(0, len(data), batch_size)]
pool = Pool(ncpu)
vocab_list = pool.map(process, batches)
vocab = [(x,y) for vocab in vocab_list for x,y in vocab]
vocab = list(set(vocab))
for x,y in sorted(vocab):
print(x, y)