-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathlm_dataset.py
41 lines (33 loc) · 1.46 KB
/
lm_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
"""Class for working with Language modeling datasets."""
import tensorflow as tf
class LMDataset(object):
"""Dataset class for language model dataset via TFRecords."""
def __init__(self, filenames, batch_size):
self.batch_size = batch_size
self.data_set, self.data_iter = self.create_iterator(filenames)
def get_instance(self, proto):
"""Parse the proto to prepare instance."""
context_features = {
"cint_len": tf.FixedLenFeature([], tf.int64),
}
sequence_features = {
"cint": tf.FixedLenSequenceFeature(shape=[], dtype=tf.int64),
}
# parse a sequence example given the above instructions on the structure
context, sequence = tf.parse_single_sequence_example(
serialized=proto,
context_features=context_features,
sequence_features=sequence_features
)
cint = sequence["cint"]
cint_len = context["cint_len"]
return {"char": cint, "char_len": cint_len}
def create_iterator(self, data_files):
"""Create iterator for data."""
data_set = tf.data.TFRecordDataset(data_files)
data_set = data_set.map(self.get_instance)
data_set = data_set.shuffle(buffer_size=10000)
data_set = data_set.padded_batch(
self.batch_size, padded_shapes={'char': [None], 'char_len':[]})
data_iter = data_set.make_initializable_iterator()
return data_set, data_iter