forked from tensorflow/models
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_squad_data.py
108 lines (90 loc) · 3.95 KB
/
preprocess_squad_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# coding=utf-8
"""Script to pre-process SQUAD data into tfrecords."""
import os
import random
# Import libraries
from absl import app
from absl import flags
from absl import logging
import tensorflow as tf
import sentencepiece as spm
from official.nlp.xlnet import squad_utils
flags.DEFINE_integer(
"num_proc", default=1, help="Number of preprocessing processes.")
flags.DEFINE_integer("proc_id", default=0, help="Process id for preprocessing.")
# I/O paths
flags.DEFINE_string("output_dir", default="", help="Output dir for TF records.")
flags.DEFINE_string(
"spiece_model_file", default="", help="Sentence Piece model path.")
flags.DEFINE_string("train_file", default="", help="Path of train file.")
flags.DEFINE_string("predict_file", default="", help="Path of prediction file.")
# Data preprocessing config
flags.DEFINE_integer("max_seq_length", default=512, help="Max sequence length")
flags.DEFINE_integer("max_query_length", default=64, help="Max query length")
flags.DEFINE_integer("doc_stride", default=128, help="Doc stride")
flags.DEFINE_bool("uncased", default=False, help="Use uncased data.")
flags.DEFINE_bool(
"create_train_data", default=True, help="Whether to create training data.")
flags.DEFINE_bool(
"create_eval_data", default=False, help="Whether to create eval data.")
FLAGS = flags.FLAGS
def preprocess():
"""Preprocesses SQUAD data."""
sp_model = spm.SentencePieceProcessor()
sp_model.Load(FLAGS.spiece_model_file)
spm_basename = os.path.basename(FLAGS.spiece_model_file)
if FLAGS.create_train_data:
train_rec_file = os.path.join(
FLAGS.output_dir,
"{}.{}.slen-{}.qlen-{}.train.tf_record".format(spm_basename,
FLAGS.proc_id,
FLAGS.max_seq_length,
FLAGS.max_query_length))
logging.info("Read examples from %s", FLAGS.train_file)
train_examples = squad_utils.read_squad_examples(
FLAGS.train_file, is_training=True)
train_examples = train_examples[FLAGS.proc_id::FLAGS.num_proc]
# Pre-shuffle the input to avoid having to make a very large shuffle
# buffer in the `input_fn`.
random.shuffle(train_examples)
write_to_logging = "Write to " + train_rec_file
logging.info(write_to_logging)
train_writer = squad_utils.FeatureWriter(
filename=train_rec_file, is_training=True)
squad_utils.convert_examples_to_features(
examples=train_examples,
sp_model=sp_model,
max_seq_length=FLAGS.max_seq_length,
doc_stride=FLAGS.doc_stride,
max_query_length=FLAGS.max_query_length,
is_training=True,
output_fn=train_writer.process_feature,
uncased=FLAGS.uncased)
train_writer.close()
if FLAGS.create_eval_data:
eval_examples = squad_utils.read_squad_examples(
FLAGS.predict_file, is_training=False)
squad_utils.create_eval_data(spm_basename, sp_model, eval_examples,
FLAGS.max_seq_length, FLAGS.max_query_length,
FLAGS.doc_stride, FLAGS.uncased,
FLAGS.output_dir)
def main(_):
logging.set_verbosity(logging.INFO)
if not tf.io.gfile.exists(FLAGS.output_dir):
tf.io.gfile.mkdir(FLAGS.output_dir)
preprocess()
if __name__ == "__main__":
app.run(main)