-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_augmenter.py
104 lines (85 loc) · 4.02 KB
/
data_augmenter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from __future__ import annotations
from logic.code_data import CodeData
from logic.text_data import TextData
from logic.text_code_data import TextCodeData
import os, yaml, time, argparse, warnings
import transformations.text.utils.initialize as text_helper
def get_argparser() -> argparse.ArgumentParser:
"""
Get the configured argument parser
"""
parser = argparse.ArgumentParser(description='Augment text and code data from CSV file')
parser.add_argument('--output-filepath', '-o',
metavar='FILEPATH',
dest='output',
required=False,
type=str,
default='output.csv',
help='Output filepath where to store the CSV file')
parser.add_argument('--workers', '-w',
metavar='N_WORKERS',
dest='workers',
required=False,
type=int,
default=1,
help='Number of threads to be used during the process. Default value = 1')
parser.add_argument('--chunk-size', '-cs',
metavar='C_SIZE',
dest='chunk_size',
required=False,
type=int,
default=1,
help='Size of the portion of the dataset to be processed by each thread.. Default value = 100')
parser.add_argument('--verbose', '-v',
action='store_true',
dest='verbose',
required=False,
default=False,
help='If selected it shows some code execution messages on the screen')
required = parser.add_argument_group('required arguments')
required.add_argument('--input-file', '-i',
metavar='FILEPATH',
dest='data_filepath',
required=True,
type=str,
help='The CSV filepath where to retrieve the data to augment.')
required.add_argument('--type', '-t',
metavar='TYPE',
dest='type',
required=True,
choices={"text", "code", "text-code"},
default="text",
type=str,
help='Select the type of augmentation to perform. Possible choices: \'text\', \'code\' or \'text-code\'"')
return parser
if __name__ == '__main__':
warnings.filterwarnings("ignore")
# Read arg parameters
parser = get_argparser()
args = parser.parse_args()
# Read csv
filepath = args.data_filepath
if not os.path.isfile(filepath):
print('Input file not found. Enter the path of the CSV file containing the data to be augmented.')
exit()
# Load config file
if not os.path.isfile(filepath):
print('File \'config.yaml\' not found. Please insert the config file under the root directory.')
exit()
with open('config.yaml', 'r') as file:
yml = yaml.safe_load(file)
# Retrieve active rules
txt_yml, code_yml = yml['text'], yml['code']
start_time = time.time()
print(f'Augmenting the dataset located on {filepath} ...')
if args.type == 'text':
text_helper.initialize_models()
data = TextData(filepath, txt_yml, verbose=args.verbose)
elif args.type == 'code':
data = CodeData(filepath, code_yml, verbose=args.verbose)
elif args.type == 'text-code':
text_helper.initialize_models()
data = TextCodeData(filepath, yml, verbose=args.verbose)
data.compute(out_fp = args.output, workers = args.workers, chunk_size = args.chunk_size)
print(f'Total execution time: {time.time() - start_time}')
print(f'Dataset augmented successfully. You can find the new dataset at the following location: {args.output} .')