forked from NVIDIA/FasterTransformer
-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy path.gitlab-ci.yml
256 lines (246 loc) · 10.7 KB
/
.gitlab-ci.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
stages:
- build
- test
build_pyt_release:
image: nvcr.io/nvidia/pytorch:21.02-py3
tags:
- fastertransformer
stage: build
only:
- main
- merge_requests
artifacts:
paths:
- ${CI_PROJECT_DIR}/build/
expire_in: 1 week
script:
- cd ${CI_PROJECT_DIR} && mkdir build && cd build
- git submodule init && git submodule update
- cmake -DSM=xx -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON -DBUILD_GPT=ON ..
- make -j12
build_pyt_release_sparse:
image: nvcr.io/nvidia/pytorch:21.02-py3
tags:
- fastertransformer
stage: build
only:
- main
- merge_requests
artifacts:
paths:
- ${CI_PROJECT_DIR}/build/
expire_in: 1 week
script:
- cd ${CI_PROJECT_DIR} && mkdir build && cd build
- git submodule init && git submodule update
- wget https://developer.download.nvidia.com/compute/libcusparse-lt/0.1.0/local_installers/libcusparse_lt-linux-x86_64-0.1.0.2.tar.gz
- tar -xzvf libcusparse_lt-linux-x86_64-0.1.0.2.tar.gz
- cmake -DSM=xx -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON -DSPARSITY_SUPPORT=ON -DCUSPARSELT_PATH=${CI_PROJECT_DIR}/build/libcusparse_lt/ ..
- make -j12
build_tf_release:
image: nvcr.io/nvidia/tensorflow:21.02-tf1-py3
tags:
- fastertransformer
stage: build
only:
- main
- merge_requests
artifacts:
paths:
- ${CI_PROJECT_DIR}/build/
expire_in: 1 week
script:
- cd ${CI_PROJECT_DIR} && mkdir build && cd build
- git submodule init && git submodule update
- cmake -DSM=xx -DCMAKE_BUILD_TYPE=Release -DBUILD_TF=ON -DTF_PATH=/usr/local/lib/python3.8/dist-packages/tensorflow_core/ -DBUILD_GPT=ON ..
- make -j12
- apt-get update && apt-get install bc
# 1. Get accuracy on LAMBADA dataset
# 2. Run pytorch gpt op as basline
# 3. Run pytorch piepline parallel and compare difference with baseline
# 4. Run pytorch tensor parallel and compare difference with baseline
pyt_gpt_test:
image: nvcr.io/nvidia/pytorch:21.02-py3
tags:
- fastertransformer
stage: test
only:
- main
- merge_requests
needs:
- job: build_pyt_release
artifacts: true
script:
- cd ${CI_PROJECT_DIR}/build/
- git submodule init && git submodule update
- export PYTHONPATH="${CI_PROJECT_DIR}/:$PYTHONPATH"
- export NVIDIA_TF32_OVERRIDE=0 # Disable the TF32
- export CUDA_VISIBLE_DEVICES=0
- wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json -P ../models
- wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -P ../models
- wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O megatron_lm_345m_v0.0.zip
- wget https://github.com/cybertronai/bflm/raw/master/lambada_test.jsonl -P ../models/megatron-models
- unzip megatron_lm_345m_v0.0.zip -d ../models/megatron-models/345m
- python ../examples/pytorch/gpt/utils/megatron_ckpt_convert.py -head_num 16 -i ../models/megatron-models/345m/release/ -o ../models/megatron-models/c-model/345m/ -t_g 1 -i_g 1
- bash ../examples/pytorch/gpt/scripts/evaluate_zeroshot_gpt.sh
- python ../examples/pytorch/gpt/gpt_example.py --ckpt_path=../models/megatron-models/c-model/345m/1-gpu/ --top_p 0.5 --sample_output_file single-gpu-out.txt
- export CUDA_VISIBLE_DEVICES=0,1
- mpirun -n 2 --allow-run-as-root python ../examples/pytorch/gpt/multi_gpu_gpt_example.py --tensor_para_size=1 --pipeline_para_size=2 --ckpt_path=../models/megatron-models/c-model/345m/1-gpu/ --top_p 0.5 --sample_output_file pipeline-parallel-2-gpu-out.txt
- diff single-gpu-out.txt pipeline-parallel-2-gpu-out.txt
- python ../examples/pytorch/gpt/utils/megatron_ckpt_convert.py -head_num 16 -i ../models/megatron-models/345m/release/ -o ../models/megatron-models/c-model/345m/ -t_g 1 -i_g 2
- mpirun -n 2 --allow-run-as-root python ../examples/pytorch/gpt/multi_gpu_gpt_example.py --tensor_para_size=2 --pipeline_para_size=1 --ckpt_path=../models/megatron-models/c-model/345m/2-gpu/ --top_p 0.5 --sample_output_file tensor-parallel-2-gpu-out.txt
- diff single-gpu-out.txt tensor-parallel-2-gpu-out.txt
timeout: 4h 30m
tf_test:
image: nvcr.io/nvidia/tensorflow:21.02-tf1-py3
tags:
- fastertransformer
stage: test
only:
- main
- merge_requests
needs:
- job: build_tf_release
artifacts: true
script:
- cd ${CI_PROJECT_DIR}/build/
- apt-get update && apt-get install bc
- export PYTHONPATH="${CI_PROJECT_DIR}/:$PYTHONPATH"
- export NVIDIA_TF32_OVERRIDE=0 # Disable the TF32
- export CUDA_VISIBLE_DEVICES=0
- bash ${CI_PROJECT_DIR}/examples/tensorflow/decoding/utils/translation/download_model_data.sh
- mkdir -p ${CI_PROJECT_DIR}/translation/ckpt_fp16
- python ${CI_PROJECT_DIR}/tests/bert/tf_bert_unit_test.py
- python ${CI_PROJECT_DIR}/tests/bert/tf_encoder_unit_test.py
- python ${CI_PROJECT_DIR}/examples/tensorflow/ckpt_type_convert.py --init_checkpoint=${CI_PROJECT_DIR}/translation/ckpt/model.ckpt-500000 --fp16_checkpoint=${CI_PROJECT_DIR}/translation/ckpt_fp16/model.ckpt-500000
- python ${CI_PROJECT_DIR}/tests/decoding/tf_decoding_unit_test.py
timeout: 4h 30m
tf_xlnet_test:
image: nvcr.io/nvidia/tensorflow:21.02-tf1-py3
tags:
- fastertransformer
stage: test
only:
- master
- v4.1
- main
- merge_requests
needs:
- job: build_tf_release
artifacts: true
script:
- cd ${CI_PROJECT_DIR}/examples/tensorflow/xlnet
- bash downloadModel.sh
- bash verifyCorrectness.sh # For FP32 model
pyt_sp_test:
image: nvcr.io/nvidia/pytorch:21.02-py3
tags:
- fastertransformer
stage: test
only:
- main
- merge_requests
needs:
- job: build_pyt_release_sparse
artifacts: true
script:
- cd ${CI_PROJECT_DIR}/build/
- export PYTHONPATH="${CI_PROJECT_DIR}/:$PYTHONPATH"
- export NVIDIA_TF32_OVERRIDE=0 # Disable the TF32
- export CUDA_VISIBLE_DEVICES=0
- pip install transformers==2.5.1
# GOS has no Ampere GPU, so no sparse tests can be done. only test some dense cases
- ${CI_PROJECT_DIR}/build/bin/bert_gemm 32 64 12 64 1 0
- python ${CI_PROJECT_DIR}/examples/pytorch/bert/bert_example.py 32 12 64 12 64 --fp16
- ${CI_PROJECT_DIR}/build/bin/bert_gemm 32 64 12 64 1 1
- python ${CI_PROJECT_DIR}/examples/pytorch/bert/bert_example.py 32 12 64 12 64 --fp16 --int8_mode 1
- python ${CI_PROJECT_DIR}/examples/pytorch/bert/bert_example.py 32 12 64 12 64 --fp16 --int8_mode 2
- python ${CI_PROJECT_DIR}/examples/pytorch/bert/bert_example.py 32 12 64 12 64 --fp16 --int8_mode 3
pyt_longformer_test:
image: nvcr.io/nvidia/pytorch:21.02-py3
tags:
- fastertransformer
stage: test
only:
- main
- merge_requests
needs:
- job: build_pyt_release
artifacts: true
script:
- cd ${CI_PROJECT_DIR}/examples/pytorch/longformer
- apt-get update && apt-get install git-lfs
- git lfs install
- git config lfs.fetchinclude "pytorch_model.bin,config.json"
- git clone https://huggingface.co/allenai/longformer-large-4096-finetuned-triviaqa
- cd ${CI_PROJECT_DIR}
- export PYTHONPATH="${CI_PROJECT_DIR}/:$PYTHONPATH"
- export NVIDIA_TF32_OVERRIDE=0 # Disable the TF32
- export CUDA_VISIBLE_DEVICES=0
- pip install transformers==4.8.2
- python3 tests/longformer/py_longformer_unit_test.py
pyt_decoding_test:
image: nvcr.io/nvidia/pytorch:21.02-py3
tags:
- fastertransformer
stage: test
only:
- main
- merge_requests
needs:
- job: build_pyt_release
artifacts: true
script:
- cd ${CI_PROJECT_DIR}/build/
- export PYTHONPATH="${CI_PROJECT_DIR}/:$PYTHONPATH"
- export NVIDIA_TF32_OVERRIDE=0 # Disable the TF32
- export CUDA_VISIBLE_DEVICES=0
- apt-get update && apt-get install bc
- pip install sacrebleu
- pip install opennmt-py==1.1.1
- bash ../examples/pytorch/decoding/utils/download_model.sh
- mkdir pytorch/translation/data -p
- cp ../examples/tensorflow/decoding/utils/translation/test* pytorch/translation/data
- python ../examples/pytorch/decoding/utils/recover_bpe.py pytorch/translation/data/test.de debpe_ref.txt
- echo "Run decoding fp32" # decoding fp32 testing
- python ../examples/pytorch/decoding/translate_example.py --batch_size 128 --beam_size 4 --model_type decoding_ext --decoding_ths_path ./lib/libth_decoding.so --data_type fp32 --output_file output.txt
- python ../examples/pytorch/decoding/utils/recover_bpe.py output.txt debpe_output.txt
- cat debpe_output.txt | sacrebleu debpe_ref.txt
- echo "Run decoder fp32" # decoder fp32 testing
- python ../examples/pytorch/decoding/translate_example.py --batch_size 128 --beam_size 4 --model_type torch_decoding_with_decoder_ext --decoder_ths_path ./lib/libth_decoder.so --data_type fp32 --output_file output.txt
- python ../examples/pytorch/decoding/utils/recover_bpe.py output.txt debpe_output.txt
- cat debpe_output.txt | sacrebleu debpe_ref.txt
- echo "Run decoding fp16" # decoding fp16 testing
- python ../examples/pytorch/decoding/translate_example.py --batch_size 128 --beam_size 4 --model_type decoding_ext --decoding_ths_path ./lib/libth_decoding.so --data_type fp16 --output_file output.txt
- python ../examples/pytorch/decoding/utils/recover_bpe.py output.txt debpe_output.txt
- cat debpe_output.txt | sacrebleu debpe_ref.txt
- echo "Run decoder fp16" # decoder fp16 testing
- python ../examples/pytorch/decoding/translate_example.py --batch_size 128 --beam_size 4 --model_type torch_decoding_with_decoder_ext --decoder_ths_path ./lib/libth_decoder.so --data_type fp16 --output_file output.txt
- python ../examples/pytorch/decoding/utils/recover_bpe.py output.txt debpe_output.txt
- cat debpe_output.txt | sacrebleu debpe_ref.txt
timeout: 4h
t5_test:
image: nvcr.io/nvidia/pytorch:21.02-py3
tags:
- fastertransformer
stage: test
only:
- main
- merge_requests
needs:
- job: build_pyt_release
artifacts: true
script:
- cd ${CI_PROJECT_DIR}/build/
- export PYTHONPATH="${CI_PROJECT_DIR}/:$PYTHONPATH"
- export NVIDIA_TF32_OVERRIDE=0 # Disable the TF32
- export CUDA_VISIBLE_DEVICES=0
- apt-get update && apt-get install bc
- pip install transformers huggingface_hub tokenizers sacrebleu SentencePiece
- python ../examples/pytorch/t5/translate_example.py -batch 32 -time 0123
- python ../examples/pytorch/t5/translate_example.py -batch 32 -time 0123 -d fp16
- python ../examples/pytorch/t5/translate_example.py -batch 4 -time 0123 -d fp16 --model t5-3b
- export CUDA_VISIBLE_DEVICES=0,2
- mpirun -n 2 --allow-run-as-root python ../examples/pytorch/t5/translate_example.py -batch 4 -time 13 -d fp16 --model t5-3b --tensor_para_size 2
- mpirun -n 2 --allow-run-as-root python ../examples/pytorch/t5/translate_example.py -batch 4 -time 13 -d fp16 --model t5-3b --pipeline_para_size 2
timeout: 4h