Skip to content

Commit

Permalink
Merge branch 'restruct_52' of https://github.com/RichardWooSJTU/Paddl…
Browse files Browse the repository at this point in the history
…eNLP into restruct_52
  • Loading branch information
RichardWooSJTU committed Dec 12, 2023
2 parents 1ecd7e2 + 7a2a0c8 commit a098300
Show file tree
Hide file tree
Showing 12 changed files with 178 additions and 94 deletions.
3 changes: 3 additions & 0 deletions llm/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,6 @@ checkpoints/
# inference_model
inference*/

# log
log/

18 changes: 14 additions & 4 deletions llm/benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,24 @@ export FLAGS_use_autotune=1
export FLAGS_cublaslt_exhaustive_search_times=10
export FLAGS_cache_inference_while_scope=1

model_dir=${1:-"checkpoints/llama_ptq_ckpts_smooth_all_shift_mp2"}
src_len=${2:-300}
dec_len=${3:-100}

python predictor.py \
--model_name_or_path ./llama7b-inference_model_fp16 \
total_len=`expr ${src_len} + ${dec_len}`


python -m paddle.distributed.launch \
--gpus "6,7" \
predictor.py \
--model_name_or_path ./inference_model/${model_dir} \
--dtype float16 \
--src_length 300 \
--max_length 100 \
--src_length ${total_len} \
--max_length ${dec_len} \
--output_file "infer.json" \
--mode "static" \
--batch_size 1 \
--benchmark \
--block_attn \
--block_size 64 \
--inference_model
13 changes: 7 additions & 6 deletions llm/export.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,23 @@

export PYTHONPATH=$(dirname $(pwd)):$PYTHONPATH

model_dir=${1:-"meta-llama/Llama-2-7b-chat"}
model_dir=${1:-"checkpoints/llama_ptq_ckpts_smooth_all_shift"}
src_len=${2:-1024}
dec_len=${3:-1024}
quant_type=${4:-"weight_only_int8"}
quant_type=${4:-"a8w8"}

total_len=`expr ${src_len} + ${dec_len}`


python \

python -m paddle.distributed.launch \
--gpus "6, 7" \
export_model.py \
--model_name_or_path ${model_dir} \
--output_path ./inference_model/${model_dir}_C8 \
--output_path ./inference_model/${model_dir}_mp2 \
--dtype float16 \
--inference_model \
--block_size 64 \
--src_length ${total_len} \
--block_attn \
--quant_type ${quant_type} \
--use_cachekv_int8
--quant_type ${quant_type}
6 changes: 3 additions & 3 deletions llm/export_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,9 @@ def main():
predictor.tokenizer.save_pretrained(export_args.output_path)
generate_rank_mapping(os.path.join(export_args.output_path, "rank_mapping.csv"))

if tensor_parallel_degree > 1:
export_args.output_path = os.path.join(export_args.output_path, f"rank_{tensor_parallel_rank}")
validate_pdmodel(export_args.output_path, predictor_args.model_prefix)
# if tensor_parallel_degree > 1:
# export_args.output_path = os.path.join(export_args.output_path, f"rank_{tensor_parallel_rank}")
# validate_pdmodel(export_args.output_path, predictor_args.model_prefix)


if __name__ == "__main__":
Expand Down
8 changes: 5 additions & 3 deletions llm/llama/ptq_argument.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"model_name_or_path": "./checkpoints/llama_sft_ckpts",
"model_name_or_path": "meta-llama/Llama-2-13b-chat",
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"eval_accumulation_steps":16,
Expand All @@ -8,14 +8,16 @@
"fp16": true,
"fp16_opt_level": "O2",
"dataset_name_or_path": "./data",
"output_dir": "./checkpoints/llama_ptq_ckpts",
"do_eval": true,
"output_dir": "./checkpoints/llama_ptq_ckpts_shi",
"do_eval": false,
"eval_with_do_generation": false,
"do_ptq": true,
"ptq_step": 16,
"smooth": true,
"smooth_step": 16,
"smooth_all_linears": true,
"shift": true,
"shift_all_linears": true,
"smooth_piecewise_search": true,
"smooth_k_piece": 3,
"smooth_search_piece": true
Expand Down
23 changes: 14 additions & 9 deletions llm/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1116,7 +1116,7 @@ def predict(self, input_texts: str | list[str]):
def _preprocess(self, source):
for i, text in enumerate(source):
print("text: ", text)
tokens = self.tokenizer(text, return_tensors="np", padding=False, max_length=self.config.src_length)
tokens = self.tokenizer(text, return_tensors="np", padding=False, max_length=(self.config.src_length - self.config.max_length))
input_ids = tokens["input_ids"][0]
length = len(input_ids)
print("input_ids: ", input_ids)
Expand All @@ -1135,6 +1135,7 @@ def _preprocess(self, source):
self.inputs["stop_flags"][i : i + 1] = False
reset_stop_value(self.inputs["not_need_stop"])
need_block_nums = (length + self.config.max_length + self.pre_cache_length + self.block_size - 1) // self.block_size
print("self.free_list", self.free_list)
for bi in range(need_block_nums):
bi_now = self.free_list.pop()
self.used_list[i].append(bi_now)
Expand Down Expand Up @@ -1241,7 +1242,9 @@ def create_predictor(
LlamaForCausalLMInferenceModel as LlamaInferenceModel,
)
model = LlamaInferenceModel.from_pretrained(
predictor_args.model_name_or_path, config=config, dtype=predictor_args.dtype
predictor_args.model_name_or_path, config=config, dtype=predictor_args.dtype,
tensor_parallel_degree=tensor_parallel_degree,
tensor_parallel_rank=tensor_parallel_rank,
)
model.eval()

Expand Down Expand Up @@ -1432,27 +1435,29 @@ def predict():

def benchmark(predictor, predictor_args, model_args):
# Just construct a simple benchmark input. We pad input to the src_length.
test_texts = "hello world, how are you?"
benchmark_texts = [test_texts + "<pad>" * predictor_args.src_length for _ in range(predictor_args.batch_size)]
test_texts = ""
benchmark_texts = [test_texts + "<pad>" * (predictor_args.src_length - predictor_args.max_length) for _ in range(predictor_args.batch_size)]

batch_benchmark_texts = batchfy_text(benchmark_texts, predictor_args.batch_size)
print("***********Start Benchmark**********")

warmup_time = 10
test_time = 100
warmup_time = 3
test_time = 20

print("***********Start Warmup**********")
for _ in range(warmup_time):
for i in range(warmup_time):
print("warm up ", i)
for bs, batch_source_text in enumerate(batch_benchmark_texts):
outputs = predictor.predict(batch_source_text)

print("***********Start Speed Test**********")
start = time.perf_counter()
output_tokens = 0
for _ in range(test_time):
for i in range(test_time):
print("test ", i)
for bs, batch_source_text in enumerate(batch_benchmark_texts):
outputs = predictor.predict(batch_source_text)
output_tokens += sum([len(output) for output in outputs])
output_tokens += predictor_args.max_length * predictor_args.batch_size
end = time.perf_counter()
print("Avg Elapse time is: ", (end - start) / test_time)
print("Output tokens is: ", output_tokens)
Expand Down
12 changes: 7 additions & 5 deletions llm/run_dygraph.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,18 @@ export FLAGS_control_flow_use_new_executor=1
export FLAGS_new_executor_serial_run=1
export FLAGS_allocator_strategy=naive_best_fit
export FLAGS_fraction_of_gpu_memory_to_use=0.92
export CUDA_VISIBLE_DEVICES=5

model_dir=${1:-"linly-ai/chinese-llama-2-7b"}
model_dir=${1:-"checkpoints/llama_ptq_ckpts_smooth_all_shift"}
src_len=${2:-1024}
dec_len=${3:-1024}
quant_type=${4:-"weight_only_int8"}
quant_type=${4:-"a8w8"}
# quant_type=${4:-"None"}

total_len=`expr ${src_len} + ${dec_len}`

python predictor.py \
python -m paddle.distributed.launch \
--gpus "6, 7" \
predictor.py \
--model_name_or_path ${model_dir} \
--dtype float16 \
--src_length ${total_len} \
Expand All @@ -43,5 +44,6 @@ python predictor.py \
--mode "dynamic" \
--batch_size 2 \
--inference_model \
--quant_type ${quant_type}
--quant_type ${quant_type} \
--block_attn

48 changes: 0 additions & 48 deletions llm/run_dygraph_block.sh

This file was deleted.

7 changes: 4 additions & 3 deletions llm/run_static.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
model_dir=${1:-"meta-llama/Llama-2-7b-chat"}
src_len=${2:-1024}
dec_len=${3:-1024}
quant_type=${4:-"weight_only_int8"}
quant_type=${4:-"a8w8"}


export PYTHONPATH=$(dirname $(pwd)):$PYTHONPATH
Expand All @@ -29,14 +29,15 @@ export FLAGS_new_executor_serial_run=1
export FLAGS_allocator_strategy=naive_best_fit
export FLAGS_fraction_of_gpu_memory_to_use=0.92

model_dir=${1:-"meta-llama/Llama-2-7b-chat"}

model_dir=${1:-"checkpoints/llama_ptq_ckpts_smooth_all_shift_mp2"}
src_len=${2:-1024}
dec_len=${3:-1024}

total_len=`expr ${src_len} + ${dec_len}`

python -m paddle.distributed.launch \
--gpus "1" \
--gpus "6,7" \
predictor.py \
--model_name_or_path ./inference_model/${model_dir} \
--dtype float16 \
Expand Down
21 changes: 16 additions & 5 deletions paddlenlp/experimental/transformers/fused_transformer_layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -838,10 +838,14 @@ def forward(
**kwargs
)
# print("out_linear_out", out_linear_out)
# exit(0)
# all_reduce
if self.nranks > 1:
dist.all_reduce(out_linear_out)


# print("out_linear_out", out_linear_out)

# ffn layernorm
tmp_out, residual_input = self.compute_ffn_layernorm(out_linear_out, residual_input, i)
# print("ln_out", tmp_out)
Expand All @@ -855,10 +859,11 @@ def forward(
# ffn2 matmul
ffn2_out = self.compute_ffn2(ffn1_out, i)
# print("ffn2_out", ffn2_out)
# exit(0)
# all_reduce
if self.nranks > 1:
dist.all_reduce(ffn2_out)
# print("ffn2_out", ffn2_out)
# exit(0)

# norm + residual_add_bias
tmp_out, residual_input = self.compute_bias_residual_layernorm(ffn2_out, residual_input, i, self.num_layers)
Expand Down Expand Up @@ -1194,7 +1199,7 @@ def compute_fmha(
):
# print("compute_fmha")
qkv_out = dequant_int8(qkv_out, self.qkv_out_scales[i], self._dtype)
print("dequant_int8", qkv_out)
# print("dequant_int8", qkv_out)
# print("compute_fmha dequant")
if self.qkv_biases[i] is not None:
qkv_out = paddle.add(qkv_out, self.qkv_biases[i])
Expand All @@ -1208,7 +1213,7 @@ def compute_fmha(
q_out, k_out, v_out = qkv_transpose_split(
qkv_out, padding_offset, seq_lens, input_ids, self.num_heads, self.head_dim
)
print("qkv_transpose_split", q_out)
# print("qkv_transpose_split", q_out)
# print("compute_fmha qkv_transpose_split")
# rotary emb (inplace)
if rotary_embs is not None:
Expand All @@ -1220,7 +1225,7 @@ def compute_fmha(
rotary_emb_dims=rotary_emb_dims,
use_neox=self.use_neox_rotary_style,
)
print("compute_fmha rotary_embs", q_out, k_out)
# print("compute_fmha rotary_embs", q_out, k_out)
if pre_caches is not None:
k_out = paddle.concat([pre_caches[i][0, :bsz], k_out], axis=2)
v_out = paddle.concat([pre_caches[i][1, :bsz], v_out], axis=2)
Expand All @@ -1241,7 +1246,7 @@ def compute_fmha(
)
# print("compute_fmha fmha")
fmha_out = transpose_remove_padding(qktv_out, seq_lens, padding_offset)
print("before quant", fmha_out)
# print("before quant", fmha_out)
# print("compute_fmha transpose_remove_padding")
fmha_out = quant_int8(
fmha_out,
Expand Down Expand Up @@ -1513,7 +1518,13 @@ def compute_attn(
compute_dtype=self._fuse_kernel_compute_dtype
)[0]
# print('self.act_scales["out_linear_in_scale"][i]', self.act_scales["out_linear_in_scale"][i])
# print("self.qkv_out_scales[i]", self.qkv_out_scales[i])
# print("self.qkv_biases[i]", self.qkv_biases[i])
# print("self.linear_shifts[i]", self.linear_shifts[i])
# print("self.linear_smooths[i]", self.linear_smooths[i])
# print("layer", i, fmha_out)



out_linear_out = self.compute_out_linear(fmha_out, i)

Expand Down
Loading

0 comments on commit a098300

Please sign in to comment.