From f8861928b67f6bbde2177f4fc4670a5264002589 Mon Sep 17 00:00:00 2001 From: wufeisheng Date: Tue, 12 Dec 2023 20:58:23 +0800 Subject: [PATCH] code refine --- llm/benchmark.sh | 13 +++++++------ llm/export.sh | 7 +++---- llm/predictor.py | 17 ++++++++++------- llm/run_dygraph.sh | 4 +--- llm/run_static.sh | 8 +++----- 5 files changed, 24 insertions(+), 25 deletions(-) diff --git a/llm/benchmark.sh b/llm/benchmark.sh index 6baed2e0f0ff..0b3d4bcd3a28 100644 --- a/llm/benchmark.sh +++ b/llm/benchmark.sh @@ -23,15 +23,15 @@ export FLAGS_use_autotune=1 export FLAGS_cublaslt_exhaustive_search_times=10 export FLAGS_cache_inference_while_scope=1 -model_dir=${1:-"checkpoints/llama_ptq_ckpts_smooth_all_shift_mp2"} -src_len=${2:-300} -dec_len=${3:-100} +model_dir=${1:-"checkpoints/llama65b_ptq_smooth_mp8"} +src_len=${2:-1100} +dec_len=${3:-330} total_len=`expr ${src_len} + ${dec_len}` python -m paddle.distributed.launch \ - --gpus "6,7" \ + --gpus "0,1,2,3,4,5,6,7" \ predictor.py \ --model_name_or_path ./inference_model/${model_dir} \ --dtype float16 \ @@ -39,8 +39,9 @@ python -m paddle.distributed.launch \ --max_length ${dec_len} \ --output_file "infer.json" \ --mode "static" \ - --batch_size 1 \ + --batch_size 128 \ --benchmark \ --block_attn \ --block_size 64 \ - --inference_model \ No newline at end of file + --inference_model \ + --use_cachekv_int8 static \ No newline at end of file diff --git a/llm/export.sh b/llm/export.sh index 94097dbfef9d..592b4a4b7fab 100644 --- a/llm/export.sh +++ b/llm/export.sh @@ -13,12 +13,11 @@ # limitations under the License. export PYTHONPATH=$(dirname $(pwd)):$PYTHONPATH -export DISTRIBUTED_TRAINER_ENDPOINTS=10.174.140.213:60105,10.174.140.213:60122,10.174.140.213:60196,10.174.140.213:60232,10.174.140.213:60257,10.174.140.213:60317,10.174.140.213:60458,10.174.140.213:60800 -model_dir=${1:-"checkpoints/llama65b_ptq"} -src_len=${2:-1024} -dec_len=${3:-1024} +model_dir=${1:-"checkpoints/llama65b_ptq_smooth"} +src_len=${2:-1100} +dec_len=${3:-330} quant_type=${4:-"a8w8"} total_len=`expr ${src_len} + ${dec_len}` diff --git a/llm/predictor.py b/llm/predictor.py index 63e19d63eefe..61f5154066f3 100644 --- a/llm/predictor.py +++ b/llm/predictor.py @@ -1115,11 +1115,11 @@ def predict(self, input_texts: str | list[str]): def _preprocess(self, source): for i, text in enumerate(source): - print("text: ", text) + # print("text: ", text) tokens = self.tokenizer(text, return_tensors="np", padding=False, max_length=(self.config.src_length - self.config.max_length)) input_ids = tokens["input_ids"][0] length = len(input_ids) - print("input_ids: ", input_ids) + # print("input_ids: ", input_ids) print("length: ", length) self.inputs["input_ids"][i : i + 1, :length] = input_ids self.inputs["penalty_score"][i : i + 1] = self.config.repetition_penalty @@ -1135,7 +1135,7 @@ def _preprocess(self, source): self.inputs["stop_flags"][i : i + 1] = False reset_stop_value(self.inputs["not_need_stop"]) need_block_nums = (length + self.config.max_length + self.pre_cache_length + self.block_size - 1) // self.block_size - print("self.free_list", self.free_list) + # print("self.free_list", self.free_list) for bi in range(need_block_nums): bi_now = self.free_list.pop() self.used_list[i].append(bi_now) @@ -1423,10 +1423,13 @@ def predict(): source_texts = [] data_file = open("humaneval_solution.json", 'r') + + dataset = [] + for line in data_file.readlines(): + dataset.append(json.loads(line)) for i in range(predictor_args.batch_size): - line = data_file.readline() - data = json.loads(line) + data = dataset[i % 164] source_texts.append(data["prompt"]) @@ -1463,8 +1466,8 @@ def benchmark(predictor, predictor_args, model_args): batch_benchmark_texts = batchfy_text(benchmark_texts, predictor_args.batch_size) print("***********Start Benchmark**********") - warmup_time = 3 - test_time = 20 + warmup_time = 2 + test_time = 10 print("***********Start Warmup**********") for i in range(warmup_time): diff --git a/llm/run_dygraph.sh b/llm/run_dygraph.sh index c1fe635a5c5f..5ad18e3dd976 100644 --- a/llm/run_dygraph.sh +++ b/llm/run_dygraph.sh @@ -25,9 +25,7 @@ export FLAGS_new_executor_serial_run=1 export FLAGS_allocator_strategy=naive_best_fit export FLAGS_fraction_of_gpu_memory_to_use=0.92 -export DISTRIBUTED_TRAINER_ENDPOINTS=10.174.140.213:60105,10.174.140.213:60122,10.174.140.213:60196,10.174.140.213:60232,10.174.140.213:60257,10.174.140.213:60317,10.174.140.213:60458,10.174.140.213:60800 - -model_dir=${1:-"checkpoints/llama65b_ptq"} +model_dir=${1:-"checkpoints/llama65b_ptq_smooth"} src_len=${2:-1024} dec_len=${3:-1024} quant_type=${4:-"a8w8"} diff --git a/llm/run_static.sh b/llm/run_static.sh index 9a08abb0d83f..fc05c5bc1775 100644 --- a/llm/run_static.sh +++ b/llm/run_static.sh @@ -21,13 +21,11 @@ export FLAGS_control_flow_use_new_executor=1 export FLAGS_new_executor_serial_run=1 export FLAGS_allocator_strategy=naive_best_fit export FLAGS_fraction_of_gpu_memory_to_use=0.92 -export DISTRIBUTED_TRAINER_ENDPOINTS=10.174.140.213:60105,10.174.140.213:60122,10.174.140.213:60196,10.174.140.213:60232,10.174.140.213:60257,10.174.140.213:60317,10.174.140.213:60458,10.174.140.213:60800 - -model_dir=${1:-"checkpoints/llama65b_ptq_mp8"} -src_len=${2:-1024} -dec_len=${3:-1024} +model_dir=${1:-"checkpoints/llama65b_ptq_smooth_mp8"} +src_len=${2:-1100} +dec_len=${3:-330} quant_type=${4:-"a8w8"} total_len=`expr ${src_len} + ${dec_len}`