From f8861928b67f6bbde2177f4fc4670a5264002589 Mon Sep 17 00:00:00 2001
From: wufeisheng <wfs1997@163.com>
Date: Tue, 12 Dec 2023 20:58:23 +0800
Subject: [PATCH] code refine

---
 llm/benchmark.sh   | 13 +++++++------
 llm/export.sh      |  7 +++----
 llm/predictor.py   | 17 ++++++++++-------
 llm/run_dygraph.sh |  4 +---
 llm/run_static.sh  |  8 +++-----
 5 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/llm/benchmark.sh b/llm/benchmark.sh
index 6baed2e0f0ff..0b3d4bcd3a28 100644
--- a/llm/benchmark.sh
+++ b/llm/benchmark.sh
@@ -23,15 +23,15 @@ export FLAGS_use_autotune=1
 export FLAGS_cublaslt_exhaustive_search_times=10
 export FLAGS_cache_inference_while_scope=1
 
-model_dir=${1:-"checkpoints/llama_ptq_ckpts_smooth_all_shift_mp2"}
-src_len=${2:-300}
-dec_len=${3:-100}
+model_dir=${1:-"checkpoints/llama65b_ptq_smooth_mp8"}
+src_len=${2:-1100}
+dec_len=${3:-330}
 
 total_len=`expr ${src_len} + ${dec_len}`
 
 
 python -m paddle.distributed.launch \
-    --gpus "6,7" \
+    --gpus "0,1,2,3,4,5,6,7" \
     predictor.py \
     --model_name_or_path ./inference_model/${model_dir} \
     --dtype float16 \
@@ -39,8 +39,9 @@ python -m paddle.distributed.launch \
     --max_length ${dec_len} \
     --output_file "infer.json" \
     --mode "static" \
-    --batch_size 1 \
+    --batch_size 128 \
     --benchmark \
     --block_attn \
     --block_size 64 \
-    --inference_model 
\ No newline at end of file
+    --inference_model \
+    --use_cachekv_int8 static
\ No newline at end of file
diff --git a/llm/export.sh b/llm/export.sh
index 94097dbfef9d..592b4a4b7fab 100644
--- a/llm/export.sh
+++ b/llm/export.sh
@@ -13,12 +13,11 @@
 # limitations under the License.
 
 export PYTHONPATH=$(dirname $(pwd)):$PYTHONPATH
-export DISTRIBUTED_TRAINER_ENDPOINTS=10.174.140.213:60105,10.174.140.213:60122,10.174.140.213:60196,10.174.140.213:60232,10.174.140.213:60257,10.174.140.213:60317,10.174.140.213:60458,10.174.140.213:60800
 
 
-model_dir=${1:-"checkpoints/llama65b_ptq"}
-src_len=${2:-1024}
-dec_len=${3:-1024}
+model_dir=${1:-"checkpoints/llama65b_ptq_smooth"}
+src_len=${2:-1100}
+dec_len=${3:-330}
 quant_type=${4:-"a8w8"}
 
 total_len=`expr ${src_len} + ${dec_len}`
diff --git a/llm/predictor.py b/llm/predictor.py
index 63e19d63eefe..61f5154066f3 100644
--- a/llm/predictor.py
+++ b/llm/predictor.py
@@ -1115,11 +1115,11 @@ def predict(self, input_texts: str | list[str]):
 
     def _preprocess(self, source):
         for i, text in enumerate(source):
-            print("text: ", text)
+            # print("text: ", text)
             tokens = self.tokenizer(text, return_tensors="np", padding=False, max_length=(self.config.src_length - self.config.max_length))
             input_ids = tokens["input_ids"][0]
             length = len(input_ids)
-            print("input_ids: ", input_ids)
+            # print("input_ids: ", input_ids)
             print("length: ", length)
             self.inputs["input_ids"][i : i + 1, :length] = input_ids
             self.inputs["penalty_score"][i : i + 1] = self.config.repetition_penalty
@@ -1135,7 +1135,7 @@ def _preprocess(self, source):
             self.inputs["stop_flags"][i : i + 1] = False
             reset_stop_value(self.inputs["not_need_stop"])
             need_block_nums = (length + self.config.max_length + self.pre_cache_length + self.block_size - 1) // self.block_size
-            print("self.free_list",  self.free_list)
+            # print("self.free_list",  self.free_list)
             for bi in range(need_block_nums):
                 bi_now = self.free_list.pop()
                 self.used_list[i].append(bi_now)
@@ -1423,10 +1423,13 @@ def predict():
         source_texts = []
 
         data_file = open("humaneval_solution.json", 'r')
+        
+        dataset = []
+        for line in data_file.readlines():
+            dataset.append(json.loads(line))
 
         for i in range(predictor_args.batch_size):
-            line = data_file.readline()
-            data = json.loads(line)
+            data = dataset[i % 164]
             source_texts.append(data["prompt"])
 
 
@@ -1463,8 +1466,8 @@ def benchmark(predictor, predictor_args, model_args):
     batch_benchmark_texts = batchfy_text(benchmark_texts, predictor_args.batch_size)
     print("***********Start Benchmark**********")
 
-    warmup_time = 3
-    test_time = 20
+    warmup_time = 2
+    test_time = 10
 
     print("***********Start Warmup**********")
     for i in range(warmup_time):
diff --git a/llm/run_dygraph.sh b/llm/run_dygraph.sh
index c1fe635a5c5f..5ad18e3dd976 100644
--- a/llm/run_dygraph.sh
+++ b/llm/run_dygraph.sh
@@ -25,9 +25,7 @@ export FLAGS_new_executor_serial_run=1
 export FLAGS_allocator_strategy=naive_best_fit
 export FLAGS_fraction_of_gpu_memory_to_use=0.92
 
-export DISTRIBUTED_TRAINER_ENDPOINTS=10.174.140.213:60105,10.174.140.213:60122,10.174.140.213:60196,10.174.140.213:60232,10.174.140.213:60257,10.174.140.213:60317,10.174.140.213:60458,10.174.140.213:60800
-
-model_dir=${1:-"checkpoints/llama65b_ptq"}
+model_dir=${1:-"checkpoints/llama65b_ptq_smooth"}
 src_len=${2:-1024}
 dec_len=${3:-1024}
 quant_type=${4:-"a8w8"}
diff --git a/llm/run_static.sh b/llm/run_static.sh
index 9a08abb0d83f..fc05c5bc1775 100644
--- a/llm/run_static.sh
+++ b/llm/run_static.sh
@@ -21,13 +21,11 @@ export FLAGS_control_flow_use_new_executor=1
 export FLAGS_new_executor_serial_run=1
 export FLAGS_allocator_strategy=naive_best_fit
 export FLAGS_fraction_of_gpu_memory_to_use=0.92
-export DISTRIBUTED_TRAINER_ENDPOINTS=10.174.140.213:60105,10.174.140.213:60122,10.174.140.213:60196,10.174.140.213:60232,10.174.140.213:60257,10.174.140.213:60317,10.174.140.213:60458,10.174.140.213:60800
 
 
-
-model_dir=${1:-"checkpoints/llama65b_ptq_mp8"}
-src_len=${2:-1024}
-dec_len=${3:-1024}
+model_dir=${1:-"checkpoints/llama65b_ptq_smooth_mp8"}
+src_len=${2:-1100}
+dec_len=${3:-330}
 quant_type=${4:-"a8w8"}
 
 total_len=`expr ${src_len} + ${dec_len}`