code refine

RichardWooSJTU · Dec 12, 2023 · f886192 · f886192
1 parent 58108c6
commit f886192
Show file tree

Hide file tree

Showing 5 changed files with 24 additions and 25 deletions.
diff --git a/llm/benchmark.sh b/llm/benchmark.sh
@@ -23,24 +23,25 @@ export FLAGS_use_autotune=1
 export FLAGS_cublaslt_exhaustive_search_times=10
 export FLAGS_cache_inference_while_scope=1
 
-model_dir=${1:-"checkpoints/llama_ptq_ckpts_smooth_all_shift_mp2"}
-src_len=${2:-300}
-dec_len=${3:-100}
+model_dir=${1:-"checkpoints/llama65b_ptq_smooth_mp8"}
+src_len=${2:-1100}
+dec_len=${3:-330}
 
 total_len=`expr ${src_len} + ${dec_len}`
 
 
 python -m paddle.distributed.launch \
-    --gpus "6,7" \
+    --gpus "0,1,2,3,4,5,6,7" \
     predictor.py \
     --model_name_or_path ./inference_model/${model_dir} \
     --dtype float16 \
     --src_length ${total_len} \
     --max_length ${dec_len} \
     --output_file "infer.json" \
     --mode "static" \
-    --batch_size 1 \
+    --batch_size 128 \
     --benchmark \
     --block_attn \
     --block_size 64 \
-    --inference_model 
+    --inference_model \
+    --use_cachekv_int8 static
diff --git a/llm/export.sh b/llm/export.sh
@@ -13,12 +13,11 @@
 # limitations under the License.
 
 export PYTHONPATH=$(dirname $(pwd)):$PYTHONPATH
-export DISTRIBUTED_TRAINER_ENDPOINTS=10.174.140.213:60105,10.174.140.213:60122,10.174.140.213:60196,10.174.140.213:60232,10.174.140.213:60257,10.174.140.213:60317,10.174.140.213:60458,10.174.140.213:60800
 
 
-model_dir=${1:-"checkpoints/llama65b_ptq"}
-src_len=${2:-1024}
-dec_len=${3:-1024}
+model_dir=${1:-"checkpoints/llama65b_ptq_smooth"}
+src_len=${2:-1100}
+dec_len=${3:-330}
 quant_type=${4:-"a8w8"}
 
 total_len=`expr ${src_len} + ${dec_len}`

diff --git a/llm/predictor.py b/llm/predictor.py
@@ -1115,11 +1115,11 @@ def predict(self, input_texts: str | list[str]):
 
     def _preprocess(self, source):
         for i, text in enumerate(source):
-            print("text: ", text)
+            # print("text: ", text)
             tokens = self.tokenizer(text, return_tensors="np", padding=False, max_length=(self.config.src_length - self.config.max_length))
             input_ids = tokens["input_ids"][0]
             length = len(input_ids)
-            print("input_ids: ", input_ids)
+            # print("input_ids: ", input_ids)
             print("length: ", length)
             self.inputs["input_ids"][i : i + 1, :length] = input_ids
             self.inputs["penalty_score"][i : i + 1] = self.config.repetition_penalty
@@ -1135,7 +1135,7 @@ def _preprocess(self, source):
             self.inputs["stop_flags"][i : i + 1] = False
             reset_stop_value(self.inputs["not_need_stop"])
             need_block_nums = (length + self.config.max_length + self.pre_cache_length + self.block_size - 1) // self.block_size
-            print("self.free_list",  self.free_list)
+            # print("self.free_list",  self.free_list)
             for bi in range(need_block_nums):
                 bi_now = self.free_list.pop()
                 self.used_list[i].append(bi_now)
@@ -1423,10 +1423,13 @@ def predict():
         source_texts = []
 
         data_file = open("humaneval_solution.json", 'r')
+
+        dataset = []
+        for line in data_file.readlines():
+            dataset.append(json.loads(line))
 
         for i in range(predictor_args.batch_size):
-            line = data_file.readline()
-            data = json.loads(line)
+            data = dataset[i % 164]
             source_texts.append(data["prompt"])
 
 
@@ -1463,8 +1466,8 @@ def benchmark(predictor, predictor_args, model_args):
     batch_benchmark_texts = batchfy_text(benchmark_texts, predictor_args.batch_size)
     print("***********Start Benchmark**********")
 
-    warmup_time = 3
-    test_time = 20
+    warmup_time = 2
+    test_time = 10
 
     print("***********Start Warmup**********")
     for i in range(warmup_time):

diff --git a/llm/run_dygraph.sh b/llm/run_dygraph.sh
@@ -25,9 +25,7 @@ export FLAGS_new_executor_serial_run=1
 export FLAGS_allocator_strategy=naive_best_fit
 export FLAGS_fraction_of_gpu_memory_to_use=0.92
 
-export DISTRIBUTED_TRAINER_ENDPOINTS=10.174.140.213:60105,10.174.140.213:60122,10.174.140.213:60196,10.174.140.213:60232,10.174.140.213:60257,10.174.140.213:60317,10.174.140.213:60458,10.174.140.213:60800
-
-model_dir=${1:-"checkpoints/llama65b_ptq"}
+model_dir=${1:-"checkpoints/llama65b_ptq_smooth"}
 src_len=${2:-1024}
 dec_len=${3:-1024}
 quant_type=${4:-"a8w8"}

diff --git a/llm/run_static.sh b/llm/run_static.sh
@@ -21,13 +21,11 @@ export FLAGS_control_flow_use_new_executor=1
 export FLAGS_new_executor_serial_run=1
 export FLAGS_allocator_strategy=naive_best_fit
 export FLAGS_fraction_of_gpu_memory_to_use=0.92
-export DISTRIBUTED_TRAINER_ENDPOINTS=10.174.140.213:60105,10.174.140.213:60122,10.174.140.213:60196,10.174.140.213:60232,10.174.140.213:60257,10.174.140.213:60317,10.174.140.213:60458,10.174.140.213:60800
 
 
-
-model_dir=${1:-"checkpoints/llama65b_ptq_mp8"}
-src_len=${2:-1024}
-dec_len=${3:-1024}
+model_dir=${1:-"checkpoints/llama65b_ptq_smooth_mp8"}
+src_len=${2:-1100}
+dec_len=${3:-330}
 quant_type=${4:-"a8w8"}
 
 total_len=`expr ${src_len} + ${dec_len}`