merge develop

RichardWooSJTU · Dec 13, 2023 · f8ebc84 · f8ebc84
2 parents 057b6e1 + a73a7bf
commit f8ebc84
Show file tree

Hide file tree

Showing 168 changed files with 9,180 additions and 2,573 deletions.
diff --git a/.github/codecov.yml b/.github/codecov.yml
@@ -1,8 +1,14 @@
+codecov:
+  notify:
+    require_ci_to_pass: yes
+
 coverage:
   status:
     project:
-      default:
-        informational: true
+      default: 
+      target: 75% # overall project Coverage < 75%  CI will fail
+      informational: true
     patch:
-      default:
-        informational: true
+      default: 
+      target: 90% # lines adjusted  Coverage < 90%  CI will fail
+      informational: true
diff --git a/csrc/setup_cuda.py b/csrc/setup_cuda.py
@@ -41,7 +41,6 @@ def get_gencode_flags():
             "./generation/transpose_removing_padding.cu",
             "./generation/write_cache_kv.cu",
             "./generation/encode_rotary_qk.cu",
-            "./generation/top_p_sampling.cu",
             "./generation/set_alibi_mask_value.cu",
             "./generation/get_padding_offset_v2.cu",
             "./generation/rebuild_padding_v2.cu",

diff --git a/examples/benchmark/glue/run_glue_trainer.py b/examples/benchmark/glue/run_glue_trainer.py
@@ -214,10 +214,8 @@ def main():
     if model_args.qat:
         from paddle import nn
         from paddle.quantization import QAT, QuantConfig
-        from paddle.quantization.quanters import (
-            FakeQuanterChannelWiseAbsMaxObserver,
-            FakeQuanterWithAbsMaxObserver,
-        )
+        from paddle.quantization.quanters import FakeQuanterWithAbsMaxObserver
+        from paddleslim.quant.quanters import FakeQuanterChannelWiseAbsMaxObserver
 
         from paddlenlp.peft.lora import LoRALinear
         from paddlenlp.peft.lora.lora_quant_layers import QuantedLoRALinear

diff --git a/examples/language_model/moe/dygraph/run_moe_pretrain.py b/examples/language_model/moe/dygraph/run_moe_pretrain.py
@@ -626,8 +626,18 @@ def do_train(args):
                         avg_loss -= bal_loss
                     else:
                         bal_loss = -1
+                    max_mem_reserved_msg = ""
+                    max_mem_allocated_msg = ""
+                    if paddle.device.is_compiled_with_cuda():
+                        max_mem_reserved_msg = (
+                            f"max_mem_reserved: {paddle.device.cuda.max_memory_reserved() >> 20} MB,"
+                        )
+                        max_mem_allocated_msg = (
+                            f"max_mem_allocated: {paddle.device.cuda.max_memory_allocated() >> 20} MB"
+                        )
                     logger.info(
-                        "global step %d, epoch: %d, batch: %d, loss: %.9f, bal_loss: %.9f, speed: %.2f step/s, ips_total: %.0f tokens/s, ips: %.0f tokens/s, learning rate: %.5e"
+                        "global step %d, epoch: %d, batch: %d, loss: %.9f, bal_loss: %.9f, speed: %.2f step/s, "
+                        "ips_total: %.0f tokens/s, ips: %.0f tokens/s, learning rate: %.5e, %s %s"
                         % (
                             global_step,
                             epoch,
@@ -638,6 +648,8 @@ def do_train(args):
                             speed * default_global_tokens_num,
                             speed * default_global_tokens_num / nranks,
                             learning_rate,
+                            max_mem_reserved_msg,
+                            max_mem_allocated_msg,
                         )
                     )
                     log_writer.add_scalar("loss", float(loss), global_step)

diff --git a/examples/machine_translation/transformer/static/train.py b/examples/machine_translation/transformer/static/train.py
@@ -317,11 +317,20 @@ def do_train(args):
                     )
                 else:
                     train_avg_batch_cost = args.print_step / batch_cost_avg.get_total_time()
+                    max_mem_reserved_msg = ""
+                    max_mem_allocated_msg = ""
+                    if paddle.device.is_compiled_with_cuda():
+                        max_mem_reserved_msg = (
+                            f"max_mem_reserved: {paddle.device.cuda.max_memory_reserved() // (1024 ** 2)} MB,"
+                        )
+                        max_mem_allocated_msg = (
+                            f"max_mem_allocated: {paddle.device.cuda.max_memory_allocated() // (1024 ** 2)} MB"
+                        )
                     logger.info(
                         "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "
                         "normalized loss: %f, ppl: %f, avg_speed: %.2f step/s, "
                         "batch_cost: %.5f sec, reader_cost: %.5f sec, tokens: %d, "
-                        "ips: %.5f words/sec"
+                        "ips: %.5f words/sec, %s %s"
                         % (
                             step_idx,
                             pass_id,
@@ -334,6 +343,8 @@ def do_train(args):
                             reader_cost_avg.get_average(),
                             batch_ips_avg.get_total_cnt(),
                             batch_ips_avg.get_average_per_sec(),
+                            max_mem_reserved_msg,
+                            max_mem_allocated_msg,
                         )
                     )
                 reader_cost_avg.reset()

diff --git a/examples/machine_translation/transformer/train.py b/examples/machine_translation/transformer/train.py
@@ -316,11 +316,20 @@ def do_train(args):
                     )
                 else:
                     train_avg_batch_cost = args.print_step / batch_cost_avg.get_total_time()
+                    max_mem_reserved_msg = ""
+                    max_mem_allocated_msg = ""
+                    if paddle.device.is_compiled_with_cuda():
+                        max_mem_reserved_msg = (
+                            f"max_mem_reserved: {paddle.device.cuda.max_memory_reserved() // (1024 ** 2)} MB,"
+                        )
+                        max_mem_allocated_msg = (
+                            f"max_mem_allocated: {paddle.device.cuda.max_memory_allocated() // (1024 ** 2)} MB"
+                        )
                     logger.info(
                         "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "
                         "normalized loss: %f, ppl: %f, avg_speed: %.2f step/sec, "
                         "batch_cost: %.5f sec, reader_cost: %.5f sec, tokens: %d, "
-                        "ips: %.5f words/sec"
+                        "ips: %.5f words/sec, %s %s"
                         % (
                             step_idx,
                             pass_id,
@@ -333,6 +342,8 @@ def do_train(args):
                             reader_cost_avg.get_average(),
                             batch_ips_avg.get_total_cnt(),
                             batch_ips_avg.get_average_per_sec(),
+                            max_mem_reserved_msg,
+                            max_mem_allocated_msg,
                         )
                     )
                 reader_cost_avg.reset()

diff --git a/llm/argument.py b/llm/argument.py
@@ -58,6 +58,25 @@ class ModelArgument:
         default=None, metadata={"help": "Build-in pretrained model name or the path to local model."}
     )
     use_flash_attention: bool = field(default=False, metadata={"help": "Whether to use flash attention"})
+    weight_quantize_algo: str = field(
+        default=None,
+        metadata={
+            "help": "Model weight quantization algorithm including 'nf4', 'fp4','weight_only_int4', 'weight_only_int8'."
+        },
+    )
+    weight_blocksize: int = field(
+        default=64,
+        metadata={"help": "Block size for weight quantization(Only available for nf4 or fp4 quant_scale.)."},
+    )
+    weight_double_quant: bool = field(
+        default=False, metadata={"help": "Whether apply double quant(Only available for nf4 or fp4 quant_scale.)."}
+    )
+    weight_double_quant_block_size: int = field(
+        default=256,
+        metadata={
+            "help": "Block size for quant_scale of weight quant_scale(Only available for nf4 or fp4 quant_scale.)"
+        },
+    )
 
     # LoRA related parameters
     lora: bool = field(default=False, metadata={"help": "Whether to use LoRA technique"})

diff --git a/llm/data.py b/llm/data.py
@@ -159,7 +159,7 @@ def tokenize_rounds_example(tokenizer, example, data_args):
 
 
 def convert_example_common(example, tokenizer, data_args, is_test=True, intokens=False):
-    if data_args.chat_template is not None:
+    if tokenizer.chat_template is not None:
         return convert_rounds_example_common(example, tokenizer, data_args, is_test, intokens)
 
     tokenized_source, tokenized_target_input_ids = tokenize_example(tokenizer, example, data_args)
@@ -208,6 +208,7 @@ def convert_rounds_example_common(example, tokenizer, data_args, is_test=True, i
     input_ids = rounds_inputs.pop("input_ids")
     # shift input_ids and labels
     input_ids, labels = input_ids[:-1], labels[1:]
+
     seq_length = len(input_ids)
     features = {"input_ids": input_ids, "labels": labels}
     if intokens:
@@ -221,7 +222,7 @@ def convert_rounds_example_common(example, tokenizer, data_args, is_test=True, i
 
 
 def convert_example_chatglm(example, tokenizer, data_args, is_test=True, intokens=False):
-    if data_args.chat_template is not None:
+    if tokenizer.chat_template is not None:
         # chatglm only support single-round finetune
         example = convert_multi_rounds_to_single_round(example, tokenizer)