Merge branch 'hpcaitech:main' into feat/allgather_overlap

hpcaitech · Jan 15, 2025 · 2254d8b · 2254d8b
2 parents 348520d + 5b094a8
commit 2254d8b
Show file tree

Hide file tree

Showing 60 changed files with 4,924 additions and 711 deletions.
diff --git a/.github/workflows/doc_check_on_pr.yml b/.github/workflows/doc_check_on_pr.yml
@@ -58,6 +58,7 @@ jobs:
       # there is no main branch, so it's safe to checkout the main branch from the merged branch
       # docer will rebase the remote main branch to the merged branch, so we have to config user
       - name: Make the merged branch main
+
         run: |
           cd ColossalAI
           git checkout -b main

diff --git a/.github/workflows/release_test_pypi_before_merge.yml b/.github/workflows/release_test_pypi_before_merge.yml
@@ -49,6 +49,7 @@ jobs:
         # we need to install the requirements.txt first
         # as test-pypi may not contain the distributions for libs listed in the txt file
         pip install -r requirements/requirements.txt
+        pip install -U setuptools==68.2.2 wheel
         pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.python.org/pypi colossalai==$VERSION
       env:
         VERSION: ${{ steps.prep-version.outputs.version }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
       args: ['--line-length=120', '--target-version=py37', '--target-version=py38', '--target-version=py39','--target-version=py310']
 
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v19.1.2
+    rev: v19.1.5
     hooks:
     - id: clang-format
       name: clang formatter

diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@
    <a href="https://www.colossalai.org/"> Documentation </a> |
    <a href="https://github.com/hpcaitech/ColossalAI/tree/main/examples"> Examples </a> |
    <a href="https://github.com/hpcaitech/ColossalAI/discussions"> Forum </a> |
-   <a href="https://cloud.luchentech.com/">GPU Cloud Playground </a> |
+   <a href="https://colossalai.org/zh-Hans/docs/get_started/bonus/">GPU Cloud Playground </a> |
    <a href="https://hpc-ai.com/blog"> Blog </a></h3>
 
    [![GitHub Repo stars](https://img.shields.io/github/stars/hpcaitech/ColossalAI?style=social)](https://github.com/hpcaitech/ColossalAI/stargazers)
@@ -44,6 +44,7 @@ Limited Academic Bonuses:
 
 
 ## Latest News
+* [2024/12] [The development cost of video generation models has saved by 50%! Open-source solutions are now available with H200 GPU vouchers](https://company.hpc-ai.com/blog/the-development-cost-of-video-generation-models-has-saved-by-50-open-source-solutions-are-now-available-with-h200-gpu-vouchers) [[code]](https://github.com/hpcaitech/Open-Sora/blob/main/scripts/train.py) [[vouchers]](https://colossalai.org/zh-Hans/docs/get_started/bonus/)
 * [2024/10] [How to build a low-cost Sora-like app? Solutions for you](https://company.hpc-ai.com/blog/how-to-build-a-low-cost-sora-like-app-solutions-for-you)
 * [2024/09] [Singapore Startup HPC-AI Tech Secures 50 Million USD in Series A Funding to Build the Video Generation AI Model and GPU Platform](https://company.hpc-ai.com/blog/singapore-startup-hpc-ai-tech-secures-50-million-usd-in-series-a-funding-to-build-the-video-generation-ai-model-and-gpu-platform)
 * [2024/09] [Reducing AI Large Model Training Costs by 30% Requires Just a Single Line of Code From FP8 Mixed Precision Training Upgrades](https://company.hpc-ai.com/blog/reducing-ai-large-model-training-costs-by-30-requires-just-a-single-line-of-code-from-fp8-mixed-precision-training-upgrades)

diff --git a/colossalai/booster/booster.py b/colossalai/booster/booster.py
@@ -288,7 +288,14 @@ def enable_lora(
 
         return self.plugin.enable_lora(model, pretrained_dir, lora_config, bnb_quantization_config)
 
-    def load_model(self, model: Union[nn.Module, ModelWrapper], checkpoint: str, strict: bool = True) -> None:
+    def load_model(
+        self,
+        model: Union[nn.Module, ModelWrapper],
+        checkpoint: str,
+        strict: bool = True,
+        low_cpu_mem_mode: bool = True,
+        num_threads: int = 1,
+    ) -> None:
         """Load model from checkpoint.
 
         Args:
@@ -298,8 +305,12 @@ def load_model(self, model: Union[nn.Module, ModelWrapper], checkpoint: str, str
             strict (bool, optional): whether to strictly enforce that the keys
                 in :attr:`state_dict` match the keys returned by this module's
                 :meth:`~torch.nn.Module.state_dict` function. Defaults to True.
+            low_cpu_mem_mode (bool): whether to load the model in low cpu memory mode. If false, it will use RAM cache to accelerate loading. Default: True.
+            num_threads (int): number of threads to use when loading the model. Only useful when disabling low cpu mem mode. Default: 1.
         """
-        self.checkpoint_io.load_model(model, checkpoint, strict)
+        self.checkpoint_io.load_model(
+            model, checkpoint, strict, low_cpu_mem_mode=low_cpu_mem_mode, num_threads=num_threads
+        )
 
     def save_model(
         self,
@@ -338,18 +349,25 @@ def save_model(
             use_async=use_async,
         )
 
-    def load_optimizer(self, optimizer: Optimizer, checkpoint: str) -> None:
+    def load_optimizer(
+        self,
+        optimizer: Optimizer,
+        checkpoint: str,
+        low_cpu_mem_mode: bool = True,
+        num_threads: int = 1,
+    ) -> None:
         """Load optimizer from checkpoint.
 
         Args:
             optimizer (Optimizer): An optimizer boosted by Booster.
             checkpoint (str): Path to the checkpoint. It must be a local path.
                 It should be a directory path if the checkpoint is sharded. Otherwise, it should be a file path.
-            prefix (str, optional): A prefix added to parameter and buffer
-                names to compose the keys in state_dict. Defaults to None.
-            size_per_shard (int, optional): Maximum size of checkpoint shard file in MB. This is useful only when ``shard=True``. Defaults to 1024.
+            low_cpu_mem_mode (bool): whether to load the model in low cpu memory mode. If false, it will use RAM cache to accelerate loading. Default: True.
+            num_threads (int): number of threads to use when loading the model. Only useful when disabling low cpu mem mode. Default: 1.
         """
-        self.checkpoint_io.load_optimizer(optimizer, checkpoint)
+        self.checkpoint_io.load_optimizer(
+            optimizer, checkpoint, low_cpu_mem_mode=low_cpu_mem_mode, num_threads=num_threads
+        )
 
     def save_optimizer(
         self,

diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py
@@ -1,4 +1,3 @@
-import gc
 import os
 import random
 from pathlib import Path
@@ -17,9 +16,11 @@
 from colossalai.accelerator import get_accelerator
 from colossalai.checkpoint_io import CheckpointIndexFile, CheckpointIO, GeneralCheckpointIO
 from colossalai.checkpoint_io.utils import (
+    async_save_state_dict_shards,
+    create_pinned_state_dict,
     get_model_base_filenames,
     get_optimizer_base_filenames,
-    load_shard_state_dict,
+    load_state_dict_shards,
     save_config_file,
     save_state_dict,
     save_state_dict_shards,
@@ -82,17 +83,34 @@ def save_unsharded_model(
         state_dict = model.state_dict(only_rank_0=True)
         if self.coordinator.is_master():
             if use_async:
-                super().save_unsharded_model(model, checkpoint, gather_dtensor, use_safetensors, use_async)
+                from colossalai.utils.safetensors import save
+
+                if id(model) not in self.pinned_state_dicts:
+                    self.pinned_state_dicts[id(model)] = create_pinned_state_dict(state_dict)
+                for k, v in state_dict.items():
+                    self.pinned_state_dicts[id(model)][k].copy_(v)
+                    state_dict[k] = self.pinned_state_dicts[id(model)][k]
+                writer = save(checkpoint, state_dict)
+                self.async_writers.append(writer)
             else:
                 save_state_dict(state_dict, checkpoint, use_safetensors)
 
-    def load_unsharded_model(self, model: GeminiDDP, checkpoint: str, strict: bool = True):
+    def load_unsharded_model(
+        self,
+        model: GeminiDDP,
+        checkpoint: str,
+        strict: bool = True,
+        low_cpu_mem_mode: bool = True,
+        num_threads: int = 1,
+    ):
         """
         Load model from checkpoint with automatic unwrapping.
         The model should be unwrapped in self.load_model via ModelWrapper.unwrap.
         """
         assert isinstance(model, GeminiDDP), "Please boost the model before loading!"
-        super().load_unsharded_model(model, checkpoint, strict=strict)
+        super().load_unsharded_model(
+            model, checkpoint, strict=strict, low_cpu_mem_mode=low_cpu_mem_mode, num_threads=num_threads
+        )
 
     def save_unsharded_optimizer(
         self, optimizer: GeminiOptimizer, checkpoint: str, gather_dtensor: bool, use_async: bool = False
@@ -106,15 +124,31 @@ def save_unsharded_optimizer(
         assert isinstance(optimizer, GeminiOptimizer), "Please boost the optimizer before saving!"
         state_dict = optimizer.state_dict()
         if self.coordinator.is_master():
-            save_state_dict(state_dict, checkpoint, use_safetensors=False)
+            if use_async:
+                from colossalai.utils.safetensors import _flatten_optim_state_dict, save
+
+                flatten_state_dict, metadata = _flatten_optim_state_dict(state_dict)
+                if id(optimizer) not in self.pinned_state_dicts:
+                    self.pinned_state_dicts[id(optimizer)] = create_pinned_state_dict(flatten_state_dict)
+                for k, v in flatten_state_dict.items():
+                    self.pinned_state_dicts[id(optimizer)][k].copy_(v)
+                    flatten_state_dict[k] = self.pinned_state_dicts[id(optimizer)][k]
+                writer = save(checkpoint, flatten_state_dict, metadata)
+                self.async_writers.append(writer)
+            else:
+                save_state_dict(state_dict, checkpoint, use_safetensors=False)
 
-    def load_unsharded_optimizer(self, optimizer: GeminiOptimizer, checkpoint: str):
+    def load_unsharded_optimizer(
+        self, optimizer: GeminiOptimizer, checkpoint: str, low_cpu_mem_mode: bool = True, num_threads: int = 1
+    ):
         """
         Loading unsharded optimizer from checkpoint file.
         For each process, only loading optimizer states of parameters it controls.
         """
         assert isinstance(optimizer, GeminiOptimizer), "Please boost the optimizer before loading!"
-        super().load_unsharded_optimizer(optimizer, checkpoint)
+        super().load_unsharded_optimizer(
+            optimizer, checkpoint, low_cpu_mem_mode=low_cpu_mem_mode, num_threads=num_threads
+        )
 
     def save_sharded_model(
         self,
@@ -137,17 +171,29 @@ def save_sharded_model(
 
         Path(checkpoint_path).mkdir(parents=True, exist_ok=True)
 
-        state_dict_shard = model.state_dict_shard(max_shard_size=max_shard_size, only_rank_0=True)
+        if use_async and self.coordinator.is_master():
+            if id(model) not in self.pinned_state_dicts:
+                self.pinned_state_dicts[id(model)] = {}
+            pinned_state_dicts = self.pinned_state_dicts[id(model)]
+        else:
+            pinned_state_dicts = None
+        state_dict_shard = model.state_dict_shard(
+            max_shard_size=max_shard_size, only_rank_0=True, pinned_state_dicts=pinned_state_dicts
+        )
         weights_name, save_index_file = get_model_base_filenames(prefix, use_safetensors)
         index_file = CheckpointIndexFile(checkpoint_path)
 
         # Save shards of optimizer states.
         is_master = self.coordinator.is_master()
         if use_async:
-            super().save_sharded_model(
-                model, checkpoint_path, gather_dtensor, prefix, max_shard_size, use_safetensors, use_async
+            total_size, writers = async_save_state_dict_shards(
+                sharded_state_dict=state_dict_shard,
+                checkpoint=checkpoint_path,
+                index_file=index_file,
+                base_filename=weights_name,
+                is_master=is_master,
             )
-
+            self.async_writers.extend(writers)
         else:
             total_size = save_state_dict_shards(
                 sharded_state_dict=state_dict_shard,
@@ -158,26 +204,40 @@ def save_sharded_model(
                 use_safetensors=use_safetensors,
             )
 
-            # only save the index file on the master rank
-            if self.coordinator.is_master():
-                index_file.append_meta_data("total_size", total_size)
-                index_file.write_index_file(save_index_file)
-                save_config_file(model.unwrap(), checkpoint_path)
-                self.logger.info(
-                    f"The model is split into checkpoint shards. "
-                    f"You can find where each parameters has been saved in the "
-                    f"index located at {save_index_file}.",
-                    ranks=[0],
-                )
+        # only save the index file on the master rank
+        if self.coordinator.is_master():
+            index_file.append_meta_data("total_size", total_size)
+            index_file.write_index_file(save_index_file)
+            save_config_file(model.unwrap(), checkpoint_path)
+            self.logger.info(
+                f"The model is split into checkpoint shards. "
+                f"You can find where each parameters has been saved in the "
+                f"index located at {save_index_file}.",
+                ranks=[0],
+            )
 
     def load_sharded_model(
-        self, model: GeminiDDP, checkpoint_index_file: Path, strict: bool = False, use_safetensors: bool = False
+        self,
+        model: GeminiDDP,
+        checkpoint_index_file: Path,
+        strict: bool = False,
+        use_safetensors: bool = False,
+        low_cpu_mem_mode: bool = True,
+        num_threads: int = 1,
     ):
         """
         Load shard model, load model from multiple files.
         """
         assert isinstance(model, GeminiDDP), "Please boost the model before loading!"
-        return super().load_sharded_model(model, checkpoint_index_file, strict, use_safetensors, load_sub_module=False)
+        return super().load_sharded_model(
+            model,
+            checkpoint_index_file,
+            strict,
+            use_safetensors,
+            load_sub_module=False,
+            low_cpu_mem_mode=low_cpu_mem_mode,
+            num_threads=num_threads,
+        )
 
     def save_sharded_optimizer(
         self,
@@ -201,7 +261,7 @@ def save_sharded_optimizer(
         Path(checkpoint).mkdir(parents=True, exist_ok=True)
 
         # Preparing file paths and index file.
-        states_name, save_index_file, param_group_file = get_optimizer_base_filenames(prefix)
+        states_name, save_index_file, param_group_file = get_optimizer_base_filenames(prefix, use_safetensors=use_async)
         index_file = CheckpointIndexFile(checkpoint)
         index_file.append_meta_data("param_groups", param_group_file)
 
@@ -212,17 +272,36 @@ def save_sharded_optimizer(
             torch.save(param_groups, group_file_path)
 
         # States are broken into shards within max_shard_size.
-        state_dict_shard = optimizer.state_shard(prefix=prefix, max_shard_size=size_per_shard, only_rank_0=True)
+        if use_async and self.coordinator.is_master():
+            if id(optimizer) not in self.pinned_state_dicts:
+                self.pinned_state_dicts[id(optimizer)] = {}
+            pinned_state_dicts = self.pinned_state_dicts[id(optimizer)]
+        else:
+            pinned_state_dicts = None
+        state_dict_shard = optimizer.state_shard(
+            prefix=prefix, max_shard_size=size_per_shard, only_rank_0=True, pinned_state_dicts=pinned_state_dicts
+        )
 
         # Save shards of optimizer states.
-        total_size = save_state_dict_shards(
-            sharded_state_dict=state_dict_shard,
-            checkpoint=checkpoint,
-            index_file=index_file,
-            base_filename=states_name,
-            is_master=self.coordinator.is_master(),
-            use_safetensors=False,
-        )
+        if use_async:
+            total_size, writers = async_save_state_dict_shards(
+                sharded_state_dict=state_dict_shard,
+                checkpoint=checkpoint,
+                index_file=index_file,
+                base_filename=states_name,
+                is_master=self.coordinator.is_master(),
+                state_preprocess=True,
+            )
+            self.async_writers.extend(writers)
+        else:
+            total_size = save_state_dict_shards(
+                sharded_state_dict=state_dict_shard,
+                checkpoint=checkpoint,
+                index_file=index_file,
+                base_filename=states_name,
+                is_master=self.coordinator.is_master(),
+                use_safetensors=False,
+            )
 
         # Wrap up index file. Only save it on master rank.
         if self.coordinator.is_master():
@@ -235,7 +314,14 @@ def save_sharded_optimizer(
                 ranks=[0],
             )
 
-    def load_sharded_optimizer(self, optimizer: GeminiOptimizer, checkpoint_index_file: Path, prefix: str):
+    def load_sharded_optimizer(
+        self,
+        optimizer: GeminiOptimizer,
+        checkpoint_index_file: Path,
+        prefix: str,
+        low_cpu_mem_mode: bool = True,
+        num_threads: int = 1,
+    ):
         """
         Loading sharded optimizer from checkpoint folder, with index file given.
         For each process, only loading optimizer states of parameters it controls.
@@ -263,11 +349,12 @@ def load_sharded_optimizer(self, optimizer: GeminiOptimizer, checkpoint_index_fi
 
         # Load optimizer states from shard files under checkpoint path.
         # For each file, only load the states managed by current process.
-        for shard_file in checkpoint_files:
-            state_dict_shard = load_shard_state_dict(Path(shard_file), use_safetensors=False)
+        for state_dict_shard in load_state_dict_shards(
+            checkpoint_files, True, False, low_cpu_mem_mode=low_cpu_mem_mode
+        ):
+            if not low_cpu_mem_mode:
+                state_dict_shard = create_pinned_state_dict(state_dict_shard, empty=False, num_threads=num_threads)
             optimizer.load_param_states(state_dict_shard)
-            del state_dict_shard
-            gc.collect()
 
         optimizer.optimizer_loading_epilogue()
 

diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -1488,7 +1488,7 @@ def seed_worker(worker_id):
         )
 
     def get_checkpoint_io(self) -> CheckpointIO:
-        return HybridParallelCheckpointIO(self.dp_group, self.pp_group, self.tp_group, self.zero_stage)
+        return HybridParallelCheckpointIO(self.dp_group, self.pp_group, self.tp_group, self.sp_group, self.zero_stage)
 
     def no_sync(self, model: Module, optimizer: OptimizerWrapper) -> Iterator[None]:
         assert (