Skip to content

Commit

Permalink
remove cpu restriction for bnb training (#3062)
Browse files Browse the repository at this point in the history
* rm cpu restriction for 8-bit training

* check bnb version

* def is bnb multi backend avaliable

* fix log
  • Loading branch information
jiqing-feng authored Sep 30, 2024
1 parent 018a99e commit 5060574
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 5 deletions.
12 changes: 7 additions & 5 deletions src/accelerate/accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@
get_mixed_precision_context_manager,
get_pretty_name,
is_bf16_available,
is_bitsandbytes_multi_backend_available,
is_deepspeed_available,
is_ipex_available,
is_lomo_available,
Expand Down Expand Up @@ -1425,8 +1426,8 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
model_devices = set(model.hf_device_map.values())
if len(model_devices) > 1 and self.distributed_type != DistributedType.NO:
raise ValueError(
"You can't train a model that has been loaded in 8-bit precision on multiple devices in any distributed mode."
" In order to use 8-bit models that have been loaded across multiple GPUs the solution is to use Naive Pipeline Parallelism."
"You can't train a model that has been loaded in 8-bit or 4-bit precision on multiple devices in any distributed mode."
" In order to use 8-bit or 4-bit models that have been loaded across multiple GPUs the solution is to use Naive Pipeline Parallelism."
" Therefore you should not specify that you are under any distributed regime in your accelerate config."
)
elif len(model_devices) == 1:
Expand All @@ -1439,13 +1440,14 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
# if on the first device (GPU 0) we don't care
if (self.device.index is not None) or (current_device_index != 0):
raise ValueError(
"You can't train a model that has been loaded in 8-bit precision on a different device than the one "
"You can't train a model that has been loaded in 8-bit or 4-bit precision on a different device than the one "
"you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device()}` or `device_map={'':torch.xpu.current_device()}`"
)

if "cpu" in model_devices or "disk" in model_devices:
if ("cpu" in model_devices and not is_bitsandbytes_multi_backend_available()) or "disk" in model_devices:
raise ValueError(
"You can't train a model that has been loaded in 8-bit precision with CPU or disk offload."
"You can't train a model that has been loaded in 8-bit or 4-bit precision with CPU or disk offload. "
"If you want train the 8-bit or 4-bit model in CPU, please install bitsandbytes with multi-backend, see https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend"
)
elif device_placement and not self.verify_device_map(model):
model = model.to(self.device)
Expand Down
1 change: 1 addition & 0 deletions src/accelerate/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
is_8bit_bnb_available,
is_aim_available,
is_bf16_available,
is_bitsandbytes_multi_backend_available,
is_bnb_available,
is_boto3_available,
is_ccl_available,
Expand Down
8 changes: 8 additions & 0 deletions src/accelerate/utils/imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,14 @@ def is_bnb_available():
return _is_package_available("bitsandbytes")


def is_bitsandbytes_multi_backend_available():
if not is_bnb_available():
return False
import bitsandbytes as bnb

return "multi_backend" in getattr(bnb, "features", set())


def is_torchvision_available():
return _is_package_available("torchvision")

Expand Down

0 comments on commit 5060574

Please sign in to comment.