CUDA error #1

yuw444 · 2021-07-12T04:44:42Z

Hi,

I was trying both pancreas and pbmc examples, pancreas one worked great under cpu or gpu, but I received an error message for pbmc example under gpu as follows.

`RuntimeError Traceback (most recent call last)
in
----> 1 hd_ae.train(num_epochs=100)
2 source_embeddings = hd_ae.embed_data(source_adata)
3 sc.pp.neighbors(source_embeddings)
4 sc.tl.umap(source_embeddings)
5 sc.pl.umap(source_embeddings, color=['cell_type', 'study'], wspace=0.4)

~/.local/lib/python3.8/site-packages/hd_ae/models.py in train(self, num_epochs)
122 train_loader = DataLoader(train_set, batch_size=128, shuffle=True)
123 trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0, max_epochs=num_epochs)
--> 124 trainer.fit(self.model, train_loader)
125 self.model.eval()
126

~/.local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloader, val_dataloaders, datamodule)
456 )
457
--> 458 self._run(model)
459
460 assert self.state.stopped

~/.local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in _run(self, model)
713 self.call_setup_hook(model) # allow user to setup lightning_module in accelerator environment
714 self.call_configure_sharded_model(model) # allow user to setup in model sharded environment
--> 715 self.accelerator.setup(self, model) # note: this sets up self.lightning_module
716
717 # ----------------------------

~/.local/lib/python3.8/site-packages/pytorch_lightning/accelerators/gpu.py in setup(self, trainer, model)
39 self.set_nvidia_flags(trainer.local_rank)
40 torch.cuda.set_device(self.root_device)
---> 41 return super().setup(trainer, model)
42
43 def on_train_start(self) -> None:

~/.local/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py in setup(self, trainer, model)
88 model: the LightningModule
89 """
---> 90 self.setup_training_type_plugin(self.training_type_plugin, model)
91 if not self.training_type_plugin.setup_optimizers_in_pre_dispatch:
92 self.setup_optimizers(trainer)

~/.local/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py in setup_training_type_plugin(self, plugin, model)
381 def setup_training_type_plugin(self, plugin: TrainingTypePlugin, model: 'pl.LightningModule') -> None:
382 """Attaches the training type plugin to the accelerator."""
--> 383 plugin.setup(model)
384
385 def setup_precision_plugin(self, plugin: PrecisionPlugin) -> None:

~/.local/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/single_device.py in setup(self, model)
67
68 def setup(self, model: torch.nn.Module) -> torch.nn.Module:
---> 69 self.model_to_device()
70 return self.model
71

~/.local/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/single_device.py in model_to_device(self)
64 torch.cuda.set_device(self.root_device)
65
---> 66 self._model.to(self.root_device)
67
68 def setup(self, model: torch.nn.Module) -> torch.nn.Module:

~/.local/lib/python3.8/site-packages/pytorch_lightning/utilities/device_dtype_mixin.py in to(self, *args, **kwargs)
107 out = torch._C._nn._parse_to(*args, **kwargs)
108 self.__update_properties(device=out[0], dtype=out[1])
--> 109 return super().to(*args, **kwargs)
110
111 def cuda(self, device: Optional[Union[torch.device, int]] = None) -> Module:

~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py in to(self, *args, **kwargs)
850 return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)
851
--> 852 return self._apply(convert)
853
854 def register_backward_hook(

~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py in _apply(self, fn)
528 def _apply(self, fn):
529 for module in self.children():
--> 530 module._apply(fn)
531
532 def compute_should_use_set_data(tensor, tensor_applied):

~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py in _apply(self, fn)
550 # with torch.no_grad():
551 with torch.no_grad():
--> 552 param_applied = fn(param)
553 should_use_set_data = compute_should_use_set_data(param, param_applied)
554 if should_use_set_data:

~/.local/lib/python3.8/site-packages/torch/nn/modules/module.py in convert(t)
848 return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None,
849 non_blocking, memory_format=convert_to_format)
--> 850 return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)
851
852 return self._apply(convert)

RuntimeError: CUDA error: all CUDA-capable devices are busy or unavailable
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.`

I have tested pbmc example with cpu, it worked fine. Just wonder why this is the case.

Please advise.

Thanks

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

CUDA error #1

CUDA error #1

yuw444 commented Jul 12, 2021

CUDA error #1

CUDA error #1

Comments

yuw444 commented Jul 12, 2021