Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed failing tests for mps device #3143

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions tests/ignite/distributed/comp_models/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ def test_serial_model():
assert model.get_node_rank() == 0
if torch.cuda.is_available():
assert model.device().type == "cuda"
elif torch.backends.mps.is_available():
assert model.device().type == "mps"
else:
assert model.device().type == "cpu"
assert model.backend() is None
Expand Down
2 changes: 1 addition & 1 deletion tests/ignite/distributed/test_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ def test_idist_parallel_n_procs_native(init_method, backend, get_fixed_dirname,
_torch_version_le_112 and torch.backends.mps.is_available(), reason="Temporary skip if MPS is available"
)
def test_idist_parallel_no_dist():
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = idist.device()
with idist.Parallel(backend=None) as parallel:
parallel.run(_test_func, ws=1, device=device, backend=None, true_init_method=None)

Expand Down
4 changes: 4 additions & 0 deletions tests/ignite/distributed/utils/test_serial.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ def test_no_distrib(capsys):
assert idist.backend() is None
if torch.cuda.is_available():
assert idist.device().type == "cuda"
elif torch.backends.mps.is_available():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we have to put a guard _torch_version_le_112 here as we have also tests for older pytorch version where mps backend does not exist.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, to run the test, you need to remove @pytest.mark.skipif

assert idist.device().type == "mps"
else:
assert idist.device().type == "cpu"
assert idist.get_rank() == 0
Expand All @@ -43,6 +45,8 @@ def test_no_distrib(capsys):
assert "ignite.distributed.utils INFO: backend: None" in out[-1]
if torch.cuda.is_available():
assert "ignite.distributed.utils INFO: device: cuda" in out[-1]
elif torch.backends.mps.is_available():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here

assert "ignite.distributed.utils INFO: device: mps" in out[-1]
else:
assert "ignite.distributed.utils INFO: device: cpu" in out[-1]
assert "ignite.distributed.utils INFO: rank: 0" in out[-1]
Expand Down
65 changes: 22 additions & 43 deletions tests/ignite/engine/test_create_supervised.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,11 @@ def __init__(self, output_as_list=False):
self.output_as_list = output_as_list
self.fc = torch.nn.Linear(1, 1, bias=False)

def forward(self, x, bias=None):
if bias is None:
bias = 0.0
def forward(self, x):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's revert this change

if self.output_as_list:
return self.fc(x) + bias, self.fc(x) + bias
return self.fc(x), self.fc(x)

return self.fc(x) + bias
return self.fc(x)


def _default_create_supervised_trainer(
Expand All @@ -49,7 +47,6 @@ def _default_create_supervised_trainer(
amp_mode: str = None,
scaler: Union[bool, "torch.cuda.amp.GradScaler"] = False,
with_model_transform: bool = False,
with_model_fn: bool = False,
):
if with_model_transform:

Expand All @@ -69,8 +66,8 @@ def get_first_element(output):
optimizer = SGD(model.parameters(), 0.1)

if trace:
example_inputs = (torch.randn(1), torch.randn(1)) if with_model_fn else torch.randn(1)
model = torch.jit.trace(model, example_inputs)
example_input = torch.randn(1)
Copy link
Collaborator

@vfdev-5 vfdev-5 Nov 23, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here, let's revert this. Probably, you need merge origin/master to your branch.
Also, it is a good practice to work on git branches and not on master (pranavvp16:master)

model = torch.jit.trace(model, example_input)

if amp_mode == "apex" and model_device == trainer_device == "cuda":
from apex import amp
Expand All @@ -87,9 +84,6 @@ def get_first_element(output):
scaler=scaler,
gradient_accumulation_steps=gradient_accumulation_steps,
model_transform=model_transform if model_transform is not None else lambda x: x,
model_fn=(lambda model, x: model(x, torch.tensor([0.01], device=model_device)))
if with_model_fn
else (lambda model, x: model(x)),
)
assert model.fc.weight.data[0, 0].item() == approx(0.0)
return trainer, model
Expand All @@ -103,7 +97,6 @@ def _test_create_supervised_trainer(
amp_mode: str = None,
scaler: Union[bool, "torch.cuda.amp.GradScaler"] = False,
with_model_transform: bool = False,
with_model_fn: bool = False,
):
trainer, model = _default_create_supervised_trainer(
gradient_accumulation_steps=gradient_accumulation_steps,
Expand All @@ -113,13 +106,10 @@ def _test_create_supervised_trainer(
amp_mode=amp_mode,
scaler=scaler,
with_model_transform=with_model_transform,
with_model_fn=with_model_fn,
)

x = torch.tensor([[0.01], [0.02], [0.03], [0.04], [0.05]])
y = torch.tensor([[0.015], [0.025], [0.035], [0.045], [0.055]])
if with_model_fn:
y += 0.01
data = [(_x, _y) for _x, _y in zip(x, y)]

theta = [0.0]
Expand All @@ -131,14 +121,12 @@ def _():
assert model.fc.weight.grad != 0
_x, _y = trainer.state.batch
_x, _y = _x.to(model_device), _y.to(model_device)
bias = 0.01 if with_model_fn else 0.0
accumulation[0] += 0.2 * _x.item() * (theta[0] * _x.item() - (_y.item() - bias))
accumulation[0] += 0.2 * _x.item() * (theta[0] * _x.item() - _y.item())
# value of loss should not be accumulated
_y_pred = model(_x, torch.tensor([bias], device=model_device)) if with_model_fn else model(_x)
if with_model_transform:
_y_pred = _y_pred[0]

loss[0] = mse_loss(_y_pred, _y).item()
loss[0] = mse_loss(model(_x)[0], _y).item()
else:
loss[0] = mse_loss(model(_x), _y).item()

@trainer.on(Events.ITERATION_COMPLETED(every=gradient_accumulation_steps))
def _():
Expand Down Expand Up @@ -232,7 +220,6 @@ def _default_create_supervised_evaluator(
trace: bool = False,
amp_mode: str = None,
with_model_transform: bool = False,
with_model_fn: bool = False,
):
if with_model_transform:

Expand All @@ -251,17 +238,14 @@ def get_first_element(output):
model.fc.weight.data.zero_()

if trace:
example_inputs = (torch.randn(1), torch.randn(1)) if with_model_fn else torch.randn(1)
model = torch.jit.trace(model, example_inputs)
example_input = torch.randn(1, 1)
model = torch.jit.trace(model, example_input)

evaluator = create_supervised_evaluator(
model,
device=evaluator_device,
amp_mode=amp_mode,
model_transform=model_transform if model_transform is not None else lambda x: x,
model_fn=(lambda model, x: model(x, torch.tensor([0.01], device=model_device)))
if with_model_fn
else (lambda model, x: model(x)),
)

assert model.fc.weight.data[0, 0].item() == approx(0.0)
Expand All @@ -275,29 +259,27 @@ def _test_create_supervised_evaluator(
trace: bool = False,
amp_mode: str = None,
with_model_transform: bool = False,
with_model_fn: bool = False,
):
model, evaluator = _default_create_supervised_evaluator(
model_device=model_device,
evaluator_device=evaluator_device,
trace=trace,
amp_mode=amp_mode,
with_model_transform=with_model_transform,
with_model_fn=with_model_fn,
)
x = torch.tensor([[1.0], [2.0]])
y = torch.tensor([[3.0], [5.0]])
if with_model_fn:
y += 0.01
x = torch.tensor([[1.0], [2.0]], device=model_device)
y = torch.tensor([[3.0], [5.0]], device=evaluator_device)
data = [(x, y)]

if model_device == evaluator_device or ((model_device == "cpu") ^ (evaluator_device == "cpu")):
if (
model_device == evaluator_device
or ((model_device == "cpu") ^ (evaluator_device == "cpu"))
or ((model_device == "mps") ^ (evaluator_device == "mps"))
):
state = evaluator.run(data)

y_pred, y = state.output
if with_model_fn:
y_pred -= 0.01
y -= 0.01

assert y_pred[0, 0].item() == approx(0.0)
assert y_pred[1, 0].item() == approx(0.0)
assert y[0, 0].item() == approx(3.0)
Expand Down Expand Up @@ -420,7 +402,6 @@ def test_create_supervised_trainer(trainer_device, trace):
_test_create_supervised_trainer(gradient_accumulation_steps=1, trainer_device=trainer_device, trace=trace)
_test_create_supervised_trainer(gradient_accumulation_steps=3, trainer_device=trainer_device, trace=trace)
_test_create_supervised_trainer(with_model_transform=True, trainer_device=trainer_device, trace=trace)
_test_create_supervised_trainer(with_model_fn=True, trainer_device=trainer_device, trace=trace)
_test_create_mocked_supervised_trainer(trainer_device=trainer_device, trace=trace)


Expand Down Expand Up @@ -618,8 +599,6 @@ def test_create_supervised_trainer_on_cuda_with_model_on_cpu():

def test_create_supervised_evaluator():
_test_create_supervised_evaluator()
_test_create_supervised_evaluator(with_model_transform=True)
_test_create_supervised_evaluator(with_model_fn=True)
_test_mocked_supervised_evaluator()

# older versions didn't have the autocast method so we skip the test for older builds
Expand Down Expand Up @@ -669,10 +648,10 @@ def test_create_supervised_evaluator_on_mps():
_test_mocked_supervised_evaluator(model_device=model_device, evaluator_device=evaluator_device)


@pytest.mark.skipif(not (_torch_version_le_112 and torch.backends.mps.is_available()), reason="Skip if no MPS")
@pytest.mark.skipif(not torch.backends.mps.is_available(), reason="Skip if no MPS Backend")
def test_create_supervised_evaluator_on_mps_with_model_on_cpu():
_test_create_supervised_evaluator(evaluator_device="mps")
_test_mocked_supervised_evaluator(evaluator_device="mps")
_test_create_supervised_evaluator(model_device="mps", evaluator_device="mps")
_test_mocked_supervised_evaluator(model_device="mps", evaluator_device="mps")


@pytest.mark.skipif(Version(torch.__version__) < Version("1.6.0"), reason="Skip if < 1.6.0")
Expand Down