Fix incorrect DDP test (#64074)
authorRohan Varma <rvarm1@fb.com>
Wed, 1 Sep 2021 23:25:00 +0000 (16:25 -0700)
committerFacebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
Wed, 1 Sep 2021 23:34:06 +0000 (16:34 -0700)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/64074

Previous PR https://github.com/pytorch/pytorch/pull/63831 did not actually test the error in https://github.com/pytorch/pytorch/issues/63812. Introduce a test
directly from the repro that simulates it.
ghstack-source-id: 137171460

Test Plan: CI

Reviewed By: SciPioneer

Differential Revision: D30569719

fbshipit-source-id: fd61250ef6d291c093607663d91d6d2cb5574eb7

torch/testing/_internal/distributed/distributed_test.py

index 333458c..f17842e 100644 (file)
@@ -3761,25 +3761,28 @@ class DistributedTest:
             self._barrier()
 
         @sandcastle_skip_if(
-            BACKEND != "nccl" and BACKEND != "gloo",
-            "Only NCCL and GLOO backend support DistributedDataParallel",
+            BACKEND == "nccl",
+            "Gloo-only test"
         )
-        @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
         def test_ddp_create_graph(self):
-            rank = self.rank
-            torch.cuda.set_device(rank)
-            net = torch.nn.parallel.DistributedDataParallel(
-                torch.nn.Linear(1, 1, bias=False).cuda(rank),
-                device_ids=[rank]
-            )
-            inp = torch.randn((2, 1), device=rank)
+            class Model(nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.p = nn.Parameter(torch.tensor(1.))
+
+                def forward(self):
+                    return self.p.pow(2)
+
+            model = Model()
+            ddp_model = torch.nn.parallel.DistributedDataParallel(model)
             for _ in range(6):
-                loss = net(inp).sum()
-                # Verify DDP works with create_graph=True
-                loss.backward(create_graph=True)
+                # Verify DDP doesn't throw when ran with create_graph=True.
+                # Although we do warn about potential issues, please see
+                # https://github.com/pytorch/pytorch/issues/63929 for details.
+                ddp_model().backward(create_graph=True)
                 # grad tensors should require grad.
                 self.assertTrue(
-                    all([param.requires_grad for param in net.parameters()])
+                    all([param.requires_grad for param in ddp_model.parameters()])
                 )
 
         @sandcastle_skip_if(