Summary:
Fixed: https://github.com/pytorch/pytorch/issues/14445
Also bumped up timeout to 30 seconds, since on 8-GPU machines, DDP test will take more than 15 seconds sometimes.
Tested on 8 GPU machines:
```
tengli@learnfair062:~/pytorch/test$ python test_c10d.py --verbose
test_dist_broadcast_coalesced_gloo (__main__.DistributedDataParallelTest) ... ok
test_dist_broadcast_coalesced_nccl (__main__.DistributedDataParallelTest) ... skipped 'Test skipped due to known issues'
test_fp16 (__main__.DistributedDataParallelTest) ... ok
test_gloo_backend (__main__.DistributedDataParallelTest) ... ok
test_nccl_backend (__main__.DistributedDataParallelTest) ... ok
test_queue_reduction (__main__.DistributedDataParallelTest) ... ok
test_sync_params_no_buffers (__main__.DistributedDataParallelTest) ... ok
test_sync_params_with_buffers (__main__.DistributedDataParallelTest) ... ok
test_sync_reduction (__main__.DistributedDataParallelTest) ... ok
test_set_get (__main__.FileStoreTest) ... ok
test_set_get (__main__.PrefixFileStoreTest) ... ok
test_set_get (__main__.PrefixTCPStoreTest) ... ok
test_allgather_basics (__main__.ProcessGroupGlooTest) ... ok
test_allgather_checks (__main__.ProcessGroupGlooTest) ... ok
test_allreduce_basics (__main__.ProcessGroupGlooTest) ... ok
test_allreduce_basics_cuda (__main__.ProcessGroupGlooTest) ... ok
test_allreduce_checks (__main__.ProcessGroupGlooTest) ... ok
test_allreduce_stress (__main__.ProcessGroupGlooTest) ... ok
test_allreduce_stress_cuda (__main__.ProcessGroupGlooTest) ... ok
test_broadcast_basics (__main__.ProcessGroupGlooTest) ... ok
test_broadcast_basics_cuda (__main__.ProcessGroupGlooTest) ... ok
test_broadcast_checks (__main__.ProcessGroupGlooTest) ... ok
test_broadcast_stress (__main__.ProcessGroupGlooTest) ... ok
test_broadcast_stress_cuda (__main__.ProcessGroupGlooTest) ... ok
test_gather_basics (__main__.ProcessGroupGlooTest) ... ok
test_gather_checks (__main__.ProcessGroupGlooTest) ... ok
test_reduce_basics (__main__.ProcessGroupGlooTest) ... ok
test_reduce_checks (__main__.ProcessGroupGlooTest) ... ok
test_scatter_basics (__main__.ProcessGroupGlooTest) ... ok
test_scatter_checks (__main__.ProcessGroupGlooTest) ... ok
test_send_recv_all_to_all (__main__.ProcessGroupGlooTest) ... ok
test_timeout_kwarg (__main__.ProcessGroupGlooTest) ... ok
test_allgather_ops (__main__.ProcessGroupNCCLTest) ... ok
test_allreduce_ops (__main__.ProcessGroupNCCLTest) ... ok
test_barrier (__main__.ProcessGroupNCCLTest) ... ok
test_broadcast_ops (__main__.ProcessGroupNCCLTest) ... ok
test_reduce_ops (__main__.ProcessGroupNCCLTest) ... ok
test_common_errors (__main__.RendezvousEnvTest) ... ok
test_nominal (__main__.RendezvousEnvTest) ... ok
test_common_errors (__main__.RendezvousFileTest) ... ok
test_nominal (__main__.RendezvousFileTest) ... ok
test_common_errors (__main__.RendezvousTCPTest) ... ok
test_nominal (__main__.RendezvousTCPTest) ... ok
test_unknown_handler (__main__.RendezvousTest) ... ok
test_address_already_in_use (__main__.TCPStoreTest) ... ok
test_set_get (__main__.TCPStoreTest) ... ok
----------------------------------------------------------------------
Ran 46 tests in 162.980s
OK (skipped=1)
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14452
Differential Revision:
D13230652
Pulled By: teng-li
fbshipit-source-id:
88580fe55b3a4fbc7a499ca3b591958f11623bf8
sys.exit(0)
-TIMEOUT_DEFAULT = 15
+TIMEOUT_DEFAULT = 30
TIMEOUT_OVERRIDE = {}
TestSkip = namedtuple('TestSkip', 'exit_code, message')
# Use all available devices on every process here (data is small, so should be fine).
devices = gpus_for_rank(self.world_size)[self.rank]
- target = torch.arange(10, dtype=torch.float64, device='cuda:0').chunk(5)
+ target = torch.arange(10, dtype=torch.float64, device='cuda:{}'.format(devices[0])).chunk(5)
parameter_data = [target]
parameter_data += [torch.zeros(10, device=torch.device('cuda', d)).chunk(5) for d in devices[1:]]
buffer_data = [[]] * len(parameter_data)
process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)
devices = gpus_for_rank(self.world_size)[self.rank]
- target = torch.arange(10, dtype=torch.float64, device='cuda:0').chunk(5)
+ target = torch.arange(10, dtype=torch.float64, device='cuda:{}'.format(devices[0])).chunk(5)
parameter_data = [target]
parameter_data += [torch.zeros(10, device=torch.device('cuda', d)).chunk(5) for d in devices[1:]]
# The expected result of the allreduce should be the average
self.assertEqual(local_grad_sum,
- torch.ones(10) * (self.world_size + 1) / 2.0)
+ torch.ones(10) * (self.world_size + 1) * len(devices) / 2.0)
@skip_if_not_nccl
def test_sync_reduction(self):
devices)
c10d._sync_reduction(work, grads_batch[0], local_grad_sum)
# The expected result of the allreduce should be the average
- self.assertEqual(grads_batch[0], (torch.ones(10) * (self.world_size + 1) / 2.0).chunk(5))
+ self.assertEqual(grads_batch[0], (torch.ones(10) * (self.world_size + 1) * len(devices) / 2.0).chunk(5))
if __name__ == '__main__':