From 9b8f9d5a25ca7a9ebd54a07ab88b3a540111e5b3 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Wed, 1 Sep 2021 16:21:31 -0700 Subject: [PATCH] [c10d] Prefer use of torch_check (#63928) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63928 throw std::invalid_argument results in not getting stacktraces with TORCH_SHOW_CPP_STACKTRACES=1, so instead prefer torch_check here. ghstack-source-id: 137135328 Test Plan: CI Reviewed By: mrshenli Differential Revision: D30533955 fbshipit-source-id: 33e5bf4f449e3043dec68da93f8022f6624d9675 --- test/distributed/test_c10d_gloo.py | 114 +++++++++++------------ torch/csrc/distributed/c10d/ProcessGroupGloo.cpp | 18 ++-- 2 files changed, 66 insertions(+), 66 deletions(-) diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py index 55b2948..789d76e 100644 --- a/test/distributed/test_c10d_gloo.py +++ b/test/distributed/test_c10d_gloo.py @@ -259,43 +259,43 @@ class ProcessGroupGlooTest(MultiProcessTestCase): t2 = torch.zeros([1], dtype=torch.float64) t3 = torch.zeros([2], dtype=torch.float32) - with self.assertRaisesRegex(ValueError, "invalid root rank"): + with self.assertRaisesRegex(RuntimeError, "invalid root rank"): opts = c10d.BroadcastOptions() opts.rootRank = -1 opts.rootTensor = 0 pg.broadcast([t1], opts) - with self.assertRaisesRegex(ValueError, "invalid root rank"): + with self.assertRaisesRegex(RuntimeError, "invalid root rank"): opts = c10d.BroadcastOptions() opts.rootRank = self.world_size opts.rootTensor = 0 pg.broadcast([t1], opts) - with self.assertRaisesRegex(ValueError, "invalid root tensor"): + with self.assertRaisesRegex(RuntimeError, "invalid root tensor"): opts = c10d.BroadcastOptions() opts.rootRank = self.rank opts.rootTensor = -1 pg.broadcast([t1], opts) - with self.assertRaisesRegex(ValueError, "invalid root tensor"): + with self.assertRaisesRegex(RuntimeError, "invalid root tensor"): opts = c10d.BroadcastOptions() opts.rootRank = self.rank opts.rootTensor = 1 pg.broadcast([t1], opts) - with self.assertRaisesRegex(ValueError, "invalid root tensor"): + with self.assertRaisesRegex(RuntimeError, "invalid root tensor"): opts = c10d.BroadcastOptions() opts.rootRank = self.rank opts.rootTensor = 0 pg.broadcast([], opts) - with self.assertRaisesRegex(ValueError, "invalid tensor type"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor type"): opts = c10d.BroadcastOptions() opts.rootRank = self.rank opts.rootTensor = 0 pg.broadcast([t1, t2], opts) - with self.assertRaisesRegex(ValueError, "invalid tensor size"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor size"): opts = c10d.BroadcastOptions() opts.rootRank = self.rank opts.rootTensor = 0 @@ -394,15 +394,15 @@ class ProcessGroupGlooTest(MultiProcessTestCase): t2 = torch.zeros([1], dtype=torch.float64) t3 = torch.zeros([2], dtype=torch.float32) - with self.assertRaisesRegex(ValueError, "requires non-empty tensor list"): + with self.assertRaisesRegex(RuntimeError, "requires non-empty tensor list"): opts = c10d.AllreduceOptions() pg.allreduce([], opts) - with self.assertRaisesRegex(ValueError, "invalid tensor type"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor type"): opts = c10d.AllreduceOptions() pg.allreduce([t1, t2], opts) - with self.assertRaisesRegex(ValueError, "invalid tensor size"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor size"): opts = c10d.AllreduceOptions() pg.allreduce([t1, t3], opts) @@ -553,19 +553,19 @@ class ProcessGroupGlooTest(MultiProcessTestCase): t2 = torch.zeros(1, dtype=torch.float64) t3 = torch.sparse_coo_tensor([[0]], [1], size=(1,)) - with self.assertRaisesRegex(ValueError, "requires non-empty tensor list"): + with self.assertRaisesRegex(RuntimeError, "requires non-empty tensor list"): opts = c10d.AllreduceCoalescedOptions() pg.allreduce_coalesced([], opts) - with self.assertRaisesRegex(ValueError, "tensors must all have the same type"): + with self.assertRaisesRegex(RuntimeError, "tensors must all have the same type"): opts = c10d.AllreduceCoalescedOptions() pg.allreduce_coalesced([t1, t2], opts) - with self.assertRaisesRegex(ValueError, "invalid tensor layout at index"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor layout at index"): opts = c10d.AllreduceCoalescedOptions() pg.allreduce_coalesced([t1, t3], opts) - with self.assertRaisesRegex(ValueError, "unsupported layout"): + with self.assertRaisesRegex(RuntimeError, "unsupported layout"): opts = c10d.AllreduceCoalescedOptions() pg.allreduce_coalesced([t3, t3.clone()], opts) @@ -579,7 +579,7 @@ class ProcessGroupGlooTest(MultiProcessTestCase): t1 = torch.zeros(1, dtype=torch.float32) - with self.assertRaisesRegex(ValueError, "unsupported device type"): + with self.assertRaisesRegex(RuntimeError, "unsupported device type"): opts = c10d.AllreduceCoalescedOptions() pg.allreduce_coalesced([t1.cuda(), t1.cuda()], opts) @@ -647,21 +647,21 @@ class ProcessGroupGlooTest(MultiProcessTestCase): t2 = torch.sparse_coo_tensor([[0]], [1], size=(2,)) t3 = torch.sparse_coo_tensor([[0]], [1], size=(4,)) - with self.assertRaisesRegex(ValueError, "requires non-empty tensor list"): + with self.assertRaisesRegex(RuntimeError, "requires non-empty tensor list"): opts = c10d.AllreduceOptions() pg.allreduce([], opts) - with self.assertRaisesRegex(ValueError, "invalid tensor layout"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor layout"): opts = c10d.AllreduceOptions() pg.allreduce([t1, t2], opts) - with self.assertRaisesRegex(ValueError, "invalid tensor size"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor size"): opts = c10d.AllreduceOptions() pg.allreduce([t2, t3], opts) # Sparse allreduce only works with c10d.ReduceOp.SUM. for op in [c10d.ReduceOp.PRODUCT, c10d.ReduceOp.MIN, c10d.ReduceOp.MAX]: - with self.assertRaisesRegex(ValueError, "unsupported reduction operation"): + with self.assertRaisesRegex(RuntimeError, "unsupported reduction operation"): opts = c10d.AllreduceOptions() opts.reduceOp = op pg.allreduce([t3], opts) @@ -705,36 +705,36 @@ class ProcessGroupGlooTest(MultiProcessTestCase): t2 = torch.zeros([1], dtype=torch.float64) t3 = torch.zeros([2], dtype=torch.float32) - with self.assertRaisesRegex(ValueError, "invalid root rank"): + with self.assertRaisesRegex(RuntimeError, "invalid root rank"): opts = c10d.ScatterOptions() opts.rootRank = -1 pg.scatter([t1], [], opts) - with self.assertRaisesRegex(ValueError, "invalid root rank"): + with self.assertRaisesRegex(RuntimeError, "invalid root rank"): opts = c10d.ScatterOptions() opts.rootRank = self.world_size pg.scatter([t1], [], opts) with self.assertRaisesRegex( - ValueError, "requires a single-element output tensor list" + RuntimeError, "requires a single-element output tensor list" ): opts = c10d.ScatterOptions() opts.rootRank = 0 pg.scatter([], [], opts) with self.assertRaisesRegex( - ValueError, "requires a single-element output tensor list" + RuntimeError, "requires a single-element output tensor list" ): opts = c10d.ScatterOptions() opts.rootRank = 0 pg.scatter([t1, t1], [], opts) - with self.assertRaisesRegex(ValueError, "requires a single-element input list"): + with self.assertRaisesRegex(RuntimeError, "requires a single-element input list"): opts = c10d.ScatterOptions() opts.rootRank = self.rank pg.scatter([t1], [], opts) - with self.assertRaisesRegex(ValueError, "requires a single-element input list"): + with self.assertRaisesRegex(RuntimeError, "requires a single-element input list"): opts = c10d.ScatterOptions() opts.rootRank = self.rank pg.scatter([t1], [[t1] * self.world_size, [t1] * self.world_size], opts) @@ -743,7 +743,7 @@ class ProcessGroupGlooTest(MultiProcessTestCase): incorrect_list_size = self.world_size - 1 err_str = "Incorrect input list size {}. Input list size should be {}" with self.assertRaisesRegex( - ValueError, err_str.format(incorrect_list_size, desired_list_size) + RuntimeError, err_str.format(incorrect_list_size, desired_list_size) ): opts = c10d.ScatterOptions() opts.rootRank = self.rank @@ -751,23 +751,23 @@ class ProcessGroupGlooTest(MultiProcessTestCase): incorrect_list_size = self.world_size + 1 with self.assertRaisesRegex( - ValueError, err_str.format(incorrect_list_size, desired_list_size) + RuntimeError, err_str.format(incorrect_list_size, desired_list_size) ): opts = c10d.ScatterOptions() opts.rootRank = self.rank pg.scatter([t1], [[t1] * incorrect_list_size], opts) - with self.assertRaisesRegex(ValueError, "invalid tensor type"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor type"): opts = c10d.ScatterOptions() opts.rootRank = self.rank pg.scatter([t1], [[t2] * self.world_size], opts) - with self.assertRaisesRegex(ValueError, "invalid tensor size"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor size"): opts = c10d.ScatterOptions() opts.rootRank = self.rank pg.scatter([t1], [[t3] * self.world_size], opts) - with self.assertRaisesRegex(ValueError, "requires empty input on non-root"): + with self.assertRaisesRegex(RuntimeError, "requires empty input on non-root"): opts = c10d.ScatterOptions() opts.rootRank = (self.rank + 1) % self.world_size pg.scatter([t1], [[t1] * self.world_size], opts) @@ -872,39 +872,39 @@ class ProcessGroupGlooTest(MultiProcessTestCase): t2 = torch.zeros([1], dtype=torch.float64) t3 = torch.zeros([2], dtype=torch.float32) - with self.assertRaisesRegex(ValueError, "invalid root rank"): + with self.assertRaisesRegex(RuntimeError, "invalid root rank"): opts = c10d.GatherOptions() opts.rootRank = -1 pg.gather([], [t1], opts) - with self.assertRaisesRegex(ValueError, "invalid root rank"): + with self.assertRaisesRegex(RuntimeError, "invalid root rank"): opts = c10d.GatherOptions() opts.rootRank = self.world_size pg.gather([], [t1], opts) with self.assertRaisesRegex( - ValueError, "requires a single-element input tensor list" + RuntimeError, "requires a single-element input tensor list" ): opts = c10d.GatherOptions() opts.rootRank = 0 pg.gather([], [], opts) with self.assertRaisesRegex( - ValueError, "requires a single-element input tensor list" + RuntimeError, "requires a single-element input tensor list" ): opts = c10d.GatherOptions() opts.rootRank = 0 pg.gather([], [t1, t1], opts) with self.assertRaisesRegex( - ValueError, "requires a single-element output list" + RuntimeError, "requires a single-element output list" ): opts = c10d.GatherOptions() opts.rootRank = self.rank pg.gather([], [t1], opts) with self.assertRaisesRegex( - ValueError, "requires a single-element output list" + RuntimeError, "requires a single-element output list" ): opts = c10d.GatherOptions() opts.rootRank = self.rank @@ -914,7 +914,7 @@ class ProcessGroupGlooTest(MultiProcessTestCase): incorrect_list_size = self.world_size - 1 err_str = "Incorrect output list size {}. Output list size should be {}" with self.assertRaisesRegex( - ValueError, err_str.format(incorrect_list_size, desired_list_size) + RuntimeError, err_str.format(incorrect_list_size, desired_list_size) ): opts = c10d.GatherOptions() opts.rootRank = self.rank @@ -922,23 +922,23 @@ class ProcessGroupGlooTest(MultiProcessTestCase): incorrect_list_size = self.world_size + 1 with self.assertRaisesRegex( - ValueError, err_str.format(incorrect_list_size, desired_list_size) + RuntimeError, err_str.format(incorrect_list_size, desired_list_size) ): opts = c10d.GatherOptions() opts.rootRank = self.rank pg.gather([[t1] * incorrect_list_size], [t1], opts) - with self.assertRaisesRegex(ValueError, "invalid tensor type"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor type"): opts = c10d.GatherOptions() opts.rootRank = self.rank pg.gather([[t2] * self.world_size], [t1], opts) - with self.assertRaisesRegex(ValueError, "invalid tensor size"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor size"): opts = c10d.GatherOptions() opts.rootRank = self.rank pg.gather([[t3] * self.world_size], [t1], opts) - with self.assertRaisesRegex(ValueError, "requires empty output on non-root"): + with self.assertRaisesRegex(RuntimeError, "requires empty output on non-root"): opts = c10d.GatherOptions() opts.rootRank = (self.rank + 1) % self.world_size pg.gather([[t1] * self.world_size], [t1], opts) @@ -1039,39 +1039,39 @@ class ProcessGroupGlooTest(MultiProcessTestCase): t2 = torch.zeros([1], dtype=torch.float64) t3 = torch.zeros([2], dtype=torch.float32) - with self.assertRaisesRegex(ValueError, "requires non-empty input tensor list"): + with self.assertRaisesRegex(RuntimeError, "requires non-empty input tensor list"): pg.allgather([], []) with self.assertRaisesRegex( - ValueError, "requires input/output tensor lists to have the same length" + RuntimeError, "requires input/output tensor lists to have the same length" ): pg.allgather([], [t1]) with self.assertRaisesRegex( - ValueError, "requires input/output tensor lists to have the same length" + RuntimeError, "requires input/output tensor lists to have the same length" ): pg.allgather([[t1] * self.world_size, [t1] * self.world_size], [t1]) - with self.assertRaisesRegex(ValueError, "invalid output tensor list"): + with self.assertRaisesRegex(RuntimeError, "invalid output tensor list"): pg.allgather([[t1] * (self.world_size - 1)], [t1]) - with self.assertRaisesRegex(ValueError, "invalid output tensor list"): + with self.assertRaisesRegex(RuntimeError, "invalid output tensor list"): pg.allgather([[t1] * (self.world_size + 1)], [t1]) - with self.assertRaisesRegex(ValueError, "invalid tensor type"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor type"): pg.allgather( [[t1, t1] * (self.world_size), [t1, t1] * (self.world_size)], [t1, t2] ) - with self.assertRaisesRegex(ValueError, "invalid tensor size"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor size"): pg.allgather( [[t1, t1] * (self.world_size), [t1, t1] * (self.world_size)], [t1, t3] ) - with self.assertRaisesRegex(ValueError, "invalid tensor type"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor type"): pg.allgather([([t1, t2] * (self.world_size))[: self.world_size]], [t1]) - with self.assertRaisesRegex(ValueError, "invalid tensor size"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor size"): pg.allgather([([t1, t3] * (self.world_size))[: self.world_size]], [t1]) def _test_allgather_basics(self, fn): @@ -1160,13 +1160,13 @@ class ProcessGroupGlooTest(MultiProcessTestCase): # One of output tensors does not match input list. dummy_output_lists[0] = [torch.zeros([0], dtype=torch.float32)] with self.assertRaisesRegex( - ValueError, "invalid size of output tensor at index 0" + RuntimeError, "invalid size of output tensor at index 0" ): c10d.all_gather_coalesced(dummy_output_lists, dummy_input, pg) # One of output tensors does not match input list. dummy_output_lists[0] = [torch.zeros([1], dtype=torch.float64)] - with self.assertRaisesRegex(ValueError, "invalid tensor type at index 0"): + with self.assertRaisesRegex(RuntimeError, "invalid tensor type at index 0"): c10d.all_gather_coalesced(dummy_output_lists, dummy_input, pg) # Output lists have too many elements @@ -1174,7 +1174,7 @@ class ProcessGroupGlooTest(MultiProcessTestCase): [torch.zeros([1], dtype=torch.float32)] for _ in range(self.world_size + 1) ] with self.assertRaisesRegex( - ValueError, "output lists should be equal to world size" + RuntimeError, "output lists should be equal to world size" ): c10d.all_gather_coalesced(dummy_output_lists, dummy_input, pg) @@ -1194,26 +1194,26 @@ class ProcessGroupGlooTest(MultiProcessTestCase): t1 = torch.zeros([1], dtype=torch.float32) - with self.assertRaisesRegex(ValueError, "invalid root rank"): + with self.assertRaisesRegex(RuntimeError, "invalid root rank"): opts = c10d.ReduceOptions() opts.rootRank = -1 opts.rootTensor = 0 pg.reduce([t1], opts) - with self.assertRaisesRegex(ValueError, "invalid root rank"): + with self.assertRaisesRegex(RuntimeError, "invalid root rank"): opts = c10d.ReduceOptions() opts.rootRank = self.world_size opts.rootTensor = 0 pg.reduce([t1], opts) - with self.assertRaisesRegex(ValueError, "invalid root tensor"): + with self.assertRaisesRegex(RuntimeError, "invalid root tensor"): opts = c10d.ReduceOptions() opts.rootRank = self.rank opts.rootTensor = 1 pg.reduce([t1], opts) with self.assertRaisesRegex( - ValueError, "requires a single-element tensor list" + RuntimeError, "requires a single-element tensor list" ): opts = c10d.ReduceOptions() opts.rootRank = self.rank diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp index ba26409..b8f5aa3 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp @@ -923,7 +923,7 @@ c10::intrusive_ptr ProcessGroupGloo::broadcast( std::vector& inputs, const BroadcastOptions& opts) { static auto invalidArgument = [](const std::string& msg) { - throw std::invalid_argument("ProcessGroupGloo::broadcast: " + msg); + TORCH_CHECK(false, "ProcessGroupGloo::broadcast: " + msg); }; assertRootRank(invalidArgument, opts.rootRank, size_); @@ -1414,7 +1414,7 @@ c10::intrusive_ptr ProcessGroupGloo::allreduce( std::vector& inputs, const AllreduceOptions& opts) { static auto invalidArgument = [](const std::string& msg) { - throw std::invalid_argument("ProcessGroupGloo::allreduce: " + msg); + TORCH_CHECK(false, "ProcessGroupGloo::allreduce: " + msg); }; assertNonEmpty(invalidArgument, inputs); @@ -1475,7 +1475,7 @@ c10::intrusive_ptr ProcessGroupGloo::allreduce_coalesced( std::vector& tensors, const AllreduceCoalescedOptions& opts) { static auto invalidArgument = [](const std::string& msg) { - throw std::invalid_argument( + TORCH_CHECK(false, "ProcessGroupGloo::allreduce_coalesced: " + msg); }; assertNonEmpty(invalidArgument, tensors); @@ -1644,7 +1644,7 @@ c10::intrusive_ptr ProcessGroupGloo::reduce( std::vector& inputs, const ReduceOptions& opts) { static auto invalidArgument = [](const std::string& msg) { - throw std::invalid_argument("ProcessGroupGloo::reduce: " + msg); + TORCH_CHECK(false, "ProcessGroupGloo::reduce: " + msg); }; assertRootRank(invalidArgument, opts.rootRank, size_); @@ -1821,7 +1821,7 @@ c10::intrusive_ptr ProcessGroupGloo::allgather( std::vector& inputs, const AllgatherOptions& opts) { static auto invalidArgument = [](const std::string& msg) { - throw std::invalid_argument("ProcessGroupGloo::allgather: " + msg); + TORCH_CHECK(false, "ProcessGroupGloo::allgather: " + msg); }; if (inputs.size() == 0) { @@ -1955,7 +1955,7 @@ c10::intrusive_ptr ProcessGroupGloo::allgather_coalesced( std::vector& input_list, const AllgatherOptions& /* unused */) { static auto invalidArgument = [](const std::string& msg) { - throw std::invalid_argument( + TORCH_CHECK(false, "ProcessGroupGloo::allgather_coalesced: " + msg); }; @@ -2152,7 +2152,7 @@ c10::intrusive_ptr ProcessGroupGloo::gather( std::vector& inputs, const GatherOptions& opts) { static auto invalidArgument = [](const std::string& msg) { - throw std::invalid_argument("ProcessGroupGloo::gather: " + msg); + TORCH_CHECK(false, "ProcessGroupGloo::gather: " + msg); }; assertRootRank(invalidArgument, opts.rootRank, size_); @@ -2336,7 +2336,7 @@ c10::intrusive_ptr ProcessGroupGloo::scatter( std::vector>& inputs, const ScatterOptions& opts) { static auto invalidArgument = [](const std::string& msg) { - throw std::invalid_argument("ProcessGroupGloo::scatter: " + msg); + TORCH_CHECK(false, "ProcessGroupGloo::scatter: " + msg); }; assertRootRank(invalidArgument, opts.rootRank, size_); @@ -2530,7 +2530,7 @@ c10::intrusive_ptr ProcessGroupGloo::alltoall_base( std::vector& inputCounts, const AllToAllOptions& /* unused */) { static auto invalidArgument = [](const std::string& msg) { - throw std::invalid_argument("ProcessGroupGloo::alltoall_base: " + msg); + TORCH_CHECK(false, "ProcessGroupGloo::alltoall_base: " + msg); }; TORCH_CHECK( -- 2.7.4