From 828cb18fa35c7c132ab920de1c0dc6d859f152d6 Mon Sep 17 00:00:00 2001 From: albanD Date: Tue, 8 Jan 2019 19:57:16 -0800 Subject: [PATCH] Allow ReadyQueue to handle empty tasks (#15791) Summary: Allow the comparison function used in ReadyQueue to handle the empty FunctionTasks created by the reentrant autograd. Fix #11732 Pull Request resolved: https://github.com/pytorch/pytorch/pull/15791 Differential Revision: D13598006 Pulled By: soumith fbshipit-source-id: 0bfdf28a735fbfe44f0fdbaf8b74a6198e6a1984 --- test/test_autograd.py | 32 ++++++++++++++++++++++++++++++++ torch/csrc/autograd/engine.cpp | 9 ++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/test/test_autograd.py b/test/test_autograd.py index 284c45a..608c95e 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -14,12 +14,14 @@ from torch._six import inf, nan from torch.autograd.gradcheck import gradgradcheck, gradcheck from torch.autograd.function import once_differentiable from torch.autograd.profiler import profile +from torch.utils.checkpoint import checkpoint from common_utils import (TEST_MKL, TestCase, run_tests, skipIfNoLapack, suppress_warnings, skipIfRocm, prod_single_zero, random_square_matrix_of_rank, random_symmetric_matrix, random_symmetric_psd_matrix, random_symmetric_pd_matrix, make_nonzero_det, random_fullrank_matrix_distinct_singular_value, load_tests) +from common_cuda import TEST_CUDA from torch.autograd import Variable, Function, detect_anomaly from torch.autograd.function import InplaceFunction from torch.testing import make_non_contiguous, randn_like @@ -2722,6 +2724,36 @@ class TestAutograd(TestCase): with self.assertRaisesRegex(RuntimeError, 'gradcheck expects all tensor inputs are dense'): gradcheck(fn, torch.rand(10).to_sparse().requires_grad_(True), check_sparse_nnz=False) + @unittest.skipIf(not TEST_CUDA, "Requires cuda for multi device") + def test_multi_device_reentrant_autograd(self): + # Output on gpu so that this task will be associated with the gpu thread + def fn_on_gpu(inp): + # Artificially increase the priority of the next op to make sure it runs + # as soon as we reach it before the ops of branch1. + dummy = inp * 2 * 2 * 2 * 2 + return inp.cuda() + + def parent_on_cpu(inp): + # Slow branch of ops on gpu so that the work queue for the gpu thread + # won't empty too quickly. They also have smaller priorities than the + # ones created by fn_on_gpu + branch1 = inp.cuda() + branch1 = branch1 / branch1 + branch1 = branch1 / branch1 + branch1 = branch1 / branch1 + # Perform checkpoint on cpu tensors. So the last op performed in the reentrant + # autograd is an AccumulateGrad that runs on the cpu thread for the gpu thread. + # So the cpu thread will notify the gpu thread with an empty FunctionTask. + branch2 = checkpoint(fn_on_gpu, inp) + out = branch2 + branch1 + return out + + inp = torch.rand(2, requires_grad=True) + out = parent_on_cpu(inp) + # This will segfault if the empty FunctionTask is not handled properly in the + # gpu thread ReadyQueue + out.sum().backward() + def index_variable(shape, max_indices): if not isinstance(shape, tuple): diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp index 29149e4..6063baa 100644 --- a/torch/csrc/autograd/engine.cpp +++ b/torch/csrc/autograd/engine.cpp @@ -75,9 +75,16 @@ struct FunctionTask { }; // Returns true when t2 should be (weakly) BEFORE t1 in the queue. +// Empty FunctionTask are first. struct CompareFunctionTaskTime { bool operator()(FunctionTask const & t1, FunctionTask const & t2) { - return t1.fn->sequence_nr() < t2.fn->sequence_nr(); + if (!t1.fn) { + return false; + } else if (!t2.fn) { + return true; + } else { + return t1.fn->sequence_nr() < t2.fn->sequence_nr(); + } } }; -- 2.7.4