From ba4de667fa578c84a255da3ef57948e14af7f01a Mon Sep 17 00:00:00 2001 From: Natalia Gimelshein Date: Mon, 25 Mar 2019 19:57:06 -0700 Subject: [PATCH] change dropout lowering in symbolic_script (#18375) Summary: Dropout is now eligible for fusion, and generated fused kernels are just as fast as dropout in ATen. Change its lowering in symbolic script so that it can actually be fused. Still special-cased for cuda, because without fusion this lowering is less efficient than current (bernoulli_ * input). Testing is covered by the test case that ailzhang added (test_dropout_cuda). Pull Request resolved: https://github.com/pytorch/pytorch/pull/18375 Differential Revision: D14611938 Pulled By: soumith fbshipit-source-id: 11b18f4784e6c9265e382a8f8deca7add8df3b37 --- test/test_jit.py | 2 ++ torch/csrc/jit/symbolic_script.cpp | 12 ++++++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/test/test_jit.py b/test/test_jit.py index 1c59929..c2318e1 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -1362,6 +1362,8 @@ class TestJit(JitTestCase): self.assertEqual(outputs, m(*inputs)) @unittest.skipIf(not RUN_CUDA, "test_dropout_cuda require CUDA") + @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows") + @skipIfRocm def test_dropout_cuda(self): # Dropout AD is dispatched to _fused_dropout in CUDA case, # which is not included in TestJitGeneratedFunctional diff --git a/torch/csrc/jit/symbolic_script.cpp b/torch/csrc/jit/symbolic_script.cpp index cce3552..6974936 100644 --- a/torch/csrc/jit/symbolic_script.cpp +++ b/torch/csrc/jit/symbolic_script.cpp @@ -725,20 +725,20 @@ const std::vector functions = { mask, p1m: float): p1r = 1. / p1m - if grad.requires_grad: - grad_input = grad * (mask.type_as(grad) * p1r) - else: - grad_input = torch._masked_scale(grad, mask, p1r) + grad_input = grad * (mask.type_as(grad) * p1r) return grad_input def dropout(input, p: float, train: bool): use_cuda = input.is_cuda - # CUDA has a fused dropout implementation + # lowering is specialized for cuda because cuda fuser can efficiently fuse those operations + # for cpu backend, where fusions are disabled, a different lowering that is more efficient + # in the absence of fusion is used p1m = 1. - p if use_cuda: - res, mask = torch._fused_dropout(input, p1m) + mask = torch.rand_like(input) < p1m + res = mask.type_as(input) * input * (1./p1m) else: mask = torch.empty_like(input) mask.bernoulli_(p1m) -- 2.7.4