From ba4de667fa578c84a255da3ef57948e14af7f01a Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimelshein@nvidia.com>
Date: Mon, 25 Mar 2019 19:57:06 -0700
Subject: [PATCH] change dropout lowering in symbolic_script (#18375)

Summary:
Dropout is now eligible for fusion, and generated fused kernels are just as fast as dropout in ATen. Change its lowering in symbolic script so that it can actually be fused. Still special-cased for cuda, because without fusion this lowering is less efficient than current (bernoulli_ * input). Testing is covered by the test case that ailzhang added (test_dropout_cuda).
Pull Request resolved: https://github.com/pytorch/pytorch/pull/18375

Differential Revision: D14611938

Pulled By: soumith

fbshipit-source-id: 11b18f4784e6c9265e382a8f8deca7add8df3b37
---
 test/test_jit.py                   |  2 ++
 torch/csrc/jit/symbolic_script.cpp | 12 ++++++------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 1c59929..c2318e1 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -1362,6 +1362,8 @@ class TestJit(JitTestCase):
             self.assertEqual(outputs, m(*inputs))
 
     @unittest.skipIf(not RUN_CUDA, "test_dropout_cuda require CUDA")
+    @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
+    @skipIfRocm
     def test_dropout_cuda(self):
         # Dropout AD is dispatched to _fused_dropout in CUDA case,
         # which is not included in TestJitGeneratedFunctional
diff --git a/torch/csrc/jit/symbolic_script.cpp b/torch/csrc/jit/symbolic_script.cpp
index cce3552..6974936 100644
--- a/torch/csrc/jit/symbolic_script.cpp
+++ b/torch/csrc/jit/symbolic_script.cpp
@@ -725,20 +725,20 @@ const std::vector<std::string> functions = {
                                       mask,
                                       p1m: float):
             p1r = 1. / p1m
-            if grad.requires_grad:
-                grad_input = grad * (mask.type_as(grad) * p1r)
-            else:
-                grad_input = torch._masked_scale(grad, mask, p1r)
+            grad_input = grad * (mask.type_as(grad) * p1r)
             return grad_input
 
         def dropout(input,
                     p: float,
                     train: bool):
             use_cuda = input.is_cuda
-            # CUDA has a fused dropout implementation
+            # lowering is specialized for cuda because cuda fuser can efficiently fuse those operations
+            # for cpu backend, where fusions are disabled, a different lowering that is more efficient
+            # in the absence of fusion is used
             p1m = 1. - p
             if use_cuda:
-                res, mask = torch._fused_dropout(input, p1m)
+                mask = torch.rand_like(input) < p1m
+                res = mask.type_as(input) * input * (1./p1m)
             else:
                 mask = torch.empty_like(input)
                 mask.bernoulli_(p1m)
-- 
2.7.4