From 5fa78303edcfcc841c30ef27462f80904b3c79a2 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimelshein@nvidia.com>
Date: Thu, 21 Feb 2019 14:35:20 -0800
Subject: [PATCH] fix double backward for half softmax/logsoftmax (#17330)

Summary:
Fix for #17261, SsnL do you have tests for it in your other PR? If not, I'll add to this. Example from #17261 now does not error out (and same for log_softmax).
Pull Request resolved: https://github.com/pytorch/pytorch/pull/17330

Differential Revision: D14171529

Pulled By: soumith

fbshipit-source-id: ee925233feb1b44ef9f1d757db59ca3601aadef2
---
 test/test_nn.py                 |  3 +--
 tools/autograd/derivatives.yaml | 12 ++++++------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/test/test_nn.py b/test/test_nn.py
index 0a5d070..2640666 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -2142,8 +2142,7 @@ class TestNN(NNTestCase):
 
     def _test_softmax_backward(self, device):
         if device.type == 'cuda':
-            dtypes = [torch.float]
-            # FIXME: add torch.half after https://github.com/pytorch/pytorch/issues/17261 is fixed
+            dtypes = [torch.float, torch.half]
         else:
             dtypes = [torch.float]
         # FIXME: add (10, 0) after https://github.com/pytorch/pytorch/issues/17262 is fixed
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index d690119..6dac188 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -214,10 +214,10 @@
   self: clamp_backward(grad, self, min, max)
 
 - name: clamp_min(Tensor self, Scalar min)
-  self: grad * (self >= min).type_as(grad)
+  self: grad * (self >= min).to(grad.dtype())
 
 - name: clamp_max(Tensor self, Scalar max)
-  self: grad * (self <= max).type_as(grad)
+  self: grad * (self <= max).to(grad.dtype())
 
 - name: clone(Tensor self)
   self: grad
@@ -1206,8 +1206,8 @@
   self: log_sigmoid_double_backward(grad * grad_output, self)
 
 - name: _log_softmax_backward_data(Tensor grad_output, Tensor output, int64_t dim, Tensor self)
-  grad_output: grad - (grad * output.exp()).sum(dim, true)
-  self: log_softmax_double_backward(grad, grad_output, dim, output).type_as(self)
+  grad_output: grad.to(output.dtype()) - (grad.to(output.dtype()) * output.exp()).sum(dim, true)
+  self: log_softmax_double_backward(grad.to(output.dtype()), grad_output, dim, output).to(self.dtype())
 
 - name: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope)
   grad_output: leaky_relu_backward(grad, self, negative_slope)
@@ -1270,8 +1270,8 @@
   self: softplus_double_backward(grad * grad_output, self, beta, threshold)
 
 - name: _softmax_backward_data(Tensor grad_output, Tensor output, int64_t dim, Tensor self)
-  grad_output: _softmax_backward_data(grad, output, dim, self)
-  self: softmax_double_backward(grad, grad_output, dim, output).type_as(self)
+  grad_output: _softmax_backward_data(grad.to(output.dtype()), output, dim, self)
+  self: softmax_double_backward(grad.to(output.dtype()), grad_output, dim, output).to(self.dtype())
 
 - name: soft_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int64_t reduction)
   grad_output: soft_margin_loss_double_backward_grad_output(grad, grad_output, self, target, reduction)
-- 
2.7.4