for (int64_t d = 0; d < dim_size; d++) {
Vec output_vec_o1 = Vec::loadu(temp_vec_input_data + d*vectorized_step*2);
Vec output_vec_o2 = Vec::loadu(temp_vec_input_data + d*vectorized_step*2 + vectorized_step);
- output_vec_o1 = output_vec_o1.exp();
- output_vec_o2 = output_vec_o2.exp();
-
+ output_vec_o1 = (output_vec_o1 - max_vec_o1).exp();
+ output_vec_o2 = (output_vec_o2 - max_vec_o2).exp();
output_vec_o1.store(temp_vec_output_data + d*vectorized_step*2);
output_vec_o2.store(temp_vec_output_data + d*vectorized_step*2 + vectorized_step);
lambda: nn.functional.multi_margin_loss(torch.randn(5, device=device),
torch.zeros(3, device=device)))
- def _test_bfloat16_ops(self, op, device, inp_dims=(), prec=1e-2):
+ def _test_bfloat16_ops(self, op, device, inp_dims=(), prec=1e-2, scale_factor=None):
# fp32 compute
input1 = torch.randn(inp_dims, dtype=torch.float32, device=device, requires_grad=True)
+ if scale_factor is not None:
+ input1 = (torch.rand(inp_dims, dtype=torch.bfloat16, device=device) * scale_factor).float().requires_grad_()
out1 = op(input1)
grad_input1 = torch.randn_like(out1, device=device)
out1.backward(grad_input1)
out2 = op_bfp16(input2)
out2.backward(grad_input2)
- self.assertEqual(out1, out2, atol=prec, rtol=0, exact_dtype=False)
- self.assertEqual(input1.grad.data, input2.grad.data, atol=prec, rtol=0, exact_dtype=False)
+ self.assertEqual(out1, out2, atol=prec, rtol=prec, exact_dtype=False)
+ self.assertEqual(input1.grad.data, input2.grad.data, atol=prec, rtol=prec, exact_dtype=False)
@onlyCUDA
def test_activations_bfloat16(self, device):
def test_softmax_bfloat16(self, device):
for dim in [0, 1, 2, 3]:
self._test_bfloat16_ops(torch.nn.Softmax(dim=dim), device, inp_dims=(16, 33, 15, 16), prec=1e-2)
+ # test softmax with large input value which casues exp() to overflow
+ self._test_bfloat16_ops(torch.nn.Softmax(dim=dim), device, inp_dims=(16, 33, 15, 16), prec=0.05, scale_factor=1000.0)
@onlyCUDA
@skipCUDAIfRocm