From fa29c179b708881ae0985fbe6ad4065256e769bb Mon Sep 17 00:00:00 2001
From: Johannes M Dieterich <johannes.dieterich@amd.com>
Date: Mon, 11 Mar 2019 14:39:07 -0700
Subject: [PATCH] Optimize fused_dropout_kernel launch bounds for AMD hardware

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/17870

Differential Revision: D14409990

Pulled By: ezyang

fbshipit-source-id: 0452282f459770823641b2527f47b1186ab14666
---
 aten/src/ATen/native/cuda/Dropout.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu
index 16d0b71..d858a0e 100644
--- a/aten/src/ATen/native/cuda/Dropout.cu
+++ b/aten/src/ATen/native/cuda/Dropout.cu
@@ -34,8 +34,10 @@ template <
           typename accscalar_t,
           typename IndexType,
           int ADims>
-#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__
+#if __CUDA_ARCH__ >= 350
 C10_LAUNCH_BOUNDS_2(256, 8)
+#elif defined (__HIP_PLATFORM_HCC__)
+C10_LAUNCH_BOUNDS_2(256, 4)
 #endif
 __global__ void
 fused_dropout_kernel(cuda::detail::TensorInfo<scalar_t, IndexType> a,
-- 
2.7.4