From fa29c179b708881ae0985fbe6ad4065256e769bb Mon Sep 17 00:00:00 2001 From: Johannes M Dieterich Date: Mon, 11 Mar 2019 14:39:07 -0700 Subject: [PATCH] Optimize fused_dropout_kernel launch bounds for AMD hardware Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/17870 Differential Revision: D14409990 Pulled By: ezyang fbshipit-source-id: 0452282f459770823641b2527f47b1186ab14666 --- aten/src/ATen/native/cuda/Dropout.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu index 16d0b71..d858a0e 100644 --- a/aten/src/ATen/native/cuda/Dropout.cu +++ b/aten/src/ATen/native/cuda/Dropout.cu @@ -34,8 +34,10 @@ template < typename accscalar_t, typename IndexType, int ADims> -#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__ +#if __CUDA_ARCH__ >= 350 C10_LAUNCH_BOUNDS_2(256, 8) +#elif defined (__HIP_PLATFORM_HCC__) +C10_LAUNCH_BOUNDS_2(256, 4) #endif __global__ void fused_dropout_kernel(cuda::detail::TensorInfo a, -- 2.7.4