Optimize fused_dropout_kernel launch bounds for AMD hardware

author Johannes M Dieterich <johannes.dieterich@amd.com>

Mon, 11 Mar 2019 21:39:07 +0000 (14:39 -0700)

committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>

Mon, 11 Mar 2019 21:45:42 +0000 (14:45 -0700)
author Johannes M Dieterich <johannes.dieterich@amd.com>
Mon, 11 Mar 2019 21:39:07 +0000 (14:39 -0700)
committer Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
Mon, 11 Mar 2019 21:45:42 +0000 (14:45 -0700)
diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu

index 16d0b71..d858a0e 100644 (file)
--- a/aten/src/ATen/native/cuda/Dropout.cu
+++ b/aten/src/ATen/native/cuda/Dropout.cu
@@ -34,8 +34,10 @@ template <
            typename accscalar_t,
            typename IndexType,
            int ADims>
-#if __CUDA_ARCH__ >= 350 || defined __HIP_PLATFORM_HCC__
+#if __CUDA_ARCH__ >= 350
  C10_LAUNCH_BOUNDS_2(256, 8)
+#elif defined (__HIP_PLATFORM_HCC__)
+C10_LAUNCH_BOUNDS_2(256, 4)
  #endif
  __global__ void
  fused_dropout_kernel(cuda::detail::TensorInfo<scalar_t, IndexType> a,
author	Johannes M Dieterich <johannes.dieterich@amd.com>
	Mon, 11 Mar 2019 21:39:07 +0000 (14:39 -0700)
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>
	Mon, 11 Mar 2019 21:45:42 +0000 (14:45 -0700)