From 4741d613eed2588a13c0f7a331500716393de3de Mon Sep 17 00:00:00 2001 From: Johannes M Dieterich Date: Wed, 20 Mar 2019 07:58:11 -0700 Subject: [PATCH] Do not apply these explicit unroll pragmas for ROCm. (#18204) Summary: Loop analysis indicates that there is a runtime trip count and hence unrolling cannot take place. This will silence compile-time warnings we have been observing with recent ROCm releases. Pull Request resolved: https://github.com/pytorch/pytorch/pull/18204 Differential Revision: D14539875 Pulled By: ezyang fbshipit-source-id: a7ea7f2a95603754296b76a6b62a154f56f4ad4d --- aten/src/ATen/native/cuda/WeightNorm.cu | 4 ++++ aten/src/THC/THCSortUtils.cuh | 10 ++++++++++ aten/src/THCUNN/SpatialDepthwiseConvolution.cu | 6 ++++++ caffe2/operators/top_k_heap_selection.cuh | 6 ++++++ 4 files changed, 26 insertions(+) diff --git a/aten/src/ATen/native/cuda/WeightNorm.cu b/aten/src/ATen/native/cuda/WeightNorm.cu index 76f4272..151f4c9 100644 --- a/aten/src/ATen/native/cuda/WeightNorm.cu +++ b/aten/src/ATen/native/cuda/WeightNorm.cu @@ -45,7 +45,9 @@ __device__ __forceinline__ void reduce_block_into_lanes __syncthreads(); } +#ifndef __HIP_PLATFORM_HCC__ #pragma unroll +#endif for(int i = (blockSize >> 1); i >= 64; i >>= 1) { if(tid < i) @@ -62,7 +64,9 @@ __device__ __forceinline__ void reduce_block_into_lanes final = val; // __SYNCWARP(); +#ifndef __HIP_PLATFORM_HCC__ #pragma unroll +#endif for(int i = 16; i >= lanes; i >>= 1) final = reduceOp(final, WARP_SHFL_DOWN(final, i)); diff --git a/aten/src/THC/THCSortUtils.cuh b/aten/src/THC/THCSortUtils.cuh index c60bfe8..4980ee1 100644 --- a/aten/src/THC/THCSortUtils.cuh +++ b/aten/src/THC/THCSortUtils.cuh @@ -60,7 +60,9 @@ __device__ inline void bitonicSort(K keys[Power2SortSize], V values[Power2SortSize], bool valid[Power2SortSize], const Comparator& comp) { +#ifndef __HIP_PLATFORM_HCC__ #pragma unroll +#endif for (unsigned int size = 2; size < Power2SortSize; size *= 2) { bool flag = ((threadIdx.x & (size / 2)) != 0); @@ -77,7 +79,9 @@ __device__ inline void bitonicSort(K keys[Power2SortSize], } } +#ifndef __HIP_PLATFORM_HCC__ #pragma unroll +#endif for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) { __syncthreads(); @@ -98,11 +102,15 @@ template 0; stride /= 2) { __syncthreads(); @@ -115,7 +123,9 @@ __device__ inline void bitonicSortKeys(K keys[Power2SortSize], } } +#ifndef __HIP_PLATFORM_HCC__ #pragma unroll +#endif for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) { __syncthreads(); diff --git a/aten/src/THCUNN/SpatialDepthwiseConvolution.cu b/aten/src/THCUNN/SpatialDepthwiseConvolution.cu index 61803f5..baf7610 100644 --- a/aten/src/THCUNN/SpatialDepthwiseConvolution.cu +++ b/aten/src/THCUNN/SpatialDepthwiseConvolution.cu @@ -78,7 +78,9 @@ __global__ void spatialDepthwiseConvolutionUpdateOutput( const IndexType offset0 = (n * inputChannels + inputChannel) * inputHeight * inputWidth; #pragma unroll for (int kH = 0; kH < KH_LIMIT; ++kH) { +#ifndef __HIP_PLATFORM_HCC__ #pragma unroll +#endif for (int kW = 0; kW < KW_LIMIT; ++kW) { const int h_in = -padHeight + h * strideHeight + kH * dilationHeight; const int w_in = -padWidth + w * strideWidth + kW * dilationWidth; @@ -138,9 +140,13 @@ __global__ void spatialDepthwiseConvolutionUpdateGradInput( for (int multiplier = 0; multiplier < depthwiseMultiplier; ++multiplier) { int och = (c * depthwiseMultiplier) + multiplier; int weightOffset = och * kernelHeight * kernelWidth; +#ifndef __HIP_PLATFORM_HCC__ #pragma unroll +#endif for (int kh = 0; kh < KH_LIMIT; ++kh) { +#ifdef __HIP_PLATFORM_HCC__ #pragma unroll +#endif for (int kw = 0; kw < KW_LIMIT; ++kw) { int h_out = h + padHeight - kh * dilationHeight; int w_out = w + padWidth - kw * dilationWidth; diff --git a/caffe2/operators/top_k_heap_selection.cuh b/caffe2/operators/top_k_heap_selection.cuh index e9c5c0f..921266c 100644 --- a/caffe2/operators/top_k_heap_selection.cuh +++ b/caffe2/operators/top_k_heap_selection.cuh @@ -71,7 +71,9 @@ __device__ inline void warpHeapInsert(K k, V v, K* keyHeap, V* valueHeap) { // (0 12 3456) // log2(8 / 2) = 2 levels of interior nodes for heap size 8 (0 and 12) int i = 0; +#ifndef __HIP_PLATFORM_HCC__ #pragma unroll +#endif for (int levels = 0; levels < math::IntegerLog2(HeapSize / 2); ++levels) { int leftChild = i * 2 + 1; int rightChild = leftChild + 1; @@ -263,12 +265,16 @@ __global__ void selectRowsViaHeap( V vals[Unroll]; for (int i = threadIdx.x; i < n; i += blockDim.x * Unroll) { +#ifndef __HIP_PLATFORM_HCC__ #pragma unroll +#endif for (int j = 0; j < Unroll; ++j) { vals[j] = inputStart[i + j * blockDim.x]; } +#ifndef __HIP_PLATFORM_HCC__ #pragma unroll +#endif for (int j = 0; j < Unroll; ++j) { heap.add(vals[j], (IndexType)i + j * blockDim.x); } -- 2.7.4