From 4741d613eed2588a13c0f7a331500716393de3de Mon Sep 17 00:00:00 2001
From: Johannes M Dieterich <johannes.dieterich@amd.com>
Date: Wed, 20 Mar 2019 07:58:11 -0700
Subject: [PATCH] Do not apply these explicit unroll pragmas for ROCm. (#18204)

Summary:
Loop analysis indicates that there is a runtime trip count and hence
unrolling cannot take place.

This will silence compile-time warnings we have been observing with recent ROCm releases.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/18204

Differential Revision: D14539875

Pulled By: ezyang

fbshipit-source-id: a7ea7f2a95603754296b76a6b62a154f56f4ad4d
---
 aten/src/ATen/native/cuda/WeightNorm.cu        |  4 ++++
 aten/src/THC/THCSortUtils.cuh                  | 10 ++++++++++
 aten/src/THCUNN/SpatialDepthwiseConvolution.cu |  6 ++++++
 caffe2/operators/top_k_heap_selection.cuh      |  6 ++++++
 4 files changed, 26 insertions(+)

diff --git a/aten/src/ATen/native/cuda/WeightNorm.cu b/aten/src/ATen/native/cuda/WeightNorm.cu
index 76f4272..151f4c9 100644
--- a/aten/src/ATen/native/cuda/WeightNorm.cu
+++ b/aten/src/ATen/native/cuda/WeightNorm.cu
@@ -45,7 +45,9 @@ __device__ __forceinline__ void reduce_block_into_lanes
     __syncthreads();
   }
 
+#ifndef __HIP_PLATFORM_HCC__
   #pragma unroll
+#endif
   for(int i = (blockSize >> 1); i >= 64; i >>= 1)
   {
     if(tid < i)
@@ -62,7 +64,9 @@ __device__ __forceinline__ void reduce_block_into_lanes
       final = val;
     // __SYNCWARP();
 
+#ifndef __HIP_PLATFORM_HCC__
     #pragma unroll
+#endif
     for(int i = 16; i >= lanes; i >>= 1)
       final = reduceOp(final, WARP_SHFL_DOWN(final, i));
 
diff --git a/aten/src/THC/THCSortUtils.cuh b/aten/src/THC/THCSortUtils.cuh
index c60bfe8..4980ee1 100644
--- a/aten/src/THC/THCSortUtils.cuh
+++ b/aten/src/THC/THCSortUtils.cuh
@@ -60,7 +60,9 @@ __device__ inline void bitonicSort(K keys[Power2SortSize],
                                    V values[Power2SortSize],
                                    bool valid[Power2SortSize],
                                    const Comparator& comp) {
+#ifndef __HIP_PLATFORM_HCC__
 #pragma unroll
+#endif
   for (unsigned int size = 2; size < Power2SortSize; size *= 2) {
     bool flag = ((threadIdx.x & (size / 2)) != 0);
 
@@ -77,7 +79,9 @@ __device__ inline void bitonicSort(K keys[Power2SortSize],
     }
   }
 
+#ifndef __HIP_PLATFORM_HCC__
 #pragma unroll
+#endif
   for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) {
 
     __syncthreads();
@@ -98,11 +102,15 @@ template <typename Comparator, typename K,
 __device__ inline void bitonicSortKeys(K keys[Power2SortSize],
                                    bool valid[Power2SortSize],
                                    const Comparator& comp) {
+#ifndef __HIP_PLATFORM_HCC__
 #pragma unroll
+#endif
   for (unsigned int size = 2; size < Power2SortSize; size *= 2) {
     bool flag = ((threadIdx.x & (size / 2)) != 0);
 
+#ifndef __HIP_PLATFORM_HCC__
 #pragma unroll
+#endif
     for (unsigned int stride = size / 2; stride > 0; stride /= 2) {
 
       __syncthreads();
@@ -115,7 +123,9 @@ __device__ inline void bitonicSortKeys(K keys[Power2SortSize],
     }
   }
 
+#ifndef __HIP_PLATFORM_HCC__
 #pragma unroll
+#endif
   for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) {
     __syncthreads();
 
diff --git a/aten/src/THCUNN/SpatialDepthwiseConvolution.cu b/aten/src/THCUNN/SpatialDepthwiseConvolution.cu
index 61803f5..baf7610 100644
--- a/aten/src/THCUNN/SpatialDepthwiseConvolution.cu
+++ b/aten/src/THCUNN/SpatialDepthwiseConvolution.cu
@@ -78,7 +78,9 @@ __global__ void spatialDepthwiseConvolutionUpdateOutput(
     const IndexType offset0 = (n * inputChannels + inputChannel) * inputHeight * inputWidth;
 #pragma unroll
     for (int kH = 0; kH < KH_LIMIT; ++kH) {
+#ifndef __HIP_PLATFORM_HCC__
 #pragma unroll
+#endif
       for (int kW = 0; kW < KW_LIMIT; ++kW) {
         const int h_in = -padHeight + h * strideHeight + kH * dilationHeight;
         const int w_in = -padWidth + w * strideWidth + kW * dilationWidth;
@@ -138,9 +140,13 @@ __global__ void spatialDepthwiseConvolutionUpdateGradInput(
     for (int multiplier = 0; multiplier < depthwiseMultiplier; ++multiplier) {
       int och = (c * depthwiseMultiplier) + multiplier;
       int weightOffset = och * kernelHeight * kernelWidth;
+#ifndef __HIP_PLATFORM_HCC__
 #pragma unroll
+#endif
       for (int kh = 0; kh < KH_LIMIT; ++kh) {
+#ifdef __HIP_PLATFORM_HCC__
 #pragma unroll
+#endif
         for (int kw = 0; kw < KW_LIMIT; ++kw) {
           int h_out = h + padHeight - kh * dilationHeight;
           int w_out = w + padWidth - kw * dilationWidth;
diff --git a/caffe2/operators/top_k_heap_selection.cuh b/caffe2/operators/top_k_heap_selection.cuh
index e9c5c0f..921266c 100644
--- a/caffe2/operators/top_k_heap_selection.cuh
+++ b/caffe2/operators/top_k_heap_selection.cuh
@@ -71,7 +71,9 @@ __device__ inline void warpHeapInsert(K k, V v, K* keyHeap, V* valueHeap) {
   // (0 12 3456)
   // log2(8 / 2) = 2 levels of interior nodes for heap size 8 (0 and 12)
   int i = 0;
+#ifndef __HIP_PLATFORM_HCC__
 #pragma unroll
+#endif
   for (int levels = 0; levels < math::IntegerLog2(HeapSize / 2); ++levels) {
     int leftChild = i * 2 + 1;
     int rightChild = leftChild + 1;
@@ -263,12 +265,16 @@ __global__ void selectRowsViaHeap(
   V vals[Unroll];
 
   for (int i = threadIdx.x; i < n; i += blockDim.x * Unroll) {
+#ifndef __HIP_PLATFORM_HCC__
 #pragma unroll
+#endif
     for (int j = 0; j < Unroll; ++j) {
       vals[j] = inputStart[i + j * blockDim.x];
     }
 
+#ifndef __HIP_PLATFORM_HCC__
 #pragma unroll
+#endif
     for (int j = 0; j < Unroll; ++j) {
       heap.add(vals[j], (IndexType)i + j * blockDim.x);
     }
-- 
2.7.4