}\r
\r
\r
+ template <int size, typename T>\r
+ __device__ void find_min_max_in_smem(volatile T* minval, volatile T* maxval, const unsigned int tid)\r
+ {\r
+ if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); }\r
+ if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); } __syncthreads(); }\r
+ if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval); } __syncthreads(); }\r
+\r
+ if (tid < 32)\r
+ {\r
+ if (size >= 64) merge(tid, 32, minval, maxval);\r
+ if (size >= 32) merge(tid, 16, minval, maxval);\r
+ if (size >= 16) merge(tid, 8, minval, maxval);\r
+ if (size >= 8) merge(tid, 4, minval, maxval);\r
+ if (size >= 4) merge(tid, 2, minval, maxval);\r
+ if (size >= 2) merge(tid, 1, minval, maxval);\r
+ }\r
+ }\r
+\r
+\r
template <int nthreads, typename T>\r
__global__ void min_max_kernel(const DevMem2D src, T* minval, T* maxval)\r
{\r
\r
sminval[tid] = mymin;\r
smaxval[tid] = mymax;\r
-\r
__syncthreads();\r
\r
- if (nthreads >= 512) if (tid < 256) { merge(tid, 256, sminval, smaxval); __syncthreads(); }\r
- if (nthreads >= 256) if (tid < 128) { merge(tid, 128, sminval, smaxval); __syncthreads(); }\r
- if (nthreads >= 128) if (tid < 64) { merge(tid, 64, sminval, smaxval); __syncthreads(); }\r
-\r
- if (tid < 32)\r
- {\r
- if (nthreads >= 64) merge(tid, 32, sminval, smaxval);\r
- if (nthreads >= 32) merge(tid, 16, sminval, smaxval);\r
- if (nthreads >= 16) merge(tid, 8, sminval, smaxval);\r
- if (nthreads >= 8) merge(tid, 4, sminval, smaxval);\r
- if (nthreads >= 4) merge(tid, 2, sminval, smaxval);\r
- if (nthreads >= 2) merge(tid, 1, sminval, smaxval);\r
- }\r
+ find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid);\r
\r
if (tid == 0) \r
{\r
}\r
\r
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
- \r
- // Process partial results in the first thread of the last block \r
- if ((gridDim.x > 1 || gridDim.y > 1) && tid == 0)\r
- {\r
- __threadfence();\r
- if (atomicInc(&blocks_finished, gridDim.x * gridDim.y) == gridDim.x * gridDim.y - 1)\r
+ __shared__ bool is_last;\r
+\r
+ if (tid == 0)\r
+ {\r
+ minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];\r
+ maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];\r
+ __threadfence();\r
+\r
+ unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);\r
+ is_last = ticket == gridDim.x * gridDim.y - 1;\r
+ }\r
+\r
+ __syncthreads();\r
+\r
+ if (is_last)\r
+ {\r
+ unsigned int idx = min(tid, gridDim.x * gridDim.y - 1);\r
+\r
+ sminval[tid] = minval[idx];\r
+ smaxval[tid] = maxval[idx];\r
+ __syncthreads();\r
+\r
+ find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid);\r
+\r
+ if (tid == 0) \r
{\r
- mymin = minval[0];\r
- mymax = maxval[0];\r
- for (unsigned int i = 1; i < gridDim.x * gridDim.y; ++i)\r
- { \r
- mymin = min(mymin, minval[i]);\r
- mymax = max(mymax, maxval[i]);\r
- }\r
- minval[0] = mymin;\r
- maxval[0] = mymax;\r
+ minval[0] = (T)sminval[0];\r
+ maxval[0] = (T)smaxval[0];\r
}\r
+ }\r
+#else\r
+ if (tid == 0) \r
+ {\r
+ minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];\r
+ maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];\r
}\r
-\r
#endif\r
}\r
\r
\r
\r
// This kernel will be used only when compute capability is 1.0\r
- template <typename T>\r
+ template <int nthreads, typename T>\r
__global__ void min_max_kernel_2ndstep(T* minval, T* maxval, int size)\r
{\r
- T val;\r
- T mymin = minval[0];\r
- T mymax = maxval[0];\r
- for (unsigned int i = 1; i < size; ++i)\r
- { \r
- val = minval[i]; if (val < mymin) mymin = val;\r
- val = maxval[i]; if (val > mymax) mymax = val;\r
+ typedef typename MinMaxTypeTraits<T>::best_type best_type;\r
+ __shared__ best_type sminval[nthreads];\r
+ __shared__ best_type smaxval[nthreads];\r
+ \r
+ unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+ unsigned int idx = min(tid, gridDim.x * gridDim.y - 1);\r
+\r
+ sminval[tid] = minval[idx];\r
+ smaxval[tid] = maxval[idx];\r
+ __syncthreads();\r
+\r
+ find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid);\r
+\r
+ if (tid == 0) \r
+ {\r
+ minval[0] = (T)sminval[0];\r
+ maxval[0] = (T)smaxval[0];\r
}\r
- minval[0] = mymin;\r
- maxval[0] = mymax;\r
}\r
\r
\r
\r
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));\r
min_max_kernel<256, T><<<grid, threads>>>(src, minval_buf, maxval_buf);\r
- min_max_kernel_2ndstep<T><<<1, 1>>>(minval_buf, maxval_buf, grid.x * grid.y);\r
+ min_max_kernel_2ndstep<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);\r
cudaSafeCall(cudaThreadSynchronize());\r
\r
T minval_, maxval_;\r
}\r
\r
\r
+ template <int size, typename T>\r
+ __device__ void find_min_max_loc_in_smem(volatile T* minval, volatile T* maxval, volatile unsigned int* minloc, \r
+ volatile unsigned int* maxloc, const unsigned int tid)\r
+ {\r
+ if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); }\r
+ if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); } __syncthreads(); }\r
+ if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval, minloc, maxloc); } __syncthreads(); }\r
+\r
+ if (tid < 32)\r
+ {\r
+ if (size >= 64) merge(tid, 32, minval, maxval, minloc, maxloc);\r
+ if (size >= 32) merge(tid, 16, minval, maxval, minloc, maxloc);\r
+ if (size >= 16) merge(tid, 8, minval, maxval, minloc, maxloc);\r
+ if (size >= 8) merge(tid, 4, minval, maxval, minloc, maxloc);\r
+ if (size >= 4) merge(tid, 2, minval, maxval, minloc, maxloc);\r
+ if (size >= 2) merge(tid, 1, minval, maxval, minloc, maxloc);\r
+ }\r
+ }\r
+\r
+\r
template <int nthreads, typename T>\r
__global__ void min_max_loc_kernel(const DevMem2D src, T* minval, T* maxval, \r
unsigned int* minloc, unsigned int* maxloc)\r
smaxval[tid] = mymax;\r
sminloc[tid] = myminloc;\r
smaxloc[tid] = mymaxloc;\r
-\r
__syncthreads();\r
\r
- if (nthreads >= 512) if (tid < 256) { merge(tid, 256, sminval, smaxval, sminloc, smaxloc); __syncthreads(); }\r
- if (nthreads >= 256) if (tid < 128) { merge(tid, 128, sminval, smaxval, sminloc, smaxloc); __syncthreads(); }\r
- if (nthreads >= 128) if (tid < 64) { merge(tid, 64, sminval, smaxval, sminloc, smaxloc); __syncthreads(); }\r
+ find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);\r
\r
- if (tid < 32)\r
- {\r
- if (nthreads >= 64) merge(tid, 32, sminval, smaxval, sminloc, smaxloc);\r
- if (nthreads >= 32) merge(tid, 16, sminval, smaxval, sminloc, smaxloc);\r
- if (nthreads >= 16) merge(tid, 8, sminval, smaxval, sminloc, smaxloc);\r
- if (nthreads >= 8) merge(tid, 4, sminval, smaxval, sminloc, smaxloc);\r
- if (nthreads >= 4) merge(tid, 2, sminval, smaxval, sminloc, smaxloc);\r
- if (nthreads >= 2) merge(tid, 1, sminval, smaxval, sminloc, smaxloc);\r
- }\r
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
+ __shared__ bool is_last;\r
\r
- if (tid == 0) \r
- {\r
- minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];\r
+ if (tid == 0)\r
+ {\r
+ minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];\r
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];\r
minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];\r
maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];\r
- }\r
+ __threadfence();\r
\r
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
- \r
- // Process partial results in the first thread of the last block \r
- if ((gridDim.x > 1 || gridDim.y > 1) && tid == 0)\r
- {\r
- __threadfence();\r
- if (atomicInc(&blocks_finished, gridDim.x * gridDim.y) == gridDim.x * gridDim.y - 1)\r
+ unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);\r
+ is_last = ticket == gridDim.x * gridDim.y - 1;\r
+ }\r
+\r
+ __syncthreads();\r
+\r
+ if (is_last)\r
+ {\r
+ unsigned int idx = min(tid, gridDim.x * gridDim.y - 1);\r
+\r
+ sminval[tid] = minval[idx];\r
+ smaxval[tid] = maxval[idx];\r
+ sminloc[tid] = minloc[idx];\r
+ smaxloc[tid] = maxloc[idx];\r
+ __syncthreads();\r
+\r
+ find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);\r
+\r
+ if (tid == 0) \r
{\r
- mymin = minval[0];\r
- mymax = maxval[0];\r
- unsigned int imin = 0, imax = 0;\r
- for (unsigned int i = 1; i < gridDim.x * gridDim.y; ++i)\r
- { \r
- val = minval[i]; if (val < mymin) { mymin = val; imin = i; }\r
- val = maxval[i]; if (val > mymax) { mymax = val; imax = i; }\r
- }\r
- minval[0] = mymin;\r
- maxval[0] = mymax;\r
- minloc[0] = minloc[imin];\r
- maxloc[0] = maxloc[imax];\r
+ minval[0] = (T)sminval[0];\r
+ maxval[0] = (T)smaxval[0];\r
+ minloc[0] = sminloc[0];\r
+ maxloc[0] = smaxloc[0];\r
}\r
+ }\r
+#else\r
+ if (tid == 0) \r
+ {\r
+ minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];\r
+ maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];\r
+ minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];\r
+ maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];\r
}\r
#endif\r
}\r
\r
\r
// This kernel will be used only when compute capability is 1.0\r
- template <typename T>\r
+ template <int nthreads, typename T>\r
__global__ void min_max_loc_kernel_2ndstep(T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, int size)\r
{\r
- T val;\r
- T mymin = minval[0];\r
- T mymax = maxval[0];\r
- unsigned int imin = 0, imax = 0;\r
- for (unsigned int i = 1; i < size; ++i)\r
- { \r
- val = minval[i]; if (val < mymin) { mymin = val; imin = i; }\r
- val = maxval[i]; if (val > mymax) { mymax = val; imax = i; }\r
+ typedef typename MinMaxTypeTraits<T>::best_type best_type;\r
+ __shared__ best_type sminval[nthreads];\r
+ __shared__ best_type smaxval[nthreads];\r
+ __shared__ unsigned int sminloc[nthreads];\r
+ __shared__ unsigned int smaxloc[nthreads];\r
+\r
+ unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
+ unsigned int idx = min(tid, gridDim.x * gridDim.y - 1);\r
+\r
+ sminval[tid] = minval[idx];\r
+ smaxval[tid] = maxval[idx];\r
+ sminloc[tid] = minloc[idx];\r
+ smaxloc[tid] = maxloc[idx];\r
+ __syncthreads();\r
+\r
+ find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);\r
+\r
+ if (tid == 0) \r
+ {\r
+ minval[0] = (T)sminval[0];\r
+ maxval[0] = (T)smaxval[0];\r
+ minloc[0] = sminloc[0];\r
+ maxloc[0] = smaxloc[0];\r
}\r
- minval[0] = mymin;\r
- maxval[0] = mymax;\r
- minloc[0] = minloc[imin];\r
- maxloc[0] = maxloc[imax];\r
}\r
\r
\r
\r
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));\r
min_max_loc_kernel<256, T><<<grid, threads>>>(src, minval_buf, maxval_buf, minloc_buf, maxloc_buf);\r
- min_max_loc_kernel_2ndstep<T><<<1, 1>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);\r
+ min_max_loc_kernel_2ndstep<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);\r
cudaSafeCall(cudaThreadSynchronize());\r
\r
T minval_, maxval_;\r
\r
\r
template <int size, typename T>\r
- __device__ void sum_shared_mem(volatile T* data, const unsigned int tid)\r
+ __device__ void sum_is_smem(volatile T* data, const unsigned int tid)\r
{\r
T sum = data[tid];\r
\r
- if (size >= 512) if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads();\r
- if (size >= 256) if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads();\r
- if (size >= 128) if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads();\r
+ if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); }\r
+ if (size >= 256) { if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); }\r
+ if (size >= 128) { if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); }\r
\r
if (tid < 32)\r
{\r
scount[tid] = cnt;\r
__syncthreads();\r
\r
- sum_shared_mem<nthreads, unsigned int>(scount, tid);\r
+ sum_is_smem<nthreads, unsigned int>(scount, tid);\r
\r
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110\r
__shared__ bool is_last;\r
\r
if (is_last)\r
{\r
- scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;\r
- sum_shared_mem<nthreads, unsigned int>(scount, tid);\r
+ scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;\r
+ __syncthreads();\r
+\r
+ sum_is_smem<nthreads, unsigned int>(scount, tid);\r
+\r
if (tid == 0) count[0] = scount[0];\r
}\r
#else\r
unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;\r
\r
scount[tid] = tid < size ? count[tid] : 0;\r
- sum_shared_mem<nthreads, unsigned int>(scount, tid);\r
+ sum_is_smem<nthreads, unsigned int>(scount, tid);\r
\r
if (tid == 0) count[0] = scount[0];\r
}\r