#ifndef OPENCV_GPU_WARP_REDUCE_HPP__\r
#define OPENCV_GPU_WARP_REDUCE_HPP__\r
\r
-namespace cv { namespace gpu { namespace device \r
-{ \r
- template <class T> \r
+namespace cv { namespace gpu { namespace device\r
+{\r
+ template <class T>\r
__device__ __forceinline__ T warp_reduce(volatile T *ptr , const unsigned int tid = threadIdx.x)\r
{\r
const unsigned int lane = tid & 31; // index of thread in warp (0..31)\r
- \r
- if (lane < 16)\r
- { \r
- T partial = ptr[tid];\r
\r
- ptr[tid] = partial = partial + ptr[tid + 16];\r
- ptr[tid] = partial = partial + ptr[tid + 8];\r
- ptr[tid] = partial = partial + ptr[tid + 4];\r
- ptr[tid] = partial = partial + ptr[tid + 2];\r
- ptr[tid] = partial = partial + ptr[tid + 1]; \r
- }\r
+ if (lane < 16)\r
+ {\r
+ T partial = ptr[tid];\r
\r
- return ptr[tid - lane];\r
+ ptr[tid] = partial = partial + ptr[tid + 16];\r
+ ptr[tid] = partial = partial + ptr[tid + 8];\r
+ ptr[tid] = partial = partial + ptr[tid + 4];\r
+ ptr[tid] = partial = partial + ptr[tid + 2];\r
+ ptr[tid] = partial = partial + ptr[tid + 1];\r
+ }\r
+\r
+ return ptr[tid - lane];\r
}\r
}}} // namespace cv { namespace gpu { namespace device {\r
\r