From: Vladislav Vinogradov Date: Fri, 5 Oct 2012 13:43:22 +0000 (+0400) Subject: minor refactoring: X-Git-Tag: accepted/2.0/20130307.220821~364^2~132^2~2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=3ebec7448dd616ea71e93eff3833eb7e64e769bd;p=profile%2Fivi%2Fopencv.git minor refactoring: moved lbp.hpp to src/cuda folder added missing cv::gpu::device namespace deleted whitespaces --- diff --git a/modules/gpu/src/cuda/lbp.cu b/modules/gpu/src/cuda/lbp.cu index ce5e9ae..9b729fe 100644 --- a/modules/gpu/src/cuda/lbp.cu +++ b/modules/gpu/src/cuda/lbp.cu @@ -42,9 +42,9 @@ #if !defined CUDA_DISABLER -#include -#include -#include +#include "lbp.hpp" +#include "opencv2/gpu/device/vec_traits.hpp" +#include "opencv2/gpu/device/saturate_cast.hpp" namespace cv { namespace gpu { namespace device { @@ -299,4 +299,4 @@ namespace cv { namespace gpu { namespace device } }}} -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/opencv2/gpu/device/lbp.hpp b/modules/gpu/src/cuda/lbp.hpp similarity index 100% rename from modules/gpu/src/opencv2/gpu/device/lbp.hpp rename to modules/gpu/src/cuda/lbp.hpp diff --git a/modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp b/modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp index 61f9f2c..22e639e 100644 --- a/modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp +++ b/modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp @@ -1535,6 +1535,8 @@ namespace cv { namespace gpu { namespace device return functor_type(); \ } \ }; + + #undef CV_DESCALE }}} // namespace cv { namespace gpu { namespace device #endif // __OPENCV_GPU_COLOR_DETAIL_HPP__ diff --git a/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp b/modules/gpu/src/opencv2/gpu/device/detail/reduction_detail.hpp similarity index 89% rename from modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp rename to modules/gpu/src/opencv2/gpu/device/detail/reduction_detail.hpp index a0a3750..b700b3f 100644 --- a/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp +++ b/modules/gpu/src/opencv2/gpu/device/detail/reduction_detail.hpp @@ -40,27 +40,27 @@ // //M*/ -#ifndef __OPENCV_GPU_UTILITY_DETAIL_HPP__ -#define __OPENCV_GPU_UTILITY_DETAIL_HPP__ +#ifndef __OPENCV_GPU_REDUCTION_DETAIL_HPP__ +#define __OPENCV_GPU_REDUCTION_DETAIL_HPP__ -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { namespace utility_detail { /////////////////////////////////////////////////////////////////////////////// - // Reduction + // Reductor template struct WarpReductor { template static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) { if (tid < n) - data[tid] = partial_reduction; + data[tid] = partial_reduction; if (n > 32) __syncthreads(); if (n > 32) { - if (tid < n - 32) + if (tid < n - 32) data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]); if (tid < 16) { @@ -73,7 +73,7 @@ namespace cv { namespace gpu { namespace device } else if (n > 16) { - if (tid < n - 16) + if (tid < n - 16) data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); if (tid < 8) { @@ -85,7 +85,7 @@ namespace cv { namespace gpu { namespace device } else if (n > 8) { - if (tid < n - 8) + if (tid < n - 8) data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]); if (tid < 4) { @@ -96,23 +96,23 @@ namespace cv { namespace gpu { namespace device } else if (n > 4) { - if (tid < n - 4) + if (tid < n - 4) data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); if (tid < 2) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); } - } + } else if (n > 2) { - if (tid < n - 2) + if (tid < n - 2) data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); if (tid < 2) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); } - } + } } }; template <> struct WarpReductor<64> @@ -121,15 +121,15 @@ namespace cv { namespace gpu { namespace device { data[tid] = partial_reduction; __syncthreads(); - - if (tid < 32) + + if (tid < 32) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]); data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); } } }; @@ -138,14 +138,14 @@ namespace cv { namespace gpu { namespace device template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) { data[tid] = partial_reduction; - - if (tid < 16) + + if (tid < 16) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); } } }; @@ -154,13 +154,13 @@ namespace cv { namespace gpu { namespace device template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) { data[tid] = partial_reduction; - - if (tid < 8) + + if (tid < 8) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); } } }; @@ -169,12 +169,12 @@ namespace cv { namespace gpu { namespace device template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) { data[tid] = partial_reduction; - - if (tid < 4) + + if (tid < 4) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); + data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); } } }; @@ -214,11 +214,11 @@ namespace cv { namespace gpu { namespace device /////////////////////////////////////////////////////////////////////////////// // PredValWarpReductor - + template struct PredValWarpReductor; template <> struct PredValWarpReductor<64> { - template + template static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) { if (tid < 32) @@ -253,14 +253,14 @@ namespace cv { namespace gpu { namespace device sdata[tid] = myData = reg; sval[tid] = myVal = sval[tid + 4]; } - + reg = sdata[tid + 2]; if (pred(reg, myData)) { sdata[tid] = myData = reg; sval[tid] = myVal = sval[tid + 2]; } - + reg = sdata[tid + 1]; if (pred(reg, myData)) { @@ -272,7 +272,7 @@ namespace cv { namespace gpu { namespace device }; template <> struct PredValWarpReductor<32> { - template + template static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) { if (tid < 16) @@ -300,14 +300,14 @@ namespace cv { namespace gpu { namespace device sdata[tid] = myData = reg; sval[tid] = myVal = sval[tid + 4]; } - + reg = sdata[tid + 2]; if (pred(reg, myData)) { sdata[tid] = myData = reg; sval[tid] = myVal = sval[tid + 2]; } - + reg = sdata[tid + 1]; if (pred(reg, myData)) { @@ -320,7 +320,7 @@ namespace cv { namespace gpu { namespace device template <> struct PredValWarpReductor<16> { - template + template static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) { if (tid < 8) @@ -341,14 +341,14 @@ namespace cv { namespace gpu { namespace device sdata[tid] = myData = reg; sval[tid] = myVal = sval[tid + 4]; } - + reg = sdata[tid + 2]; if (pred(reg, myData)) { sdata[tid] = myData = reg; sval[tid] = myVal = sval[tid + 2]; } - + reg = sdata[tid + 1]; if (pred(reg, myData)) { @@ -360,7 +360,7 @@ namespace cv { namespace gpu { namespace device }; template <> struct PredValWarpReductor<8> { - template + template static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) { if (tid < 4) @@ -374,14 +374,14 @@ namespace cv { namespace gpu { namespace device sdata[tid] = myData = reg; sval[tid] = myVal = sval[tid + 4]; } - + reg = sdata[tid + 2]; if (pred(reg, myData)) { sdata[tid] = myData = reg; sval[tid] = myVal = sval[tid + 2]; } - + reg = sdata[tid + 1]; if (pred(reg, myData)) { @@ -407,7 +407,7 @@ namespace cv { namespace gpu { namespace device myData = sdata[tid]; myVal = sval[tid]; - if (n >= 512 && tid < 256) + if (n >= 512 && tid < 256) { T reg = sdata[tid + 256]; @@ -416,9 +416,9 @@ namespace cv { namespace gpu { namespace device sdata[tid] = myData = reg; sval[tid] = myVal = sval[tid + 256]; } - __syncthreads(); + __syncthreads(); } - if (n >= 256 && tid < 128) + if (n >= 256 && tid < 128) { T reg = sdata[tid + 128]; @@ -427,9 +427,9 @@ namespace cv { namespace gpu { namespace device sdata[tid] = myData = reg; sval[tid] = myVal = sval[tid + 128]; } - __syncthreads(); + __syncthreads(); } - if (n >= 128 && tid < 64) + if (n >= 128 && tid < 64) { T reg = sdata[tid + 64]; @@ -438,13 +438,13 @@ namespace cv { namespace gpu { namespace device sdata[tid] = myData = reg; sval[tid] = myVal = sval[tid + 64]; } - __syncthreads(); - } + __syncthreads(); + } if (tid < 32) { - if (n >= 64) - { + if (n >= 64) + { T reg = sdata[tid + 32]; if (pred(reg, myData)) @@ -453,8 +453,8 @@ namespace cv { namespace gpu { namespace device sval[tid] = myVal = sval[tid + 32]; } } - if (n >= 32) - { + if (n >= 32) + { T reg = sdata[tid + 16]; if (pred(reg, myData)) @@ -463,8 +463,8 @@ namespace cv { namespace gpu { namespace device sval[tid] = myVal = sval[tid + 16]; } } - if (n >= 16) - { + if (n >= 16) + { T reg = sdata[tid + 8]; if (pred(reg, myData)) @@ -473,8 +473,8 @@ namespace cv { namespace gpu { namespace device sval[tid] = myVal = sval[tid + 8]; } } - if (n >= 8) - { + if (n >= 8) + { T reg = sdata[tid + 4]; if (pred(reg, myData)) @@ -483,18 +483,18 @@ namespace cv { namespace gpu { namespace device sval[tid] = myVal = sval[tid + 4]; } } - if (n >= 4) - { + if (n >= 4) + { T reg = sdata[tid + 2]; if (pred(reg, myData)) { sdata[tid] = myData = reg; sval[tid] = myVal = sval[tid + 2]; - } + } } - if (n >= 2) - { + if (n >= 2) + { T reg = sdata[tid + 1]; if (pred(reg, myData)) @@ -513,7 +513,7 @@ namespace cv { namespace gpu { namespace device template struct PredVal2WarpReductor; template <> struct PredVal2WarpReductor<64> { - template + template static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) { if (tid < 32) @@ -553,7 +553,7 @@ namespace cv { namespace gpu { namespace device sval1[tid] = myVal1 = sval1[tid + 4]; sval2[tid] = myVal2 = sval2[tid + 4]; } - + reg = sdata[tid + 2]; if (pred(reg, myData)) { @@ -561,7 +561,7 @@ namespace cv { namespace gpu { namespace device sval1[tid] = myVal1 = sval1[tid + 2]; sval2[tid] = myVal2 = sval2[tid + 2]; } - + reg = sdata[tid + 1]; if (pred(reg, myData)) { @@ -574,7 +574,7 @@ namespace cv { namespace gpu { namespace device }; template <> struct PredVal2WarpReductor<32> { - template + template static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) { if (tid < 16) @@ -606,7 +606,7 @@ namespace cv { namespace gpu { namespace device sval1[tid] = myVal1 = sval1[tid + 4]; sval2[tid] = myVal2 = sval2[tid + 4]; } - + reg = sdata[tid + 2]; if (pred(reg, myData)) { @@ -614,7 +614,7 @@ namespace cv { namespace gpu { namespace device sval1[tid] = myVal1 = sval1[tid + 2]; sval2[tid] = myVal2 = sval2[tid + 2]; } - + reg = sdata[tid + 1]; if (pred(reg, myData)) { @@ -628,7 +628,7 @@ namespace cv { namespace gpu { namespace device template <> struct PredVal2WarpReductor<16> { - template + template static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) { if (tid < 8) @@ -652,7 +652,7 @@ namespace cv { namespace gpu { namespace device sval1[tid] = myVal1 = sval1[tid + 4]; sval2[tid] = myVal2 = sval2[tid + 4]; } - + reg = sdata[tid + 2]; if (pred(reg, myData)) { @@ -660,7 +660,7 @@ namespace cv { namespace gpu { namespace device sval1[tid] = myVal1 = sval1[tid + 2]; sval2[tid] = myVal2 = sval2[tid + 2]; } - + reg = sdata[tid + 1]; if (pred(reg, myData)) { @@ -673,7 +673,7 @@ namespace cv { namespace gpu { namespace device }; template <> struct PredVal2WarpReductor<8> { - template + template static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) { if (tid < 4) @@ -689,7 +689,7 @@ namespace cv { namespace gpu { namespace device sval1[tid] = myVal1 = sval1[tid + 4]; sval2[tid] = myVal2 = sval2[tid + 4]; } - + reg = sdata[tid + 2]; if (pred(reg, myData)) { @@ -697,7 +697,7 @@ namespace cv { namespace gpu { namespace device sval1[tid] = myVal1 = sval1[tid + 2]; sval2[tid] = myVal2 = sval2[tid + 2]; } - + reg = sdata[tid + 1]; if (pred(reg, myData)) { @@ -712,7 +712,7 @@ namespace cv { namespace gpu { namespace device template struct PredVal2ReductionDispatcher; template <> struct PredVal2ReductionDispatcher { - template + template static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) { PredVal2WarpReductor::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred); @@ -720,14 +720,14 @@ namespace cv { namespace gpu { namespace device }; template <> struct PredVal2ReductionDispatcher { - template + template static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) { myData = sdata[tid]; myVal1 = sval1[tid]; myVal2 = sval2[tid]; - if (n >= 512 && tid < 256) + if (n >= 512 && tid < 256) { T reg = sdata[tid + 256]; @@ -737,9 +737,9 @@ namespace cv { namespace gpu { namespace device sval1[tid] = myVal1 = sval1[tid + 256]; sval2[tid] = myVal2 = sval2[tid + 256]; } - __syncthreads(); + __syncthreads(); } - if (n >= 256 && tid < 128) + if (n >= 256 && tid < 128) { T reg = sdata[tid + 128]; @@ -749,9 +749,9 @@ namespace cv { namespace gpu { namespace device sval1[tid] = myVal1 = sval1[tid + 128]; sval2[tid] = myVal2 = sval2[tid + 128]; } - __syncthreads(); + __syncthreads(); } - if (n >= 128 && tid < 64) + if (n >= 128 && tid < 64) { T reg = sdata[tid + 64]; @@ -761,13 +761,13 @@ namespace cv { namespace gpu { namespace device sval1[tid] = myVal1 = sval1[tid + 64]; sval2[tid] = myVal2 = sval2[tid + 64]; } - __syncthreads(); - } + __syncthreads(); + } if (tid < 32) { - if (n >= 64) - { + if (n >= 64) + { T reg = sdata[tid + 32]; if (pred(reg, myData)) @@ -777,8 +777,8 @@ namespace cv { namespace gpu { namespace device sval2[tid] = myVal2 = sval2[tid + 32]; } } - if (n >= 32) - { + if (n >= 32) + { T reg = sdata[tid + 16]; if (pred(reg, myData)) @@ -788,8 +788,8 @@ namespace cv { namespace gpu { namespace device sval2[tid] = myVal2 = sval2[tid + 16]; } } - if (n >= 16) - { + if (n >= 16) + { T reg = sdata[tid + 8]; if (pred(reg, myData)) @@ -799,8 +799,8 @@ namespace cv { namespace gpu { namespace device sval2[tid] = myVal2 = sval2[tid + 8]; } } - if (n >= 8) - { + if (n >= 8) + { T reg = sdata[tid + 4]; if (pred(reg, myData)) @@ -810,8 +810,8 @@ namespace cv { namespace gpu { namespace device sval2[tid] = myVal2 = sval2[tid + 4]; } } - if (n >= 4) - { + if (n >= 4) + { T reg = sdata[tid + 2]; if (pred(reg, myData)) @@ -819,10 +819,10 @@ namespace cv { namespace gpu { namespace device sdata[tid] = myData = reg; sval1[tid] = myVal1 = sval1[tid + 2]; sval2[tid] = myVal2 = sval2[tid + 2]; - } + } } - if (n >= 2) - { + if (n >= 2) + { T reg = sdata[tid + 1]; if (pred(reg, myData)) @@ -838,4 +838,4 @@ namespace cv { namespace gpu { namespace device } // namespace utility_detail }}} // namespace cv { namespace gpu { namespace device -#endif // __OPENCV_GPU_UTILITY_DETAIL_HPP__ +#endif // __OPENCV_GPU_REDUCTION_DETAIL_HPP__ diff --git a/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp b/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp index f5eaefb..9c7bbd9 100644 --- a/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp +++ b/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp @@ -47,7 +47,7 @@ #include "../vec_traits.hpp" #include "../functional.hpp" -namespace cv { namespace gpu { namespace device +namespace cv { namespace gpu { namespace device { namespace transform_detail { @@ -203,7 +203,7 @@ namespace cv { namespace gpu { namespace device }; template - __global__ static void transformSmart(const PtrStepSz src_, PtrStep dst_, const Mask mask, const UnOp op) + static __global__ void transformSmart(const PtrStepSz src_, PtrStep dst_, const Mask mask, const UnOp op) { typedef TransformFunctorTraits ft; typedef typename UnaryReadWriteTraits::read_type read_type; @@ -239,10 +239,10 @@ namespace cv { namespace gpu { namespace device } template - static __global__ void transformSimple(const PtrStepSz src, PtrStep dst, const Mask mask, const UnOp op) + __global__ static void transformSimple(const PtrStepSz src, PtrStep dst, const Mask mask, const UnOp op) { - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; + const int x = blockDim.x * blockIdx.x + threadIdx.x; + const int y = blockDim.y * blockIdx.y + threadIdx.y; if (x < src.cols && y < src.rows && mask(y, x)) { @@ -251,7 +251,7 @@ namespace cv { namespace gpu { namespace device } template - __global__ static void transformSmart(const PtrStepSz src1_, const PtrStep src2_, PtrStep dst_, + static __global__ void transformSmart(const PtrStepSz src1_, const PtrStep src2_, PtrStep dst_, const Mask mask, const BinOp op) { typedef TransformFunctorTraits ft; @@ -274,7 +274,7 @@ namespace cv { namespace gpu { namespace device const read_type1 src1_n_el = ((const read_type1*)src1)[x]; const read_type2 src2_n_el = ((const read_type2*)src2)[x]; write_type dst_n_el = ((const write_type*)dst)[x]; - + OpUnroller::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y); ((write_type*)dst)[x] = dst_n_el; @@ -291,11 +291,11 @@ namespace cv { namespace gpu { namespace device } template - static __global__ void transformSimple(const PtrStepSz src1, const PtrStep src2, PtrStep dst, + static __global__ void transformSimple(const PtrStepSz src1, const PtrStep src2, PtrStep dst, const Mask mask, const BinOp op) { - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; + const int x = blockDim.x * blockIdx.x + threadIdx.x; + const int y = blockDim.y * blockIdx.y + threadIdx.y; if (x < src1.cols && y < src1.rows && mask(y, x)) { @@ -314,13 +314,13 @@ namespace cv { namespace gpu { namespace device typedef TransformFunctorTraits ft; const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1); - const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1); + const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1); transformSimple<<>>(src, dst, mask, op); cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } template @@ -329,13 +329,13 @@ namespace cv { namespace gpu { namespace device typedef TransformFunctorTraits ft; const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1); - const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1); + const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1); transformSimple<<>>(src1, src2, dst, mask, op); cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } }; template<> struct TransformDispatcher @@ -347,7 +347,7 @@ namespace cv { namespace gpu { namespace device StaticAssert::check(); - if (!isAligned(src.data, ft::smart_shift * sizeof(T)) || !isAligned(src.step, ft::smart_shift * sizeof(T)) || + if (!isAligned(src.data, ft::smart_shift * sizeof(T)) || !isAligned(src.step, ft::smart_shift * sizeof(T)) || !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D))) { TransformDispatcher::call(src, dst, op, mask, stream); @@ -355,7 +355,7 @@ namespace cv { namespace gpu { namespace device } const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1); - const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1); + const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1); transformSmart<<>>(src, dst, mask, op); cudaSafeCall( cudaGetLastError() ); @@ -380,15 +380,15 @@ namespace cv { namespace gpu { namespace device } const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1); - const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1); + const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1); transformSmart<<>>(src1, src2, dst, mask, op); cudaSafeCall( cudaGetLastError() ); if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); + cudaSafeCall( cudaDeviceSynchronize() ); } - }; + }; } // namespace transform_detail }}} // namespace cv { namespace gpu { namespace device diff --git a/modules/gpu/src/opencv2/gpu/device/scan.hpp b/modules/gpu/src/opencv2/gpu/device/scan.hpp index b55ff41..f6dc693 100644 --- a/modules/gpu/src/opencv2/gpu/device/scan.hpp +++ b/modules/gpu/src/opencv2/gpu/device/scan.hpp @@ -43,124 +43,129 @@ #ifndef __OPENCV_GPU_SCAN_HPP__ #define __OPENCV_GPU_SCAN_HPP__ - enum ScanKind { EXCLUSIVE = 0, INCLUSIVE = 1 }; +#include "common.hpp" - template struct WarpScan - { - __device__ __forceinline__ WarpScan() {} - __device__ __forceinline__ WarpScan(const WarpScan& other) { (void)other; } - - __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx) - { - const unsigned int lane = idx & 31; - F op; +namespace cv { namespace gpu { namespace device +{ + enum ScanKind { EXCLUSIVE = 0, INCLUSIVE = 1 }; - if ( lane >= 1) ptr [idx ] = op(ptr [idx - 1], ptr [idx]); - if ( lane >= 2) ptr [idx ] = op(ptr [idx - 2], ptr [idx]); - if ( lane >= 4) ptr [idx ] = op(ptr [idx - 4], ptr [idx]); - if ( lane >= 8) ptr [idx ] = op(ptr [idx - 8], ptr [idx]); - if ( lane >= 16) ptr [idx ] = op(ptr [idx - 16], ptr [idx]); + template struct WarpScan + { + __device__ __forceinline__ WarpScan() {} + __device__ __forceinline__ WarpScan(const WarpScan& other) { (void)other; } - if( Kind == INCLUSIVE ) - return ptr [idx]; - else - return (lane > 0) ? ptr [idx - 1] : 0; - } + __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx) + { + const unsigned int lane = idx & 31; + F op; + + if ( lane >= 1) ptr [idx ] = op(ptr [idx - 1], ptr [idx]); + if ( lane >= 2) ptr [idx ] = op(ptr [idx - 2], ptr [idx]); + if ( lane >= 4) ptr [idx ] = op(ptr [idx - 4], ptr [idx]); + if ( lane >= 8) ptr [idx ] = op(ptr [idx - 8], ptr [idx]); + if ( lane >= 16) ptr [idx ] = op(ptr [idx - 16], ptr [idx]); + + if( Kind == INCLUSIVE ) + return ptr [idx]; + else + return (lane > 0) ? ptr [idx - 1] : 0; + } + + __device__ __forceinline__ unsigned int index(const unsigned int tid) + { + return tid; + } - __device__ __forceinline__ unsigned int index(const unsigned int tid) - { - return tid; - } + __device__ __forceinline__ void init(volatile T *ptr){} - __device__ __forceinline__ void init(volatile T *ptr){} + static const int warp_offset = 0; - static const int warp_offset = 0; + typedef WarpScan merge; + }; - typedef WarpScan merge; - }; + template struct WarpScanNoComp + { + __device__ __forceinline__ WarpScanNoComp() {} + __device__ __forceinline__ WarpScanNoComp(const WarpScanNoComp& other) { (void)other; } - template struct WarpScanNoComp + __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx) + { + const unsigned int lane = threadIdx.x & 31; + F op; + + ptr [idx ] = op(ptr [idx - 1], ptr [idx]); + ptr [idx ] = op(ptr [idx - 2], ptr [idx]); + ptr [idx ] = op(ptr [idx - 4], ptr [idx]); + ptr [idx ] = op(ptr [idx - 8], ptr [idx]); + ptr [idx ] = op(ptr [idx - 16], ptr [idx]); + + if( Kind == INCLUSIVE ) + return ptr [idx]; + else + return (lane > 0) ? ptr [idx - 1] : 0; + } + + __device__ __forceinline__ unsigned int index(const unsigned int tid) { - __device__ __forceinline__ WarpScanNoComp() {} - __device__ __forceinline__ WarpScanNoComp(const WarpScanNoComp& other) { (void)other; } - - __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx) - { - const unsigned int lane = threadIdx.x & 31; - F op; - - ptr [idx ] = op(ptr [idx - 1], ptr [idx]); - ptr [idx ] = op(ptr [idx - 2], ptr [idx]); - ptr [idx ] = op(ptr [idx - 4], ptr [idx]); - ptr [idx ] = op(ptr [idx - 8], ptr [idx]); - ptr [idx ] = op(ptr [idx - 16], ptr [idx]); - - if( Kind == INCLUSIVE ) - return ptr [idx]; - else - return (lane > 0) ? ptr [idx - 1] : 0; - } - - __device__ __forceinline__ unsigned int index(const unsigned int tid) - { - return (tid >> warp_log) * warp_smem_stride + 16 + (tid & warp_mask); - } - - __device__ __forceinline__ void init(volatile T *ptr) - { - ptr[threadIdx.x] = 0; - } - - static const int warp_smem_stride = 32 + 16 + 1; - static const int warp_offset = 16; - static const int warp_log = 5; - static const int warp_mask = 31; - - typedef WarpScanNoComp merge; - }; - - template struct BlockScan + return (tid >> warp_log) * warp_smem_stride + 16 + (tid & warp_mask); + } + + __device__ __forceinline__ void init(volatile T *ptr) { - __device__ __forceinline__ BlockScan() {} - __device__ __forceinline__ BlockScan(const BlockScan& other) { (void)other; } + ptr[threadIdx.x] = 0; + } + + static const int warp_smem_stride = 32 + 16 + 1; + static const int warp_offset = 16; + static const int warp_log = 5; + static const int warp_mask = 31; + + typedef WarpScanNoComp merge; + }; - __device__ __forceinline__ T operator()(volatile T *ptr) - { - const unsigned int tid = threadIdx.x; - const unsigned int lane = tid & warp_mask; - const unsigned int warp = tid >> warp_log; + template struct BlockScan + { + __device__ __forceinline__ BlockScan() {} + __device__ __forceinline__ BlockScan(const BlockScan& other) { (void)other; } + + __device__ __forceinline__ T operator()(volatile T *ptr) + { + const unsigned int tid = threadIdx.x; + const unsigned int lane = tid & warp_mask; + const unsigned int warp = tid >> warp_log; - Sc scan; - typename Sc::merge merge_scan; - const unsigned int idx = scan.index(tid); + Sc scan; + typename Sc::merge merge_scan; + const unsigned int idx = scan.index(tid); - T val = scan(ptr, idx); - __syncthreads (); + T val = scan(ptr, idx); + __syncthreads (); - if( warp == 0) - scan.init(ptr); - __syncthreads (); + if( warp == 0) + scan.init(ptr); + __syncthreads (); - if( lane == 31 ) - ptr [scan.warp_offset + warp ] = (Kind == INCLUSIVE) ? val : ptr [idx]; - __syncthreads (); + if( lane == 31 ) + ptr [scan.warp_offset + warp ] = (Kind == INCLUSIVE) ? val : ptr [idx]; + __syncthreads (); - if( warp == 0 ) - merge_scan(ptr, idx); - __syncthreads(); + if( warp == 0 ) + merge_scan(ptr, idx); + __syncthreads(); - if ( warp > 0) - val = ptr [scan.warp_offset + warp - 1] + val; - __syncthreads (); + if ( warp > 0) + val = ptr [scan.warp_offset + warp - 1] + val; + __syncthreads (); - ptr[idx] = val; - __syncthreads (); + ptr[idx] = val; + __syncthreads (); - return val ; - } + return val ; + } - static const int warp_log = 5; - static const int warp_mask = 31; - }; + static const int warp_log = 5; + static const int warp_mask = 31; + }; +}}} -#endif \ No newline at end of file +#endif // __OPENCV_GPU_SCAN_HPP__ diff --git a/modules/gpu/src/opencv2/gpu/device/static_check.hpp b/modules/gpu/src/opencv2/gpu/device/static_check.hpp index 178c0f7..db472d5 100644 --- a/modules/gpu/src/opencv2/gpu/device/static_check.hpp +++ b/modules/gpu/src/opencv2/gpu/device/static_check.hpp @@ -60,10 +60,8 @@ namespace cv { namespace gpu __OPENCV_GPU_HOST_DEVICE__ static void check() {}; }; } - - using ::cv::gpu::device::Static; }} #undef __OPENCV_GPU_HOST_DEVICE__ -#endif /* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */ \ No newline at end of file +#endif /* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */ diff --git a/modules/gpu/src/opencv2/gpu/device/utility.hpp b/modules/gpu/src/opencv2/gpu/device/utility.hpp index 78d82e3..072f42d 100644 --- a/modules/gpu/src/opencv2/gpu/device/utility.hpp +++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp @@ -45,7 +45,7 @@ #include "saturate_cast.hpp" #include "datamov_utils.hpp" -#include "detail/utility_detail.hpp" +#include "detail/reduction_detail.hpp" namespace cv { namespace gpu { namespace device {