From: Alexey Spizhevoy Date: Mon, 20 Dec 2010 08:06:13 +0000 (+0000) Subject: updated gpu bitwise operations X-Git-Tag: accepted/2.0/20130307.220821~3794 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=e62bf3a2ae9a8deaed3131fcd7709f472af135e8;p=profile%2Fivi%2Fopencv.git updated gpu bitwise operations --- diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index ba64111..8a7abb8 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -1002,18 +1002,25 @@ void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& namespace cv { namespace gpu { namespace mathfunc { - void bitwise_not_caller(int rows, int cols, const PtrStep src, int elemSize, PtrStep dst, cudaStream_t stream); - void bitwise_not_caller(int rows, int cols, const PtrStep src, int elemSize, PtrStep dst, const PtrStep mask, cudaStream_t stream); - void bitwise_or_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elemSize, PtrStep dst, cudaStream_t stream); - void bitwise_or_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elemSize, PtrStep dst, const PtrStep mask, cudaStream_t stream); - void bitwise_and_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elemSize, PtrStep dst, cudaStream_t stream); - void bitwise_and_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elemSize, PtrStep dst, const PtrStep mask, cudaStream_t stream); - void bitwise_xor_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elemSize, PtrStep dst, cudaStream_t stream); - void bitwise_xor_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elemSize, PtrStep dst, const PtrStep mask, cudaStream_t stream); - - - template - void bitwise_bin_op(int rows, int cols, const PtrStep src1, const PtrStep src2, PtrStep dst, int elem_size, Mask mask, cudaStream_t stream); + void bitwise_not_caller(int rows, int cols, int elem_size1, int cn, const PtrStep src, PtrStep dst, cudaStream_t stream); + + template + void bitwise_mask_not_caller(int rows, int cols, int cn, const PtrStep src, const PtrStep mask, PtrStep dst, cudaStream_t stream); + + void bitwise_or_caller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream); + + template + void bitwise_mask_or_caller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream); + + void bitwise_and_caller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream); + + template + void bitwise_mask_and_caller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream); + + void bitwise_xor_caller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream); + + template + void bitwise_mask_xor_caller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream); }}} namespace @@ -1021,60 +1028,123 @@ namespace void bitwise_not_caller(const GpuMat& src, GpuMat& dst, cudaStream_t stream) { dst.create(src.size(), src.type()); - mathfunc::bitwise_not_caller(src.rows, src.cols, src, src.elemSize(), dst, stream); + + cv::gpu::mathfunc::bitwise_not_caller(src.rows, src.cols, src.elemSize1(), + dst.channels(), src, dst, stream); } + void bitwise_not_caller(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream) { + using namespace cv::gpu; + + typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t); + static Caller callers[] = {mathfunc::bitwise_mask_not_caller, mathfunc::bitwise_mask_not_caller, + mathfunc::bitwise_mask_not_caller, mathfunc::bitwise_mask_not_caller, + mathfunc::bitwise_mask_not_caller, mathfunc::bitwise_mask_not_caller, + mathfunc::bitwise_mask_not_caller}; + CV_Assert(mask.type() == CV_8U && mask.size() == src.size()); dst.create(src.size(), src.type()); - mathfunc::bitwise_not_caller(src.rows, src.cols, src, src.elemSize(), dst, mask, stream); + + Caller caller = callers[src.depth()]; + CV_Assert(caller); + + int cn = src.depth() != CV_64F ? src.channels() : src.channels() * (sizeof(double) / sizeof(unsigned int)); + caller(src.rows, src.cols, cn, src, mask, dst, stream); } + void bitwise_or_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream) { CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); dst.create(src1.size(), src1.type()); - mathfunc::bitwise_or_caller(dst.rows, dst.cols, src1, src2, dst.elemSize(), dst, stream); + + cv::gpu::mathfunc::bitwise_or_caller(dst.rows, dst.cols, dst.elemSize1(), + dst.channels(), src1, src2, dst, stream); } + void bitwise_or_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream) { + using namespace cv::gpu; + + typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); + static Caller callers[] = {mathfunc::bitwise_mask_or_caller, mathfunc::bitwise_mask_or_caller, + mathfunc::bitwise_mask_or_caller, mathfunc::bitwise_mask_or_caller, + mathfunc::bitwise_mask_or_caller, mathfunc::bitwise_mask_or_caller, + mathfunc::bitwise_mask_or_caller}; + CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); - CV_Assert(mask.type() == CV_8U && mask.size() == src1.size()); dst.create(src1.size(), src1.type()); - mathfunc::bitwise_or_caller(dst.rows, dst.cols, src1, src2, dst.elemSize(), dst, mask, stream); + + Caller caller = callers[src1.depth()]; + CV_Assert(caller); + + int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int)); + caller(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream); } + void bitwise_and_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream) { CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); dst.create(src1.size(), src1.type()); - mathfunc::bitwise_and_caller(dst.rows, dst.cols, src1, src2, dst.elemSize(), dst, stream); + + cv::gpu::mathfunc::bitwise_and_caller(dst.rows, dst.cols, dst.elemSize1(), + dst.channels(), src1, src2, dst, stream); } + void bitwise_and_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream) { + using namespace cv::gpu; + + typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); + static Caller callers[] = {mathfunc::bitwise_mask_and_caller, mathfunc::bitwise_mask_and_caller, + mathfunc::bitwise_mask_and_caller, mathfunc::bitwise_mask_and_caller, + mathfunc::bitwise_mask_and_caller, mathfunc::bitwise_mask_and_caller, + mathfunc::bitwise_mask_and_caller}; + CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); - CV_Assert(mask.type() == CV_8U && mask.size() == src1.size()); dst.create(src1.size(), src1.type()); - mathfunc::bitwise_and_caller(dst.rows, dst.cols, src1, src2, dst.elemSize(), dst, mask, stream); + + Caller caller = callers[src1.depth()]; + CV_Assert(caller); + + int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int)); + caller(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream); } + void bitwise_xor_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream) { - CV_Assert(src1.size() == src2.size()); - CV_Assert(src1.type() == src2.type()); + CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); dst.create(src1.size(), src1.type()); - mathfunc::bitwise_xor_caller(dst.rows, dst.cols, src1, src2, dst.elemSize(), dst, stream); + + cv::gpu::mathfunc::bitwise_xor_caller(dst.rows, dst.cols, dst.elemSize1(), + dst.channels(), src1, src2, dst, stream); } + void bitwise_xor_caller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream) { + using namespace cv::gpu; + + typedef void (*Caller)(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); + static Caller callers[] = {mathfunc::bitwise_mask_xor_caller, mathfunc::bitwise_mask_xor_caller, + mathfunc::bitwise_mask_xor_caller, mathfunc::bitwise_mask_xor_caller, + mathfunc::bitwise_mask_xor_caller, mathfunc::bitwise_mask_xor_caller, + mathfunc::bitwise_mask_xor_caller}; + CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); - CV_Assert(mask.type() == CV_8U && mask.size() == src1.size()); dst.create(src1.size(), src1.type()); - mathfunc::bitwise_xor_caller(dst.rows, dst.cols, src1, src2, dst.elemSize(), dst, mask, stream); + + Caller caller = callers[src1.depth()]; + CV_Assert(caller); + + int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int)); + caller(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream); } } diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu index 2de34ec..387d49d 100644 --- a/modules/gpu/src/cuda/mathfunc.cu +++ b/modules/gpu/src/cuda/mathfunc.cu @@ -299,18 +299,12 @@ namespace cv { namespace gpu { namespace mathfunc template struct UnOp { - typedef typename TypeVec::vec_t Vec2; - typedef typename TypeVec::vec_t Vec3; - typedef typename TypeVec::vec_t Vec4; static __device__ T call(T v) { return ~v; } - static __device__ Vec2 call(Vec2 v) { return VecTraits::make(~v.x, ~v.y); } - static __device__ Vec3 call(Vec3 v) { return VecTraits::make(~v.x, ~v.y, ~v.z); } - static __device__ Vec4 call(Vec4 v) { return VecTraits::make(~v.x, ~v.y, ~v.z, ~v.w); } }; template - __global__ void bitwise_un_op(int rows, int cols, const PtrStep src, PtrStep dst) + __global__ void bitwise_un_op_kernel(int rows, int width, const PtrStep src, PtrStep dst) { const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4; const int y = blockDim.y * blockIdx.y + threadIdx.y; @@ -319,13 +313,13 @@ namespace cv { namespace gpu { namespace mathfunc { uchar* dst_ptr = dst.ptr(y) + x; const uchar* src_ptr = src.ptr(y) + x; - if (x + sizeof(uint) - 1 < cols) + if (x + sizeof(uint) - 1 < width) { *(uint*)dst_ptr = UnOp::call(*(uint*)src_ptr); } else { - const uchar* src_end = src.ptr(y) + cols; + const uchar* src_end = src.ptr(y) + width; while (src_ptr < src_end) { *dst_ptr++ = UnOp::call(*src_ptr++); @@ -335,105 +329,65 @@ namespace cv { namespace gpu { namespace mathfunc } - template - __global__ void bitwise_un_op(int rows, int cols, const PtrStep src, PtrStep dst, const PtrStep mask) + template + void bitwise_un_op(int rows, int width, const PtrStep src, PtrStep dst, cudaStream_t stream) { - typedef typename TypeVec::vec_t Type; - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; + dim3 threads(16, 16); + dim3 grid(divUp(width, threads.x * sizeof(uint)), + divUp(rows, threads.y)); - if (x < cols && y < rows && mask.ptr(y)[x]) - { - Type* dst_row = (Type*)dst.ptr(y); - const Type* src_row = (const Type*)src.ptr(y); - dst_row[x] = UnOp::call(src_row[x]); - } + bitwise_un_op_kernel<<>>(rows, width, src, dst); + + if (stream == 0) + cudaSafeCall(cudaThreadSynchronize()); } - template - __global__ void bitwise_un_op_two_loads(int rows, int cols, const PtrStep src, PtrStep dst, const PtrStep mask) + template + __global__ void bitwise_un_op_kernel(int rows, int cols, int cn, const PtrStep src, const PtrStep mask, PtrStep dst) { - typedef typename TypeVec::vec_t Type; const int x = blockDim.x * blockIdx.x + threadIdx.x; const int y = blockDim.y * blockIdx.y + threadIdx.y; - if (x < cols && y < rows && mask.ptr(y)[x]) + if (x < cols && y < rows && mask.ptr(y)[x / cn]) { - Type* dst_row = (Type*)dst.ptr(y); - const Type* src_row = (const Type*)src.ptr(y); - dst_row[2 * x] = UnOp::call(src_row[2 * x]); - dst_row[2 * x + 1] = UnOp::call(src_row[2 * x + 1]); + T* dst_row = (T*)dst.ptr(y); + const T* src_row = (const T*)src.ptr(y); + + dst_row[x] = UnOp::call(src_row[x]); } } - template - void bitwise_un_op(int rows, int cols, const PtrStep src, PtrStep dst, int elem_size, cudaStream_t stream) + template + void bitwise_un_op(int rows, int cols, int cn, const PtrStep src, const PtrStep mask, PtrStep dst, cudaStream_t stream) { dim3 threads(16, 16); - dim3 grid(divUp(cols * elem_size, threads.x * sizeof(uint)), - divUp(rows, threads.y)); - bitwise_un_op<<>>(rows, cols * elem_size, src, dst); - if (stream == 0) - cudaSafeCall(cudaThreadSynchronize()); - } + dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); + bitwise_un_op_kernel<<>>(rows, cols, cn, src, mask, dst); - template - void bitwise_un_op(int rows, int cols, const PtrStep src, PtrStep dst, int elem_size, const PtrStep mask, cudaStream_t stream) - { - dim3 threads(16, 16); - dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); - switch (elem_size) - { - case 1: - bitwise_un_op<<>>(rows, cols, src, dst, mask); - break; - case 2: - bitwise_un_op<<>>(rows, cols, src, dst, mask); - break; - case 3: - bitwise_un_op<<>>(rows, cols, src, dst, mask); - break; - case 4: - bitwise_un_op<<>>(rows, cols, src, dst, mask); - break; - case 6: - bitwise_un_op<<>>(rows, cols, src, dst, mask); - break; - case 8: - bitwise_un_op<<>>(rows, cols, src, dst, mask); - break; - case 12: - bitwise_un_op<<>>(rows, cols, src, dst, mask); - break; - case 16: - bitwise_un_op<<>>(rows, cols, src, dst, mask); - break; - case 24: - bitwise_un_op_two_loads<<>>(rows, cols, src, dst, mask); - break; - case 32: - bitwise_un_op_two_loads<<>>(rows, cols, src, dst, mask); - break; - } if (stream == 0) cudaSafeCall(cudaThreadSynchronize()); } - void bitwise_not_caller(int rows, int cols, const PtrStep src, int elem_size, PtrStep dst, cudaStream_t stream) + void bitwise_not_caller(int rows, int cols, int elem_size1, int cn, const PtrStep src, PtrStep dst, cudaStream_t stream) { - bitwise_un_op(rows, cols, src, dst, elem_size, stream); + bitwise_un_op(rows, cols * elem_size1 * cn, src, dst, stream); } - void bitwise_not_caller(int rows, int cols,const PtrStep src, int elem_size, PtrStep dst, const PtrStep mask, cudaStream_t stream) + template + void bitwise_mask_not_caller(int rows, int cols, int cn, const PtrStep src, const PtrStep mask, PtrStep dst, cudaStream_t stream) { - bitwise_un_op(rows, cols, src, dst, elem_size, mask, stream); + bitwise_un_op(rows, cols * cn, cn, src, mask, dst, stream); } + template void bitwise_mask_not_caller(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t); + template void bitwise_mask_not_caller(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t); + template void bitwise_mask_not_caller(int, int, int, const PtrStep, const PtrStep, PtrStep, cudaStream_t); + //------------------------------------------------------------------------ // Binary operations @@ -445,43 +399,25 @@ namespace cv { namespace gpu { namespace mathfunc template struct BinOp { - typedef typename TypeVec::vec_t Vec2; - typedef typename TypeVec::vec_t Vec3; - typedef typename TypeVec::vec_t Vec4; static __device__ T call(T a, T b) { return a | b; } - static __device__ Vec2 call(Vec2 a, Vec2 b) { return VecTraits::make(a.x | b.x, a.y | b.y); } - static __device__ Vec3 call(Vec3 a, Vec3 b) { return VecTraits::make(a.x | b.x, a.y | b.y, a.z | b.z); } - static __device__ Vec4 call(Vec4 a, Vec4 b) { return VecTraits::make(a.x | b.x, a.y | b.y, a.z | b.z, a.w | b.w); } }; template struct BinOp { - typedef typename TypeVec::vec_t Vec2; - typedef typename TypeVec::vec_t Vec3; - typedef typename TypeVec::vec_t Vec4; static __device__ T call(T a, T b) { return a & b; } - static __device__ Vec2 call(Vec2 a, Vec2 b) { return VecTraits::make(a.x & b.x, a.y & b.y); } - static __device__ Vec3 call(Vec3 a, Vec3 b) { return VecTraits::make(a.x & b.x, a.y & b.y, a.z & b.z); } - static __device__ Vec4 call(Vec4 a, Vec4 b) { return VecTraits::make(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w); } }; template struct BinOp { - typedef typename TypeVec::vec_t Vec2; - typedef typename TypeVec::vec_t Vec3; - typedef typename TypeVec::vec_t Vec4; static __device__ T call(T a, T b) { return a ^ b; } - static __device__ Vec2 call(Vec2 a, Vec2 b) { return VecTraits::make(a.x ^ b.x, a.y ^ b.y); } - static __device__ Vec3 call(Vec3 a, Vec3 b) { return VecTraits::make(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z); } - static __device__ Vec4 call(Vec4 a, Vec4 b) { return VecTraits::make(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); } }; template - __global__ void bitwise_bin_op(int rows, int cols, const PtrStep src1, const PtrStep src2, PtrStep dst) + __global__ void bitwise_bin_op_kernel(int rows, int width, const PtrStep src1, const PtrStep src2, PtrStep dst) { const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4; const int y = blockDim.y * blockIdx.y + threadIdx.y; @@ -491,13 +427,14 @@ namespace cv { namespace gpu { namespace mathfunc uchar* dst_ptr = dst.ptr(y) + x; const uchar* src1_ptr = src1.ptr(y) + x; const uchar* src2_ptr = src2.ptr(y) + x; - if (x + sizeof(uint) - 1 < cols) + + if (x + sizeof(uint) - 1 < width) { *(uint*)dst_ptr = BinOp::call(*(uint*)src1_ptr, *(uint*)src2_ptr); } else { - const uchar* src1_end = src1.ptr(y) + cols; + const uchar* src1_end = src1.ptr(y) + width; while (src1_ptr < src1_end) { *dst_ptr++ = BinOp::call(*src1_ptr++, *src2_ptr++); @@ -507,134 +444,102 @@ namespace cv { namespace gpu { namespace mathfunc } - template - __global__ void bitwise_bin_op(int rows, int cols, const PtrStep src1, const PtrStep src2, - PtrStep dst, const PtrStep mask) + template + void bitwise_bin_op(int rows, int width, const PtrStep src1, const PtrStep src2, PtrStep dst, + cudaStream_t stream) { - typedef typename TypeVec::vec_t Type; - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; + dim3 threads(16, 16); + dim3 grid(divUp(width, threads.x * sizeof(uint)), divUp(rows, threads.y)); - if (x < cols && y < rows && mask.ptr(y)[x]) - { - Type* dst_row = (Type*)dst.ptr(y); - const Type* src1_row = (const Type*)src1.ptr(y); - const Type* src2_row = (const Type*)src2.ptr(y); - dst_row[x] = BinOp::call(src1_row[x], src2_row[x]); - } + bitwise_bin_op_kernel<<>>(rows, width, src1, src2, dst); + + if (stream == 0) + cudaSafeCall(cudaThreadSynchronize()); } - template - __global__ void bitwise_bin_op_two_loads(int rows, int cols, const PtrStep src1, const PtrStep src2, - PtrStep dst, const PtrStep mask) + template + __global__ void bitwise_bin_op_kernel( + int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, + const PtrStep mask, PtrStep dst) { - typedef typename TypeVec::vec_t Type; const int x = blockDim.x * blockIdx.x + threadIdx.x; const int y = blockDim.y * blockIdx.y + threadIdx.y; - if (x < cols && y < rows && mask.ptr(y)[x]) + if (x < cols && y < rows && mask.ptr(y)[x / cn]) { - Type* dst_row = (Type*)dst.ptr(y); - const Type* src1_row = (const Type*)src1.ptr(y); - const Type* src2_row = (const Type*)src2.ptr(y); - dst_row[2 * x] = BinOp::call(src1_row[2 * x], src2_row[2 * x]); - dst_row[2 * x + 1] = BinOp::call(src1_row[2 * x + 1], src2_row[2 * x + 1]); + T* dst_row = (T*)dst.ptr(y); + const T* src1_row = (const T*)src1.ptr(y); + const T* src2_row = (const T*)src2.ptr(y); + + dst_row[x] = BinOp::call(src1_row[x], src2_row[x]); } } - template - void bitwise_bin_op(int rows, int cols, const PtrStep src1, const PtrStep src2, PtrStep dst, - int elem_size, cudaStream_t stream) + template + void bitwise_bin_op(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, + const PtrStep mask, PtrStep dst, cudaStream_t stream) { dim3 threads(16, 16); - dim3 grid(divUp(cols * elem_size, threads.x * sizeof(uint)), - divUp(rows, threads.y)); - bitwise_bin_op<<>>(rows, cols * elem_size, src1, src2, dst); - if (stream == 0) - cudaSafeCall(cudaThreadSynchronize()); - } + dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); + bitwise_bin_op_kernel<<>>(rows, cols, cn, src1, src2, mask, dst); - template - void bitwise_bin_op(int rows, int cols, const PtrStep src1, const PtrStep src2, PtrStep dst, - int elem_size, const PtrStep mask, cudaStream_t stream) - { - dim3 threads(16, 16); - dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); - switch (elem_size) - { - case 1: - bitwise_bin_op<<>>(rows, cols, src1, src2, dst, mask); - break; - case 2: - bitwise_bin_op<<>>(rows, cols, src1, src2, dst, mask); - break; - case 3: - bitwise_bin_op<<>>(rows, cols, src1, src2, dst, mask); - break; - case 4: - bitwise_bin_op<<>>(rows, cols, src1, src2, dst, mask); - break; - case 6: - bitwise_bin_op<<>>(rows, cols, src1, src2, dst, mask); - break; - case 8: - bitwise_bin_op<<>>(rows, cols, src1, src2, dst, mask); - break; - case 12: - bitwise_bin_op<<>>(rows, cols, src1, src2, dst, mask); - break; - case 16: - bitwise_bin_op<<>>(rows, cols, src1, src2, dst, mask); - break; - case 24: - bitwise_bin_op_two_loads<<>>(rows, cols, src1, src2, dst, mask); - break; - case 32: - bitwise_bin_op_two_loads<<>>(rows, cols, src1, src2, dst, mask); - break; - } if (stream == 0) cudaSafeCall(cudaThreadSynchronize()); } - void bitwise_or_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elem_size, PtrStep dst, cudaStream_t stream) + void bitwise_or_caller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream) { - bitwise_bin_op(rows, cols, src1, src2, dst, elem_size, stream); + bitwise_bin_op(rows, cols * elem_size1 * cn, src1, src2, dst, stream); } - void bitwise_or_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elem_size, PtrStep dst, const PtrStep mask, cudaStream_t stream) + template + void bitwise_mask_or_caller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream) { - bitwise_bin_op(rows, cols, src1, src2, dst, elem_size, mask, stream); + bitwise_bin_op(rows, cols * cn, cn, src1, src2, mask, dst, stream); } + template void bitwise_mask_or_caller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); + template void bitwise_mask_or_caller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); + template void bitwise_mask_or_caller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); - void bitwise_and_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elem_size, PtrStep dst, cudaStream_t stream) + + void bitwise_and_caller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream) { - bitwise_bin_op(rows, cols, src1, src2, dst, elem_size, stream); + bitwise_bin_op(rows, cols * elem_size1 * cn, src1, src2, dst, stream); } - void bitwise_and_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elem_size, PtrStep dst, const PtrStep mask, cudaStream_t stream) + template + void bitwise_mask_and_caller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream) { - bitwise_bin_op(rows, cols, src1, src2, dst, elem_size, mask, stream); + bitwise_bin_op(rows, cols * cn, cn, src1, src2, mask, dst, stream); } + template void bitwise_mask_and_caller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); + template void bitwise_mask_and_caller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); + template void bitwise_mask_and_caller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); - void bitwise_xor_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elem_size, PtrStep dst, cudaStream_t stream) + + void bitwise_xor_caller(int rows, int cols, int elem_size1, int cn, const PtrStep src1, const PtrStep src2, PtrStep dst, cudaStream_t stream) { - bitwise_bin_op(rows, cols, src1, src2, dst, elem_size, stream); + bitwise_bin_op(rows, cols * elem_size1 * cn, src1, src2, dst, stream); } - void bitwise_xor_caller(int rows, int cols, const PtrStep src1, const PtrStep src2, int elem_size, PtrStep dst, const PtrStep mask, cudaStream_t stream) + template + void bitwise_mask_xor_caller(int rows, int cols, int cn, const PtrStep src1, const PtrStep src2, const PtrStep mask, PtrStep dst, cudaStream_t stream) { - bitwise_bin_op(rows, cols, src1, src2, dst, elem_size, mask, stream); - } + bitwise_bin_op(rows, cols * cn, cn, src1, src2, mask, dst, stream); + } + + template void bitwise_mask_xor_caller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); + template void bitwise_mask_xor_caller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t); + template void bitwise_mask_xor_caller(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);