#if !defined CUDA_DISABLER
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/vec_traits.hpp"
#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
+#include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/type_traits.hpp"
-namespace cv { namespace gpu { namespace device
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+namespace
{
- namespace matrix_reductions
+ template <int cn> struct Unroll;
+ template <> struct Unroll<1>
{
- // Performs reduction in shared memory
- template <int size, typename T>
- __device__ void sumInSmem(volatile T* data, const uint tid)
+ template <int BLOCK_SIZE, typename R>
+ static __device__ __forceinline__ volatile R* smem_tuple(R* smem)
{
- T sum = data[tid];
-
- if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); }
- if (size >= 256) { if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); }
- if (size >= 128) { if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); }
+ return smem;
+ }
- if (tid < 32)
- {
- if (size >= 64) data[tid] = sum = sum + data[tid + 32];
- if (size >= 32) data[tid] = sum = sum + data[tid + 16];
- if (size >= 16) data[tid] = sum = sum + data[tid + 8];
- if (size >= 8) data[tid] = sum = sum + data[tid + 4];
- if (size >= 4) data[tid] = sum = sum + data[tid + 2];
- if (size >= 2) data[tid] = sum = sum + data[tid + 1];
- }
+ template <typename R>
+ static __device__ __forceinline__ R& tie(R& val)
+ {
+ return val;
}
- struct Mask8U
+ template <class Op>
+ static __device__ __forceinline__ const Op& op(const Op& op)
+ {
+ return op;
+ }
+ };
+ template <> struct Unroll<2>
+ {
+ template <int BLOCK_SIZE, typename R>
+ static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*> smem_tuple(R* smem)
{
- explicit Mask8U(PtrStepb mask_): mask(mask_) {}
+ return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE);
+ }
- __device__ __forceinline__ bool operator()(int y, int x) const
- {
- return mask.ptr(y)[x];
- }
+ template <typename R>
+ static __device__ __forceinline__ thrust::tuple<typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&> tie(R& val)
+ {
+ return thrust::tie(val.x, val.y);
+ }
- PtrStepb mask;
- };
+ template <class Op>
+ static __device__ __forceinline__ const thrust::tuple<Op, Op> op(const Op& op)
+ {
+ return thrust::make_tuple(op, op);
+ }
+ };
+ template <> struct Unroll<3>
+ {
+ template <int BLOCK_SIZE, typename R>
+ static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*, volatile R*> smem_tuple(R* smem)
+ {
+ return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
+ }
- struct MaskTrue
+ template <typename R>
+ static __device__ __forceinline__ thrust::tuple<typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&> tie(R& val)
{
- __device__ __forceinline__ bool operator()(int y, int x) const
- {
- return true;
- }
- __device__ __forceinline__ MaskTrue(){}
- __device__ __forceinline__ MaskTrue(const MaskTrue& mask_){}
- };
+ return thrust::tie(val.x, val.y, val.z);
+ }
- //////////////////////////////////////////////////////////////////////////////
- // Min max
-
- // To avoid shared bank conflicts we convert each value into value of
- // appropriate type (32 bits minimum)
- template <typename T> struct MinMaxTypeTraits {};
- template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
- template <> struct MinMaxTypeTraits<char> { typedef int best_type; };
- template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
- template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
- template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
- template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
- template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
-
- namespace minmax
+ template <class Op>
+ static __device__ __forceinline__ const thrust::tuple<Op, Op, Op> op(const Op& op)
{
- __constant__ int ctwidth;
- __constant__ int ctheight;
+ return thrust::make_tuple(op, op, op);
+ }
+ };
+ template <> struct Unroll<4>
+ {
+ template <int BLOCK_SIZE, typename R>
+ static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*, volatile R*, volatile R*> smem_tuple(R* smem)
+ {
+ return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
+ }
- // Global counter of blocks finished its work
- __device__ uint blocks_finished = 0;
+ template <typename R>
+ static __device__ __forceinline__ thrust::tuple<typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&> tie(R& val)
+ {
+ return thrust::tie(val.x, val.y, val.z, val.w);
+ }
+ template <class Op>
+ static __device__ __forceinline__ const thrust::tuple<Op, Op, Op, Op> op(const Op& op)
+ {
+ return thrust::make_tuple(op, op, op, op);
+ }
+ };
+}
- // Estimates good thread configuration
- // - threads variable satisfies to threads.x * threads.y == 256
- void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)
- {
- threads = dim3(32, 8);
- grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
- grid.x = std::min(grid.x, threads.x);
- grid.y = std::min(grid.y, threads.y);
- }
+/////////////////////////////////////////////////////////////
+// sum
+namespace sum
+{
+ __device__ unsigned int blocks_finished = 0;
- // Returns required buffer sizes
- void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows)
- {
- dim3 threads, grid;
- estimateThreadCfg(cols, rows, threads, grid);
- bufcols = grid.x * grid.y * elem_size;
- bufrows = 2;
- }
+ template <typename R, int cn> struct AtomicAdd;
+ template <typename R> struct AtomicAdd<R, 1>
+ {
+ static __device__ void run(R* ptr, R val)
+ {
+ ::atomicAdd(ptr, val);
+ }
+ };
+ template <typename R> struct AtomicAdd<R, 2>
+ {
+ typedef typename TypeVec<R, 2>::vec_type val_type;
+ static __device__ void run(R* ptr, val_type val)
+ {
+ ::atomicAdd(ptr, val.x);
+ ::atomicAdd(ptr + 1, val.y);
+ }
+ };
+ template <typename R> struct AtomicAdd<R, 3>
+ {
+ typedef typename TypeVec<R, 3>::vec_type val_type;
- // Estimates device constants which are used in the kernels using specified thread configuration
- void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
- {
- int twidth = divUp(divUp(cols, grid.x), threads.x);
- int theight = divUp(divUp(rows, grid.y), threads.y);
- cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth)));
- cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight)));
- }
+ static __device__ void run(R* ptr, val_type val)
+ {
+ ::atomicAdd(ptr, val.x);
+ ::atomicAdd(ptr + 1, val.y);
+ ::atomicAdd(ptr + 2, val.z);
+ }
+ };
+ template <typename R> struct AtomicAdd<R, 4>
+ {
+ typedef typename TypeVec<R, 4>::vec_type val_type;
+ static __device__ void run(R* ptr, val_type val)
+ {
+ ::atomicAdd(ptr, val.x);
+ ::atomicAdd(ptr + 1, val.y);
+ ::atomicAdd(ptr + 2, val.z);
+ ::atomicAdd(ptr + 3, val.w);
+ }
+ };
- // Does min and max in shared memory
- template <typename T>
- __device__ __forceinline__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval)
- {
- minval[tid] = ::min(minval[tid], minval[tid + offset]);
- maxval[tid] = ::max(maxval[tid], maxval[tid + offset]);
- }
+ template <int BLOCK_SIZE, typename R, int cn>
+ struct GlobalReduce
+ {
+ typedef typename TypeVec<R, cn>::vec_type result_type;
+ static __device__ void run(result_type& sum, result_type* result, int tid, int bid, R* smem)
+ {
+ #if __CUDA_ARCH__ >= 200
+ if (tid == 0)
+ AtomicAdd<R, cn>::run((R*) result, sum);
+ #else
+ __shared__ bool is_last;
- template <int size, typename T>
- __device__ void findMinMaxInSmem(volatile T* minval, volatile T* maxval, const uint tid)
+ if (tid == 0)
{
- if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); }
- if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); } __syncthreads(); }
- if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval); } __syncthreads(); }
+ result[bid] = sum;
- if (tid < 32)
- {
- if (size >= 64) merge(tid, 32, minval, maxval);
- if (size >= 32) merge(tid, 16, minval, maxval);
- if (size >= 16) merge(tid, 8, minval, maxval);
- if (size >= 8) merge(tid, 4, minval, maxval);
- if (size >= 4) merge(tid, 2, minval, maxval);
- if (size >= 2) merge(tid, 1, minval, maxval);
- }
+ __threadfence();
+
+ unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
+ is_last = (ticket == gridDim.x * gridDim.y - 1);
}
+ __syncthreads();
- template <int nthreads, typename T, typename Mask>
- __global__ void minMaxKernel(const PtrStepSzb src, Mask mask, T* minval, T* maxval)
+ if (is_last)
{
- typedef typename MinMaxTypeTraits<T>::best_type best_type;
- __shared__ best_type sminval[nthreads];
- __shared__ best_type smaxval[nthreads];
-
- uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
- uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
- uint tid = threadIdx.y * blockDim.x + threadIdx.x;
-
- T mymin = numeric_limits<T>::max();
- T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min();
- uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
- uint x_end = ::min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
- for (uint y = y0; y < y_end; y += blockDim.y)
- {
- const T* src_row = (const T*)src.ptr(y);
- for (uint x = x0; x < x_end; x += blockDim.x)
- {
- T val = src_row[x];
- if (mask(y, x))
- {
- mymin = ::min(mymin, val);
- mymax = ::max(mymax, val);
- }
- }
- }
-
- sminval[tid] = mymin;
- smaxval[tid] = mymax;
- __syncthreads();
+ sum = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<result_type>::all(0);
- findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);
+ device::reduce<BLOCK_SIZE>(Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), Unroll<cn>::tie(sum), tid, Unroll<cn>::op(plus<R>()));
if (tid == 0)
{
- minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
- maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
+ result[0] = sum;
+ blocks_finished = 0;
}
-
- #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
- __shared__ bool is_last;
-
- if (tid == 0)
- {
- minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
- maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
- __threadfence();
-
- uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
- is_last = ticket == gridDim.x * gridDim.y - 1;
- }
-
- __syncthreads();
-
- if (is_last)
- {
- uint idx = ::min(tid, gridDim.x * gridDim.y - 1);
-
- sminval[tid] = minval[idx];
- smaxval[tid] = maxval[idx];
- __syncthreads();
-
- findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);
-
- if (tid == 0)
- {
- minval[0] = (T)sminval[0];
- maxval[0] = (T)smaxval[0];
- blocks_finished = 0;
- }
- }
- #else
- if (tid == 0)
- {
- minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
- maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
- }
- #endif
- }
-
-
- template <typename T>
- void minMaxMaskCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
- {
- dim3 threads, grid;
- estimateThreadCfg(src.cols, src.rows, threads, grid);
- setKernelConsts(src.cols, src.rows, threads, grid);
-
- T* minval_buf = (T*)buf.ptr(0);
- T* maxval_buf = (T*)buf.ptr(1);
-
- minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
- cudaSafeCall( cudaGetLastError() );
-
- cudaSafeCall( cudaDeviceSynchronize() );
-
- T minval_, maxval_;
- cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
- cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
- *minval = minval_;
- *maxval = maxval_;
}
+ #endif
+ }
+ };
+ template <int BLOCK_SIZE, int cn>
+ struct GlobalReduce<BLOCK_SIZE, double, cn>
+ {
+ typedef typename TypeVec<double, cn>::vec_type result_type;
- template void minMaxMaskCaller<uchar>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
- template void minMaxMaskCaller<char>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
- template void minMaxMaskCaller<ushort>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
- template void minMaxMaskCaller<short>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
- template void minMaxMaskCaller<int>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
- template void minMaxMaskCaller<float>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
- template void minMaxMaskCaller<double>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
-
+ static __device__ void run(result_type& sum, result_type* result, int tid, int bid, double* smem)
+ {
+ __shared__ bool is_last;
- template <typename T>
- void minMaxCaller(const PtrStepSzb src, double* minval, double* maxval, PtrStepb buf)
+ if (tid == 0)
{
- dim3 threads, grid;
- estimateThreadCfg(src.cols, src.rows, threads, grid);
- setKernelConsts(src.cols, src.rows, threads, grid);
-
- T* minval_buf = (T*)buf.ptr(0);
- T* maxval_buf = (T*)buf.ptr(1);
-
- minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
- cudaSafeCall( cudaGetLastError() );
+ result[bid] = sum;
- cudaSafeCall( cudaDeviceSynchronize() );
+ __threadfence();
- T minval_, maxval_;
- cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
- cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
- *minval = minval_;
- *maxval = maxval_;
+ unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
+ is_last = (ticket == gridDim.x * gridDim.y - 1);
}
- template void minMaxCaller<uchar>(const PtrStepSzb, double*, double*, PtrStepb);
- template void minMaxCaller<char>(const PtrStepSzb, double*, double*, PtrStepb);
- template void minMaxCaller<ushort>(const PtrStepSzb, double*, double*, PtrStepb);
- template void minMaxCaller<short>(const PtrStepSzb, double*, double*, PtrStepb);
- template void minMaxCaller<int>(const PtrStepSzb, double*, double*, PtrStepb);
- template void minMaxCaller<float>(const PtrStepSzb, double*,double*, PtrStepb);
- template void minMaxCaller<double>(const PtrStepSzb, double*, double*, PtrStepb);
-
+ __syncthreads();
- template <int nthreads, typename T>
- __global__ void minMaxPass2Kernel(T* minval, T* maxval, int size)
+ if (is_last)
{
- typedef typename MinMaxTypeTraits<T>::best_type best_type;
- __shared__ best_type sminval[nthreads];
- __shared__ best_type smaxval[nthreads];
-
- uint tid = threadIdx.y * blockDim.x + threadIdx.x;
- uint idx = ::min(tid, size - 1);
-
- sminval[tid] = minval[idx];
- smaxval[tid] = maxval[idx];
- __syncthreads();
+ sum = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<result_type>::all(0);
- findMinMaxInSmem<nthreads, best_type>(sminval, smaxval, tid);
+ device::reduce<BLOCK_SIZE>(Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), Unroll<cn>::tie(sum), tid, Unroll<cn>::op(plus<double>()));
if (tid == 0)
{
- minval[0] = (T)sminval[0];
- maxval[0] = (T)smaxval[0];
+ result[0] = sum;
+ blocks_finished = 0;
}
}
+ }
+ };
+ template <int BLOCK_SIZE, typename src_type, typename result_type, class Op>
+ __global__ void kernel(const PtrStepSz<src_type> src, result_type* result, const Op op, const int twidth, const int theight)
+ {
+ typedef typename VecTraits<src_type>::elem_type T;
+ typedef typename VecTraits<result_type>::elem_type R;
+ const int cn = VecTraits<src_type>::cn;
- template <typename T>
- void minMaxMaskMultipassCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
- {
- dim3 threads, grid;
- estimateThreadCfg(src.cols, src.rows, threads, grid);
- setKernelConsts(src.cols, src.rows, threads, grid);
-
- T* minval_buf = (T*)buf.ptr(0);
- T* maxval_buf = (T*)buf.ptr(1);
-
- minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
- cudaSafeCall( cudaGetLastError() );
- minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
- cudaSafeCall( cudaGetLastError() );
-
- cudaSafeCall(cudaDeviceSynchronize());
-
- T minval_, maxval_;
- cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
- cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
- *minval = minval_;
- *maxval = maxval_;
- }
-
- template void minMaxMaskMultipassCaller<uchar>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
- template void minMaxMaskMultipassCaller<char>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
- template void minMaxMaskMultipassCaller<ushort>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
- template void minMaxMaskMultipassCaller<short>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
- template void minMaxMaskMultipassCaller<int>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
- template void minMaxMaskMultipassCaller<float>(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
-
-
- template <typename T>
- void minMaxMultipassCaller(const PtrStepSzb src, double* minval, double* maxval, PtrStepb buf)
- {
- dim3 threads, grid;
- estimateThreadCfg(src.cols, src.rows, threads, grid);
- setKernelConsts(src.cols, src.rows, threads, grid);
-
- T* minval_buf = (T*)buf.ptr(0);
- T* maxval_buf = (T*)buf.ptr(1);
-
- minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
- cudaSafeCall( cudaGetLastError() );
- minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
- cudaSafeCall( cudaGetLastError() );
-
- cudaSafeCall( cudaDeviceSynchronize() );
+ __shared__ R smem[BLOCK_SIZE * cn];
- T minval_, maxval_;
- cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
- cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
- *minval = minval_;
- *maxval = maxval_;
- }
+ const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
+ const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
- template void minMaxMultipassCaller<uchar>(const PtrStepSzb, double*, double*, PtrStepb);
- template void minMaxMultipassCaller<char>(const PtrStepSzb, double*, double*, PtrStepb);
- template void minMaxMultipassCaller<ushort>(const PtrStepSzb, double*, double*, PtrStepb);
- template void minMaxMultipassCaller<short>(const PtrStepSzb, double*, double*, PtrStepb);
- template void minMaxMultipassCaller<int>(const PtrStepSzb, double*, double*, PtrStepb);
- template void minMaxMultipassCaller<float>(const PtrStepSzb, double*, double*, PtrStepb);
- } // namespace minmax
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+ const int bid = blockIdx.y * gridDim.x + blockIdx.x;
- ///////////////////////////////////////////////////////////////////////////////
- // minMaxLoc
+ result_type sum = VecTraits<result_type>::all(0);
- namespace minmaxloc
+ for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
{
- __constant__ int ctwidth;
- __constant__ int ctheight;
+ const src_type* ptr = src.ptr(y);
- // Global counter of blocks finished its work
- __device__ uint blocks_finished = 0;
-
-
- // Estimates good thread configuration
- // - threads variable satisfies to threads.x * threads.y == 256
- void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)
+ for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
{
- threads = dim3(32, 8);
- grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
- grid.x = std::min(grid.x, threads.x);
- grid.y = std::min(grid.y, threads.y);
- }
-
+ const src_type srcVal = ptr[x];
- // Returns required buffer sizes
- void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols,
- int& b1rows, int& b2cols, int& b2rows)
- {
- dim3 threads, grid;
- estimateThreadCfg(cols, rows, threads, grid);
- b1cols = grid.x * grid.y * elem_size; // For values
- b1rows = 2;
- b2cols = grid.x * grid.y * sizeof(int); // For locations
- b2rows = 2;
+ sum = sum + op(saturate_cast<result_type>(srcVal));
}
+ }
+ device::reduce<BLOCK_SIZE>(Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), Unroll<cn>::tie(sum), tid, Unroll<cn>::op(plus<R>()));
- // Estimates device constants which are used in the kernels using specified thread configuration
- void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
- {
- int twidth = divUp(divUp(cols, grid.x), threads.x);
- int theight = divUp(divUp(rows, grid.y), threads.y);
- cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth)));
- cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight)));
- }
+ GlobalReduce<BLOCK_SIZE, R, cn>::run(sum, result, tid, bid, smem);
+ }
+ const int threads_x = 32;
+ const int threads_y = 8;
- template <typename T>
- __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval,
- volatile uint* minloc, volatile uint* maxloc)
- {
- T val = minval[tid + offset];
- if (val < minval[tid])
- {
- minval[tid] = val;
- minloc[tid] = minloc[tid + offset];
- }
- val = maxval[tid + offset];
- if (val > maxval[tid])
- {
- maxval[tid] = val;
- maxloc[tid] = maxloc[tid + offset];
- }
- }
+ void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
+ {
+ block = dim3(threads_x, threads_y);
+ grid = dim3(divUp(cols, block.x * block.y),
+ divUp(rows, block.y * block.x));
- template <int size, typename T>
- __device__ void findMinMaxLocInSmem(volatile T* minval, volatile T* maxval, volatile uint* minloc,
- volatile uint* maxloc, const uint tid)
- {
- if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); }
- if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); } __syncthreads(); }
- if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval, minloc, maxloc); } __syncthreads(); }
+ grid.x = ::min(grid.x, block.x);
+ grid.y = ::min(grid.y, block.y);
+ }
- if (tid < 32)
- {
- if (size >= 64) merge(tid, 32, minval, maxval, minloc, maxloc);
- if (size >= 32) merge(tid, 16, minval, maxval, minloc, maxloc);
- if (size >= 16) merge(tid, 8, minval, maxval, minloc, maxloc);
- if (size >= 8) merge(tid, 4, minval, maxval, minloc, maxloc);
- if (size >= 4) merge(tid, 2, minval, maxval, minloc, maxloc);
- if (size >= 2) merge(tid, 1, minval, maxval, minloc, maxloc);
- }
- }
+ void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows)
+ {
+ dim3 block, grid;
+ getLaunchCfg(cols, rows, block, grid);
+ bufcols = grid.x * grid.y * sizeof(double) * cn;
+ bufrows = 1;
+ }
- template <int nthreads, typename T, typename Mask>
- __global__ void minMaxLocKernel(const PtrStepSzb src, Mask mask, T* minval, T* maxval,
- uint* minloc, uint* maxloc)
- {
- typedef typename MinMaxTypeTraits<T>::best_type best_type;
- __shared__ best_type sminval[nthreads];
- __shared__ best_type smaxval[nthreads];
- __shared__ uint sminloc[nthreads];
- __shared__ uint smaxloc[nthreads];
-
- uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
- uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
- uint tid = threadIdx.y * blockDim.x + threadIdx.x;
-
- T mymin = numeric_limits<T>::max();
- T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min();
- uint myminloc = 0;
- uint mymaxloc = 0;
- uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
- uint x_end = ::min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
-
- for (uint y = y0; y < y_end; y += blockDim.y)
- {
- const T* ptr = (const T*)src.ptr(y);
- for (uint x = x0; x < x_end; x += blockDim.x)
- {
- if (mask(y, x))
- {
- T val = ptr[x];
- if (val <= mymin) { mymin = val; myminloc = y * src.cols + x; }
- if (val >= mymax) { mymax = val; mymaxloc = y * src.cols + x; }
- }
- }
- }
+ template <typename T, typename R, int cn, template <typename> class Op>
+ void caller(PtrStepSzb src_, void* buf_, double* out)
+ {
+ typedef typename TypeVec<T, cn>::vec_type src_type;
+ typedef typename TypeVec<R, cn>::vec_type result_type;
- sminval[tid] = mymin;
- smaxval[tid] = mymax;
- sminloc[tid] = myminloc;
- smaxloc[tid] = mymaxloc;
- __syncthreads();
+ PtrStepSz<src_type> src(src_);
+ result_type* buf = (result_type*) buf_;
- findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
+ dim3 block, grid;
+ getLaunchCfg(src.cols, src.rows, block, grid);
- #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
- __shared__ bool is_last;
+ const int twidth = divUp(divUp(src.cols, grid.x), block.x);
+ const int theight = divUp(divUp(src.rows, grid.y), block.y);
- if (tid == 0)
- {
- minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
- maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
- minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];
- maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];
- __threadfence();
-
- uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
- is_last = ticket == gridDim.x * gridDim.y - 1;
- }
+ Op<result_type> op;
- __syncthreads();
+ kernel<threads_x * threads_y><<<grid, block>>>(src, buf, op, twidth, theight);
+ cudaSafeCall( cudaGetLastError() );
- if (is_last)
- {
- uint idx = ::min(tid, gridDim.x * gridDim.y - 1);
+ cudaSafeCall( cudaDeviceSynchronize() );
- sminval[tid] = minval[idx];
- smaxval[tid] = maxval[idx];
- sminloc[tid] = minloc[idx];
- smaxloc[tid] = maxloc[idx];
- __syncthreads();
+ R result[4] = {0, 0, 0, 0};
+ cudaSafeCall( cudaMemcpy(&result, buf, sizeof(result_type), cudaMemcpyDeviceToHost) );
- findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
+ out[0] = result[0];
+ out[1] = result[1];
+ out[2] = result[2];
+ out[3] = result[3];
+ }
- if (tid == 0)
- {
- minval[0] = (T)sminval[0];
- maxval[0] = (T)smaxval[0];
- minloc[0] = sminloc[0];
- maxloc[0] = smaxloc[0];
- blocks_finished = 0;
- }
- }
- #else
- if (tid == 0)
- {
- minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
- maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
- minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];
- maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];
- }
- #endif
- }
+ template <typename T> struct SumType;
+ template <> struct SumType<uchar> { typedef unsigned int R; };
+ template <> struct SumType<schar> { typedef int R; };
+ template <> struct SumType<ushort> { typedef unsigned int R; };
+ template <> struct SumType<short> { typedef int R; };
+ template <> struct SumType<int> { typedef int R; };
+ template <> struct SumType<float> { typedef float R; };
+ template <> struct SumType<double> { typedef double R; };
+ template <typename T, int cn>
+ void run(PtrStepSzb src, void* buf, double* out)
+ {
+ typedef typename SumType<T>::R R;
+ caller<T, R, cn, identity>(src, buf, out);
+ }
+
+ template void run<uchar, 1>(PtrStepSzb src, void* buf, double* out);
+ template void run<uchar, 2>(PtrStepSzb src, void* buf, double* out);
+ template void run<uchar, 3>(PtrStepSzb src, void* buf, double* out);
+ template void run<uchar, 4>(PtrStepSzb src, void* buf, double* out);
+
+ template void run<schar, 1>(PtrStepSzb src, void* buf, double* out);
+ template void run<schar, 2>(PtrStepSzb src, void* buf, double* out);
+ template void run<schar, 3>(PtrStepSzb src, void* buf, double* out);
+ template void run<schar, 4>(PtrStepSzb src, void* buf, double* out);
+
+ template void run<ushort, 1>(PtrStepSzb src, void* buf, double* out);
+ template void run<ushort, 2>(PtrStepSzb src, void* buf, double* out);
+ template void run<ushort, 3>(PtrStepSzb src, void* buf, double* out);
+ template void run<ushort, 4>(PtrStepSzb src, void* buf, double* out);
+
+ template void run<short, 1>(PtrStepSzb src, void* buf, double* out);
+ template void run<short, 2>(PtrStepSzb src, void* buf, double* out);
+ template void run<short, 3>(PtrStepSzb src, void* buf, double* out);
+ template void run<short, 4>(PtrStepSzb src, void* buf, double* out);
+
+ template void run<int, 1>(PtrStepSzb src, void* buf, double* out);
+ template void run<int, 2>(PtrStepSzb src, void* buf, double* out);
+ template void run<int, 3>(PtrStepSzb src, void* buf, double* out);
+ template void run<int, 4>(PtrStepSzb src, void* buf, double* out);
+
+ template void run<float, 1>(PtrStepSzb src, void* buf, double* out);
+ template void run<float, 2>(PtrStepSzb src, void* buf, double* out);
+ template void run<float, 3>(PtrStepSzb src, void* buf, double* out);
+ template void run<float, 4>(PtrStepSzb src, void* buf, double* out);
+
+ template void run<double, 1>(PtrStepSzb src, void* buf, double* out);
+ template void run<double, 2>(PtrStepSzb src, void* buf, double* out);
+ template void run<double, 3>(PtrStepSzb src, void* buf, double* out);
+ template void run<double, 4>(PtrStepSzb src, void* buf, double* out);
+
+ template <typename T, int cn>
+ void runAbs(PtrStepSzb src, void* buf, double* out)
+ {
+ typedef typename SumType<T>::R R;
+ caller<T, R, cn, abs_func>(src, buf, out);
+ }
+
+ template void runAbs<uchar, 1>(PtrStepSzb src, void* buf, double* out);
+ template void runAbs<uchar, 2>(PtrStepSzb src, void* buf, double* out);
+ template void runAbs<uchar, 3>(PtrStepSzb src, void* buf, double* out);
+ template void runAbs<uchar, 4>(PtrStepSzb src, void* buf, double* out);
+
+ template void runAbs<schar, 1>(PtrStepSzb src, void* buf, double* out);
+ template void runAbs<schar, 2>(PtrStepSzb src, void* buf, double* out);
+ template void runAbs<schar, 3>(PtrStepSzb src, void* buf, double* out);
+ template void runAbs<schar, 4>(PtrStepSzb src, void* buf, double* out);
+
+ template void runAbs<ushort, 1>(PtrStepSzb src, void* buf, double* out);
+ template void runAbs<ushort, 2>(PtrStepSzb src, void* buf, double* out);
+ template void runAbs<ushort, 3>(PtrStepSzb src, void* buf, double* out);
+ template void runAbs<ushort, 4>(PtrStepSzb src, void* buf, double* out);
+
+ template void runAbs<short, 1>(PtrStepSzb src, void* buf, double* out);
+ template void runAbs<short, 2>(PtrStepSzb src, void* buf, double* out);
+ template void runAbs<short, 3>(PtrStepSzb src, void* buf, double* out);
+ template void runAbs<short, 4>(PtrStepSzb src, void* buf, double* out);
+
+ template void runAbs<int, 1>(PtrStepSzb src, void* buf, double* out);
+ template void runAbs<int, 2>(PtrStepSzb src, void* buf, double* out);
+ template void runAbs<int, 3>(PtrStepSzb src, void* buf, double* out);
+ template void runAbs<int, 4>(PtrStepSzb src, void* buf, double* out);
+
+ template void runAbs<float, 1>(PtrStepSzb src, void* buf, double* out);
+ template void runAbs<float, 2>(PtrStepSzb src, void* buf, double* out);
+ template void runAbs<float, 3>(PtrStepSzb src, void* buf, double* out);
+ template void runAbs<float, 4>(PtrStepSzb src, void* buf, double* out);
+
+ template void runAbs<double, 1>(PtrStepSzb src, void* buf, double* out);
+ template void runAbs<double, 2>(PtrStepSzb src, void* buf, double* out);
+ template void runAbs<double, 3>(PtrStepSzb src, void* buf, double* out);
+ template void runAbs<double, 4>(PtrStepSzb src, void* buf, double* out);
+
+ template <typename T> struct Sqr : unary_function<T, T>
+ {
+ __device__ __forceinline__ T operator ()(T x) const
+ {
+ return x * x;
+ }
+ };
- template <typename T>
- void minMaxLocMaskCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval,
- int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
- {
- dim3 threads, grid;
- estimateThreadCfg(src.cols, src.rows, threads, grid);
- setKernelConsts(src.cols, src.rows, threads, grid);
-
- T* minval_buf = (T*)valbuf.ptr(0);
- T* maxval_buf = (T*)valbuf.ptr(1);
- uint* minloc_buf = (uint*)locbuf.ptr(0);
- uint* maxloc_buf = (uint*)locbuf.ptr(1);
-
- minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf,
- minloc_buf, maxloc_buf);
- cudaSafeCall( cudaGetLastError() );
-
- cudaSafeCall( cudaDeviceSynchronize() );
-
- T minval_, maxval_;
- cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
- cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
- *minval = minval_;
- *maxval = maxval_;
-
- uint minloc_, maxloc_;
- cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost) );
- cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost) );
- minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
- maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
- }
+ template <typename T, int cn>
+ void runSqr(PtrStepSzb src, void* buf, double* out)
+ {
+ caller<T, double, cn, Sqr>(src, buf, out);
+ }
+
+ template void runSqr<uchar, 1>(PtrStepSzb src, void* buf, double* out);
+ template void runSqr<uchar, 2>(PtrStepSzb src, void* buf, double* out);
+ template void runSqr<uchar, 3>(PtrStepSzb src, void* buf, double* out);
+ template void runSqr<uchar, 4>(PtrStepSzb src, void* buf, double* out);
+
+ template void runSqr<schar, 1>(PtrStepSzb src, void* buf, double* out);
+ template void runSqr<schar, 2>(PtrStepSzb src, void* buf, double* out);
+ template void runSqr<schar, 3>(PtrStepSzb src, void* buf, double* out);
+ template void runSqr<schar, 4>(PtrStepSzb src, void* buf, double* out);
+
+ template void runSqr<ushort, 1>(PtrStepSzb src, void* buf, double* out);
+ template void runSqr<ushort, 2>(PtrStepSzb src, void* buf, double* out);
+ template void runSqr<ushort, 3>(PtrStepSzb src, void* buf, double* out);
+ template void runSqr<ushort, 4>(PtrStepSzb src, void* buf, double* out);
+
+ template void runSqr<short, 1>(PtrStepSzb src, void* buf, double* out);
+ template void runSqr<short, 2>(PtrStepSzb src, void* buf, double* out);
+ template void runSqr<short, 3>(PtrStepSzb src, void* buf, double* out);
+ template void runSqr<short, 4>(PtrStepSzb src, void* buf, double* out);
+
+ template void runSqr<int, 1>(PtrStepSzb src, void* buf, double* out);
+ template void runSqr<int, 2>(PtrStepSzb src, void* buf, double* out);
+ template void runSqr<int, 3>(PtrStepSzb src, void* buf, double* out);
+ template void runSqr<int, 4>(PtrStepSzb src, void* buf, double* out);
+
+ template void runSqr<float, 1>(PtrStepSzb src, void* buf, double* out);
+ template void runSqr<float, 2>(PtrStepSzb src, void* buf, double* out);
+ template void runSqr<float, 3>(PtrStepSzb src, void* buf, double* out);
+ template void runSqr<float, 4>(PtrStepSzb src, void* buf, double* out);
+
+ template void runSqr<double, 1>(PtrStepSzb src, void* buf, double* out);
+ template void runSqr<double, 2>(PtrStepSzb src, void* buf, double* out);
+ template void runSqr<double, 3>(PtrStepSzb src, void* buf, double* out);
+ template void runSqr<double, 4>(PtrStepSzb src, void* buf, double* out);
+}
+
+/////////////////////////////////////////////////////////////
+// minMax
+
+namespace minMax
+{
+ __device__ unsigned int blocks_finished = 0;
+
+ // To avoid shared bank conflicts we convert each value into value of
+ // appropriate type (32 bits minimum)
+ template <typename T> struct MinMaxTypeTraits;
+ template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
+ template <> struct MinMaxTypeTraits<schar> { typedef int best_type; };
+ template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
+ template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
+ template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
+ template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
+ template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
+
+ template <int BLOCK_SIZE, typename R>
+ struct GlobalReduce
+ {
+ static __device__ void run(R& mymin, R& mymax, R* minval, R* maxval, int tid, int bid, R* sminval, R* smaxval)
+ {
+ __shared__ bool is_last;
- template void minMaxLocMaskCaller<uchar>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
- template void minMaxLocMaskCaller<char>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
- template void minMaxLocMaskCaller<ushort>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
- template void minMaxLocMaskCaller<short>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
- template void minMaxLocMaskCaller<int>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
- template void minMaxLocMaskCaller<float>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
- template void minMaxLocMaskCaller<double>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+ if (tid == 0)
+ {
+ minval[bid] = mymin;
+ maxval[bid] = mymax;
+ __threadfence();
- template <typename T>
- void minMaxLocCaller(const PtrStepSzb src, double* minval, double* maxval,
- int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
- {
- dim3 threads, grid;
- estimateThreadCfg(src.cols, src.rows, threads, grid);
- setKernelConsts(src.cols, src.rows, threads, grid);
-
- T* minval_buf = (T*)valbuf.ptr(0);
- T* maxval_buf = (T*)valbuf.ptr(1);
- uint* minloc_buf = (uint*)locbuf.ptr(0);
- uint* maxloc_buf = (uint*)locbuf.ptr(1);
-
- minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf,
- minloc_buf, maxloc_buf);
- cudaSafeCall( cudaGetLastError() );
-
- cudaSafeCall( cudaDeviceSynchronize() );
-
- T minval_, maxval_;
- cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
- cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
- *minval = minval_;
- *maxval = maxval_;
-
- uint minloc_, maxloc_;
- cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
- cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
- minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
- maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
+ unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
+ is_last = (ticket == gridDim.x * gridDim.y - 1);
}
- template void minMaxLocCaller<uchar>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
- template void minMaxLocCaller<char>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
- template void minMaxLocCaller<ushort>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
- template void minMaxLocCaller<short>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
- template void minMaxLocCaller<int>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
- template void minMaxLocCaller<float>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
- template void minMaxLocCaller<double>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-
+ __syncthreads();
- // This kernel will be used only when compute capability is 1.0
- template <int nthreads, typename T>
- __global__ void minMaxLocPass2Kernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size)
+ if (is_last)
{
- typedef typename MinMaxTypeTraits<T>::best_type best_type;
- __shared__ best_type sminval[nthreads];
- __shared__ best_type smaxval[nthreads];
- __shared__ uint sminloc[nthreads];
- __shared__ uint smaxloc[nthreads];
+ int idx = ::min(tid, gridDim.x * gridDim.y - 1);
- uint tid = threadIdx.y * blockDim.x + threadIdx.x;
- uint idx = ::min(tid, size - 1);
+ mymin = minval[idx];
+ mymax = maxval[idx];
- sminval[tid] = minval[idx];
- smaxval[tid] = maxval[idx];
- sminloc[tid] = minloc[idx];
- smaxloc[tid] = maxloc[idx];
- __syncthreads();
-
- findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
+ const minimum<R> minOp;
+ const maximum<R> maxOp;
+ device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
if (tid == 0)
{
- minval[0] = (T)sminval[0];
- maxval[0] = (T)smaxval[0];
- minloc[0] = sminloc[0];
- maxloc[0] = smaxloc[0];
- }
- }
-
-
- template <typename T>
- void minMaxLocMaskMultipassCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval,
- int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
- {
- dim3 threads, grid;
- estimateThreadCfg(src.cols, src.rows, threads, grid);
- setKernelConsts(src.cols, src.rows, threads, grid);
-
- T* minval_buf = (T*)valbuf.ptr(0);
- T* maxval_buf = (T*)valbuf.ptr(1);
- uint* minloc_buf = (uint*)locbuf.ptr(0);
- uint* maxloc_buf = (uint*)locbuf.ptr(1);
-
- minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf,
- minloc_buf, maxloc_buf);
- cudaSafeCall( cudaGetLastError() );
- minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
- cudaSafeCall( cudaGetLastError() );
-
- cudaSafeCall( cudaDeviceSynchronize() );
-
- T minval_, maxval_;
- cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
- cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
- *minval = minval_;
- *maxval = maxval_;
-
- uint minloc_, maxloc_;
- cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
- cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
- minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
- maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
- }
-
- template void minMaxLocMaskMultipassCaller<uchar>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
- template void minMaxLocMaskMultipassCaller<char>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
- template void minMaxLocMaskMultipassCaller<ushort>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
- template void minMaxLocMaskMultipassCaller<short>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
- template void minMaxLocMaskMultipassCaller<int>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
- template void minMaxLocMaskMultipassCaller<float>(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
+ minval[0] = mymin;
+ maxval[0] = mymax;
-
- template <typename T>
- void minMaxLocMultipassCaller(const PtrStepSzb src, double* minval, double* maxval,
- int minloc[2], int maxloc[2], PtrStepb valbuf, PtrStepb locbuf)
- {
- dim3 threads, grid;
- estimateThreadCfg(src.cols, src.rows, threads, grid);
- setKernelConsts(src.cols, src.rows, threads, grid);
-
- T* minval_buf = (T*)valbuf.ptr(0);
- T* maxval_buf = (T*)valbuf.ptr(1);
- uint* minloc_buf = (uint*)locbuf.ptr(0);
- uint* maxloc_buf = (uint*)locbuf.ptr(1);
-
- minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf,
- minloc_buf, maxloc_buf);
- cudaSafeCall( cudaGetLastError() );
- minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
- cudaSafeCall( cudaGetLastError() );
-
- cudaSafeCall( cudaDeviceSynchronize() );
-
- T minval_, maxval_;
- cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
- cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
- *minval = minval_;
- *maxval = maxval_;
-
- uint minloc_, maxloc_;
- cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
- cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
- minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
- maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
+ blocks_finished = 0;
+ }
}
-
- template void minMaxLocMultipassCaller<uchar>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
- template void minMaxLocMultipassCaller<char>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
- template void minMaxLocMultipassCaller<ushort>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
- template void minMaxLocMultipassCaller<short>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
- template void minMaxLocMultipassCaller<int>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
- template void minMaxLocMultipassCaller<float>(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
- } // namespace minmaxloc
-
- //////////////////////////////////////////////////////////////////////////////////////////////////////////
- // countNonZero
-
- namespace countnonzero
+ }
+ };
+ template <int BLOCK_SIZE>
+ struct GlobalReduce<BLOCK_SIZE, int>
+ {
+ static __device__ void run(int& mymin, int& mymax, int* minval, int* maxval, int tid, int bid, int* sminval, int* smaxval)
{
- __constant__ int ctwidth;
- __constant__ int ctheight;
-
- __device__ uint blocks_finished = 0;
-
- void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)
+ #if __CUDA_ARCH__ >= 200
+ if (tid == 0)
{
- threads = dim3(32, 8);
- grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
- grid.x = std::min(grid.x, threads.x);
- grid.y = std::min(grid.y, threads.y);
+ ::atomicMin(minval, mymin);
+ ::atomicMax(maxval, mymax);
}
+ #else
+ __shared__ bool is_last;
-
- void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows)
+ if (tid == 0)
{
- dim3 threads, grid;
- estimateThreadCfg(cols, rows, threads, grid);
- bufcols = grid.x * grid.y * sizeof(int);
- bufrows = 1;
- }
+ minval[bid] = mymin;
+ maxval[bid] = mymax;
+ __threadfence();
- void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
- {
- int twidth = divUp(divUp(cols, grid.x), threads.x);
- int theight = divUp(divUp(rows, grid.y), threads.y);
- cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth)));
- cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));
+ unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
+ is_last = (ticket == gridDim.x * gridDim.y - 1);
}
+ __syncthreads();
- template <int nthreads, typename T>
- __global__ void countNonZeroKernel(const PtrStepSzb src, volatile uint* count)
+ if (is_last)
{
- __shared__ uint scount[nthreads];
+ int idx = ::min(tid, gridDim.x * gridDim.y - 1);
- uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
- uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
- uint tid = threadIdx.y * blockDim.x + threadIdx.x;
+ mymin = minval[idx];
+ mymax = maxval[idx];
- uint cnt = 0;
- for (uint y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
- {
- const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
- for (uint x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
- cnt += ptr[x0 + x * blockDim.x] != 0;
- }
-
- scount[tid] = cnt;
- __syncthreads();
-
- sumInSmem<nthreads, uint>(scount, tid);
-
- #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
- __shared__ bool is_last;
+ const minimum<int> minOp;
+ const maximum<int> maxOp;
+ device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
if (tid == 0)
{
- count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
- __threadfence();
-
- uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
- is_last = ticket == gridDim.x * gridDim.y - 1;
- }
-
- __syncthreads();
-
- if (is_last)
- {
- scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;
- __syncthreads();
+ minval[0] = mymin;
+ maxval[0] = mymax;
- sumInSmem<nthreads, uint>(scount, tid);
-
- if (tid == 0)
- {
- count[0] = scount[0];
- blocks_finished = 0;
- }
+ blocks_finished = 0;
}
- #else
- if (tid == 0) count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
- #endif
- }
-
-
- template <typename T>
- int countNonZeroCaller(const PtrStepSzb src, PtrStepb buf)
- {
- dim3 threads, grid;
- estimateThreadCfg(src.cols, src.rows, threads, grid);
- setKernelConsts(src.cols, src.rows, threads, grid);
-
- uint* count_buf = (uint*)buf.ptr(0);
-
- countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);
- cudaSafeCall( cudaGetLastError() );
-
- cudaSafeCall( cudaDeviceSynchronize() );
-
- uint count;
- cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
-
- return count;
- }
-
- template int countNonZeroCaller<uchar>(const PtrStepSzb, PtrStepb);
- template int countNonZeroCaller<char>(const PtrStepSzb, PtrStepb);
- template int countNonZeroCaller<ushort>(const PtrStepSzb, PtrStepb);
- template int countNonZeroCaller<short>(const PtrStepSzb, PtrStepb);
- template int countNonZeroCaller<int>(const PtrStepSzb, PtrStepb);
- template int countNonZeroCaller<float>(const PtrStepSzb, PtrStepb);
- template int countNonZeroCaller<double>(const PtrStepSzb, PtrStepb);
-
-
- template <int nthreads, typename T>
- __global__ void countNonZeroPass2Kernel(uint* count, int size)
- {
- __shared__ uint scount[nthreads];
- uint tid = threadIdx.y * blockDim.x + threadIdx.x;
-
- scount[tid] = tid < size ? count[tid] : 0;
- __syncthreads();
-
- sumInSmem<nthreads, uint>(scount, tid);
-
- if (tid == 0)
- count[0] = scount[0];
}
+ #endif
+ }
+ };
+ template <int BLOCK_SIZE, typename T, typename R, class Mask>
+ __global__ void kernel(const PtrStepSz<T> src, const Mask mask, R* minval, R* maxval, const int twidth, const int theight)
+ {
+ __shared__ R sminval[BLOCK_SIZE];
+ __shared__ R smaxval[BLOCK_SIZE];
- template <typename T>
- int countNonZeroMultipassCaller(const PtrStepSzb src, PtrStepb buf)
- {
- dim3 threads, grid;
- estimateThreadCfg(src.cols, src.rows, threads, grid);
- setKernelConsts(src.cols, src.rows, threads, grid);
-
- uint* count_buf = (uint*)buf.ptr(0);
-
- countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);
- cudaSafeCall( cudaGetLastError() );
- countNonZeroPass2Kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
- cudaSafeCall( cudaGetLastError() );
-
- cudaSafeCall( cudaDeviceSynchronize() );
-
- uint count;
- cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
-
- return count;
- }
-
- template int countNonZeroMultipassCaller<uchar>(const PtrStepSzb, PtrStepb);
- template int countNonZeroMultipassCaller<char>(const PtrStepSzb, PtrStepb);
- template int countNonZeroMultipassCaller<ushort>(const PtrStepSzb, PtrStepb);
- template int countNonZeroMultipassCaller<short>(const PtrStepSzb, PtrStepb);
- template int countNonZeroMultipassCaller<int>(const PtrStepSzb, PtrStepb);
- template int countNonZeroMultipassCaller<float>(const PtrStepSzb, PtrStepb);
+ const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
+ const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
- } // namespace countnonzero
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+ const int bid = blockIdx.y * gridDim.x + blockIdx.x;
+ R mymin = numeric_limits<R>::max();
+ R mymax = -numeric_limits<R>::max();
- //////////////////////////////////////////////////////////////////////////
- // Sum
+ const minimum<R> minOp;
+ const maximum<R> maxOp;
- namespace sum
+ for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
{
- template <typename T> struct SumType {};
- template <> struct SumType<uchar> { typedef uint R; };
- template <> struct SumType<char> { typedef int R; };
- template <> struct SumType<ushort> { typedef uint R; };
- template <> struct SumType<short> { typedef int R; };
- template <> struct SumType<int> { typedef int R; };
- template <> struct SumType<float> { typedef float R; };
- template <> struct SumType<double> { typedef double R; };
-
- template <typename R>
- struct IdentityOp { static __device__ __forceinline__ R call(R x) { return x; } };
+ const T* ptr = src.ptr(y);
- template <typename R>
- struct AbsOp { static __device__ __forceinline__ R call(R x) { return ::abs(x); } };
-
- template <>
- struct AbsOp<uint> { static __device__ __forceinline__ uint call(uint x) { return x; } };
-
- template <typename R>
- struct SqrOp { static __device__ __forceinline__ R call(R x) { return x * x; } };
-
- __constant__ int ctwidth;
- __constant__ int ctheight;
- __device__ uint blocks_finished = 0;
-
- const int threads_x = 32;
- const int threads_y = 8;
-
- void estimateThreadCfg(int cols, int rows, dim3& threads, dim3& grid)
+ for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
{
- threads = dim3(threads_x, threads_y);
- grid = dim3(divUp(cols, threads.x * threads.y),
- divUp(rows, threads.y * threads.x));
- grid.x = std::min(grid.x, threads.x);
- grid.y = std::min(grid.y, threads.y);
- }
-
-
- void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows)
- {
- dim3 threads, grid;
- estimateThreadCfg(cols, rows, threads, grid);
- bufcols = grid.x * grid.y * sizeof(double) * cn;
- bufrows = 1;
- }
-
-
- void setKernelConsts(int cols, int rows, const dim3& threads, const dim3& grid)
- {
- int twidth = divUp(divUp(cols, grid.x), threads.x);
- int theight = divUp(divUp(rows, grid.y), threads.y);
- cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth)));
- cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));
- }
-
- template <typename T, typename R, typename Op, int nthreads>
- __global__ void sumKernel(const PtrStepSzb src, R* result)
- {
- __shared__ R smem[nthreads];
-
- const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
- const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
- const int tid = threadIdx.y * blockDim.x + threadIdx.x;
- const int bid = blockIdx.y * gridDim.x + blockIdx.x;
-
- R sum = 0;
- for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
+ if (mask(y, x))
{
- const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
- for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
- sum += Op::call(ptr[x0 + x * blockDim.x]);
- }
-
- smem[tid] = sum;
- __syncthreads();
+ const R srcVal = ptr[x];
- sumInSmem<nthreads, R>(smem, tid);
-
- #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
- __shared__ bool is_last;
-
- if (tid == 0)
- {
- result[bid] = smem[0];
- __threadfence();
-
- uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
- is_last = (ticket == gridDim.x * gridDim.y - 1);
- }
-
- __syncthreads();
-
- if (is_last)
- {
- smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0;
- __syncthreads();
-
- sumInSmem<nthreads, R>(smem, tid);
-
- if (tid == 0)
- {
- result[0] = smem[0];
- blocks_finished = 0;
- }
+ mymin = minOp(mymin, srcVal);
+ mymax = maxOp(mymax, srcVal);
}
- #else
- if (tid == 0) result[bid] = smem[0];
- #endif
}
+ }
+ device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
- template <typename T, typename R, int nthreads>
- __global__ void sumPass2Kernel(R* result, int size)
- {
- __shared__ R smem[nthreads];
- int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
- smem[tid] = tid < size ? result[tid] : 0;
- __syncthreads();
-
- sumInSmem<nthreads, R>(smem, tid);
+ GlobalReduce<BLOCK_SIZE, R>::run(mymin, mymax, minval, maxval, tid, bid, sminval, smaxval);
+ }
- if (tid == 0)
- result[0] = smem[0];
- }
+ const int threads_x = 32;
+ const int threads_y = 8;
+ void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
+ {
+ block = dim3(threads_x, threads_y);
- template <typename T, typename R, typename Op, int nthreads>
- __global__ void sumKernel_C2(const PtrStepSzb src, typename TypeVec<R, 2>::vec_type* result)
- {
- typedef typename TypeVec<T, 2>::vec_type SrcType;
- typedef typename TypeVec<R, 2>::vec_type DstType;
+ grid = dim3(divUp(cols, block.x * block.y),
+ divUp(rows, block.y * block.x));
- __shared__ R smem[nthreads * 2];
+ grid.x = ::min(grid.x, block.x);
+ grid.y = ::min(grid.y, block.y);
+ }
- const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
- const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
- const int tid = threadIdx.y * blockDim.x + threadIdx.x;
- const int bid = blockIdx.y * gridDim.x + blockIdx.x;
+ void getBufSize(int cols, int rows, int& bufcols, int& bufrows)
+ {
+ dim3 block, grid;
+ getLaunchCfg(cols, rows, block, grid);
- SrcType val;
- DstType sum = VecTraits<DstType>::all(0);
- for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
- {
- const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
- for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
- {
- val = ptr[x0 + x * blockDim.x];
- sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y));
- }
- }
+ bufcols = grid.x * grid.y * sizeof(double);
+ bufrows = 2;
+ }
- smem[tid] = sum.x;
- smem[tid + nthreads] = sum.y;
- __syncthreads();
+ __global__ void setDefaultKernel(int* minval_buf, int* maxval_buf)
+ {
+ *minval_buf = numeric_limits<int>::max();
+ *maxval_buf = numeric_limits<int>::min();
+ }
- sumInSmem<nthreads, R>(smem, tid);
- sumInSmem<nthreads, R>(smem + nthreads, tid);
+ template <typename R>
+ void setDefault(R*, R*)
+ {
+ }
+ void setDefault(int* minval_buf, int* maxval_buf)
+ {
+ setDefaultKernel<<<1, 1>>>(minval_buf, maxval_buf);
+ }
- #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
- __shared__ bool is_last;
+ template <typename T>
+ void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
+ {
+ typedef typename MinMaxTypeTraits<T>::best_type R;
- if (tid == 0)
- {
- DstType res;
- res.x = smem[0];
- res.y = smem[nthreads];
- result[bid] = res;
- __threadfence();
-
- uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
- is_last = (ticket == gridDim.x * gridDim.y - 1);
- }
+ dim3 block, grid;
+ getLaunchCfg(src.cols, src.rows, block, grid);
- __syncthreads();
+ const int twidth = divUp(divUp(src.cols, grid.x), block.x);
+ const int theight = divUp(divUp(src.rows, grid.y), block.y);
- if (is_last)
- {
- DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
- smem[tid] = res.x;
- smem[tid + nthreads] = res.y;
- __syncthreads();
+ R* minval_buf = (R*) buf.ptr(0);
+ R* maxval_buf = (R*) buf.ptr(1);
- sumInSmem<nthreads, R>(smem, tid);
- sumInSmem<nthreads, R>(smem + nthreads, tid);
+ setDefault(minval_buf, maxval_buf);
- if (tid == 0)
- {
- res.x = smem[0];
- res.y = smem[nthreads];
- result[0] = res;
- blocks_finished = 0;
- }
- }
- #else
- if (tid == 0)
- {
- DstType res;
- res.x = smem[0];
- res.y = smem[nthreads];
- result[bid] = res;
- }
- #endif
- }
+ if (mask.data)
+ kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, twidth, theight);
+ else
+ kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, twidth, theight);
+ cudaSafeCall( cudaGetLastError() );
- template <typename T, typename R, int nthreads>
- __global__ void sumPass2Kernel_C2(typename TypeVec<R, 2>::vec_type* result, int size)
- {
- typedef typename TypeVec<R, 2>::vec_type DstType;
+ cudaSafeCall( cudaDeviceSynchronize() );
- __shared__ R smem[nthreads * 2];
+ R minval_, maxval_;
+ cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(R), cudaMemcpyDeviceToHost) );
+ cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(R), cudaMemcpyDeviceToHost) );
+ *minval = minval_;
+ *maxval = maxval_;
+ }
- const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+ template void run<uchar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+ template void run<schar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+ template void run<ushort>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+ template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+ template void run<int >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+ template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+ template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+}
- DstType res = tid < size ? result[tid] : VecTraits<DstType>::all(0);
- smem[tid] = res.x;
- smem[tid + nthreads] = res.y;
- __syncthreads();
+/////////////////////////////////////////////////////////////
+// minMaxLoc
- sumInSmem<nthreads, R>(smem, tid);
- sumInSmem<nthreads, R>(smem + nthreads, tid);
+namespace minMaxLoc
+{
+ __device__ unsigned int blocks_finished = 0;
+
+ // To avoid shared bank conflicts we convert each value into value of
+ // appropriate type (32 bits minimum)
+ template <typename T> struct MinMaxTypeTraits;
+ template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
+ template <> struct MinMaxTypeTraits<schar> { typedef int best_type; };
+ template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
+ template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
+ template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
+ template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
+ template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
+
+ template <int BLOCK_SIZE, typename T, class Mask>
+ __global__ void kernel(const PtrStepSz<T> src, const Mask mask, T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, const int twidth, const int theight)
+ {
+ typedef typename MinMaxTypeTraits<T>::best_type work_type;
- if (tid == 0)
- {
- res.x = smem[0];
- res.y = smem[nthreads];
- result[0] = res;
- }
- }
+ __shared__ work_type sminval[BLOCK_SIZE];
+ __shared__ work_type smaxval[BLOCK_SIZE];
+ __shared__ unsigned int sminloc[BLOCK_SIZE];
+ __shared__ unsigned int smaxloc[BLOCK_SIZE];
+ __shared__ bool is_last;
+ const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
+ const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
- template <typename T, typename R, typename Op, int nthreads>
- __global__ void sumKernel_C3(const PtrStepSzb src, typename TypeVec<R, 3>::vec_type* result)
- {
- typedef typename TypeVec<T, 3>::vec_type SrcType;
- typedef typename TypeVec<R, 3>::vec_type DstType;
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+ const int bid = blockIdx.y * gridDim.x + blockIdx.x;
- __shared__ R smem[nthreads * 3];
+ work_type mymin = numeric_limits<work_type>::max();
+ work_type mymax = -numeric_limits<work_type>::max();
+ unsigned int myminloc = 0;
+ unsigned int mymaxloc = 0;
- const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
- const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
- const int tid = threadIdx.y * blockDim.x + threadIdx.x;
- const int bid = blockIdx.y * gridDim.x + blockIdx.x;
+ for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
+ {
+ const T* ptr = src.ptr(y);
- SrcType val;
- DstType sum = VecTraits<DstType>::all(0);
- for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
+ for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
+ {
+ if (mask(y, x))
{
- const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
- for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
+ const work_type srcVal = ptr[x];
+
+ if (srcVal < mymin)
{
- val = ptr[x0 + x * blockDim.x];
- sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y), Op::call(val.z));
+ mymin = srcVal;
+ myminloc = y * src.cols + x;
}
- }
-
- smem[tid] = sum.x;
- smem[tid + nthreads] = sum.y;
- smem[tid + 2 * nthreads] = sum.z;
- __syncthreads();
-
- sumInSmem<nthreads, R>(smem, tid);
- sumInSmem<nthreads, R>(smem + nthreads, tid);
- sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
-
- #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
- __shared__ bool is_last;
-
- if (tid == 0)
- {
- DstType res;
- res.x = smem[0];
- res.y = smem[nthreads];
- res.z = smem[2 * nthreads];
- result[bid] = res;
- __threadfence();
-
- uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
- is_last = (ticket == gridDim.x * gridDim.y - 1);
- }
-
- __syncthreads();
-
- if (is_last)
- {
- DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
- smem[tid] = res.x;
- smem[tid + nthreads] = res.y;
- smem[tid + 2 * nthreads] = res.z;
- __syncthreads();
- sumInSmem<nthreads, R>(smem, tid);
- sumInSmem<nthreads, R>(smem + nthreads, tid);
- sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
-
- if (tid == 0)
+ if (srcVal > mymax)
{
- res.x = smem[0];
- res.y = smem[nthreads];
- res.z = smem[2 * nthreads];
- result[0] = res;
- blocks_finished = 0;
+ mymax = srcVal;
+ mymaxloc = y * src.cols + x;
}
}
- #else
- if (tid == 0)
- {
- DstType res;
- res.x = smem[0];
- res.y = smem[nthreads];
- res.z = smem[2 * nthreads];
- result[bid] = res;
- }
- #endif
- }
-
-
- template <typename T, typename R, int nthreads>
- __global__ void sumPass2Kernel_C3(typename TypeVec<R, 3>::vec_type* result, int size)
- {
- typedef typename TypeVec<R, 3>::vec_type DstType;
-
- __shared__ R smem[nthreads * 3];
-
- const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
- DstType res = tid < size ? result[tid] : VecTraits<DstType>::all(0);
- smem[tid] = res.x;
- smem[tid + nthreads] = res.y;
- smem[tid + 2 * nthreads] = res.z;
- __syncthreads();
-
- sumInSmem<nthreads, R>(smem, tid);
- sumInSmem<nthreads, R>(smem + nthreads, tid);
- sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
-
- if (tid == 0)
- {
- res.x = smem[0];
- res.y = smem[nthreads];
- res.z = smem[2 * nthreads];
- result[0] = res;
- }
}
+ }
- template <typename T, typename R, typename Op, int nthreads>
- __global__ void sumKernel_C4(const PtrStepSzb src, typename TypeVec<R, 4>::vec_type* result)
- {
- typedef typename TypeVec<T, 4>::vec_type SrcType;
- typedef typename TypeVec<R, 4>::vec_type DstType;
+ reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
+ smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
+ tid,
+ thrust::make_tuple(less<work_type>(), greater<work_type>()));
- __shared__ R smem[nthreads * 4];
+ if (tid == 0)
+ {
+ minval[bid] = (T) mymin;
+ maxval[bid] = (T) mymax;
+ minloc[bid] = myminloc;
+ maxloc[bid] = mymaxloc;
- const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
- const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
- const int tid = threadIdx.y * blockDim.x + threadIdx.x;
- const int bid = blockIdx.y * gridDim.x + blockIdx.x;
+ __threadfence();
- SrcType val;
- DstType sum = VecTraits<DstType>::all(0);
- for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
- {
- const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
- for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
- {
- val = ptr[x0 + x * blockDim.x];
- sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y),
- Op::call(val.z), Op::call(val.w));
- }
- }
+ unsigned int ticket = ::atomicInc(&blocks_finished, gridDim.x * gridDim.y);
+ is_last = (ticket == gridDim.x * gridDim.y - 1);
+ }
- smem[tid] = sum.x;
- smem[tid + nthreads] = sum.y;
- smem[tid + 2 * nthreads] = sum.z;
- smem[tid + 3 * nthreads] = sum.w;
- __syncthreads();
+ __syncthreads();
- sumInSmem<nthreads, R>(smem, tid);
- sumInSmem<nthreads, R>(smem + nthreads, tid);
- sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
- sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);
+ if (is_last)
+ {
+ unsigned int idx = ::min(tid, gridDim.x * gridDim.y - 1);
- #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
- __shared__ bool is_last;
+ mymin = minval[idx];
+ mymax = maxval[idx];
+ myminloc = minloc[idx];
+ mymaxloc = maxloc[idx];
- if (tid == 0)
- {
- DstType res;
- res.x = smem[0];
- res.y = smem[nthreads];
- res.z = smem[2 * nthreads];
- res.w = smem[3 * nthreads];
- result[bid] = res;
- __threadfence();
-
- uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
- is_last = (ticket == gridDim.x * gridDim.y - 1);
- }
+ reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
+ smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
+ tid,
+ thrust::make_tuple(less<work_type>(), greater<work_type>()));
- __syncthreads();
+ if (tid == 0)
+ {
+ minval[0] = (T) mymin;
+ maxval[0] = (T) mymax;
+ minloc[0] = myminloc;
+ maxloc[0] = mymaxloc;
- if (is_last)
- {
- DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
- smem[tid] = res.x;
- smem[tid + nthreads] = res.y;
- smem[tid + 2 * nthreads] = res.z;
- smem[tid + 3 * nthreads] = res.w;
- __syncthreads();
-
- sumInSmem<nthreads, R>(smem, tid);
- sumInSmem<nthreads, R>(smem + nthreads, tid);
- sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
- sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);
-
- if (tid == 0)
- {
- res.x = smem[0];
- res.y = smem[nthreads];
- res.z = smem[2 * nthreads];
- res.w = smem[3 * nthreads];
- result[0] = res;
- blocks_finished = 0;
- }
- }
- #else
- if (tid == 0)
- {
- DstType res;
- res.x = smem[0];
- res.y = smem[nthreads];
- res.z = smem[2 * nthreads];
- res.w = smem[3 * nthreads];
- result[bid] = res;
- }
- #endif
+ blocks_finished = 0;
}
+ }
+ }
+ const int threads_x = 32;
+ const int threads_y = 8;
- template <typename T, typename R, int nthreads>
- __global__ void sumPass2Kernel_C4(typename TypeVec<R, 4>::vec_type* result, int size)
- {
- typedef typename TypeVec<R, 4>::vec_type DstType;
-
- __shared__ R smem[nthreads * 4];
+ void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
+ {
+ block = dim3(threads_x, threads_y);
- const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+ grid = dim3(divUp(cols, block.x * block.y),
+ divUp(rows, block.y * block.x));
- DstType res = tid < size ? result[tid] : VecTraits<DstType>::all(0);
- smem[tid] = res.x;
- smem[tid + nthreads] = res.y;
- smem[tid + 2 * nthreads] = res.z;
- smem[tid + 3 * nthreads] = res.w;
- __syncthreads();
+ grid.x = ::min(grid.x, block.x);
+ grid.y = ::min(grid.y, block.y);
+ }
- sumInSmem<nthreads, R>(smem, tid);
- sumInSmem<nthreads, R>(smem + nthreads, tid);
- sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
- sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);
+ void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows)
+ {
+ dim3 block, grid;
+ getLaunchCfg(cols, rows, block, grid);
- if (tid == 0)
- {
- res.x = smem[0];
- res.y = smem[nthreads];
- res.z = smem[2 * nthreads];
- res.w = smem[3 * nthreads];
- result[0] = res;
- }
- }
+ // For values
+ b1cols = grid.x * grid.y * elem_size;
+ b1rows = 2;
- template <typename T>
- void sumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn)
- {
- typedef typename SumType<T>::R R;
+ // For locations
+ b2cols = grid.x * grid.y * sizeof(int);
+ b2rows = 2;
+ }
- dim3 threads, grid;
- estimateThreadCfg(src.cols, src.rows, threads, grid);
- setKernelConsts(src.cols, src.rows, threads, grid);
+ template <typename T>
+ void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf)
+ {
+ dim3 block, grid;
+ getLaunchCfg(src.cols, src.rows, block, grid);
+
+ const int twidth = divUp(divUp(src.cols, grid.x), block.x);
+ const int theight = divUp(divUp(src.rows, grid.y), block.y);
+
+ T* minval_buf = (T*) valbuf.ptr(0);
+ T* maxval_buf = (T*) valbuf.ptr(1);
+ unsigned int* minloc_buf = locbuf.ptr(0);
+ unsigned int* maxloc_buf = locbuf.ptr(1);
+
+ if (mask.data)
+ kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
+ else
+ kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
+
+ cudaSafeCall( cudaGetLastError() );
+
+ cudaSafeCall( cudaDeviceSynchronize() );
+
+ T minval_, maxval_;
+ cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
+ cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
+ *minval = minval_;
+ *maxval = maxval_;
+
+ unsigned int minloc_, maxloc_;
+ cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+ cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+ minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
+ maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
+ }
+
+ template void run<uchar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+ template void run<schar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+ template void run<ushort>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+ template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+ template void run<int >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+ template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+ template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+}
+
+/////////////////////////////////////////////////////////////
+// countNonZero
+
+namespace countNonZero
+{
+ __device__ unsigned int blocks_finished = 0;
- switch (cn)
- {
- case 1:
- sumKernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
- src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
- cudaSafeCall( cudaGetLastError() );
-
- sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
- (typename TypeVec<R, 1>::vec_type*)buf.ptr(0), grid.x * grid.y);
- cudaSafeCall( cudaGetLastError() );
-
- break;
- case 2:
- sumKernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
- src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
- cudaSafeCall( cudaGetLastError() );
-
- sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
- (typename TypeVec<R, 2>::vec_type*)buf.ptr(0), grid.x * grid.y);
- cudaSafeCall( cudaGetLastError() );
-
- break;
- case 3:
- sumKernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
- src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
- cudaSafeCall( cudaGetLastError() );
-
- sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
- (typename TypeVec<R, 3>::vec_type*)buf.ptr(0), grid.x * grid.y);
- cudaSafeCall( cudaGetLastError() );
-
- break;
- case 4:
- sumKernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
- src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
- cudaSafeCall( cudaGetLastError() );
-
- sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
- (typename TypeVec<R, 4>::vec_type*)buf.ptr(0), grid.x * grid.y);
- cudaSafeCall( cudaGetLastError() );
-
- break;
- }
- cudaSafeCall( cudaDeviceSynchronize() );
+ template <int BLOCK_SIZE, typename T>
+ __global__ void kernel(const PtrStepSz<T> src, unsigned int* count, const int twidth, const int theight)
+ {
+ __shared__ unsigned int scount[BLOCK_SIZE];
+ __shared__ bool is_last;
- R result[4] = {0, 0, 0, 0};
- cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
+ const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
+ const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
- sum[0] = result[0];
- sum[1] = result[1];
- sum[2] = result[2];
- sum[3] = result[3];
- }
+ const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+ const int bid = blockIdx.y * gridDim.x + blockIdx.x;
- template void sumMultipassCaller<uchar>(const PtrStepSzb, PtrStepb, double*, int);
- template void sumMultipassCaller<char>(const PtrStepSzb, PtrStepb, double*, int);
- template void sumMultipassCaller<ushort>(const PtrStepSzb, PtrStepb, double*, int);
- template void sumMultipassCaller<short>(const PtrStepSzb, PtrStepb, double*, int);
- template void sumMultipassCaller<int>(const PtrStepSzb, PtrStepb, double*, int);
- template void sumMultipassCaller<float>(const PtrStepSzb, PtrStepb, double*, int);
+ unsigned int mycount = 0;
+ for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
+ {
+ const T* ptr = src.ptr(y);
- template <typename T>
- void sumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn)
+ for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
{
- typedef typename SumType<T>::R R;
-
- dim3 threads, grid;
- estimateThreadCfg(src.cols, src.rows, threads, grid);
- setKernelConsts(src.cols, src.rows, threads, grid);
-
- switch (cn)
- {
- case 1:
- sumKernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
- src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
- break;
- case 2:
- sumKernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
- src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
- break;
- case 3:
- sumKernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
- src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
- break;
- case 4:
- sumKernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
- src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
- break;
- }
- cudaSafeCall( cudaGetLastError() );
-
- cudaSafeCall( cudaDeviceSynchronize() );
+ const T srcVal = ptr[x];
- R result[4] = {0, 0, 0, 0};
- cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
-
- sum[0] = result[0];
- sum[1] = result[1];
- sum[2] = result[2];
- sum[3] = result[3];
+ mycount += (srcVal != 0);
}
+ }
- template void sumCaller<uchar>(const PtrStepSzb, PtrStepb, double*, int);
- template void sumCaller<char>(const PtrStepSzb, PtrStepb, double*, int);
- template void sumCaller<ushort>(const PtrStepSzb, PtrStepb, double*, int);
- template void sumCaller<short>(const PtrStepSzb, PtrStepb, double*, int);
- template void sumCaller<int>(const PtrStepSzb, PtrStepb, double*, int);
- template void sumCaller<float>(const PtrStepSzb, PtrStepb, double*, int);
-
+ device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
- template <typename T>
- void absSumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn)
- {
- typedef typename SumType<T>::R R;
+ #if __CUDA_ARCH__ >= 200
+ if (tid == 0)
+ ::atomicAdd(count, mycount);
+ #else
+ if (tid == 0)
+ {
+ count[bid] = mycount;
- dim3 threads, grid;
- estimateThreadCfg(src.cols, src.rows, threads, grid);
- setKernelConsts(src.cols, src.rows, threads, grid);
+ __threadfence();
- switch (cn)
- {
- case 1:
- sumKernel<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
- src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
- cudaSafeCall( cudaGetLastError() );
-
- sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
- (typename TypeVec<R, 1>::vec_type*)buf.ptr(0), grid.x * grid.y);
- cudaSafeCall( cudaGetLastError() );
-
- break;
- case 2:
- sumKernel_C2<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
- src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
- cudaSafeCall( cudaGetLastError() );
-
- sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
- (typename TypeVec<R, 2>::vec_type*)buf.ptr(0), grid.x * grid.y);
- cudaSafeCall( cudaGetLastError() );
-
- break;
- case 3:
- sumKernel_C3<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
- src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
- cudaSafeCall( cudaGetLastError() );
-
- sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
- (typename TypeVec<R, 3>::vec_type*)buf.ptr(0), grid.x * grid.y);
- cudaSafeCall( cudaGetLastError() );
-
- break;
- case 4:
- sumKernel_C4<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
- src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
- cudaSafeCall( cudaGetLastError() );
-
- sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
- (typename TypeVec<R, 4>::vec_type*)buf.ptr(0), grid.x * grid.y);
- cudaSafeCall( cudaGetLastError() );
-
- break;
- }
- cudaSafeCall( cudaDeviceSynchronize() );
-
- R result[4] = {0, 0, 0, 0};
- cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
+ unsigned int ticket = ::atomicInc(&blocks_finished, gridDim.x * gridDim.y);
+ is_last = (ticket == gridDim.x * gridDim.y - 1);
+ }
- sum[0] = result[0];
- sum[1] = result[1];
- sum[2] = result[2];
- sum[3] = result[3];
- }
+ __syncthreads();
- template void absSumMultipassCaller<uchar>(const PtrStepSzb, PtrStepb, double*, int);
- template void absSumMultipassCaller<char>(const PtrStepSzb, PtrStepb, double*, int);
- template void absSumMultipassCaller<ushort>(const PtrStepSzb, PtrStepb, double*, int);
- template void absSumMultipassCaller<short>(const PtrStepSzb, PtrStepb, double*, int);
- template void absSumMultipassCaller<int>(const PtrStepSzb, PtrStepb, double*, int);
- template void absSumMultipassCaller<float>(const PtrStepSzb, PtrStepb, double*, int);
+ if (is_last)
+ {
+ mycount = tid < gridDim.x * gridDim.y ? count[tid] : 0;
+ device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
- template <typename T>
- void absSumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn)
+ if (tid == 0)
{
- typedef typename SumType<T>::R R;
-
- dim3 threads, grid;
- estimateThreadCfg(src.cols, src.rows, threads, grid);
- setKernelConsts(src.cols, src.rows, threads, grid);
-
- switch (cn)
- {
- case 1:
- sumKernel<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
- src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
- break;
- case 2:
- sumKernel_C2<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
- src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
- break;
- case 3:
- sumKernel_C3<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
- src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
- break;
- case 4:
- sumKernel_C4<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
- src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
- break;
- }
- cudaSafeCall( cudaGetLastError() );
-
- cudaSafeCall( cudaDeviceSynchronize() );
-
- R result[4] = {0, 0, 0, 0};
- cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
+ count[0] = mycount;
- sum[0] = result[0];
- sum[1] = result[1];
- sum[2] = result[2];
- sum[3] = result[3];
+ blocks_finished = 0;
}
+ }
+ #endif
+ }
- template void absSumCaller<uchar>(const PtrStepSzb, PtrStepb, double*, int);
- template void absSumCaller<char>(const PtrStepSzb, PtrStepb, double*, int);
- template void absSumCaller<ushort>(const PtrStepSzb, PtrStepb, double*, int);
- template void absSumCaller<short>(const PtrStepSzb, PtrStepb, double*, int);
- template void absSumCaller<int>(const PtrStepSzb, PtrStepb, double*, int);
- template void absSumCaller<float>(const PtrStepSzb, PtrStepb, double*, int);
-
+ const int threads_x = 32;
+ const int threads_y = 8;
- template <typename T>
- void sqrSumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn)
- {
- typedef typename SumType<T>::R R;
+ void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
+ {
+ block = dim3(threads_x, threads_y);
- dim3 threads, grid;
- estimateThreadCfg(src.cols, src.rows, threads, grid);
- setKernelConsts(src.cols, src.rows, threads, grid);
+ grid = dim3(divUp(cols, block.x * block.y),
+ divUp(rows, block.y * block.x));
- switch (cn)
- {
- case 1:
- sumKernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
- src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
- cudaSafeCall( cudaGetLastError() );
-
- sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
- (typename TypeVec<R, 1>::vec_type*)buf.ptr(0), grid.x * grid.y);
- cudaSafeCall( cudaGetLastError() );
-
- break;
- case 2:
- sumKernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
- src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
- cudaSafeCall( cudaGetLastError() );
-
- sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
- (typename TypeVec<R, 2>::vec_type*)buf.ptr(0), grid.x * grid.y);
- cudaSafeCall( cudaGetLastError() );
-
- break;
- case 3:
- sumKernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
- src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
- cudaSafeCall( cudaGetLastError() );
-
- sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
- (typename TypeVec<R, 3>::vec_type*)buf.ptr(0), grid.x * grid.y);
- cudaSafeCall( cudaGetLastError() );
-
- break;
- case 4:
- sumKernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
- src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
- cudaSafeCall( cudaGetLastError() );
-
- sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
- (typename TypeVec<R, 4>::vec_type*)buf.ptr(0), grid.x * grid.y);
- cudaSafeCall( cudaGetLastError() );
-
- break;
- }
- cudaSafeCall( cudaDeviceSynchronize() );
+ grid.x = ::min(grid.x, block.x);
+ grid.y = ::min(grid.y, block.y);
+ }
- R result[4] = {0, 0, 0, 0};
- cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
+ void getBufSize(int cols, int rows, int& bufcols, int& bufrows)
+ {
+ dim3 block, grid;
+ getLaunchCfg(cols, rows, block, grid);
- sum[0] = result[0];
- sum[1] = result[1];
- sum[2] = result[2];
- sum[3] = result[3];
- }
+ bufcols = grid.x * grid.y * sizeof(int);
+ bufrows = 1;
+ }
- template void sqrSumMultipassCaller<uchar>(const PtrStepSzb, PtrStepb, double*, int);
- template void sqrSumMultipassCaller<char>(const PtrStepSzb, PtrStepb, double*, int);
- template void sqrSumMultipassCaller<ushort>(const PtrStepSzb, PtrStepb, double*, int);
- template void sqrSumMultipassCaller<short>(const PtrStepSzb, PtrStepb, double*, int);
- template void sqrSumMultipassCaller<int>(const PtrStepSzb, PtrStepb, double*, int);
- template void sqrSumMultipassCaller<float>(const PtrStepSzb, PtrStepb, double*, int);
+ template <typename T>
+ int run(const PtrStepSzb src, PtrStep<unsigned int> buf)
+ {
+ dim3 block, grid;
+ getLaunchCfg(src.cols, src.rows, block, grid);
+ const int twidth = divUp(divUp(src.cols, grid.x), block.x);
+ const int theight = divUp(divUp(src.rows, grid.y), block.y);
- template <typename T>
- void sqrSumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn)
- {
- typedef double R;
+ unsigned int* count_buf = buf.ptr(0);
- dim3 threads, grid;
- estimateThreadCfg(src.cols, src.rows, threads, grid);
- setKernelConsts(src.cols, src.rows, threads, grid);
+ cudaSafeCall( cudaMemset(count_buf, 0, sizeof(unsigned int)) );
- switch (cn)
- {
- case 1:
- sumKernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
- src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
- break;
- case 2:
- sumKernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
- src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
- break;
- case 3:
- sumKernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
- src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
- break;
- case 4:
- sumKernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
- src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
- break;
- }
- cudaSafeCall( cudaGetLastError() );
+ kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, count_buf, twidth, theight);
+ cudaSafeCall( cudaGetLastError() );
- cudaSafeCall( cudaDeviceSynchronize() );
+ cudaSafeCall( cudaDeviceSynchronize() );
- R result[4] = {0, 0, 0, 0};
- cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
+ unsigned int count;
+ cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost));
- sum[0] = result[0];
- sum[1] = result[1];
- sum[2] = result[2];
- sum[3] = result[3];
- }
+ return count;
+ }
- template void sqrSumCaller<uchar>(const PtrStepSzb, PtrStepb, double*, int);
- template void sqrSumCaller<char>(const PtrStepSzb, PtrStepb, double*, int);
- template void sqrSumCaller<ushort>(const PtrStepSzb, PtrStepb, double*, int);
- template void sqrSumCaller<short>(const PtrStepSzb, PtrStepb, double*, int);
- template void sqrSumCaller<int>(const PtrStepSzb, PtrStepb, double*, int);
- template void sqrSumCaller<float>(const PtrStepSzb, PtrStepb, double*, int);
- } // namespace sum
+ template int run<uchar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
+ template int run<schar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
+ template int run<ushort>(const PtrStepSzb src, PtrStep<unsigned int> buf);
+ template int run<short >(const PtrStepSzb src, PtrStep<unsigned int> buf);
+ template int run<int >(const PtrStepSzb src, PtrStep<unsigned int> buf);
+ template int run<float >(const PtrStepSzb src, PtrStep<unsigned int> buf);
+ template int run<double>(const PtrStepSzb src, PtrStep<unsigned int> buf);
+}
- //////////////////////////////////////////////////////////////////////////////
- // reduce
+//////////////////////////////////////////////////////////////////////////////
+// reduce
- template <typename S> struct SumReductor
+namespace reduce
+{
+ struct Sum
+ {
+ template <typename T>
+ __device__ __forceinline__ T startValue() const
{
- __device__ __forceinline__ S startValue() const
- {
- return 0;
- }
-
- __device__ __forceinline__ SumReductor(const SumReductor& other){}
- __device__ __forceinline__ SumReductor(){}
-
- __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const
- {
- return a + b;
- }
-
- __device__ __forceinline__ S result(S r, double) const
- {
- return r;
- }
- };
+ return VecTraits<T>::all(0);
+ }
- template <typename S> struct AvgReductor
+ template <typename T>
+ __device__ __forceinline__ T operator ()(T a, T b) const
{
- __device__ __forceinline__ S startValue() const
- {
- return 0;
- }
+ return a + b;
+ }
- __device__ __forceinline__ AvgReductor(const AvgReductor& other){}
- __device__ __forceinline__ AvgReductor(){}
+ template <typename T>
+ __device__ __forceinline__ T result(T r, double) const
+ {
+ return r;
+ }
- __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const
- {
- return a + b;
- }
+ __device__ __forceinline__ Sum() {}
+ __device__ __forceinline__ Sum(const Sum&) {}
+ };
- __device__ __forceinline__ double result(S r, double sz) const
- {
- return r / sz;
- }
- };
+ struct Avg
+ {
+ template <typename T>
+ __device__ __forceinline__ T startValue() const
+ {
+ return VecTraits<T>::all(0);
+ }
- template <typename S> struct MinReductor
+ template <typename T>
+ __device__ __forceinline__ T operator ()(T a, T b) const
{
- __device__ __forceinline__ S startValue() const
- {
- return numeric_limits<S>::max();
- }
+ return a + b;
+ }
- __device__ __forceinline__ MinReductor(const MinReductor& other){}
- __device__ __forceinline__ MinReductor(){}
+ template <typename T>
+ __device__ __forceinline__ typename TypeVec<double, VecTraits<T>::cn>::vec_type result(T r, double sz) const
+ {
+ return r / sz;
+ }
- template <typename T> __device__ __forceinline__ T operator ()(volatile T a, volatile T b) const
- {
- return saturate_cast<T>(::min(a, b));
- }
- __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const
- {
- return ::fmin(a, b);
- }
+ __device__ __forceinline__ Avg() {}
+ __device__ __forceinline__ Avg(const Avg&) {}
+ };
- __device__ __forceinline__ S result(S r, double) const
- {
- return r;
- }
- };
+ struct Min
+ {
+ template <typename T>
+ __device__ __forceinline__ T startValue() const
+ {
+ return VecTraits<T>::all(numeric_limits<typename VecTraits<T>::elem_type>::max());
+ }
- template <typename S> struct MaxReductor
+ template <typename T>
+ __device__ __forceinline__ T operator ()(T a, T b) const
{
- __device__ __forceinline__ S startValue() const
- {
- return numeric_limits<S>::min();
- }
+ minimum<T> minOp;
+ return minOp(a, b);
+ }
- __device__ __forceinline__ MaxReductor(const MaxReductor& other){}
- __device__ __forceinline__ MaxReductor(){}
+ template <typename T>
+ __device__ __forceinline__ T result(T r, double) const
+ {
+ return r;
+ }
- template <typename T> __device__ __forceinline__ int operator ()(volatile T a, volatile T b) const
- {
- return ::max(a, b);
- }
- __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const
- {
- return ::fmax(a, b);
- }
+ __device__ __forceinline__ Min() {}
+ __device__ __forceinline__ Min(const Min&) {}
+ };
- __device__ __forceinline__ S result(S r, double) const
- {
- return r;
- }
- };
+ struct Max
+ {
+ template <typename T>
+ __device__ __forceinline__ T startValue() const
+ {
+ return VecTraits<T>::all(-numeric_limits<typename VecTraits<T>::elem_type>::max());
+ }
- template <class Op, typename T, typename S, typename D> __global__ void reduceRows(const PtrStepSz<T> src, D* dst, const Op op)
+ template <typename T>
+ __device__ __forceinline__ T operator ()(T a, T b) const
{
- __shared__ S smem[16 * 16];
+ maximum<T> maxOp;
+ return maxOp(a, b);
+ }
- const int x = blockIdx.x * 16 + threadIdx.x;
+ template <typename T>
+ __device__ __forceinline__ T result(T r, double) const
+ {
+ return r;
+ }
- S myVal = op.startValue();
+ __device__ __forceinline__ Max() {}
+ __device__ __forceinline__ Max(const Max&) {}
+ };
- if (x < src.cols)
- {
- for (int y = threadIdx.y; y < src.rows; y += 16)
- myVal = op(myVal, src.ptr(y)[x]);
- }
+ ///////////////////////////////////////////////////////////
- smem[threadIdx.x * 16 + threadIdx.y] = myVal;
- __syncthreads();
+ template <typename T, typename S, typename D, class Op>
+ __global__ void rowsKernel(const PtrStepSz<T> src, D* dst, const Op op)
+ {
+ __shared__ S smem[16 * 16];
- if (threadIdx.x < 8)
- {
- volatile S* srow = smem + threadIdx.y * 16;
- srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 8]);
- srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 4]);
- srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 2]);
- srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 1]);
- }
- __syncthreads();
+ const int x = blockIdx.x * 16 + threadIdx.x;
- if (threadIdx.y == 0 && x < src.cols)
- dst[x] = saturate_cast<D>(op.result(smem[threadIdx.x * 16], src.rows));
- }
+ S myVal = op.template startValue<S>();
- template <template <typename> class Op, typename T, typename S, typename D> void reduceRows_caller(const PtrStepSz<T>& src, PtrStepSz<D> dst, cudaStream_t stream)
+ if (x < src.cols)
{
- const dim3 block(16, 16);
- const dim3 grid(divUp(src.cols, block.x));
-
- Op<S> op;
- reduceRows<Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);
- cudaSafeCall( cudaGetLastError() );
-
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
-
+ for (int y = threadIdx.y; y < src.rows; y += 16)
+ {
+ S srcVal = src(y, x);
+ myVal = op(myVal, srcVal);
+ }
}
- template <typename T, typename S, typename D> void reduceRows_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream)
- {
- typedef void (*caller_t)(const PtrStepSz<T>& src, PtrStepSz<D> dst, cudaStream_t stream);
+ smem[threadIdx.x * 16 + threadIdx.y] = myVal;
- static const caller_t callers[] =
- {
- reduceRows_caller<SumReductor, T, S, D>,
- reduceRows_caller<AvgReductor, T, S, D>,
- reduceRows_caller<MaxReductor, T, S, D>,
- reduceRows_caller<MinReductor, T, S, D>
- };
+ __syncthreads();
- callers[reduceOp](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<D> >(dst), stream);
- }
+ volatile S* srow = smem + threadIdx.y * 16;
- template void reduceRows_gpu<uchar, int, uchar>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
- template void reduceRows_gpu<uchar, int, int>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
- template void reduceRows_gpu<uchar, int, float>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+ myVal = srow[threadIdx.x];
+ device::reduce<16>(srow, myVal, threadIdx.x, op);
- template void reduceRows_gpu<ushort, int, ushort>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
- template void reduceRows_gpu<ushort, int, int>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
- template void reduceRows_gpu<ushort, int, float>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+ if (threadIdx.x == 0)
+ srow[0] = myVal;
- template void reduceRows_gpu<short, int, short>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
- template void reduceRows_gpu<short, int, int>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
- template void reduceRows_gpu<short, int, float>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+ __syncthreads();
- template void reduceRows_gpu<int, int, int>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
- template void reduceRows_gpu<int, int, float>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+ if (threadIdx.y == 0 && x < src.cols)
+ dst[x] = (D) op.result(smem[threadIdx.x * 16], src.rows);
+ }
- template void reduceRows_gpu<float, float, float>(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+ template <typename T, typename S, typename D, class Op>
+ void rowsCaller(PtrStepSz<T> src, D* dst, cudaStream_t stream)
+ {
+ const dim3 block(16, 16);
+ const dim3 grid(divUp(src.cols, block.x));
+ Op op;
+ rowsKernel<T, S, D, Op><<<grid, block, 0, stream>>>(src, dst, op);
+ cudaSafeCall( cudaGetLastError() );
+ if (stream == 0)
+ cudaSafeCall( cudaDeviceSynchronize() );
+ }
- template <int cn, class Op, typename T, typename S, typename D> __global__ void reduceCols(const PtrStepSz<T> src, D* dst, const Op op)
+ template <typename T, typename S, typename D>
+ void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream)
+ {
+ typedef void (*func_t)(PtrStepSz<T> src, D* dst, cudaStream_t stream);
+ static const func_t funcs[] =
{
- __shared__ S smem[256 * cn];
+ rowsCaller<T, S, D, Sum>,
+ rowsCaller<T, S, D, Avg>,
+ rowsCaller<T, S, D, Max>,
+ rowsCaller<T, S, D, Min>
+ };
- const int y = blockIdx.x;
+ funcs[op]((PtrStepSz<T>) src, (D*) dst, stream);
+ }
- const T* src_row = src.ptr(y);
+ template void rows<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+ template void rows<unsigned char, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+ template void rows<unsigned char, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+ template void rows<unsigned char, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
- S myVal[cn];
+ template void rows<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+ template void rows<unsigned short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+ template void rows<unsigned short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+ template void rows<unsigned short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
- #pragma unroll
- for (int c = 0; c < cn; ++c)
- myVal[c] = op.startValue();
+ template void rows<short, int, short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+ template void rows<short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+ template void rows<short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+ template void rows<short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
- #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200
+ template void rows<int, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+ template void rows<int, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+ template void rows<int, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
- // For cc >= 2.0 prefer L1 cache
- for (int x = threadIdx.x; x < src.cols; x += 256)
- {
- #pragma unroll
- for (int c = 0; c < cn; ++c)
- myVal[c] = op(myVal[c], src_row[x * cn + c]);
- }
+ template void rows<float, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+ template void rows<float, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
- #else // __CUDA_ARCH__ >= 200
+ template void rows<double, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
- // For older arch use shared memory for cache
- for (int x = 0; x < src.cols; x += 256)
- {
- #pragma unroll
- for (int c = 0; c < cn; ++c)
- {
- smem[c * 256 + threadIdx.x] = op.startValue();
- const int load_x = x * cn + c * 256 + threadIdx.x;
- if (load_x < src.cols * cn)
- smem[c * 256 + threadIdx.x] = src_row[load_x];
- }
- __syncthreads();
+ ///////////////////////////////////////////////////////////
- #pragma unroll
- for (int c = 0; c < cn; ++c)
- myVal[c] = op(myVal[c], smem[threadIdx.x * cn + c]);
- __syncthreads();
- }
+ template <int BLOCK_SIZE, typename T, typename S, typename D, int cn, class Op>
+ __global__ void colsKernel(const PtrStepSz<typename TypeVec<T, cn>::vec_type> src, typename TypeVec<D, cn>::vec_type* dst, const Op op)
+ {
+ typedef typename TypeVec<T, cn>::vec_type src_type;
+ typedef typename TypeVec<S, cn>::vec_type work_type;
+ typedef typename TypeVec<D, cn>::vec_type dst_type;
- #endif // __CUDA_ARCH__ >= 200
+ __shared__ S smem[BLOCK_SIZE * cn];
- #pragma unroll
- for (int c = 0; c < cn; ++c)
- smem[c * 256 + threadIdx.x] = myVal[c];
- __syncthreads();
+ const int y = blockIdx.x;
- if (threadIdx.x < 128)
- {
- #pragma unroll
- for (int c = 0; c < cn; ++c)
- smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 128]);
- }
- __syncthreads();
+ const src_type* srcRow = src.ptr(y);
- if (threadIdx.x < 64)
- {
- #pragma unroll
- for (int c = 0; c < cn; ++c)
- smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 64]);
- }
- __syncthreads();
+ work_type myVal = op.template startValue<work_type>();
- volatile S* sdata = smem;
+ for (int x = threadIdx.x; x < src.cols; x += BLOCK_SIZE)
+ myVal = op(myVal, saturate_cast<work_type>(srcRow[x]));
- if (threadIdx.x < 32)
- {
- #pragma unroll
- for (int c = 0; c < cn; ++c)
- {
- sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 32]);
- sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 16]);
- sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 8]);
- sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 4]);
- sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 2]);
- sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 1]);
- }
- }
- __syncthreads();
+ device::reduce<BLOCK_SIZE>(Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), Unroll<cn>::tie(myVal), threadIdx.x, Unroll<cn>::op(op));
- if (threadIdx.x < cn)
- dst[y * cn + threadIdx.x] = saturate_cast<D>(op.result(smem[threadIdx.x * 256], src.cols));
- }
+ if (threadIdx.x == 0)
+ dst[y] = saturate_cast<dst_type>(op.result(myVal, src.cols));
+ }
- template <int cn, template <typename> class Op, typename T, typename S, typename D> void reduceCols_caller(const PtrStepSz<T>& src, PtrStepSz<D> dst, cudaStream_t stream)
- {
- const dim3 block(256);
- const dim3 grid(src.rows);
+ template <typename T, typename S, typename D, int cn, class Op> void colsCaller(PtrStepSzb src, void* dst, cudaStream_t stream)
+ {
+ const int BLOCK_SIZE = 256;
- Op<S> op;
- reduceCols<cn, Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);
- cudaSafeCall( cudaGetLastError() );
+ const dim3 block(BLOCK_SIZE);
+ const dim3 grid(src.rows);
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
+ Op op;
+ colsKernel<BLOCK_SIZE, T, S, D, cn, Op><<<grid, block, 0, stream>>>((PtrStepSz<typename TypeVec<T, cn>::vec_type>) src, (typename TypeVec<D, cn>::vec_type*) dst, op);
+ cudaSafeCall( cudaGetLastError() );
- }
+ if (stream == 0)
+ cudaSafeCall( cudaDeviceSynchronize() );
- template <typename T, typename S, typename D> void reduceCols_gpu(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream)
+ }
+
+ template <typename T, typename S, typename D> void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream)
+ {
+ typedef void (*func_t)(PtrStepSzb src, void* dst, cudaStream_t stream);
+ static const func_t funcs[5][4] =
{
- typedef void (*caller_t)(const PtrStepSz<T>& src, PtrStepSz<D> dst, cudaStream_t stream);
+ {0,0,0,0},
+ {colsCaller<T, S, D, 1, Sum>, colsCaller<T, S, D, 1, Avg>, colsCaller<T, S, D, 1, Max>, colsCaller<T, S, D, 1, Min>},
+ {colsCaller<T, S, D, 2, Sum>, colsCaller<T, S, D, 2, Avg>, colsCaller<T, S, D, 2, Max>, colsCaller<T, S, D, 2, Min>},
+ {colsCaller<T, S, D, 3, Sum>, colsCaller<T, S, D, 3, Avg>, colsCaller<T, S, D, 3, Max>, colsCaller<T, S, D, 3, Min>},
+ {colsCaller<T, S, D, 4, Sum>, colsCaller<T, S, D, 4, Avg>, colsCaller<T, S, D, 4, Max>, colsCaller<T, S, D, 4, Min>},
+ };
- static const caller_t callers[4][4] =
- {
- {reduceCols_caller<1, SumReductor, T, S, D>, reduceCols_caller<1, AvgReductor, T, S, D>, reduceCols_caller<1, MaxReductor, T, S, D>, reduceCols_caller<1, MinReductor, T, S, D>},
- {reduceCols_caller<2, SumReductor, T, S, D>, reduceCols_caller<2, AvgReductor, T, S, D>, reduceCols_caller<2, MaxReductor, T, S, D>, reduceCols_caller<2, MinReductor, T, S, D>},
- {reduceCols_caller<3, SumReductor, T, S, D>, reduceCols_caller<3, AvgReductor, T, S, D>, reduceCols_caller<3, MaxReductor, T, S, D>, reduceCols_caller<3, MinReductor, T, S, D>},
- {reduceCols_caller<4, SumReductor, T, S, D>, reduceCols_caller<4, AvgReductor, T, S, D>, reduceCols_caller<4, MaxReductor, T, S, D>, reduceCols_caller<4, MinReductor, T, S, D>},
- };
+ funcs[cn][op](src, dst, stream);
+ }
- callers[cn - 1][reduceOp](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<D> >(dst), stream);
- }
+ template void cols<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+ template void cols<unsigned char, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+ template void cols<unsigned char, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+ template void cols<unsigned char, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
- template void reduceCols_gpu<uchar, int, uchar>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
- template void reduceCols_gpu<uchar, int, int>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
- template void reduceCols_gpu<uchar, int, float>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+ template void cols<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+ template void cols<unsigned short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+ template void cols<unsigned short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+ template void cols<unsigned short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
- template void reduceCols_gpu<ushort, int, ushort>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
- template void reduceCols_gpu<ushort, int, int>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
- template void reduceCols_gpu<ushort, int, float>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+ template void cols<short, int, short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+ template void cols<short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+ template void cols<short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+ template void cols<short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
- template void reduceCols_gpu<short, int, short>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
- template void reduceCols_gpu<short, int, int>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
- template void reduceCols_gpu<short, int, float>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+ template void cols<int, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+ template void cols<int, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+ template void cols<int, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
- template void reduceCols_gpu<int, int, int>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
- template void reduceCols_gpu<int, int, float>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
+ template void cols<float, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+ template void cols<float, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
- template void reduceCols_gpu<float, float, float>(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
- } // namespace mattrix_reductions
-}}} // namespace cv { namespace gpu { namespace device
+ template void cols<double, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+}
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
////////////////////////////////////////////////////////////////////////
// Sum
-namespace cv { namespace gpu { namespace device
+namespace sum
{
- namespace matrix_reductions
- {
- namespace sum
- {
- template <typename T>
- void sumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
-
- template <typename T>
- void sumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
-
- template <typename T>
- void absSumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
+ void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows);
- template <typename T>
- void absSumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
+ template <typename T, int cn>
+ void run(PtrStepSzb src, void* buf, double* sum);
- template <typename T>
- void sqrSumCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
+ template <typename T, int cn>
+ void runAbs(PtrStepSzb src, void* buf, double* sum);
- template <typename T>
- void sqrSumMultipassCaller(const PtrStepSzb src, PtrStepb buf, double* sum, int cn);
-
- void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows);
- }
- }
-}}}
+ template <typename T, int cn>
+ void runSqr(PtrStepSzb src, void* buf, double* sum);
+}
Scalar cv::gpu::sum(const GpuMat& src)
{
return sum(src, buf);
}
-
Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
{
- using namespace cv::gpu::device::matrix_reductions::sum;
-
- typedef void (*Caller)(const PtrStepSzb, PtrStepb, double*, int);
-
- static Caller multipass_callers[] =
+ typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum);
+ static const func_t funcs[7][5] =
{
- sumMultipassCaller<unsigned char>, sumMultipassCaller<char>,
- sumMultipassCaller<unsigned short>, sumMultipassCaller<short>,
- sumMultipassCaller<int>, sumMultipassCaller<float>
+ {0, ::sum::run<uchar , 1>, ::sum::run<uchar , 2>, ::sum::run<uchar , 3>, ::sum::run<uchar , 4>},
+ {0, ::sum::run<schar , 1>, ::sum::run<schar , 2>, ::sum::run<schar , 3>, ::sum::run<schar , 4>},
+ {0, ::sum::run<ushort, 1>, ::sum::run<ushort, 2>, ::sum::run<ushort, 3>, ::sum::run<ushort, 4>},
+ {0, ::sum::run<short , 1>, ::sum::run<short , 2>, ::sum::run<short , 3>, ::sum::run<short , 4>},
+ {0, ::sum::run<int , 1>, ::sum::run<int , 2>, ::sum::run<int , 3>, ::sum::run<int , 4>},
+ {0, ::sum::run<float , 1>, ::sum::run<float , 2>, ::sum::run<float , 3>, ::sum::run<float , 4>},
+ {0, ::sum::run<double, 1>, ::sum::run<double, 2>, ::sum::run<double, 3>, ::sum::run<double, 4>}
};
- static Caller singlepass_callers[] = {
- sumCaller<unsigned char>, sumCaller<char>,
- sumCaller<unsigned short>, sumCaller<short>,
- sumCaller<int>, sumCaller<float>
- };
-
- CV_Assert(src.depth() <= CV_32F);
+ if (src.depth() == CV_64F)
+ {
+ if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+ CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+ }
Size buf_size;
- getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
+ ::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf);
+ buf.setTo(Scalar::all(0));
- Caller* callers = multipass_callers;
- if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
- callers = singlepass_callers;
-
- Caller caller = callers[src.depth()];
+ const func_t func = funcs[src.depth()][src.channels()];
double result[4];
- caller(src, buf, result, src.channels());
+ func(src, buf.data, result);
+
return Scalar(result[0], result[1], result[2], result[3]);
}
-
Scalar cv::gpu::absSum(const GpuMat& src)
{
GpuMat buf;
return absSum(src, buf);
}
-
Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
{
- using namespace cv::gpu::device::matrix_reductions::sum;
-
- typedef void (*Caller)(const PtrStepSzb, PtrStepb, double*, int);
-
- static Caller multipass_callers[] =
- {
- absSumMultipassCaller<unsigned char>, absSumMultipassCaller<char>,
- absSumMultipassCaller<unsigned short>, absSumMultipassCaller<short>,
- absSumMultipassCaller<int>, absSumMultipassCaller<float>
- };
-
- static Caller singlepass_callers[] =
+ typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum);
+ static const func_t funcs[7][5] =
{
- absSumCaller<unsigned char>, absSumCaller<char>,
- absSumCaller<unsigned short>, absSumCaller<short>,
- absSumCaller<int>, absSumCaller<float>
+ {0, ::sum::runAbs<uchar , 1>, ::sum::runAbs<uchar , 2>, ::sum::runAbs<uchar , 3>, ::sum::runAbs<uchar , 4>},
+ {0, ::sum::runAbs<schar , 1>, ::sum::runAbs<schar , 2>, ::sum::runAbs<schar , 3>, ::sum::runAbs<schar , 4>},
+ {0, ::sum::runAbs<ushort, 1>, ::sum::runAbs<ushort, 2>, ::sum::runAbs<ushort, 3>, ::sum::runAbs<ushort, 4>},
+ {0, ::sum::runAbs<short , 1>, ::sum::runAbs<short , 2>, ::sum::runAbs<short , 3>, ::sum::runAbs<short , 4>},
+ {0, ::sum::runAbs<int , 1>, ::sum::runAbs<int , 2>, ::sum::runAbs<int , 3>, ::sum::runAbs<int , 4>},
+ {0, ::sum::runAbs<float , 1>, ::sum::runAbs<float , 2>, ::sum::runAbs<float , 3>, ::sum::runAbs<float , 4>},
+ {0, ::sum::runAbs<double, 1>, ::sum::runAbs<double, 2>, ::sum::runAbs<double, 3>, ::sum::runAbs<double, 4>}
};
- CV_Assert(src.depth() <= CV_32F);
-
Size buf_size;
- getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
+ ::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf);
+ buf.setTo(Scalar::all(0));
- Caller* callers = multipass_callers;
- if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
- callers = singlepass_callers;
-
- Caller caller = callers[src.depth()];
+ const func_t func = funcs[src.depth()][src.channels()];
double result[4];
- caller(src, buf, result, src.channels());
+ func(src, buf.data, result);
+
return Scalar(result[0], result[1], result[2], result[3]);
}
-
Scalar cv::gpu::sqrSum(const GpuMat& src)
{
GpuMat buf;
return sqrSum(src, buf);
}
-
Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
{
- using namespace cv::gpu::device::matrix_reductions::sum;
-
- typedef void (*Caller)(const PtrStepSzb, PtrStepb, double*, int);
-
- static Caller multipass_callers[] =
+ typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum);
+ static const func_t funcs[7][5] =
{
- sqrSumMultipassCaller<unsigned char>, sqrSumMultipassCaller<char>,
- sqrSumMultipassCaller<unsigned short>, sqrSumMultipassCaller<short>,
- sqrSumMultipassCaller<int>, sqrSumMultipassCaller<float>
+ {0, ::sum::runSqr<uchar , 1>, ::sum::runSqr<uchar , 2>, ::sum::runSqr<uchar , 3>, ::sum::runSqr<uchar , 4>},
+ {0, ::sum::runSqr<schar , 1>, ::sum::runSqr<schar , 2>, ::sum::runSqr<schar , 3>, ::sum::runSqr<schar , 4>},
+ {0, ::sum::runSqr<ushort, 1>, ::sum::runSqr<ushort, 2>, ::sum::runSqr<ushort, 3>, ::sum::runSqr<ushort, 4>},
+ {0, ::sum::runSqr<short , 1>, ::sum::runSqr<short , 2>, ::sum::runSqr<short , 3>, ::sum::runSqr<short , 4>},
+ {0, ::sum::runSqr<int , 1>, ::sum::runSqr<int , 2>, ::sum::runSqr<int , 3>, ::sum::runSqr<int , 4>},
+ {0, ::sum::runSqr<float , 1>, ::sum::runSqr<float , 2>, ::sum::runSqr<float , 3>, ::sum::runSqr<float , 4>},
+ {0, ::sum::runSqr<double, 1>, ::sum::runSqr<double, 2>, ::sum::runSqr<double, 3>, ::sum::runSqr<double, 4>}
};
- static Caller singlepass_callers[7] =
- {
- sqrSumCaller<unsigned char>, sqrSumCaller<char>,
- sqrSumCaller<unsigned short>, sqrSumCaller<short>,
- sqrSumCaller<int>, sqrSumCaller<float>
- };
-
- CV_Assert(src.depth() <= CV_32F);
-
- Caller* callers = multipass_callers;
- if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
- callers = singlepass_callers;
-
Size buf_size;
- getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
+ ::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf);
+ buf.setTo(Scalar::all(0));
- Caller caller = callers[src.depth()];
+ const func_t func = funcs[src.depth()][src.channels()];
double result[4];
- caller(src, buf, result, src.channels());
+ func(src, buf.data, result);
+
return Scalar(result[0], result[1], result[2], result[3]);
}
////////////////////////////////////////////////////////////////////////
-// Find min or max
+// minMax
-namespace cv { namespace gpu { namespace device
+namespace minMax
{
- namespace matrix_reductions
- {
- namespace minmax
- {
- void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
-
- template <typename T>
- void minMaxCaller(const PtrStepSzb src, double* minval, double* maxval, PtrStepb buf);
-
- template <typename T>
- void minMaxMaskCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
-
- template <typename T>
- void minMaxMultipassCaller(const PtrStepSzb src, double* minval, double* maxval, PtrStepb buf);
-
- template <typename T>
- void minMaxMaskMultipassCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
- }
- }
-}}}
+ void getBufSize(int cols, int rows, int& bufcols, int& bufrows);
+ template <typename T>
+ void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+}
void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask)
{
minMax(src, minVal, maxVal, mask, buf);
}
-
void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
{
- using namespace ::cv::gpu::device::matrix_reductions::minmax;
-
- typedef void (*Caller)(const PtrStepSzb, double*, double*, PtrStepb);
- typedef void (*MaskedCaller)(const PtrStepSzb, const PtrStepb, double*, double*, PtrStepb);
-
- static Caller multipass_callers[] =
- {
- minMaxMultipassCaller<unsigned char>, minMaxMultipassCaller<char>,
- minMaxMultipassCaller<unsigned short>, minMaxMultipassCaller<short>,
- minMaxMultipassCaller<int>, minMaxMultipassCaller<float>, 0
- };
-
- static Caller singlepass_callers[] =
- {
- minMaxCaller<unsigned char>, minMaxCaller<char>,
- minMaxCaller<unsigned short>, minMaxCaller<short>,
- minMaxCaller<int>, minMaxCaller<float>, minMaxCaller<double>
- };
-
- static MaskedCaller masked_multipass_callers[] =
- {
- minMaxMaskMultipassCaller<unsigned char>, minMaxMaskMultipassCaller<char>,
- minMaxMaskMultipassCaller<unsigned short>, minMaxMaskMultipassCaller<short>,
- minMaxMaskMultipassCaller<int>, minMaxMaskMultipassCaller<float>, 0
- };
-
- static MaskedCaller masked_singlepass_callers[] =
+ typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+ static const func_t funcs[] =
{
- minMaxMaskCaller<unsigned char>, minMaxMaskCaller<char>,
- minMaxMaskCaller<unsigned short>, minMaxMaskCaller<short>,
- minMaxMaskCaller<int>, minMaxMaskCaller<float>, minMaxMaskCaller<double>
+ ::minMax::run<uchar>,
+ ::minMax::run<schar>,
+ ::minMax::run<ushort>,
+ ::minMax::run<short>,
+ ::minMax::run<int>,
+ ::minMax::run<float>,
+ ::minMax::run<double>
};
- CV_Assert(src.depth() <= CV_64F);
- CV_Assert(src.channels() == 1);
- CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
+ CV_Assert( src.channels() == 1 );
+ CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
if (src.depth() == CV_64F)
{
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
- double minVal_; if (!minVal) minVal = &minVal_;
- double maxVal_; if (!maxVal) maxVal = &maxVal_;
-
Size buf_size;
- getBufSizeRequired(src.cols, src.rows, static_cast<int>(src.elemSize()), buf_size.width, buf_size.height);
+ ::minMax::getBufSize(src.cols, src.rows, buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf);
- if (mask.empty())
- {
- Caller* callers = multipass_callers;
- if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
- callers = singlepass_callers;
-
- Caller caller = callers[src.type()];
- CV_Assert(caller != 0);
- caller(src, minVal, maxVal, buf);
- }
- else
- {
- MaskedCaller* callers = masked_multipass_callers;
- if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
- callers = masked_singlepass_callers;
+ const func_t func = funcs[src.depth()];
- MaskedCaller caller = callers[src.type()];
- CV_Assert(caller != 0);
- caller(src, mask, minVal, maxVal, buf);
- }
+ double temp1, temp2;
+ func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, buf);
}
-
////////////////////////////////////////////////////////////////////////
-// Locate min and max
+// minMaxLoc
-namespace cv { namespace gpu { namespace device
+namespace minMaxLoc
{
- namespace matrix_reductions
- {
- namespace minmaxloc
- {
- void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols,
- int& b1rows, int& b2cols, int& b2rows);
-
- template <typename T>
- void minMaxLocCaller(const PtrStepSzb src, double* minval, double* maxval,
- int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
-
- template <typename T>
- void minMaxLocMaskCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval,
- int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
-
- template <typename T>
- void minMaxLocMultipassCaller(const PtrStepSzb src, double* minval, double* maxval,
- int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
+ void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows);
- template <typename T>
- void minMaxLocMaskMultipassCaller(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval,
- int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
- }
- }
-}}}
+ template <typename T>
+ void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+}
void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask)
{
void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
const GpuMat& mask, GpuMat& valBuf, GpuMat& locBuf)
{
- using namespace ::cv::gpu::device::matrix_reductions::minmaxloc;
-
- typedef void (*Caller)(const PtrStepSzb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
- typedef void (*MaskedCaller)(const PtrStepSzb, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-
- static Caller multipass_callers[] =
- {
- minMaxLocMultipassCaller<unsigned char>, minMaxLocMultipassCaller<char>,
- minMaxLocMultipassCaller<unsigned short>, minMaxLocMultipassCaller<short>,
- minMaxLocMultipassCaller<int>, minMaxLocMultipassCaller<float>, 0
- };
-
- static Caller singlepass_callers[] =
+ typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+ static const func_t funcs[] =
{
- minMaxLocCaller<unsigned char>, minMaxLocCaller<char>,
- minMaxLocCaller<unsigned short>, minMaxLocCaller<short>,
- minMaxLocCaller<int>, minMaxLocCaller<float>, minMaxLocCaller<double>
+ ::minMaxLoc::run<uchar>,
+ ::minMaxLoc::run<schar>,
+ ::minMaxLoc::run<ushort>,
+ ::minMaxLoc::run<short>,
+ ::minMaxLoc::run<int>,
+ ::minMaxLoc::run<float>,
+ ::minMaxLoc::run<double>
};
- static MaskedCaller masked_multipass_callers[] =
- {
- minMaxLocMaskMultipassCaller<unsigned char>, minMaxLocMaskMultipassCaller<char>,
- minMaxLocMaskMultipassCaller<unsigned short>, minMaxLocMaskMultipassCaller<short>,
- minMaxLocMaskMultipassCaller<int>, minMaxLocMaskMultipassCaller<float>, 0
- };
-
- static MaskedCaller masked_singlepass_callers[] =
- {
- minMaxLocMaskCaller<unsigned char>, minMaxLocMaskCaller<char>,
- minMaxLocMaskCaller<unsigned short>, minMaxLocMaskCaller<short>,
- minMaxLocMaskCaller<int>, minMaxLocMaskCaller<float>, minMaxLocMaskCaller<double>
- };
-
- CV_Assert(src.depth() <= CV_64F);
- CV_Assert(src.channels() == 1);
- CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));
+ CV_Assert( src.channels() == 1 );
+ CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
if (src.depth() == CV_64F)
{
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
- double minVal_; if (!minVal) minVal = &minVal_;
- double maxVal_; if (!maxVal) maxVal = &maxVal_;
- int minLoc_[2];
- int maxLoc_[2];
-
Size valbuf_size, locbuf_size;
- getBufSizeRequired(src.cols, src.rows, static_cast<int>(src.elemSize()), valbuf_size.width,
- valbuf_size.height, locbuf_size.width, locbuf_size.height);
+ ::minMaxLoc::getBufSize(src.cols, src.rows, src.elemSize(), valbuf_size.width, valbuf_size.height, locbuf_size.width, locbuf_size.height);
ensureSizeIsEnough(valbuf_size, CV_8U, valBuf);
ensureSizeIsEnough(locbuf_size, CV_8U, locBuf);
- if (mask.empty())
- {
- Caller* callers = multipass_callers;
- if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
- callers = singlepass_callers;
-
- Caller caller = callers[src.type()];
- CV_Assert(caller != 0);
- caller(src, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf);
- }
- else
- {
- MaskedCaller* callers = masked_multipass_callers;
- if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
- callers = masked_singlepass_callers;
-
- MaskedCaller caller = callers[src.type()];
- CV_Assert(caller != 0);
- caller(src, mask, minVal, maxVal, minLoc_, maxLoc_, valBuf, locBuf);
- }
+ const func_t func = funcs[src.depth()];
- if (minLoc) { minLoc->x = minLoc_[0]; minLoc->y = minLoc_[1]; }
- if (maxLoc) { maxLoc->x = maxLoc_[0]; maxLoc->y = maxLoc_[1]; }
+ double temp1, temp2;
+ Point temp3, temp4;
+ func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, minLoc ? &minLoc->x : &temp3.x, maxLoc ? &maxLoc->x : &temp4.x, valBuf, locBuf);
}
//////////////////////////////////////////////////////////////////////////////
-// Count non-zero elements
+// countNonZero
-namespace cv { namespace gpu { namespace device
+namespace countNonZero
{
- namespace matrix_reductions
- {
- namespace countnonzero
- {
- void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows);
+ void getBufSize(int cols, int rows, int& bufcols, int& bufrows);
- template <typename T>
- int countNonZeroCaller(const PtrStepSzb src, PtrStepb buf);
-
- template <typename T>
- int countNonZeroMultipassCaller(const PtrStepSzb src, PtrStepb buf);
- }
- }
-}}}
+ template <typename T>
+ int run(const PtrStepSzb src, PtrStep<unsigned int> buf);
+}
int cv::gpu::countNonZero(const GpuMat& src)
{
return countNonZero(src, buf);
}
-
int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
{
- using namespace ::cv::gpu::device::matrix_reductions::countnonzero;
-
- typedef int (*Caller)(const PtrStepSzb src, PtrStepb buf);
-
- static Caller multipass_callers[7] =
+ typedef int (*func_t)(const PtrStepSzb src, PtrStep<unsigned int> buf);
+ static const func_t funcs[] =
{
- countNonZeroMultipassCaller<unsigned char>, countNonZeroMultipassCaller<char>,
- countNonZeroMultipassCaller<unsigned short>, countNonZeroMultipassCaller<short>,
- countNonZeroMultipassCaller<int>, countNonZeroMultipassCaller<float>, 0
+ ::countNonZero::run<uchar>,
+ ::countNonZero::run<schar>,
+ ::countNonZero::run<ushort>,
+ ::countNonZero::run<short>,
+ ::countNonZero::run<int>,
+ ::countNonZero::run<float>,
+ ::countNonZero::run<double>
};
- static Caller singlepass_callers[7] =
- {
- countNonZeroCaller<unsigned char>, countNonZeroCaller<char>,
- countNonZeroCaller<unsigned short>, countNonZeroCaller<short>,
- countNonZeroCaller<int>, countNonZeroCaller<float>, countNonZeroCaller<double> };
-
- CV_Assert(src.depth() <= CV_64F);
CV_Assert(src.channels() == 1);
if (src.depth() == CV_64F)
}
Size buf_size;
- getBufSizeRequired(src.cols, src.rows, buf_size.width, buf_size.height);
+ ::countNonZero::getBufSize(src.cols, src.rows, buf_size.width, buf_size.height);
ensureSizeIsEnough(buf_size, CV_8U, buf);
- Caller* callers = multipass_callers;
- if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
- callers = singlepass_callers;
+ const func_t func = funcs[src.depth()];
- Caller caller = callers[src.type()];
- CV_Assert(caller != 0);
- return caller(src, buf);
+ return func(src, buf);
}
//////////////////////////////////////////////////////////////////////////////
// reduce
-namespace cv { namespace gpu { namespace device
+namespace reduce
{
- namespace matrix_reductions
- {
- template <typename T, typename S, typename D> void reduceRows_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
- template <typename T, typename S, typename D> void reduceCols_gpu(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
- }
-}}}
+ template <typename T, typename S, typename D>
+ void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+
+ template <typename T, typename S, typename D>
+ void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+}
void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int dtype, Stream& stream)
{
- using namespace ::cv::gpu::device::matrix_reductions;
-
- CV_Assert(src.depth() <= CV_32F && src.channels() <= 4 && dtype <= CV_32F);
- CV_Assert(dim == 0 || dim == 1);
- CV_Assert(reduceOp == CV_REDUCE_SUM || reduceOp == CV_REDUCE_AVG || reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN);
+ CV_Assert( src.channels() <= 4 );
+ CV_Assert( dim == 0 || dim == 1 );
+ CV_Assert( reduceOp == CV_REDUCE_SUM || reduceOp == CV_REDUCE_AVG || reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN );
if (dtype < 0)
dtype = src.depth();
- dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKETYPE(dtype, src.channels()));
+ dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
if (dim == 0)
{
- typedef void (*caller_t)(const PtrStepSzb& src, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-
- static const caller_t callers[6][6] =
+ typedef void (*func_t)(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+ static const func_t funcs[7][7] =
{
{
- reduceRows_gpu<unsigned char, int, unsigned char>,
- 0/*reduceRows_gpu<unsigned char, int, signed char>*/,
- 0/*reduceRows_gpu<unsigned char, int, unsigned short>*/,
- 0/*reduceRows_gpu<unsigned char, int, short>*/,
- reduceRows_gpu<unsigned char, int, int>,
- reduceRows_gpu<unsigned char, int, float>
+ ::reduce::rows<unsigned char, int, unsigned char>,
+ 0/*::reduce::rows<unsigned char, int, signed char>*/,
+ 0/*::reduce::rows<unsigned char, int, unsigned short>*/,
+ 0/*::reduce::rows<unsigned char, int, short>*/,
+ ::reduce::rows<unsigned char, int, int>,
+ ::reduce::rows<unsigned char, float, float>,
+ ::reduce::rows<unsigned char, double, double>
+ },
+ {
+ 0/*::reduce::rows<signed char, int, unsigned char>*/,
+ 0/*::reduce::rows<signed char, int, signed char>*/,
+ 0/*::reduce::rows<signed char, int, unsigned short>*/,
+ 0/*::reduce::rows<signed char, int, short>*/,
+ 0/*::reduce::rows<signed char, int, int>*/,
+ 0/*::reduce::rows<signed char, float, float>*/,
+ 0/*::reduce::rows<signed char, double, double>*/
},
{
- 0/*reduceRows_gpu<signed char, int, unsigned char>*/,
- 0/*reduceRows_gpu<signed char, int, signed char>*/,
- 0/*reduceRows_gpu<signed char, int, unsigned short>*/,
- 0/*reduceRows_gpu<signed char, int, short>*/,
- 0/*reduceRows_gpu<signed char, int, int>*/,
- 0/*reduceRows_gpu<signed char, int, float>*/
+ 0/*::reduce::rows<unsigned short, int, unsigned char>*/,
+ 0/*::reduce::rows<unsigned short, int, signed char>*/,
+ ::reduce::rows<unsigned short, int, unsigned short>,
+ 0/*::reduce::rows<unsigned short, int, short>*/,
+ ::reduce::rows<unsigned short, int, int>,
+ ::reduce::rows<unsigned short, float, float>,
+ ::reduce::rows<unsigned short, double, double>
},
{
- 0/*reduceRows_gpu<unsigned short, int, unsigned char>*/,
- 0/*reduceRows_gpu<unsigned short, int, signed char>*/,
- reduceRows_gpu<unsigned short, int, unsigned short>,
- 0/*reduceRows_gpu<unsigned short, int, short>*/,
- reduceRows_gpu<unsigned short, int, int>,
- reduceRows_gpu<unsigned short, int, float>
+ 0/*::reduce::rows<short, int, unsigned char>*/,
+ 0/*::reduce::rows<short, int, signed char>*/,
+ 0/*::reduce::rows<short, int, unsigned short>*/,
+ ::reduce::rows<short, int, short>,
+ ::reduce::rows<short, int, int>,
+ ::reduce::rows<short, float, float>,
+ ::reduce::rows<short, double, double>
},
{
- 0/*reduceRows_gpu<short, int, unsigned char>*/,
- 0/*reduceRows_gpu<short, int, signed char>*/,
- 0/*reduceRows_gpu<short, int, unsigned short>*/,
- reduceRows_gpu<short, int, short>,
- reduceRows_gpu<short, int, int>,
- reduceRows_gpu<short, int, float>
+ 0/*::reduce::rows<int, int, unsigned char>*/,
+ 0/*::reduce::rows<int, int, signed char>*/,
+ 0/*::reduce::rows<int, int, unsigned short>*/,
+ 0/*::reduce::rows<int, int, short>*/,
+ ::reduce::rows<int, int, int>,
+ ::reduce::rows<int, float, float>,
+ ::reduce::rows<int, double, double>
},
{
- 0/*reduceRows_gpu<int, int, unsigned char>*/,
- 0/*reduceRows_gpu<int, int, signed char>*/,
- 0/*reduceRows_gpu<int, int, unsigned short>*/,
- 0/*reduceRows_gpu<int, int, short>*/,
- reduceRows_gpu<int, int, int>,
- reduceRows_gpu<int, int, float>
+ 0/*::reduce::rows<float, float, unsigned char>*/,
+ 0/*::reduce::rows<float, float, signed char>*/,
+ 0/*::reduce::rows<float, float, unsigned short>*/,
+ 0/*::reduce::rows<float, float, short>*/,
+ 0/*::reduce::rows<float, float, int>*/,
+ ::reduce::rows<float, float, float>,
+ ::reduce::rows<float, double, double>
},
{
- 0/*reduceRows_gpu<float, float, unsigned char>*/,
- 0/*reduceRows_gpu<float, float, signed char>*/,
- 0/*reduceRows_gpu<float, float, unsigned short>*/,
- 0/*reduceRows_gpu<float, float, short>*/,
- 0/*reduceRows_gpu<float, float, int>*/,
- reduceRows_gpu<float, float, float>
+ 0/*::reduce::rows<double, double, unsigned char>*/,
+ 0/*::reduce::rows<double, double, signed char>*/,
+ 0/*::reduce::rows<double, double, unsigned short>*/,
+ 0/*::reduce::rows<double, double, short>*/,
+ 0/*::reduce::rows<double, double, int>*/,
+ 0/*::reduce::rows<double, double, float>*/,
+ ::reduce::rows<double, double, double>
}
};
- const caller_t func = callers[src.depth()][dst.depth()];
+ const func_t func = funcs[src.depth()][dst.depth()];
if (!func)
CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats");
- func(src.reshape(1), dst.reshape(1), reduceOp, StreamAccessor::getStream(stream));
+ func(src.reshape(1), dst.data, reduceOp, StreamAccessor::getStream(stream));
}
else
{
- typedef void (*caller_t)(const PtrStepSzb& src, int cn, const PtrStepSzb& dst, int reduceOp, cudaStream_t stream);
-
- static const caller_t callers[6][6] =
+ typedef void (*func_t)(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+ static const func_t funcs[7][7] =
{
{
- reduceCols_gpu<unsigned char, int, unsigned char>,
- 0/*reduceCols_gpu<unsigned char, int, signed char>*/,
- 0/*reduceCols_gpu<unsigned char, int, unsigned short>*/,
- 0/*reduceCols_gpu<unsigned char, int, short>*/,
- reduceCols_gpu<unsigned char, int, int>,
- reduceCols_gpu<unsigned char, int, float>
+ ::reduce::cols<unsigned char, int, unsigned char>,
+ 0/*::reduce::cols<unsigned char, int, signed char>*/,
+ 0/*::reduce::cols<unsigned char, int, unsigned short>*/,
+ 0/*::reduce::cols<unsigned char, int, short>*/,
+ ::reduce::cols<unsigned char, int, int>,
+ ::reduce::cols<unsigned char, float, float>,
+ ::reduce::cols<unsigned char, double, double>
+ },
+ {
+ 0/*::reduce::cols<signed char, int, unsigned char>*/,
+ 0/*::reduce::cols<signed char, int, signed char>*/,
+ 0/*::reduce::cols<signed char, int, unsigned short>*/,
+ 0/*::reduce::cols<signed char, int, short>*/,
+ 0/*::reduce::cols<signed char, int, int>*/,
+ 0/*::reduce::cols<signed char, float, float>*/,
+ 0/*::reduce::cols<signed char, double, double>*/
},
{
- 0/*reduceCols_gpu<signed char, int, unsigned char>*/,
- 0/*reduceCols_gpu<signed char, int, signed char>*/,
- 0/*reduceCols_gpu<signed char, int, unsigned short>*/,
- 0/*reduceCols_gpu<signed char, int, short>*/,
- 0/*reduceCols_gpu<signed char, int, int>*/,
- 0/*reduceCols_gpu<signed char, int, float>*/
+ 0/*::reduce::cols<unsigned short, int, unsigned char>*/,
+ 0/*::reduce::cols<unsigned short, int, signed char>*/,
+ ::reduce::cols<unsigned short, int, unsigned short>,
+ 0/*::reduce::cols<unsigned short, int, short>*/,
+ ::reduce::cols<unsigned short, int, int>,
+ ::reduce::cols<unsigned short, float, float>,
+ ::reduce::cols<unsigned short, double, double>
},
{
- 0/*reduceCols_gpu<unsigned short, int, unsigned char>*/,
- 0/*reduceCols_gpu<unsigned short, int, signed char>*/,
- reduceCols_gpu<unsigned short, int, unsigned short>,
- 0/*reduceCols_gpu<unsigned short, int, short>*/,
- reduceCols_gpu<unsigned short, int, int>,
- reduceCols_gpu<unsigned short, int, float>
+ 0/*::reduce::cols<short, int, unsigned char>*/,
+ 0/*::reduce::cols<short, int, signed char>*/,
+ 0/*::reduce::cols<short, int, unsigned short>*/,
+ ::reduce::cols<short, int, short>,
+ ::reduce::cols<short, int, int>,
+ ::reduce::cols<short, float, float>,
+ ::reduce::cols<short, double, double>
},
{
- 0/*reduceCols_gpu<short, int, unsigned char>*/,
- 0/*reduceCols_gpu<short, int, signed char>*/,
- 0/*reduceCols_gpu<short, int, unsigned short>*/,
- reduceCols_gpu<short, int, short>,
- reduceCols_gpu<short, int, int>,
- reduceCols_gpu<short, int, float>
+ 0/*::reduce::cols<int, int, unsigned char>*/,
+ 0/*::reduce::cols<int, int, signed char>*/,
+ 0/*::reduce::cols<int, int, unsigned short>*/,
+ 0/*::reduce::cols<int, int, short>*/,
+ ::reduce::cols<int, int, int>,
+ ::reduce::cols<int, float, float>,
+ ::reduce::cols<int, double, double>
},
{
- 0/*reduceCols_gpu<int, int, unsigned char>*/,
- 0/*reduceCols_gpu<int, int, signed char>*/,
- 0/*reduceCols_gpu<int, int, unsigned short>*/,
- 0/*reduceCols_gpu<int, int, short>*/,
- reduceCols_gpu<int, int, int>,
- reduceCols_gpu<int, int, float>
+ 0/*::reduce::cols<float, float, unsigned char>*/,
+ 0/*::reduce::cols<float, float, signed char>*/,
+ 0/*::reduce::cols<float, float, unsigned short>*/,
+ 0/*::reduce::cols<float, float, short>*/,
+ 0/*::reduce::cols<float, float, int>*/,
+ ::reduce::cols<float, float, float>,
+ ::reduce::cols<float, double, double>
},
{
- 0/*reduceCols_gpu<float, unsigned char>*/,
- 0/*reduceCols_gpu<float, signed char>*/,
- 0/*reduceCols_gpu<float, unsigned short>*/,
- 0/*reduceCols_gpu<float, short>*/,
- 0/*reduceCols_gpu<float, int>*/,
- reduceCols_gpu<float, float, float>
+ 0/*::reduce::cols<double, double, unsigned char>*/,
+ 0/*::reduce::cols<double, double, signed char>*/,
+ 0/*::reduce::cols<double, double, unsigned short>*/,
+ 0/*::reduce::cols<double, double, short>*/,
+ 0/*::reduce::cols<double, double, int>*/,
+ 0/*::reduce::cols<double, double, float>*/,
+ ::reduce::cols<double, double, double>
}
};
- const caller_t func = callers[src.depth()][dst.depth()];
+ const func_t func = funcs[src.depth()][dst.depth()];
if (!func)
CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of input and output array formats");
- func(src, src.channels(), dst, reduceOp, StreamAccessor::getStream(stream));
+ func(src, dst.data, src.channels(), reduceOp, StreamAccessor::getStream(stream));
}
}