//
//M*/
-#if !defined CUDA_DISABLER
+#include "opencv2/opencv_modules.hpp"
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-#include "opencv2/core/cuda/limits.hpp"
+#ifndef HAVE_OPENCV_CUDEV
-#include "unroll_detail.hpp"
+#error "opencv_cudev is required"
-using namespace cv::cuda;
-using namespace cv::cuda::device;
+#else
-namespace reduce
-{
- struct Sum
- {
- template <typename T>
- __device__ __forceinline__ T startValue() const
- {
- return VecTraits<T>::all(0);
- }
-
- template <typename T>
- __device__ __forceinline__ T operator ()(T a, T b) const
- {
- return a + b;
- }
-
- template <typename T>
- __device__ __forceinline__ T result(T r, int) const
- {
- return r;
- }
-
- __host__ __device__ __forceinline__ Sum() {}
- __host__ __device__ __forceinline__ Sum(const Sum&) {}
- };
-
- template <typename T> struct OutputType
- {
- typedef float type;
- };
- template <> struct OutputType<double>
- {
- typedef double type;
- };
-
- struct Avg
- {
- template <typename T>
- __device__ __forceinline__ T startValue() const
- {
- return VecTraits<T>::all(0);
- }
-
- template <typename T>
- __device__ __forceinline__ T operator ()(T a, T b) const
- {
- return a + b;
- }
-
- template <typename T>
- __device__ __forceinline__ typename TypeVec<typename OutputType<typename VecTraits<T>::elem_type>::type, VecTraits<T>::cn>::vec_type result(T r, float sz) const
- {
- return r / sz;
- }
-
- __host__ __device__ __forceinline__ Avg() {}
- __host__ __device__ __forceinline__ Avg(const Avg&) {}
- };
-
- struct Min
- {
- template <typename T>
- __device__ __forceinline__ T startValue() const
- {
- return VecTraits<T>::all(numeric_limits<typename VecTraits<T>::elem_type>::max());
- }
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
- template <typename T>
- __device__ __forceinline__ T operator ()(T a, T b) const
- {
- minimum<T> minOp;
- return minOp(a, b);
- }
-
- template <typename T>
- __device__ __forceinline__ T result(T r, int) const
- {
- return r;
- }
+using namespace cv::cudev;
- __host__ __device__ __forceinline__ Min() {}
- __host__ __device__ __forceinline__ Min(const Min&) {}
- };
-
- struct Max
+namespace
+{
+ template <typename T, typename S, typename D>
+ void reduceToRowImpl(const GpuMat& _src, GpuMat& _dst, int reduceOp, Stream& stream)
{
- template <typename T>
- __device__ __forceinline__ T startValue() const
- {
- return VecTraits<T>::all(-numeric_limits<typename VecTraits<T>::elem_type>::max());
- }
+ const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
+ GpuMat_<D>& dst = (GpuMat_<D>&) _dst;
- template <typename T>
- __device__ __forceinline__ T operator ()(T a, T b) const
+ switch (reduceOp)
{
- maximum<T> maxOp;
- return maxOp(a, b);
- }
+ case cv::REDUCE_SUM:
+ gridReduceToRow< Sum<S> >(src, dst, stream);
+ break;
- template <typename T>
- __device__ __forceinline__ T result(T r, int) const
- {
- return r;
- }
+ case cv::REDUCE_AVG:
+ gridReduceToRow< Avg<S> >(src, dst, stream);
+ break;
- __host__ __device__ __forceinline__ Max() {}
- __host__ __device__ __forceinline__ Max(const Max&) {}
- };
+ case cv::REDUCE_MIN:
+ gridReduceToRow< Min<S> >(src, dst, stream);
+ break;
- ///////////////////////////////////////////////////////////
+ case cv::REDUCE_MAX:
+ gridReduceToRow< Max<S> >(src, dst, stream);
+ break;
+ };
+ }
- template <typename T, typename S, typename D, class Op>
- __global__ void rowsKernel(const PtrStepSz<T> src, D* dst, const Op op)
+ template <typename T, typename S, typename D>
+ void reduceToColumnImpl_(const GpuMat& _src, GpuMat& _dst, int reduceOp, Stream& stream)
{
- __shared__ S smem[16 * 16];
+ const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
+ GpuMat_<D>& dst = (GpuMat_<D>&) _dst;
- const int x = blockIdx.x * 16 + threadIdx.x;
-
- S myVal = op.template startValue<S>();
-
- if (x < src.cols)
+ switch (reduceOp)
{
- for (int y = threadIdx.y; y < src.rows; y += 16)
- {
- S srcVal = src(y, x);
- myVal = op(myVal, srcVal);
- }
- }
-
- smem[threadIdx.x * 16 + threadIdx.y] = myVal;
-
- __syncthreads();
-
- volatile S* srow = smem + threadIdx.y * 16;
-
- myVal = srow[threadIdx.x];
- device::reduce<16>(srow, myVal, threadIdx.x, op);
-
- if (threadIdx.x == 0)
- srow[0] = myVal;
-
- __syncthreads();
-
- if (threadIdx.y == 0 && x < src.cols)
- dst[x] = (D) op.result(smem[threadIdx.x * 16], src.rows);
- }
+ case cv::REDUCE_SUM:
+ gridReduceToColumn< Sum<S> >(src, dst, stream);
+ break;
- template <typename T, typename S, typename D, class Op>
- void rowsCaller(PtrStepSz<T> src, D* dst, cudaStream_t stream)
- {
- const dim3 block(16, 16);
- const dim3 grid(divUp(src.cols, block.x));
+ case cv::REDUCE_AVG:
+ gridReduceToColumn< Avg<S> >(src, dst, stream);
+ break;
- Op op;
- rowsKernel<T, S, D, Op><<<grid, block, 0, stream>>>(src, dst, op);
- cudaSafeCall( cudaGetLastError() );
+ case cv::REDUCE_MIN:
+ gridReduceToColumn< Min<S> >(src, dst, stream);
+ break;
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
+ case cv::REDUCE_MAX:
+ gridReduceToColumn< Max<S> >(src, dst, stream);
+ break;
+ };
}
template <typename T, typename S, typename D>
- void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream)
+ void reduceToColumnImpl(const GpuMat& src, GpuMat& dst, int reduceOp, Stream& stream)
{
- typedef void (*func_t)(PtrStepSz<T> src, D* dst, cudaStream_t stream);
- static const func_t funcs[] =
+ typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int reduceOp, Stream& stream);
+ static const func_t funcs[4] =
{
- rowsCaller<T, S, D, Sum>,
- rowsCaller<T, S, D, Avg>,
- rowsCaller<T, S, D, Max>,
- rowsCaller<T, S, D, Min>
+ reduceToColumnImpl_<T, S, D>,
+ reduceToColumnImpl_<typename MakeVec<T, 2>::type, typename MakeVec<S, 2>::type, typename MakeVec<D, 2>::type>,
+ reduceToColumnImpl_<typename MakeVec<T, 3>::type, typename MakeVec<S, 3>::type, typename MakeVec<D, 3>::type>,
+ reduceToColumnImpl_<typename MakeVec<T, 4>::type, typename MakeVec<S, 4>::type, typename MakeVec<D, 4>::type>
};
- funcs[op]((PtrStepSz<T>) src, (D*) dst, stream);
+ funcs[src.channels() - 1](src, dst, reduceOp, stream);
}
+}
- template void rows<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
- template void rows<unsigned char, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
- template void rows<unsigned char, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
- template void rows<unsigned char, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
-
- template void rows<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
- template void rows<unsigned short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
- template void rows<unsigned short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
- template void rows<unsigned short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
-
- template void rows<short, int, short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
- template void rows<short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
- template void rows<short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
- template void rows<short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
-
- template void rows<int, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
- template void rows<int, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
- template void rows<int, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
-
- template void rows<float, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
- template void rows<float, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
-
- template void rows<double, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
-
- ///////////////////////////////////////////////////////////
-
- template <int BLOCK_SIZE, typename T, typename S, typename D, int cn, class Op>
- __global__ void colsKernel(const PtrStepSz<typename TypeVec<T, cn>::vec_type> src, typename TypeVec<D, cn>::vec_type* dst, const Op op)
- {
- typedef typename TypeVec<T, cn>::vec_type src_type;
- typedef typename TypeVec<S, cn>::vec_type work_type;
- typedef typename TypeVec<D, cn>::vec_type dst_type;
-
- __shared__ S smem[BLOCK_SIZE * cn];
-
- const int y = blockIdx.x;
-
- const src_type* srcRow = src.ptr(y);
+void cv::cuda::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp, int dtype, Stream& stream)
+{
+ GpuMat src = _src.getGpuMat();
- work_type myVal = op.template startValue<work_type>();
+ CV_Assert( src.channels() <= 4 );
+ CV_Assert( dim == 0 || dim == 1 );
+ CV_Assert( reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG || reduceOp == REDUCE_MAX || reduceOp == REDUCE_MIN );
- for (int x = threadIdx.x; x < src.cols; x += BLOCK_SIZE)
- myVal = op(myVal, saturate_cast<work_type>(srcRow[x]));
+ if (dtype < 0)
+ dtype = src.depth();
- device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(myVal), threadIdx.x, detail::Unroll<cn>::op(op));
+ _dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
+ GpuMat dst = _dst.getGpuMat();
- if (threadIdx.x == 0)
- dst[y] = saturate_cast<dst_type>(op.result(myVal, src.cols));
- }
-
- template <typename T, typename S, typename D, int cn, class Op> void colsCaller(PtrStepSzb src, void* dst, cudaStream_t stream)
+ if (dim == 0)
{
- const int BLOCK_SIZE = 256;
-
- const dim3 block(BLOCK_SIZE);
- const dim3 grid(src.rows);
+ typedef void (*func_t)(const GpuMat& _src, GpuMat& _dst, int reduceOp, Stream& stream);
+ static const func_t funcs[7][7] =
+ {
+ {
+ reduceToRowImpl<uchar, int, uchar>,
+ 0 /*reduceToRowImpl<uchar, int, schar>*/,
+ 0 /*reduceToRowImpl<uchar, int, ushort>*/,
+ 0 /*reduceToRowImpl<uchar, int, short>*/,
+ reduceToRowImpl<uchar, int, int>,
+ reduceToRowImpl<uchar, float, float>,
+ reduceToRowImpl<uchar, double, double>
+ },
+ {
+ 0 /*reduceToRowImpl<schar, int, uchar>*/,
+ 0 /*reduceToRowImpl<schar, int, schar>*/,
+ 0 /*reduceToRowImpl<schar, int, ushort>*/,
+ 0 /*reduceToRowImpl<schar, int, short>*/,
+ 0 /*reduceToRowImpl<schar, int, int>*/,
+ 0 /*reduceToRowImpl<schar, float, float>*/,
+ 0 /*reduceToRowImpl<schar, double, double>*/
+ },
+ {
+ 0 /*reduceToRowImpl<ushort, int, uchar>*/,
+ 0 /*reduceToRowImpl<ushort, int, schar>*/,
+ reduceToRowImpl<ushort, int, ushort>,
+ 0 /*reduceToRowImpl<ushort, int, short>*/,
+ reduceToRowImpl<ushort, int, int>,
+ reduceToRowImpl<ushort, float, float>,
+ reduceToRowImpl<ushort, double, double>
+ },
+ {
+ 0 /*reduceToRowImpl<short, int, uchar>*/,
+ 0 /*reduceToRowImpl<short, int, schar>*/,
+ 0 /*reduceToRowImpl<short, int, ushort>*/,
+ reduceToRowImpl<short, int, short>,
+ reduceToRowImpl<short, int, int>,
+ reduceToRowImpl<short, float, float>,
+ reduceToRowImpl<short, double, double>
+ },
+ {
+ 0 /*reduceToRowImpl<int, int, uchar>*/,
+ 0 /*reduceToRowImpl<int, int, schar>*/,
+ 0 /*reduceToRowImpl<int, int, ushort>*/,
+ 0 /*reduceToRowImpl<int, int, short>*/,
+ reduceToRowImpl<int, int, int>,
+ reduceToRowImpl<int, float, float>,
+ reduceToRowImpl<int, double, double>
+ },
+ {
+ 0 /*reduceToRowImpl<float, float, uchar>*/,
+ 0 /*reduceToRowImpl<float, float, schar>*/,
+ 0 /*reduceToRowImpl<float, float, ushort>*/,
+ 0 /*reduceToRowImpl<float, float, short>*/,
+ 0 /*reduceToRowImpl<float, float, int>*/,
+ reduceToRowImpl<float, float, float>,
+ reduceToRowImpl<float, double, double>
+ },
+ {
+ 0 /*reduceToRowImpl<double, double, uchar>*/,
+ 0 /*reduceToRowImpl<double, double, schar>*/,
+ 0 /*reduceToRowImpl<double, double, ushort>*/,
+ 0 /*reduceToRowImpl<double, double, short>*/,
+ 0 /*reduceToRowImpl<double, double, int>*/,
+ 0 /*reduceToRowImpl<double, double, float>*/,
+ reduceToRowImpl<double, double, double>
+ }
+ };
- Op op;
- colsKernel<BLOCK_SIZE, T, S, D, cn, Op><<<grid, block, 0, stream>>>((PtrStepSz<typename TypeVec<T, cn>::vec_type>) src, (typename TypeVec<D, cn>::vec_type*) dst, op);
- cudaSafeCall( cudaGetLastError() );
+ const func_t func = funcs[src.depth()][dst.depth()];
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
+ if (!func)
+ CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
+ GpuMat dst_cont = dst.reshape(1);
+ func(src.reshape(1), dst_cont, reduceOp, stream);
}
-
- template <typename T, typename S, typename D> void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream)
+ else
{
- typedef void (*func_t)(PtrStepSzb src, void* dst, cudaStream_t stream);
- static const func_t funcs[5][4] =
+ typedef void (*func_t)(const GpuMat& _src, GpuMat& _dst, int reduceOp, Stream& stream);
+ static const func_t funcs[7][7] =
{
- {0,0,0,0},
- {colsCaller<T, S, D, 1, Sum>, colsCaller<T, S, D, 1, Avg>, colsCaller<T, S, D, 1, Max>, colsCaller<T, S, D, 1, Min>},
- {colsCaller<T, S, D, 2, Sum>, colsCaller<T, S, D, 2, Avg>, colsCaller<T, S, D, 2, Max>, colsCaller<T, S, D, 2, Min>},
- {colsCaller<T, S, D, 3, Sum>, colsCaller<T, S, D, 3, Avg>, colsCaller<T, S, D, 3, Max>, colsCaller<T, S, D, 3, Min>},
- {colsCaller<T, S, D, 4, Sum>, colsCaller<T, S, D, 4, Avg>, colsCaller<T, S, D, 4, Max>, colsCaller<T, S, D, 4, Min>},
+ {
+ reduceToColumnImpl<uchar, int, uchar>,
+ 0 /*reduceToColumnImpl<uchar, int, schar>*/,
+ 0 /*reduceToColumnImpl<uchar, int, ushort>*/,
+ 0 /*reduceToColumnImpl<uchar, int, short>*/,
+ reduceToColumnImpl<uchar, int, int>,
+ reduceToColumnImpl<uchar, float, float>,
+ reduceToColumnImpl<uchar, double, double>
+ },
+ {
+ 0 /*reduceToColumnImpl<schar, int, uchar>*/,
+ 0 /*reduceToColumnImpl<schar, int, schar>*/,
+ 0 /*reduceToColumnImpl<schar, int, ushort>*/,
+ 0 /*reduceToColumnImpl<schar, int, short>*/,
+ 0 /*reduceToColumnImpl<schar, int, int>*/,
+ 0 /*reduceToColumnImpl<schar, float, float>*/,
+ 0 /*reduceToColumnImpl<schar, double, double>*/
+ },
+ {
+ 0 /*reduceToColumnImpl<ushort, int, uchar>*/,
+ 0 /*reduceToColumnImpl<ushort, int, schar>*/,
+ reduceToColumnImpl<ushort, int, ushort>,
+ 0 /*reduceToColumnImpl<ushort, int, short>*/,
+ reduceToColumnImpl<ushort, int, int>,
+ reduceToColumnImpl<ushort, float, float>,
+ reduceToColumnImpl<ushort, double, double>
+ },
+ {
+ 0 /*reduceToColumnImpl<short, int, uchar>*/,
+ 0 /*reduceToColumnImpl<short, int, schar>*/,
+ 0 /*reduceToColumnImpl<short, int, ushort>*/,
+ reduceToColumnImpl<short, int, short>,
+ reduceToColumnImpl<short, int, int>,
+ reduceToColumnImpl<short, float, float>,
+ reduceToColumnImpl<short, double, double>
+ },
+ {
+ 0 /*reduceToColumnImpl<int, int, uchar>*/,
+ 0 /*reduceToColumnImpl<int, int, schar>*/,
+ 0 /*reduceToColumnImpl<int, int, ushort>*/,
+ 0 /*reduceToColumnImpl<int, int, short>*/,
+ reduceToColumnImpl<int, int, int>,
+ reduceToColumnImpl<int, float, float>,
+ reduceToColumnImpl<int, double, double>
+ },
+ {
+ 0 /*reduceToColumnImpl<float, float, uchar>*/,
+ 0 /*reduceToColumnImpl<float, float, schar>*/,
+ 0 /*reduceToColumnImpl<float, float, ushort>*/,
+ 0 /*reduceToColumnImpl<float, float, short>*/,
+ 0 /*reduceToColumnImpl<float, float, int>*/,
+ reduceToColumnImpl<float, float, float>,
+ reduceToColumnImpl<float, double, double>
+ },
+ {
+ 0 /*reduceToColumnImpl<double, double, uchar>*/,
+ 0 /*reduceToColumnImpl<double, double, schar>*/,
+ 0 /*reduceToColumnImpl<double, double, ushort>*/,
+ 0 /*reduceToColumnImpl<double, double, short>*/,
+ 0 /*reduceToColumnImpl<double, double, int>*/,
+ 0 /*reduceToColumnImpl<double, double, float>*/,
+ reduceToColumnImpl<double, double, double>
+ }
};
- funcs[cn][op](src, dst, stream);
- }
+ const func_t func = funcs[src.depth()][dst.depth()];
- template void cols<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
- template void cols<unsigned char, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
- template void cols<unsigned char, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
- template void cols<unsigned char, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+ if (!func)
+ CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
- template void cols<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
- template void cols<unsigned short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
- template void cols<unsigned short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
- template void cols<unsigned short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
-
- template void cols<short, int, short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
- template void cols<short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
- template void cols<short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
- template void cols<short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
-
- template void cols<int, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
- template void cols<int, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
- template void cols<int, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
-
- template void cols<float, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
- template void cols<float, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
-
- template void cols<double, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+ func(src, dst, reduceOp, stream);
+ }
}
-#endif /* CUDA_DISABLER */
+#endif
return retVal;
}
-//////////////////////////////////////////////////////////////////////////////
-// reduce
-
-namespace reduce
-{
- template <typename T, typename S, typename D>
- void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
-
- template <typename T, typename S, typename D>
- void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
-}
-
-void cv::cuda::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp, int dtype, Stream& stream)
-{
- GpuMat src = _src.getGpuMat();
-
- CV_Assert( src.channels() <= 4 );
- CV_Assert( dim == 0 || dim == 1 );
- CV_Assert( reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG || reduceOp == REDUCE_MAX || reduceOp == REDUCE_MIN );
-
- if (dtype < 0)
- dtype = src.depth();
-
- _dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
- GpuMat dst = _dst.getGpuMat();
-
- if (dim == 0)
- {
- typedef void (*func_t)(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
- static const func_t funcs[7][7] =
- {
- {
- ::reduce::rows<unsigned char, int, unsigned char>,
- 0/*::reduce::rows<unsigned char, int, signed char>*/,
- 0/*::reduce::rows<unsigned char, int, unsigned short>*/,
- 0/*::reduce::rows<unsigned char, int, short>*/,
- ::reduce::rows<unsigned char, int, int>,
- ::reduce::rows<unsigned char, float, float>,
- ::reduce::rows<unsigned char, double, double>
- },
- {
- 0/*::reduce::rows<signed char, int, unsigned char>*/,
- 0/*::reduce::rows<signed char, int, signed char>*/,
- 0/*::reduce::rows<signed char, int, unsigned short>*/,
- 0/*::reduce::rows<signed char, int, short>*/,
- 0/*::reduce::rows<signed char, int, int>*/,
- 0/*::reduce::rows<signed char, float, float>*/,
- 0/*::reduce::rows<signed char, double, double>*/
- },
- {
- 0/*::reduce::rows<unsigned short, int, unsigned char>*/,
- 0/*::reduce::rows<unsigned short, int, signed char>*/,
- ::reduce::rows<unsigned short, int, unsigned short>,
- 0/*::reduce::rows<unsigned short, int, short>*/,
- ::reduce::rows<unsigned short, int, int>,
- ::reduce::rows<unsigned short, float, float>,
- ::reduce::rows<unsigned short, double, double>
- },
- {
- 0/*::reduce::rows<short, int, unsigned char>*/,
- 0/*::reduce::rows<short, int, signed char>*/,
- 0/*::reduce::rows<short, int, unsigned short>*/,
- ::reduce::rows<short, int, short>,
- ::reduce::rows<short, int, int>,
- ::reduce::rows<short, float, float>,
- ::reduce::rows<short, double, double>
- },
- {
- 0/*::reduce::rows<int, int, unsigned char>*/,
- 0/*::reduce::rows<int, int, signed char>*/,
- 0/*::reduce::rows<int, int, unsigned short>*/,
- 0/*::reduce::rows<int, int, short>*/,
- ::reduce::rows<int, int, int>,
- ::reduce::rows<int, float, float>,
- ::reduce::rows<int, double, double>
- },
- {
- 0/*::reduce::rows<float, float, unsigned char>*/,
- 0/*::reduce::rows<float, float, signed char>*/,
- 0/*::reduce::rows<float, float, unsigned short>*/,
- 0/*::reduce::rows<float, float, short>*/,
- 0/*::reduce::rows<float, float, int>*/,
- ::reduce::rows<float, float, float>,
- ::reduce::rows<float, double, double>
- },
- {
- 0/*::reduce::rows<double, double, unsigned char>*/,
- 0/*::reduce::rows<double, double, signed char>*/,
- 0/*::reduce::rows<double, double, unsigned short>*/,
- 0/*::reduce::rows<double, double, short>*/,
- 0/*::reduce::rows<double, double, int>*/,
- 0/*::reduce::rows<double, double, float>*/,
- ::reduce::rows<double, double, double>
- }
- };
-
- const func_t func = funcs[src.depth()][dst.depth()];
-
- if (!func)
- CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
-
- func(src.reshape(1), dst.data, reduceOp, StreamAccessor::getStream(stream));
- }
- else
- {
- typedef void (*func_t)(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
- static const func_t funcs[7][7] =
- {
- {
- ::reduce::cols<unsigned char, int, unsigned char>,
- 0/*::reduce::cols<unsigned char, int, signed char>*/,
- 0/*::reduce::cols<unsigned char, int, unsigned short>*/,
- 0/*::reduce::cols<unsigned char, int, short>*/,
- ::reduce::cols<unsigned char, int, int>,
- ::reduce::cols<unsigned char, float, float>,
- ::reduce::cols<unsigned char, double, double>
- },
- {
- 0/*::reduce::cols<signed char, int, unsigned char>*/,
- 0/*::reduce::cols<signed char, int, signed char>*/,
- 0/*::reduce::cols<signed char, int, unsigned short>*/,
- 0/*::reduce::cols<signed char, int, short>*/,
- 0/*::reduce::cols<signed char, int, int>*/,
- 0/*::reduce::cols<signed char, float, float>*/,
- 0/*::reduce::cols<signed char, double, double>*/
- },
- {
- 0/*::reduce::cols<unsigned short, int, unsigned char>*/,
- 0/*::reduce::cols<unsigned short, int, signed char>*/,
- ::reduce::cols<unsigned short, int, unsigned short>,
- 0/*::reduce::cols<unsigned short, int, short>*/,
- ::reduce::cols<unsigned short, int, int>,
- ::reduce::cols<unsigned short, float, float>,
- ::reduce::cols<unsigned short, double, double>
- },
- {
- 0/*::reduce::cols<short, int, unsigned char>*/,
- 0/*::reduce::cols<short, int, signed char>*/,
- 0/*::reduce::cols<short, int, unsigned short>*/,
- ::reduce::cols<short, int, short>,
- ::reduce::cols<short, int, int>,
- ::reduce::cols<short, float, float>,
- ::reduce::cols<short, double, double>
- },
- {
- 0/*::reduce::cols<int, int, unsigned char>*/,
- 0/*::reduce::cols<int, int, signed char>*/,
- 0/*::reduce::cols<int, int, unsigned short>*/,
- 0/*::reduce::cols<int, int, short>*/,
- ::reduce::cols<int, int, int>,
- ::reduce::cols<int, float, float>,
- ::reduce::cols<int, double, double>
- },
- {
- 0/*::reduce::cols<float, float, unsigned char>*/,
- 0/*::reduce::cols<float, float, signed char>*/,
- 0/*::reduce::cols<float, float, unsigned short>*/,
- 0/*::reduce::cols<float, float, short>*/,
- 0/*::reduce::cols<float, float, int>*/,
- ::reduce::cols<float, float, float>,
- ::reduce::cols<float, double, double>
- },
- {
- 0/*::reduce::cols<double, double, unsigned char>*/,
- 0/*::reduce::cols<double, double, signed char>*/,
- 0/*::reduce::cols<double, double, unsigned short>*/,
- 0/*::reduce::cols<double, double, short>*/,
- 0/*::reduce::cols<double, double, int>*/,
- 0/*::reduce::cols<double, double, float>*/,
- ::reduce::cols<double, double, double>
- }
- };
-
- const func_t func = funcs[src.depth()][dst.depth()];
-
- if (!func)
- CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
-
- func(src, dst.data, src.channels(), reduceOp, StreamAccessor::getStream(stream));
- }
-}
-
////////////////////////////////////////////////////////////////////////
// meanStdDev