//
//M*/
-#if !defined CUDA_DISABLER
+#include "opencv2/opencv_modules.hpp"
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/simd_functions.hpp"
+#ifndef HAVE_OPENCV_CUDEV
-#include "arithm_func_traits.hpp"
+#error "opencv_cudev is required"
-using namespace cv::cuda;
-using namespace cv::cuda::device;
+#else
-namespace cv { namespace cuda { namespace device
-{
- template <typename T> struct TransformFunctorTraits< bit_not<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
- {
- };
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudev.hpp"
- template <typename T> struct TransformFunctorTraits< bit_and<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
- {
- };
+using namespace cv::cudev;
- template <typename T> struct TransformFunctorTraits< bit_or<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
- {
- };
+void bitMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op);
- template <typename T> struct TransformFunctorTraits< bit_xor<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
- {
- };
-}}}
+//////////////////////////////////////////////////////////////////////////////
+/// bitwise_not
-namespace arithm
+namespace
{
- template <typename T> void bitMatNot(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+ template <typename T>
+ void bitMatNot(const GpuMat& src, GpuMat& dst, const GpuMat& mask, Stream& stream)
{
+ GlobPtrSz<T> vsrc = globPtr((T*) src.data, src.step, src.rows, src.cols * src.channels());
+ GlobPtrSz<T> vdst = globPtr((T*) dst.data, dst.step, src.rows, src.cols * src.channels());
+
if (mask.data)
- device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), mask, stream);
+ gridTransformUnary(vsrc, vdst, bit_not<T>(), singleMaskChannels(globPtr<uchar>(mask), src.channels()), stream);
else
- device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), WithOutMask(), stream);
+ gridTransformUnary(vsrc, vdst, bit_not<T>(), stream);
}
+}
- template <typename T> void bitMatAnd(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+void cv::cuda::bitwise_not(InputArray _src, OutputArray _dst, InputArray _mask, Stream& stream)
+{
+ GpuMat src = _src.getGpuMat();
+ GpuMat mask = _mask.getGpuMat();
+
+ const int depth = src.depth();
+
+ CV_DbgAssert( depth <= CV_32F );
+ CV_DbgAssert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
+
+ _dst.create(src.size(), src.type());
+ GpuMat dst = _dst.getGpuMat();
+
+ if (depth == CV_32F || depth == CV_32S)
{
- if (mask.data)
- device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), mask, stream);
- else
- device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), WithOutMask(), stream);
+ bitMatNot<uint>(src, dst, mask, stream);
}
-
- template <typename T> void bitMatOr(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+ else if (depth == CV_16S || depth == CV_16U)
{
- if (mask.data)
- device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), mask, stream);
- else
- device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), WithOutMask(), stream);
+ bitMatNot<ushort>(src, dst, mask, stream);
}
+ else
+ {
+ bitMatNot<uchar>(src, dst, mask, stream);
+ }
+}
- template <typename T> void bitMatXor(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+//////////////////////////////////////////////////////////////////////////////
+/// Binary bitwise logical operations
+
+namespace
+{
+ template <template <typename> class Op, typename T>
+ void bitMatOp(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
{
+ GlobPtrSz<T> vsrc1 = globPtr((T*) src1.data, src1.step, src1.rows, src1.cols * src1.channels());
+ GlobPtrSz<T> vsrc2 = globPtr((T*) src2.data, src2.step, src1.rows, src1.cols * src1.channels());
+ GlobPtrSz<T> vdst = globPtr((T*) dst.data, dst.step, src1.rows, src1.cols * src1.channels());
+
if (mask.data)
- device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), mask, stream);
+ gridTransformBinary(vsrc1, vsrc2, vdst, Op<T>(), singleMaskChannels(globPtr<uchar>(mask), src1.channels()), stream);
else
- device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), WithOutMask(), stream);
+ gridTransformBinary(vsrc1, vsrc2, vdst, Op<T>(), stream);
}
+}
- template void bitMatNot<uchar>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
- template void bitMatNot<ushort>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
- template void bitMatNot<uint>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+void bitMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op)
+{
+ typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream);
+ static const func_t funcs32[] =
+ {
+ bitMatOp<bit_and, uint>,
+ bitMatOp<bit_or, uint>,
+ bitMatOp<bit_xor, uint>
+ };
+ static const func_t funcs16[] =
+ {
+ bitMatOp<bit_and, ushort>,
+ bitMatOp<bit_or, ushort>,
+ bitMatOp<bit_xor, ushort>
+ };
+ static const func_t funcs8[] =
+ {
+ bitMatOp<bit_and, uchar>,
+ bitMatOp<bit_or, uchar>,
+ bitMatOp<bit_xor, uchar>
+ };
- template void bitMatAnd<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
- template void bitMatAnd<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
- template void bitMatAnd<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ const int depth = src1.depth();
- template void bitMatOr<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
- template void bitMatOr<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
- template void bitMatOr<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ CV_DbgAssert( depth <= CV_32F );
+ CV_DbgAssert( op >= 0 && op < 3 );
- template void bitMatXor<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
- template void bitMatXor<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
- template void bitMatXor<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ if (depth == CV_32F || depth == CV_32S)
+ {
+ funcs32[op](src1, src2, dst, mask, stream);
+ }
+ else if (depth == CV_16S || depth == CV_16U)
+ {
+ funcs16[op](src1, src2, dst, mask, stream);
+ }
+ else
+ {
+ funcs8[op](src1, src2, dst, mask, stream);
+ }
}
-#endif // CUDA_DISABLER
+#endif
//
//M*/
-#if !defined CUDA_DISABLER
+#include "opencv2/opencv_modules.hpp"
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/simd_functions.hpp"
+#ifndef HAVE_OPENCV_CUDEV
-#include "arithm_func_traits.hpp"
+#error "opencv_cudev is required"
-using namespace cv::cuda;
-using namespace cv::cuda::device;
+#else
-namespace cv { namespace cuda { namespace device
+#include "opencv2/cudev.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace cv::cudev;
+
+void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op);
+
+namespace
{
- template <typename T> struct TransformFunctorTraits< binder2nd< bit_and<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+ template <template <typename> class Op, typename T>
+ void bitScalarOp(const GpuMat& src, uint value, GpuMat& dst, Stream& stream)
{
- };
+ gridTransformUnary(globPtr<T>(src), globPtr<T>(dst), bind2nd(Op<T>(), value), stream);
+ }
+
+ typedef void (*bit_scalar_func_t)(const GpuMat& src, uint value, GpuMat& dst, Stream& stream);
- template <typename T> struct TransformFunctorTraits< binder2nd< bit_or<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+ template <typename T, bit_scalar_func_t func> struct BitScalar
{
+ static void call(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream)
+ {
+ func(src, cv::saturate_cast<T>(value[0]), dst, stream);
+ }
};
- template <typename T> struct TransformFunctorTraits< binder2nd< bit_xor<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+ template <bit_scalar_func_t func> struct BitScalar4
{
+ static void call(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream)
+ {
+ uint packedVal = 0;
+
+ packedVal |= cv::saturate_cast<uchar>(value[0]);
+ packedVal |= cv::saturate_cast<uchar>(value[1]) << 8;
+ packedVal |= cv::saturate_cast<uchar>(value[2]) << 16;
+ packedVal |= cv::saturate_cast<uchar>(value[3]) << 24;
+
+ func(src, packedVal, dst, stream);
+ }
};
-}}}
-namespace arithm
-{
- template <typename T> void bitScalarAnd(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
+ template <int DEPTH, int cn> struct NppBitwiseCFunc
{
- device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_and<T>(), src2), WithOutMask(), stream);
- }
+ typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
+
+ typedef NppStatus (*func_t)(const npp_type* pSrc1, int nSrc1Step, const npp_type* pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI);
+ };
- template <typename T> void bitScalarOr(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
+ template <int DEPTH, int cn, typename NppBitwiseCFunc<DEPTH, cn>::func_t func> struct NppBitwiseC
{
- device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_or<T>(), src2), WithOutMask(), stream);
- }
+ typedef typename NppBitwiseCFunc<DEPTH, cn>::npp_type npp_type;
+
+ static void call(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& _stream)
+ {
+ cudaStream_t stream = StreamAccessor::getStream(_stream);
+ NppStreamHandler h(stream);
+
+ NppiSize oSizeROI;
+ oSizeROI.width = src.cols;
+ oSizeROI.height = src.rows;
+
+ const npp_type pConstants[] =
+ {
+ cv::saturate_cast<npp_type>(value[0]),
+ cv::saturate_cast<npp_type>(value[1]),
+ cv::saturate_cast<npp_type>(value[2]),
+ cv::saturate_cast<npp_type>(value[3])
+ };
+
+ nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), pConstants, dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI) );
+
+ if (stream == 0)
+ CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+ }
+ };
+}
+
+void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op)
+{
+ (void) mask;
- template <typename T> void bitScalarXor(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
+ typedef void (*func_t)(const GpuMat& src, cv::Scalar value, GpuMat& dst, Stream& stream);
+ static const func_t funcs[3][6][4] =
{
- device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_xor<T>(), src2), WithOutMask(), stream);
- }
+ {
+ {BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
+ {BitScalar<uchar, bitScalarOp<bit_and, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_and, uint> >::call},
+ {BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
+ {BitScalar<ushort, bitScalarOp<bit_and, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
+ {BitScalar<uint, bitScalarOp<bit_and, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call},
+ {BitScalar<uint, bitScalarOp<bit_and, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
+ },
+ {
+ {BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
+ {BitScalar<uchar, bitScalarOp<bit_or, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_or, uint> >::call},
+ {BitScalar<ushort, bitScalarOp<bit_or, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
+ {BitScalar<ushort, bitScalarOp<bit_or, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
+ {BitScalar<uint, bitScalarOp<bit_or, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call},
+ {BitScalar<uint, bitScalarOp<bit_or, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call}
+ },
+ {
+ {BitScalar<uchar, bitScalarOp<bit_xor, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_xor, uint> >::call},
+ {BitScalar<uchar, bitScalarOp<bit_xor, uchar> >::call , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarOp<bit_xor, uint> >::call},
+ {BitScalar<ushort, bitScalarOp<bit_xor, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
+ {BitScalar<ushort, bitScalarOp<bit_xor, ushort> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
+ {BitScalar<uint, bitScalarOp<bit_xor, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call},
+ {BitScalar<uint, bitScalarOp<bit_xor, uint> >::call , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
+ }
+ };
- template void bitScalarAnd<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
- template void bitScalarAnd<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
- template void bitScalarAnd<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
- template void bitScalarAnd<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+ const int depth = src.depth();
+ const int cn = src.channels();
- template void bitScalarOr<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
- template void bitScalarOr<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
- template void bitScalarOr<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
- template void bitScalarOr<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+ CV_DbgAssert( depth <= CV_32F );
+ CV_DbgAssert( cn == 1 || cn == 3 || cn == 4 );
+ CV_DbgAssert( mask.empty() );
+ CV_DbgAssert( op >= 0 && op < 3 );
- template void bitScalarXor<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
- template void bitScalarXor<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
- template void bitScalarXor<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
- template void bitScalarXor<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+ funcs[op][depth][cn - 1](src, value, dst, stream);
}
-#endif // CUDA_DISABLER
+#endif
}
}
-
-////////////////////////////////////////////////////////////////////////
-// Basic arithmetical operations (add subtract multiply divide)
-
-namespace
-{
- template<int DEPTH> struct NppTypeTraits;
- template<> struct NppTypeTraits<CV_8U> { typedef Npp8u npp_t; };
- template<> struct NppTypeTraits<CV_8S> { typedef Npp8s npp_t; };
- template<> struct NppTypeTraits<CV_16U> { typedef Npp16u npp_t; };
- template<> struct NppTypeTraits<CV_16S> { typedef Npp16s npp_t; typedef Npp16sc npp_complex_type; };
- template<> struct NppTypeTraits<CV_32S> { typedef Npp32s npp_t; typedef Npp32sc npp_complex_type; };
- template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; typedef Npp32fc npp_complex_type; };
- template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; typedef Npp64fc npp_complex_type; };
-
- template<int DEPTH, int cn> struct NppArithmScalarFunc
- {
- typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-
- typedef NppStatus (*func_ptr)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pConstants,
- npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);
- };
- template<int DEPTH> struct NppArithmScalarFunc<DEPTH, 1>
- {
- typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-
- typedef NppStatus (*func_ptr)(const npp_t* pSrc1, int nSrc1Step, const npp_t pConstants,
- npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);
- };
- template<int DEPTH> struct NppArithmScalarFunc<DEPTH, 2>
- {
- typedef typename NppTypeTraits<DEPTH>::npp_complex_type npp_complex_type;
-
- typedef NppStatus (*func_ptr)(const npp_complex_type* pSrc1, int nSrc1Step, const npp_complex_type pConstants,
- npp_complex_type* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);
- };
- template<int cn> struct NppArithmScalarFunc<CV_32F, cn>
- {
- typedef NppStatus (*func_ptr)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f* pConstants, Npp32f* pDst, int nDstStep, NppiSize oSizeROI);
- };
- template<> struct NppArithmScalarFunc<CV_32F, 1>
- {
- typedef NppStatus (*func_ptr)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f pConstants, Npp32f* pDst, int nDstStep, NppiSize oSizeROI);
- };
- template<> struct NppArithmScalarFunc<CV_32F, 2>
- {
- typedef NppStatus (*func_ptr)(const Npp32fc* pSrc1, int nSrc1Step, const Npp32fc pConstants, Npp32fc* pDst, int nDstStep, NppiSize oSizeROI);
- };
-
- template<int DEPTH, int cn, typename NppArithmScalarFunc<DEPTH, cn>::func_ptr func> struct NppArithmScalar
- {
- typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-
- static void call(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream)
- {
- NppStreamHandler h(stream);
-
- NppiSize sz;
- sz.width = src.cols;
- sz.height = src.rows;
-
- const npp_t pConstants[] = { saturate_cast<npp_t>(sc.val[0]), saturate_cast<npp_t>(sc.val[1]), saturate_cast<npp_t>(sc.val[2]), saturate_cast<npp_t>(sc.val[3]) };
-
- nppSafeCall( func((const npp_t*)src.data, static_cast<int>(src.step), pConstants, (npp_t*)dst.data, static_cast<int>(dst.step), sz, 0) );
-
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
- }
- };
- template<int DEPTH, typename NppArithmScalarFunc<DEPTH, 1>::func_ptr func> struct NppArithmScalar<DEPTH, 1, func>
- {
- typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-
- static void call(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream)
- {
- NppStreamHandler h(stream);
-
- NppiSize sz;
- sz.width = src.cols;
- sz.height = src.rows;
-
- nppSafeCall( func((const npp_t*)src.data, static_cast<int>(src.step), saturate_cast<npp_t>(sc.val[0]), (npp_t*)dst.data, static_cast<int>(dst.step), sz, 0) );
-
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
- }
- };
- template<int DEPTH, typename NppArithmScalarFunc<DEPTH, 2>::func_ptr func> struct NppArithmScalar<DEPTH, 2, func>
- {
- typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
- typedef typename NppTypeTraits<DEPTH>::npp_complex_type npp_complex_type;
-
- static void call(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream)
- {
- NppStreamHandler h(stream);
-
- NppiSize sz;
- sz.width = src.cols;
- sz.height = src.rows;
-
- npp_complex_type nConstant;
- nConstant.re = saturate_cast<npp_t>(sc.val[0]);
- nConstant.im = saturate_cast<npp_t>(sc.val[1]);
-
- nppSafeCall( func((const npp_complex_type*)src.data, static_cast<int>(src.step), nConstant,
- (npp_complex_type*)dst.data, static_cast<int>(dst.step), sz, 0) );
-
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
- }
- };
- template<int cn, typename NppArithmScalarFunc<CV_32F, cn>::func_ptr func> struct NppArithmScalar<CV_32F, cn, func>
- {
- typedef typename NppTypeTraits<CV_32F>::npp_t npp_t;
-
- static void call(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream)
- {
- NppStreamHandler h(stream);
-
- NppiSize sz;
- sz.width = src.cols;
- sz.height = src.rows;
-
- const Npp32f pConstants[] = { saturate_cast<Npp32f>(sc.val[0]), saturate_cast<Npp32f>(sc.val[1]), saturate_cast<Npp32f>(sc.val[2]), saturate_cast<Npp32f>(sc.val[3]) };
-
- nppSafeCall( func((const npp_t*)src.data, static_cast<int>(src.step), pConstants, (npp_t*)dst.data, static_cast<int>(dst.step), sz) );
-
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
- }
- };
- template<typename NppArithmScalarFunc<CV_32F, 1>::func_ptr func> struct NppArithmScalar<CV_32F, 1, func>
- {
- typedef typename NppTypeTraits<CV_32F>::npp_t npp_t;
-
- static void call(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream)
- {
- NppStreamHandler h(stream);
-
- NppiSize sz;
- sz.width = src.cols;
- sz.height = src.rows;
-
- nppSafeCall( func((const npp_t*)src.data, static_cast<int>(src.step), saturate_cast<Npp32f>(sc.val[0]), (npp_t*)dst.data, static_cast<int>(dst.step), sz) );
-
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
- }
- };
- template<typename NppArithmScalarFunc<CV_32F, 2>::func_ptr func> struct NppArithmScalar<CV_32F, 2, func>
- {
- typedef typename NppTypeTraits<CV_32F>::npp_t npp_t;
- typedef typename NppTypeTraits<CV_32F>::npp_complex_type npp_complex_type;
-
- static void call(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream)
- {
- NppStreamHandler h(stream);
-
- NppiSize sz;
- sz.width = src.cols;
- sz.height = src.rows;
-
- Npp32fc nConstant;
- nConstant.re = saturate_cast<Npp32f>(sc.val[0]);
- nConstant.im = saturate_cast<Npp32f>(sc.val[1]);
-
- nppSafeCall( func((const npp_complex_type*)src.data, static_cast<int>(src.step), nConstant, (npp_complex_type*)dst.data, static_cast<int>(dst.step), sz) );
-
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
- }
- };
-}
-
////////////////////////////////////////////////////////////////////////
// add
}
//////////////////////////////////////////////////////////////////////////////
-// bitwise_not
-
-namespace arithm
-{
- template <typename T> void bitMatNot(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-}
-
-void cv::cuda::bitwise_not(InputArray _src, OutputArray _dst, InputArray _mask, Stream& _stream)
-{
- using namespace arithm;
-
- GpuMat src = _src.getGpuMat();
- GpuMat mask = _mask.getGpuMat();
-
- const int depth = src.depth();
-
- CV_Assert( depth <= CV_64F );
- CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
-
- _dst.create(src.size(), src.type());
- GpuMat dst = _dst.getGpuMat();
-
- cudaStream_t stream = StreamAccessor::getStream(_stream);
-
- const int bcols = (int) (src.cols * src.elemSize());
-
- if ((bcols & 3) == 0)
- {
- const int vcols = bcols >> 2;
-
- bitMatNot<unsigned int>(
- PtrStepSzb(src.rows, vcols, src.data, src.step),
- PtrStepSzb(src.rows, vcols, dst.data, dst.step),
- mask, stream);
- }
- else if ((bcols & 1) == 0)
- {
- const int vcols = bcols >> 1;
-
- bitMatNot<unsigned short>(
- PtrStepSzb(src.rows, vcols, src.data, src.step),
- PtrStepSzb(src.rows, vcols, dst.data, dst.step),
- mask, stream);
- }
- else
- {
- bitMatNot<unsigned char>(
- PtrStepSzb(src.rows, bcols, src.data, src.step),
- PtrStepSzb(src.rows, bcols, dst.data, dst.step),
- mask, stream);
- }
-}
-
-//////////////////////////////////////////////////////////////////////////////
// Binary bitwise logical operations
namespace
};
}
-namespace arithm
-{
- template <typename T> void bitMatAnd(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
- template <typename T> void bitMatOr(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
- template <typename T> void bitMatXor(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
-}
+void bitMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op);
-static void bitMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int op)
-{
- using namespace arithm;
-
- typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
- static const func_t funcs32[] =
- {
- bitMatAnd<uint>,
- bitMatOr<uint>,
- bitMatXor<uint>
- };
- static const func_t funcs16[] =
- {
- bitMatAnd<ushort>,
- bitMatOr<ushort>,
- bitMatXor<ushort>
- };
- static const func_t funcs8[] =
- {
- bitMatAnd<uchar>,
- bitMatOr<uchar>,
- bitMatXor<uchar>
- };
-
- cudaStream_t stream = StreamAccessor::getStream(_stream);
-
- const int bcols = (int) (src1.cols * src1.elemSize());
-
- if ((bcols & 3) == 0)
- {
- const int vcols = bcols >> 2;
-
- funcs32[op](PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
- PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
- PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
- mask, stream);
- }
- else if ((bcols & 1) == 0)
- {
- const int vcols = bcols >> 1;
-
- funcs16[op](PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
- PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
- PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
- mask, stream);
- }
- else
- {
-
- funcs8[op](PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
- PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
- PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
- mask, stream);
- }
-}
-
-namespace arithm
-{
- template <typename T> void bitScalarAnd(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream);
- template <typename T> void bitScalarOr(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream);
- template <typename T> void bitScalarXor(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream);
-}
-
-namespace
-{
- typedef void (*bit_scalar_func_t)(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream);
-
- template <typename T, bit_scalar_func_t func> struct BitScalar
- {
- static void call(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream)
- {
- func(src, saturate_cast<T>(sc.val[0]), dst, stream);
- }
- };
-
- template <bit_scalar_func_t func> struct BitScalar4
- {
- static void call(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream)
- {
- unsigned int packedVal = 0;
-
- packedVal |= (saturate_cast<unsigned char>(sc.val[0]) & 0xffff);
- packedVal |= (saturate_cast<unsigned char>(sc.val[1]) & 0xffff) << 8;
- packedVal |= (saturate_cast<unsigned char>(sc.val[2]) & 0xffff) << 16;
- packedVal |= (saturate_cast<unsigned char>(sc.val[3]) & 0xffff) << 24;
-
- func(src, packedVal, dst, stream);
- }
- };
-
- template <int DEPTH, int cn> struct NppBitwiseCFunc
- {
- typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-
- typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pConstants, npp_t* pDst, int nDstStep, NppiSize oSizeROI);
- };
- template <int DEPTH> struct NppBitwiseCFunc<DEPTH, 1>
- {
- typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-
- typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t pConstant, npp_t* pDst, int nDstStep, NppiSize oSizeROI);
- };
-
- template <int DEPTH, int cn, typename NppBitwiseCFunc<DEPTH, cn>::func_t func> struct NppBitwiseC
- {
- typedef typename NppBitwiseCFunc<DEPTH, cn>::npp_t npp_t;
-
- static void call(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream)
- {
- NppStreamHandler h(stream);
-
- NppiSize oSizeROI;
- oSizeROI.width = src.cols;
- oSizeROI.height = src.rows;
-
- const npp_t pConstants[] = {saturate_cast<npp_t>(sc.val[0]), saturate_cast<npp_t>(sc.val[1]), saturate_cast<npp_t>(sc.val[2]), saturate_cast<npp_t>(sc.val[3])};
-
- nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), pConstants, dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );
-
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
- }
- };
- template <int DEPTH, typename NppBitwiseCFunc<DEPTH, 1>::func_t func> struct NppBitwiseC<DEPTH, 1, func>
- {
- typedef typename NppBitwiseCFunc<DEPTH, 1>::npp_t npp_t;
-
- static void call(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream)
- {
- NppStreamHandler h(stream);
-
- NppiSize oSizeROI;
- oSizeROI.width = src.cols;
- oSizeROI.height = src.rows;
-
- nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), saturate_cast<npp_t>(sc.val[0]), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );
-
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
- }
- };
-}
-
-static void bitScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op)
-{
- using namespace arithm;
-
- typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
- static const func_t funcs[3][5][4] =
- {
- {
- {BitScalar<unsigned char, bitScalarAnd<unsigned char> >::call , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarAnd<unsigned int> >::call},
- {0,0,0,0},
- {BitScalar<unsigned short, bitScalarAnd<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
- {0,0,0,0},
- {BitScalar<int, bitScalarAnd<int> >::call , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
- },
- {
- {BitScalar<unsigned char, bitScalarOr<unsigned char> >::call , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOr<unsigned int> >::call},
- {0,0,0,0},
- {BitScalar<unsigned short, bitScalarOr<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
- {0,0,0,0},
- {BitScalar<int, bitScalarOr<int> >::call , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call}
- },
- {
- {BitScalar<unsigned char, bitScalarXor<unsigned char> >::call , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarXor<unsigned int> >::call},
- {0,0,0,0},
- {BitScalar<unsigned short, bitScalarXor<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
- {0,0,0,0},
- {BitScalar<int, bitScalarXor<int> >::call , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
- }
- };
-
- const int depth = src.depth();
- const int cn = src.channels();
-
- CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32S );
- CV_Assert( cn == 1 || cn == 3 || cn == 4 );
- CV_Assert( mask.empty() );
-
- funcs[op][depth][cn - 1](src, val, dst, StreamAccessor::getStream(stream));
-}
+void bitScalar(const GpuMat& src, cv::Scalar value, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op);
void cv::cuda::bitwise_or(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, Stream& stream)
{
{
template <int DEPTH, int cn> struct NppShiftFunc
{
- typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
+ typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
- typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const Npp32u* pConstants, npp_t* pDst, int nDstStep, NppiSize oSizeROI);
+ typedef NppStatus (*func_t)(const npp_type* pSrc1, int nSrc1Step, const Npp32u* pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI);
};
template <int DEPTH> struct NppShiftFunc<DEPTH, 1>
{
- typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
+ typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
- typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const Npp32u pConstants, npp_t* pDst, int nDstStep, NppiSize oSizeROI);
+ typedef NppStatus (*func_t)(const npp_type* pSrc1, int nSrc1Step, const Npp32u pConstants, npp_type* pDst, int nDstStep, NppiSize oSizeROI);
};
template <int DEPTH, int cn, typename NppShiftFunc<DEPTH, cn>::func_t func> struct NppShift
{
- typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
+ typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
static void call(const GpuMat& src, Scalar_<Npp32u> sc, GpuMat& dst, cudaStream_t stream)
{
oSizeROI.width = src.cols;
oSizeROI.height = src.rows;
- nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), sc.val, dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );
+ nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), sc.val, dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI) );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
};
template <int DEPTH, typename NppShiftFunc<DEPTH, 1>::func_t func> struct NppShift<DEPTH, 1, func>
{
- typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
+ typedef typename NPPTypeTraits<DEPTH>::npp_type npp_type;
static void call(const GpuMat& src, Scalar_<Npp32u> sc, GpuMat& dst, cudaStream_t stream)
{
oSizeROI.width = src.cols;
oSizeROI.height = src.rows;
- nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), sc.val[0], dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );
+ nppSafeCall( func(src.ptr<npp_type>(), static_cast<int>(src.step), sc.val[0], dst.ptr<npp_type>(), static_cast<int>(dst.step), oSizeROI) );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
};
+template <class MaskPtr> struct SingleMaskChannels
+{
+ typedef typename PtrTraits<MaskPtr>::value_type value_type;
+ typedef typename PtrTraits<MaskPtr>::index_type index_type;
+
+ MaskPtr mask;
+ int channels;
+
+ __device__ __forceinline__ value_type operator()(index_type y, index_type x) const
+ {
+ return mask(y, x / channels);
+ }
+
+};
+
+template <class MaskPtr> struct SingleMaskChannelsSz : SingleMaskChannels<MaskPtr>
+{
+ int rows, cols;
+};
+
+template <class MaskPtr>
+__host__ SingleMaskChannelsSz<typename PtrTraits<MaskPtr>::ptr_type>
+singleMaskChannels(const MaskPtr& mask, int channels)
+{
+ SingleMaskChannelsSz<typename PtrTraits<MaskPtr>::ptr_type> ptr;
+ ptr.mask = shrinkPtr(mask);
+ ptr.channels = channels;
+ ptr.rows = getRows(mask);
+ ptr.cols = getCols(mask) * channels;
+ return ptr;
+}
+
+template <class MaskPtr> struct PtrTraits< SingleMaskChannelsSz<MaskPtr> > : PtrTraitsBase<SingleMaskChannelsSz<MaskPtr>, SingleMaskChannels<MaskPtr> >
+{
+};
+
}}
#endif