#if !defined CUDA_DISABLER
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
-namespace cv { namespace gpu { namespace device
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+
+//////////////////////////////////////////////////////////////////////////
+// addMat
+
+namespace
{
- //////////////////////////////////////////////////////////////////////////
- // add
+ template <typename T, typename D> struct VAdd4;
+ template <> struct VAdd4<uint, uint> : binary_function<uint, uint, uint>
+ {
+ __device__ __forceinline__ uint operator ()(uint a, uint b) const
+ {
+ uint res = 0;
+
+ #if __CUDA_ARCH__ >= 300
+ asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+
+ return res;
+ }
+
+ __device__ __forceinline__ VAdd4() {}
+ __device__ __forceinline__ VAdd4(const VAdd4<uint, uint>& other) {}
+ };
+ template <> struct VAdd4<int, uint> : binary_function<int, int, uint>
+ {
+ __device__ __forceinline__ uint operator ()(int a, int b) const
+ {
+ uint res = 0;
+
+ #if __CUDA_ARCH__ >= 300
+ asm("vadd4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vadd.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vadd.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vadd.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vadd.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+
+ return res;
+ }
+
+ __device__ __forceinline__ VAdd4() {}
+ __device__ __forceinline__ VAdd4(const VAdd4<int, uint>& other) {}
+ };
+ template <> struct VAdd4<uint, int> : binary_function<uint, uint, int>
+ {
+ __device__ __forceinline__ int operator ()(uint a, uint b) const
+ {
+ int res = 0;
+
+ #if __CUDA_ARCH__ >= 300
+ asm("vadd4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vadd.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vadd.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vadd.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vadd.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+
+ return res;
+ }
+
+ __device__ __forceinline__ VAdd4() {}
+ __device__ __forceinline__ VAdd4(const VAdd4<uint, int>& other) {}
+ };
+ template <> struct VAdd4<int, int> : binary_function<int, int, int>
+ {
+ __device__ __forceinline__ int operator ()(int a, int b) const
+ {
+ int res = 0;
+
+ #if __CUDA_ARCH__ >= 300
+ asm("vadd4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vadd.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vadd.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vadd.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vadd.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+
+ return res;
+ }
+
+ __device__ __forceinline__ VAdd4() {}
+ __device__ __forceinline__ VAdd4(const VAdd4<int, int>& other) {}
+ };
+
+ ////////////////////////////////////
+
+ template <typename T, typename D> struct VAdd2;
+ template <> struct VAdd2<uint, uint> : binary_function<uint, uint, uint>
+ {
+ __device__ __forceinline__ uint operator ()(uint a, uint b) const
+ {
+ uint res = 0;
+
+ #if __CUDA_ARCH__ >= 300
+ asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+
+ return res;
+ }
+
+ __device__ __forceinline__ VAdd2() {}
+ __device__ __forceinline__ VAdd2(const VAdd2<uint, uint>& other) {}
+ };
+ template <> struct VAdd2<uint, int> : binary_function<uint, uint, int>
+ {
+ __device__ __forceinline__ int operator ()(uint a, uint b) const
+ {
+ int res = 0;
- template <typename T, typename D> struct Add : binary_function<T, T, D>
+ #if __CUDA_ARCH__ >= 300
+ asm("vadd2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vadd.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vadd.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+
+ return res;
+ }
+
+ __device__ __forceinline__ VAdd2() {}
+ __device__ __forceinline__ VAdd2(const VAdd2<uint, int>& other) {}
+ };
+ template <> struct VAdd2<int, uint> : binary_function<int, int, uint>
+ {
+ __device__ __forceinline__ uint operator ()(int a, int b) const
+ {
+ uint res = 0;
+
+ #if __CUDA_ARCH__ >= 300
+ asm("vadd2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vadd.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vadd.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+
+ return res;
+ }
+
+ __device__ __forceinline__ VAdd2() {}
+ __device__ __forceinline__ VAdd2(const VAdd2<int, uint>& other) {}
+ };
+ template <> struct VAdd2<int, int> : binary_function<int, int, int>
+ {
+ __device__ __forceinline__ int operator ()(int a, int b) const
+ {
+ int res = 0;
+
+ #if __CUDA_ARCH__ >= 300
+ asm("vadd2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vadd.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vadd.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+
+ return res;
+ }
+
+ __device__ __forceinline__ VAdd2() {}
+ __device__ __forceinline__ VAdd2(const VAdd2<int, int>& other) {}
+ };
+
+ ////////////////////////////////////
+
+ template <typename T, typename D> struct AddMat : binary_function<T, T, D>
{
__device__ __forceinline__ D operator ()(T a, T b) const
{
return saturate_cast<D>(a + b);
}
+
+ __device__ __forceinline__ AddMat() {}
+ __device__ __forceinline__ AddMat(const AddMat& other) {}
+ };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+ template <typename T, typename D> struct TransformFunctorTraits< VAdd4<T, D> > : DefaultTransformFunctorTraits< VAdd4<T, D> >
+ {
+ enum { smart_shift = 2 };
+ };
+
+ ////////////////////////////////////
+
+ template <typename T, typename D> struct TransformFunctorTraits< VAdd2<T, D> > : DefaultTransformFunctorTraits< VAdd4<T, D> >
+ {
+ enum { smart_shift = 2 };
};
- template <> struct TransformFunctorTraits< Add<ushort, ushort> > : DefaultTransformFunctorTraits< Add<ushort, ushort> >
+ ////////////////////////////////////
+
+ template <> struct TransformFunctorTraits< AddMat<ushort, ushort> > : DefaultTransformFunctorTraits< AddMat<ushort, ushort> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< Add<short, short> > : DefaultTransformFunctorTraits< Add<short, short> >
+ template <> struct TransformFunctorTraits< AddMat<short, short> > : DefaultTransformFunctorTraits< AddMat<short, short> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< Add<int, int> > : DefaultTransformFunctorTraits< Add<int, int> >
+ template <> struct TransformFunctorTraits< AddMat<int, int> > : DefaultTransformFunctorTraits< AddMat<int, int> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< Add<float, float> > : DefaultTransformFunctorTraits< Add<float, float> >
+ template <> struct TransformFunctorTraits< AddMat<float, float> > : DefaultTransformFunctorTraits< AddMat<float, float> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
+}}}
+
+namespace arithm
+{
+ template <typename T, typename D>
+ void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+ {
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VAdd4<T, D>(), WithOutMask(), stream);
+ }
+
+ template void vadd4<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void vadd4<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void vadd4<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void vadd4<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template <typename T, typename D> void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream)
+ template <typename T, typename D>
+ void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+ {
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VAdd2<T, D>(), WithOutMask(), stream);
+ }
+
+ template void vadd2<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void vadd2<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void vadd2<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void vadd2<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+ template <typename T, typename D>
+ void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
if (mask.data)
- cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<D>)dst, Add<T, D>(), SingleMask(mask), stream);
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), mask, stream);
else
- cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<D>)dst, Add<T, D>(), WithOutMask(), stream);
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), WithOutMask(), stream);
}
- template void add_gpu<uchar, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<uchar, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<uchar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<uchar, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<uchar, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<uchar, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<uchar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //template void add_gpu<schar, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<schar, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<schar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<schar, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<schar, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<schar, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<schar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //template void add_gpu<ushort, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<ushort, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<ushort, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<ushort, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<ushort, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<ushort, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<ushort, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //template void add_gpu<short, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<short, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<short, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<short, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<short, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<short, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<short, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //template void add_gpu<int, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<int, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<int, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<int, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<int, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<int, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<int, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //template void add_gpu<float, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<float, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<float, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<float, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<float, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<float, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<float, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //template void add_gpu<double, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<double, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<double, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<double, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<double, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<double, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<double, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- template <typename T, typename D> struct AddScalar : unary_function<T, D>
- {
- AddScalar(double val_) : val(val_) {}
+ template void addMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<uchar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<uchar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<uchar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ template void addMat<schar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<schar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<schar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<schar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<schar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<schar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<schar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ //template void addMat<ushort, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addMat<ushort, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<ushort, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<ushort, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<ushort, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<ushort, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<ushort, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ //template void addMat<short, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addMat<short, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<short, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<short, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<short, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<short, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<short, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ //template void addMat<int, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addMat<int, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addMat<int, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addMat<int, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<int, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<int, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ //template void addMat<float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addMat<float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ //template void addMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addMat<double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addMat<double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addMat<double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// addScalar
+
+namespace
+{
+ template <typename T, typename S, typename D> struct AddScalar : unary_function<T, D>
+ {
+ S val;
+
+ explicit AddScalar(S val_) : val(val_) {}
+
__device__ __forceinline__ D operator ()(T a) const
{
return saturate_cast<D>(a + val);
}
- const double val;
};
+}
- template <> struct TransformFunctorTraits< AddScalar<ushort, ushort> > : DefaultTransformFunctorTraits< AddScalar<ushort, ushort> >
+namespace cv { namespace gpu { namespace device
+{
+ template <> struct TransformFunctorTraits< AddScalar<ushort, float, ushort> > : DefaultTransformFunctorTraits< AddScalar<ushort, float, ushort> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< AddScalar<short, short> > : DefaultTransformFunctorTraits< AddScalar<short, short> >
+ template <> struct TransformFunctorTraits< AddScalar<short, float, short> > : DefaultTransformFunctorTraits< AddScalar<short, float, short> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< AddScalar<int, int> > : DefaultTransformFunctorTraits< AddScalar<int, int> >
+ template <> struct TransformFunctorTraits< AddScalar<int, float, int> > : DefaultTransformFunctorTraits< AddScalar<int, float, int> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< AddScalar<float, float> > : DefaultTransformFunctorTraits< AddScalar<float, float> >
+ template <> struct TransformFunctorTraits< AddScalar<float, float, float> > : DefaultTransformFunctorTraits< AddScalar<float, float, float> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
+}}}
- template <typename T, typename D> void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream)
+namespace arithm
+{
+ template <typename T, typename S, typename D>
+ void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
- cudaSafeCall( cudaSetDoubleForDevice(&val) );
- AddScalar<T, D> op(val);
+ AddScalar<T, S, D> op(static_cast<S>(val));
+
if (mask.data)
- cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<D>)dst, op, SingleMask(mask), stream);
+ transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream);
else
- cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
+ transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
- template void add_gpu<uchar, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<uchar, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<uchar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<uchar, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<uchar, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<uchar, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<uchar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //template void add_gpu<schar, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<schar, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<schar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<schar, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<schar, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<schar, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<schar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //template void add_gpu<ushort, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<ushort, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<ushort, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<ushort, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<ushort, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<ushort, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<ushort, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //template void add_gpu<short, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<short, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<short, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<short, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<short, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<short, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<short, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //template void add_gpu<int, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<int, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<int, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<int, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<int, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<int, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<int, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //template void add_gpu<float, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<float, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<float, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<float, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<float, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<float, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<float, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //template void add_gpu<double, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<double, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<double, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<double, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<double, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void add_gpu<double, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void add_gpu<double, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //////////////////////////////////////////////////////////////////////////
- // subtract
-
- template <typename T, typename D> struct Subtract : binary_function<T, T, D>
+ template void addScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ template void addScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ //template void addScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ //template void addScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ //template void addScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ //template void addScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ //template void addScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void addScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void addScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// subMat
+
+namespace
+{
+ template <typename T, typename D> struct VSub4;
+ template <> struct VSub4<uint, uint> : binary_function<uint, uint, uint>
+ {
+ __device__ __forceinline__ uint operator ()(uint a, uint b) const
+ {
+ uint res = 0;
+
+ #if __CUDA_ARCH__ >= 300
+ asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+
+ return res;
+ }
+
+ __device__ __forceinline__ VSub4() {}
+ __device__ __forceinline__ VSub4(const VSub4<uint, uint>& other) {}
+ };
+ template <> struct VSub4<int, uint> : binary_function<int, int, uint>
+ {
+ __device__ __forceinline__ uint operator ()(int a, int b) const
+ {
+ uint res = 0;
+
+ #if __CUDA_ARCH__ >= 300
+ asm("vsub4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vsub.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vsub.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vsub.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vsub.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+
+ return res;
+ }
+
+ __device__ __forceinline__ VSub4() {}
+ __device__ __forceinline__ VSub4(const VSub4<int, uint>& other) {}
+ };
+ template <> struct VSub4<uint, int> : binary_function<uint, uint, int>
+ {
+ __device__ __forceinline__ int operator ()(uint a, uint b) const
+ {
+ int res = 0;
+
+ #if __CUDA_ARCH__ >= 300
+ asm("vsub4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vsub.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vsub.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vsub.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vsub.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+
+ return res;
+ }
+
+ __device__ __forceinline__ VSub4() {}
+ __device__ __forceinline__ VSub4(const VSub4<uint, int>& other) {}
+ };
+ template <> struct VSub4<int, int> : binary_function<int, int, int>
+ {
+ __device__ __forceinline__ int operator ()(int a, int b) const
+ {
+ int res = 0;
+
+ #if __CUDA_ARCH__ >= 300
+ asm("vsub4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vsub.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vsub.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vsub.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vsub.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+
+ return res;
+ }
+
+ __device__ __forceinline__ VSub4() {}
+ __device__ __forceinline__ VSub4(const VSub4<int, int>& other) {}
+ };
+
+ ////////////////////////////////////
+
+ template <typename T, typename D> struct VSub2;
+ template <> struct VSub2<uint, uint> : binary_function<uint, uint, uint>
+ {
+ __device__ __forceinline__ uint operator ()(uint a, uint b) const
+ {
+ uint res = 0;
+
+ #if __CUDA_ARCH__ >= 300
+ asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+
+ return res;
+ }
+
+ __device__ __forceinline__ VSub2() {}
+ __device__ __forceinline__ VSub2(const VSub2<uint, uint>& other) {}
+ };
+ template <> struct VSub2<uint, int> : binary_function<uint, uint, int>
+ {
+ __device__ __forceinline__ int operator ()(uint a, uint b) const
+ {
+ int res = 0;
+
+ #if __CUDA_ARCH__ >= 300
+ asm("vsub2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vsub.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vsub.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+
+ return res;
+ }
+
+ __device__ __forceinline__ VSub2() {}
+ __device__ __forceinline__ VSub2(const VSub2<uint, int>& other) {}
+ };
+ template <> struct VSub2<int, uint> : binary_function<int, int, uint>
+ {
+ __device__ __forceinline__ uint operator ()(int a, int b) const
+ {
+ uint res = 0;
+
+ #if __CUDA_ARCH__ >= 300
+ asm("vsub2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vsub.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vsub.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+
+ return res;
+ }
+
+ __device__ __forceinline__ VSub2() {}
+ __device__ __forceinline__ VSub2(const VSub2<int, uint>& other) {}
+ };
+ template <> struct VSub2<int, int> : binary_function<int, int, int>
+ {
+ __device__ __forceinline__ int operator ()(int a, int b) const
+ {
+ int res = 0;
+
+ #if __CUDA_ARCH__ >= 300
+ asm("vsub2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vsub.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vsub.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+
+ return res;
+ }
+
+ __device__ __forceinline__ VSub2() {}
+ __device__ __forceinline__ VSub2(const VSub2<int, int>& other) {}
+ };
+
+ ////////////////////////////////////
+
+ template <typename T, typename D> struct SubMat : binary_function<T, T, D>
{
__device__ __forceinline__ D operator ()(T a, T b) const
{
return saturate_cast<D>(a - b);
}
+
+ __device__ __forceinline__ SubMat() {}
+ __device__ __forceinline__ SubMat(const SubMat& other) {}
+ };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+ template <typename T, typename D> struct TransformFunctorTraits< VSub4<T, D> > : DefaultTransformFunctorTraits< VSub4<T, D> >
+ {
+ enum { smart_shift = 2 };
+ };
+
+ ////////////////////////////////////
+
+ template <typename T, typename D> struct TransformFunctorTraits< VSub2<T, D> > : DefaultTransformFunctorTraits< VSub2<T, D> >
+ {
+ enum { smart_shift = 2 };
};
- template <> struct TransformFunctorTraits< Subtract<ushort, ushort> > : DefaultTransformFunctorTraits< Subtract<ushort, ushort> >
+ ////////////////////////////////////
+
+ template <> struct TransformFunctorTraits< SubMat<ushort, ushort> > : DefaultTransformFunctorTraits< SubMat<ushort, ushort> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< Subtract<short, short> > : DefaultTransformFunctorTraits< Subtract<short, short> >
+ template <> struct TransformFunctorTraits< SubMat<short, short> > : DefaultTransformFunctorTraits< SubMat<short, short> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< Subtract<int, int> > : DefaultTransformFunctorTraits< Subtract<int, int> >
+ template <> struct TransformFunctorTraits< SubMat<int, int> > : DefaultTransformFunctorTraits< SubMat<int, int> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< Subtract<float, float> > : DefaultTransformFunctorTraits< Subtract<float, float> >
+ template <> struct TransformFunctorTraits< SubMat<float, float> > : DefaultTransformFunctorTraits< SubMat<float, float> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
+}}}
+
+namespace arithm
+{
+ template <typename T, typename D>
+ void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+ {
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VSub4<T, D>(), WithOutMask(), stream);
+ }
+
+ template void vsub4<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void vsub4<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void vsub4<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void vsub4<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template <typename T, typename D> void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream)
+ template <typename T, typename D>
+ void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+ {
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VSub2<T, D>(), WithOutMask(), stream);
+ }
+
+ template void vsub2<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void vsub2<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void vsub2<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void vsub2<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+ template <typename T, typename D>
+ void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
if (mask.data)
- cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<D>)dst, Subtract<T, D>(), SingleMask(mask), stream);
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, SubMat<T, D>(), mask, stream);
else
- cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<D>)dst, Subtract<T, D>(), WithOutMask(), stream);
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, SubMat<T, D>(), WithOutMask(), stream);
}
- template void subtract_gpu<uchar, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<uchar, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<uchar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<uchar, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<uchar, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<uchar, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<uchar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //template void subtract_gpu<schar, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<schar, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<schar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<schar, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<schar, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<schar, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<schar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //template void subtract_gpu<ushort, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<ushort, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<ushort, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<ushort, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<ushort, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<ushort, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<ushort, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //template void subtract_gpu<short, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<short, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<short, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<short, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<short, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<short, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<short, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //template void subtract_gpu<int, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<int, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<int, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<int, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<int, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<int, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<int, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //template void subtract_gpu<float, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<float, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<float, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<float, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<float, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<float, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<float, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //template void subtract_gpu<double, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<double, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<double, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<double, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<double, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<double, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<double, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- template <typename T, typename D> struct SubtractScalar : unary_function<T, D>
- {
- SubtractScalar(double val_) : val(val_) {}
- __device__ __forceinline__ D operator ()(T a) const
+ template void subMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<uchar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<uchar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<uchar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ template void subMat<schar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<schar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<schar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<schar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<schar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<schar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<schar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ //template void subMat<ushort, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subMat<ushort, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<ushort, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<ushort, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<ushort, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<ushort, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<ushort, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ //template void subMat<short, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subMat<short, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<short, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<short, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<short, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<short, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<short, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ //template void subMat<int, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subMat<int, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subMat<int, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subMat<int, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<int, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<int, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ //template void subMat<float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subMat<float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ //template void subMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subMat<double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subMat<double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subMat<double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// subScalar
+
+namespace arithm
+{
+ template <typename T, typename S, typename D>
+ void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+ {
+ AddScalar<T, S, D> op(-static_cast<S>(val));
+
+ if (mask.data)
+ transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream);
+ else
+ transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+ }
+
+ template void subScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ template void subScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ //template void subScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ //template void subScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ //template void subScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ //template void subScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ //template void subScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ //template void subScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void subScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// mulMat
+
+namespace
+{
+ struct Mul_8uc4_32f : binary_function<uint, float, uint>
+ {
+ __device__ __forceinline__ uint operator ()(uint a, float b) const
{
- return saturate_cast<D>(a - val);
+ uint res = 0;
+
+ res |= (saturate_cast<uchar>((0xffu & (a )) * b) );
+ res |= (saturate_cast<uchar>((0xffu & (a >> 8)) * b) << 8);
+ res |= (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);
+ res |= (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);
+
+ return res;
}
- const double val;
+
+ __device__ __forceinline__ Mul_8uc4_32f() {}
+ __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f& other) {}
};
- template <> struct TransformFunctorTraits< SubtractScalar<ushort, ushort> > : DefaultTransformFunctorTraits< SubtractScalar<ushort, ushort> >
+ struct Mul_16sc4_32f : binary_function<short4, float, short4>
+ {
+ __device__ __forceinline__ short4 operator ()(short4 a, float b) const
+ {
+ return make_short4(saturate_cast<short>(a.x * b), saturate_cast<short>(a.y * b),
+ saturate_cast<short>(a.z * b), saturate_cast<short>(a.w * b));
+ }
+
+ __device__ __forceinline__ Mul_16sc4_32f() {}
+ __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f& other) {}
+ };
+
+ template <typename T, typename D> struct Mul : binary_function<T, T, D>
+ {
+ __device__ __forceinline__ D operator ()(T a, T b) const
+ {
+ return saturate_cast<D>(a * b);
+ }
+
+ __device__ __forceinline__ Mul() {}
+ __device__ __forceinline__ Mul(const Mul& other) {}
+ };
+
+ template <typename T, typename S, typename D> struct MulScale : binary_function<T, T, D>
+ {
+ S scale;
+
+ explicit MulScale(S scale_) : scale(scale_) {}
+
+ __device__ __forceinline__ D operator ()(T a, T b) const
+ {
+ return saturate_cast<D>(scale * a * b);
+ }
+ };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(Mul_8uc4_32f)
+ {
+ enum { smart_block_dim_x = 8 };
+ enum { smart_block_dim_y = 8 };
+ enum { smart_shift = 8 };
+ };
+
+ template <> struct TransformFunctorTraits< Mul<ushort, ushort> > : DefaultTransformFunctorTraits< Mul<ushort, ushort> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< SubtractScalar<short, short> > : DefaultTransformFunctorTraits< SubtractScalar<short, short> >
+ template <> struct TransformFunctorTraits< Mul<short, short> > : DefaultTransformFunctorTraits< Mul<short, short> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< SubtractScalar<int, int> > : DefaultTransformFunctorTraits< SubtractScalar<int, int> >
+ template <> struct TransformFunctorTraits< Mul<int, int> > : DefaultTransformFunctorTraits< Mul<int, int> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< SubtractScalar<float, float> > : DefaultTransformFunctorTraits< SubtractScalar<float, float> >
+ template <> struct TransformFunctorTraits< Mul<float, float> > : DefaultTransformFunctorTraits< Mul<float, float> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <typename T, typename D> void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream)
+ template <> struct TransformFunctorTraits< MulScale<ushort, float, ushort> > : DefaultTransformFunctorTraits< MulScale<ushort, float, ushort> >
{
- cudaSafeCall( cudaSetDoubleForDevice(&val) );
- SubtractScalar<T, D> op(val);
- if (mask.data)
- cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<D>)dst, op, SingleMask(mask), stream);
+ enum { smart_block_dim_y = 8 };
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< MulScale<short, float, short> > : DefaultTransformFunctorTraits< MulScale<short, float, short> >
+ {
+ enum { smart_block_dim_y = 8 };
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< MulScale<int, float, int> > : DefaultTransformFunctorTraits< MulScale<int, float, int> >
+ {
+ enum { smart_block_dim_y = 8 };
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< MulScale<float, float, float> > : DefaultTransformFunctorTraits< MulScale<float, float, float> >
+ {
+ enum { smart_block_dim_y = 8 };
+ enum { smart_shift = 4 };
+ };
+}}}
+
+namespace arithm
+{
+ void mulMat_8uc4_32f(PtrStepSz<uint> src1, PtrStepSzf src2, PtrStepSz<uint> dst, cudaStream_t stream)
+ {
+ transform(src1, src2, dst, Mul_8uc4_32f(), WithOutMask(), stream);
+ }
+
+ void mulMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream)
+ {
+ transform(src1, src2, dst, Mul_16sc4_32f(), WithOutMask(), stream);
+ }
+
+ template <typename T, typename S, typename D>
+ void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream)
+ {
+ if (scale == 1)
+ {
+ Mul<T, D> op;
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+ }
else
- cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
+ {
+ MulScale<T, S, D> op(static_cast<S>(scale));
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+ }
+ }
+
+ template void mulMat<uchar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<uchar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<uchar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<uchar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<uchar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<uchar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<uchar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+ template void mulMat<schar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<schar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<schar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<schar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<schar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<schar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<schar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+ //template void mulMat<ushort, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void mulMat<ushort, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<ushort, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<ushort, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<ushort, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<ushort, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<ushort, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+ //template void mulMat<short, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void mulMat<short, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<short, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<short, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<short, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<short, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<short, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+ //template void mulMat<int, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void mulMat<int, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void mulMat<int, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void mulMat<int, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<int, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<int, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<int, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+ //template void mulMat<float, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void mulMat<float, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void mulMat<float, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void mulMat<float, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void mulMat<float, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<float, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<float, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+ //template void mulMat<double, double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void mulMat<double, double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void mulMat<double, double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void mulMat<double, double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void mulMat<double, double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void mulMat<double, double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void mulMat<double, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// mulScalar
+
+namespace
+{
+ template <typename T, typename S, typename D> struct MulScalar : unary_function<T, D>
+ {
+ S val;
+
+ explicit MulScalar(S val_) : val(val_) {}
+
+ __device__ __forceinline__ D operator ()(T a) const
+ {
+ return saturate_cast<D>(a * val);
+ }
+ };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+ template <> struct TransformFunctorTraits< MulScalar<ushort, float, ushort> > : DefaultTransformFunctorTraits< MulScalar<ushort, float, ushort> >
+ {
+ enum { smart_block_dim_y = 8 };
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< MulScalar<short, float, short> > : DefaultTransformFunctorTraits< MulScalar<short, float, short> >
+ {
+ enum { smart_block_dim_y = 8 };
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< MulScalar<int, float, int> > : DefaultTransformFunctorTraits< MulScalar<int, float, int> >
+ {
+ enum { smart_block_dim_y = 8 };
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< MulScalar<float, float, float> > : DefaultTransformFunctorTraits< MulScalar<float, float, float> >
+ {
+ enum { smart_block_dim_y = 8 };
+ enum { smart_shift = 4 };
+ };
+}}}
+
+namespace arithm
+{
+ template <typename T, typename S, typename D>
+ void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
+ {
+ MulScalar<T, S, D> op(static_cast<S>(val));
+ transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
- template void subtract_gpu<uchar, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<uchar, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<uchar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<uchar, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<uchar, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<uchar, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<uchar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //template void subtract_gpu<schar, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<schar, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<schar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<schar, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<schar, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<schar, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<schar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //template void subtract_gpu<ushort, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<ushort, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<ushort, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<ushort, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<ushort, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<ushort, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<ushort, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //template void subtract_gpu<short, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<short, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<short, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<short, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<short, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<short, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<short, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //template void subtract_gpu<int, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<int, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<int, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<int, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<int, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<int, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<int, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //template void subtract_gpu<float, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<float, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<float, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<float, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<float, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<float, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<float, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //template void subtract_gpu<double, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<double, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<double, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<double, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<double, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- //template void subtract_gpu<double, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
- template void subtract_gpu<double, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
- //////////////////////////////////////////////////////////////////////////
- // multiply
-
- struct multiply_8uc4_32f : binary_function<uint, float, uint>
+ template void mulScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+ template void mulScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+ //template void mulScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void mulScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+ //template void mulScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void mulScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+ //template void mulScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void mulScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void mulScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void mulScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+ //template void mulScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void mulScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void mulScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void mulScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void mulScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+ //template void mulScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void mulScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void mulScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void mulScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void mulScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void mulScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void mulScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// divMat
+
+namespace
+{
+ struct Div_8uc4_32f : binary_function<uint, float, uint>
{
__device__ __forceinline__ uint operator ()(uint a, float b) const
{
uint res = 0;
- res |= (saturate_cast<uchar>((0xffu & (a )) * b) );
- res |= (saturate_cast<uchar>((0xffu & (a >> 8)) * b) << 8);
- res |= (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);
- res |= (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);
+ if (b != 0)
+ {
+ b = 1.0f / b;
+ res |= (saturate_cast<uchar>((0xffu & (a )) * b) );
+ res |= (saturate_cast<uchar>((0xffu & (a >> 8)) * b) << 8);
+ res |= (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);
+ res |= (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);
+ }
return res;
}
};
- OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(multiply_8uc4_32f)
+ struct Div_16sc4_32f : binary_function<short4, float, short4>
{
- enum { smart_block_dim_x = 8 };
- enum { smart_block_dim_y = 8 };
- enum { smart_shift = 8 };
+ __device__ __forceinline__ short4 operator ()(short4 a, float b) const
+ {
+ return b != 0 ? make_short4(saturate_cast<short>(a.x / b), saturate_cast<short>(a.y / b),
+ saturate_cast<short>(a.z / b), saturate_cast<short>(a.w / b))
+ : make_short4(0,0,0,0);
+ }
};
- void multiply_gpu(const PtrStepSz<uchar4>& src1, const PtrStepSzf& src2, const PtrStepSz<uchar4>& dst, cudaStream_t stream)
+ template <typename T, typename D> struct Div : binary_function<T, T, D>
{
- cv::gpu::device::transform(static_cast< PtrStepSz<uint> >(src1), src2, static_cast< PtrStepSz<uint> >(dst), multiply_8uc4_32f(), WithOutMask(), stream);
- }
+ __device__ __forceinline__ D operator ()(T a, T b) const
+ {
+ return b != 0 ? saturate_cast<D>(a / b) : 0;
+ }
- struct multiply_16sc4_32f : binary_function<short4, float, short4>
+ __device__ __forceinline__ Div() {}
+ __device__ __forceinline__ Div(const Div& other) {}
+ };
+ template <typename T> struct Div<T, float> : binary_function<T, T, float>
{
- __device__ __forceinline__ short4 operator ()(short4 a, float b) const
+ __device__ __forceinline__ float operator ()(T a, T b) const
{
- return make_short4(saturate_cast<short>(a.x * b), saturate_cast<short>(a.y * b),
- saturate_cast<short>(a.z * b), saturate_cast<short>(a.w * b));
+ return b != 0 ? static_cast<float>(a) / b : 0;
}
+
+ __device__ __forceinline__ Div() {}
+ __device__ __forceinline__ Div(const Div& other) {}
+ };
+ template <typename T> struct Div<T, double> : binary_function<T, T, double>
+ {
+ __device__ __forceinline__ double operator ()(T a, T b) const
+ {
+ return b != 0 ? static_cast<double>(a) / b : 0;
+ }
+
+ __device__ __forceinline__ Div() {}
+ __device__ __forceinline__ Div(const Div& other) {}
+ };
+
+ template <typename T, typename S, typename D> struct DivScale : binary_function<T, T, D>
+ {
+ S scale;
+
+ explicit DivScale(S scale_) : scale(scale_) {}
+
+ __device__ __forceinline__ D operator ()(T a, T b) const
+ {
+ return b != 0 ? saturate_cast<D>(scale * a / b) : 0;
+ }
+ };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+ OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(Div_8uc4_32f)
+ {
+ enum { smart_block_dim_x = 8 };
+ enum { smart_block_dim_y = 8 };
+ enum { smart_shift = 8 };
+ };
+
+ template <> struct TransformFunctorTraits< Div<ushort, ushort> > : DefaultTransformFunctorTraits< Div<ushort, ushort> >
+ {
+ enum { smart_block_dim_y = 8 };
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< Div<short, short> > : DefaultTransformFunctorTraits< Div<short, short> >
+ {
+ enum { smart_block_dim_y = 8 };
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< Div<int, int> > : DefaultTransformFunctorTraits< Div<int, int> >
+ {
+ enum { smart_block_dim_y = 8 };
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< Div<float, float> > : DefaultTransformFunctorTraits< Div<float, float> >
+ {
+ enum { smart_block_dim_y = 8 };
+ enum { smart_shift = 4 };
};
- OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(multiply_16sc4_32f)
+ template <> struct TransformFunctorTraits< DivScale<ushort, float, ushort> > : DefaultTransformFunctorTraits< DivScale<ushort, float, ushort> >
+ {
+ enum { smart_block_dim_y = 8 };
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< DivScale<short, float, short> > : DefaultTransformFunctorTraits< DivScale<short, float, short> >
+ {
+ enum { smart_block_dim_y = 8 };
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< DivScale<int, float, int> > : DefaultTransformFunctorTraits< DivScale<int, float, int> >
+ {
+ enum { smart_block_dim_y = 8 };
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< DivScale<float, float, float> > : DefaultTransformFunctorTraits< DivScale<float, float, float> >
{
- enum { smart_block_dim_x = 8 };
enum { smart_block_dim_y = 8 };
- enum { smart_shift = 8 };
+ enum { smart_shift = 4 };
};
+}}}
+
+namespace arithm
+{
+ void divMat_8uc4_32f(PtrStepSz<uint> src1, PtrStepSzf src2, PtrStepSz<uint> dst, cudaStream_t stream)
+ {
+ transform(src1, src2, dst, Div_8uc4_32f(), WithOutMask(), stream);
+ }
- void multiply_gpu(const PtrStepSz<short4>& src1, const PtrStepSzf& src2, const PtrStepSz<short4>& dst, cudaStream_t stream)
+ void divMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream)
{
- cv::gpu::device::transform(static_cast< PtrStepSz<short4> >(src1), src2, static_cast< PtrStepSz<short4> >(dst), multiply_16sc4_32f(), WithOutMask(), stream);
+ transform(src1, src2, dst, Div_16sc4_32f(), WithOutMask(), stream);
}
- template <typename T, typename D> struct Multiply : binary_function<T, T, D>
+ template <typename T, typename S, typename D>
+ void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream)
{
- Multiply(float scale_) : scale(scale_) {}
- __device__ __forceinline__ D operator ()(T a, T b) const
+ if (scale == 1)
{
- return saturate_cast<D>(scale * a * b);
+ Div<T, D> op;
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
- const float scale;
- };
- template <typename T> struct Multiply<T, double> : binary_function<T, T, double>
- {
- Multiply(double scale_) : scale(scale_) {}
- __device__ __forceinline__ double operator ()(T a, T b) const
+ else
{
- return scale * a * b;
+ DivScale<T, S, D> op(static_cast<S>(scale));
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
- const double scale;
- };
- template <> struct Multiply<int, int> : binary_function<int, int, int>
+ }
+
+ template void divMat<uchar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<uchar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<uchar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<uchar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<uchar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<uchar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<uchar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+ template void divMat<schar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<schar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<schar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<schar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<schar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<schar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<schar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+ //template void divMat<ushort, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void divMat<ushort, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<ushort, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<ushort, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<ushort, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<ushort, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<ushort, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+ //template void divMat<short, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void divMat<short, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<short, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<short, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<short, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<short, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<short, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+ //template void divMat<int, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void divMat<int, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void divMat<int, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void divMat<int, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<int, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<int, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<int, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+ //template void divMat<float, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void divMat<float, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void divMat<float, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void divMat<float, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void divMat<float, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<float, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<float, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+ //template void divMat<double, double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void divMat<double, double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void divMat<double, double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void divMat<double, double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void divMat<double, double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ //template void divMat<double, double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+ template void divMat<double, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// divScalar
+
+namespace arithm
+{
+ template <typename T, typename S, typename D>
+ void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
{
- Multiply(double scale_) : scale(scale_) {}
- __device__ __forceinline__ int operator ()(int a, int b) const
+ MulScalar<T, S, D> op(static_cast<S>(1.0 / val));
+ transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+ }
+
+ template void divScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+ template void divScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+ //template void divScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+ //template void divScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+ //template void divScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+ //template void divScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+ //template void divScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// divInv
+
+namespace
+{
+ template <typename T, typename S, typename D> struct DivInv : unary_function<T, D>
+ {
+ S val;
+
+ explicit DivInv(double val_) : val(val_) {}
+
+ __device__ __forceinline__ D operator ()(T a) const
{
- return saturate_cast<int>(scale * a * b);
+ return a != 0 ? saturate_cast<D>(val / a) : 0;
}
- const double scale;
};
+}
- template <> struct TransformFunctorTraits< Multiply<ushort, ushort> > : DefaultTransformFunctorTraits< Multiply<ushort, ushort> >
+namespace cv { namespace gpu { namespace device
+{
+ template <> struct TransformFunctorTraits< DivInv<ushort, float, ushort> > : DefaultTransformFunctorTraits< DivInv<ushort, float, ushort> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< Multiply<short, short> > : DefaultTransformFunctorTraits< Multiply<short, short> >
+ template <> struct TransformFunctorTraits< DivInv<short, float, short> > : DefaultTransformFunctorTraits< DivInv<short, float, short> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< Multiply<int, int> > : DefaultTransformFunctorTraits< Multiply<int, int> >
+ template <> struct TransformFunctorTraits< DivInv<int, float, int> > : DefaultTransformFunctorTraits< DivInv<int, float, int> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< Multiply<float, float> > : DefaultTransformFunctorTraits< Multiply<float, float> >
+ template <> struct TransformFunctorTraits< DivInv<float, float, float> > : DefaultTransformFunctorTraits< DivInv<float, float, float> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
+}}}
+
+namespace arithm
+{
+ template <typename T, typename S, typename D>
+ void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
+ {
+ DivInv<T, S, D> op(static_cast<S>(val));
+ transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+ }
+
+ template void divInv<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+ template void divInv<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+ //template void divInv<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divInv<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+ //template void divInv<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divInv<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+ //template void divInv<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divInv<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divInv<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divInv<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+ //template void divInv<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divInv<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divInv<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divInv<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divInv<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+ //template void divInv<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divInv<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divInv<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divInv<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divInv<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ //template void divInv<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void divInv<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// absDiffMat
+
+namespace
+{
+ template <typename T, typename D> struct VAbsDiff4;
+ template <> struct VAbsDiff4<uint, uint> : binary_function<uint, uint, uint>
+ {
+ __device__ __forceinline__ uint operator ()(uint a, uint b) const
+ {
+ uint res = 0;
+
+ #if __CUDA_ARCH__ >= 300
+ asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+
+ return res;
+ }
- template <typename T, typename D> struct MultiplyCaller
+ __device__ __forceinline__ VAbsDiff4() {}
+ __device__ __forceinline__ VAbsDiff4(const VAbsDiff4<uint, uint>& other) {}
+ };
+ template <> struct VAbsDiff4<int, int> : binary_function<int, int, int>
{
- static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream)
+ __device__ __forceinline__ int operator ()(int a, int b) const
{
- Multiply<T, D> op(static_cast<float>(scale));
- cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
+ int res = 0;
+
+ #if __CUDA_ARCH__ >= 300
+ asm("vabsdiff4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vabsdiff.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vabsdiff.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vabsdiff.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vabsdiff.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+
+ return res;
}
+
+ __device__ __forceinline__ VAbsDiff4() {}
+ __device__ __forceinline__ VAbsDiff4(const VAbsDiff4<int, int>& other) {}
};
- template <typename T> struct MultiplyCaller<T, double>
+
+ ////////////////////////////////////
+
+ template <typename T, typename D> struct VAbsDiff2;
+ template <> struct VAbsDiff2<uint, uint> : binary_function<uint, uint, uint>
{
- static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream)
+ __device__ __forceinline__ uint operator ()(uint a, uint b) const
{
- cudaSafeCall( cudaSetDoubleForDevice(&scale) );
- Multiply<T, double> op(scale);
- cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<double>)dst, op, WithOutMask(), stream);
+ uint res = 0;
+
+ #if __CUDA_ARCH__ >= 300
+ asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+
+ return res;
}
+
+ __device__ __forceinline__ VAbsDiff2() {}
+ __device__ __forceinline__ VAbsDiff2(const VAbsDiff2<uint, uint>& other) {}
};
- template <> struct MultiplyCaller<int, int>
+ template <> struct VAbsDiff2<int, int> : binary_function<int, int, int>
{
- static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream)
+ __device__ __forceinline__ int operator ()(int a, int b) const
{
- cudaSafeCall( cudaSetDoubleForDevice(&scale) );
- Multiply<int, int> op(scale);
- cv::gpu::device::transform((PtrStepSz<int>)src1, (PtrStepSz<int>)src2, (PtrStepSz<int>)dst, op, WithOutMask(), stream);
+ int res = 0;
+
+ #if __CUDA_ARCH__ >= 300
+ asm("vabsdiff2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vabsdiff.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vabsdiff.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+
+ return res;
}
+
+ __device__ __forceinline__ VAbsDiff2() {}
+ __device__ __forceinline__ VAbsDiff2(const VAbsDiff2<int, int>& other) {}
};
- template <typename T, typename D> void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream)
+ ////////////////////////////////////
+
+ __device__ __forceinline__ int _abs(int a)
+ {
+ return ::abs(a);
+ }
+ __device__ __forceinline__ float _abs(float a)
+ {
+ return ::fabsf(a);
+ }
+ __device__ __forceinline__ double _abs(double a)
{
- MultiplyCaller<T, D>::call(src1, src2, dst, scale, stream);
+ return ::fabs(a);
}
- template void multiply_gpu<uchar, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<uchar, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<uchar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<uchar, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<uchar, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<uchar, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<uchar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //template void multiply_gpu<schar, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<schar, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<schar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<schar, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<schar, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<schar, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<schar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //template void multiply_gpu<ushort, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<ushort, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<ushort, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<ushort, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<ushort, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<ushort, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<ushort, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //template void multiply_gpu<short, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<short, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<short, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<short, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<short, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<short, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<short, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //template void multiply_gpu<int, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<int, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<int, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<int, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<int, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<int, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<int, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //template void multiply_gpu<float, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<float, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<float, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<float, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<float, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<float, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<float, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //template void multiply_gpu<double, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<double, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<double, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<double, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<double, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<double, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<double, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- template <typename T, typename D> struct MultiplyScalar : unary_function<T, D>
- {
- MultiplyScalar(double val_, double scale_) : val(val_), scale(scale_) {}
- __device__ __forceinline__ D operator ()(T a) const
+ template <typename T> struct AbsDiffMat : binary_function<T, T, T>
+ {
+ __device__ __forceinline__ T operator ()(T a, T b) const
{
- return saturate_cast<D>(scale * a * val);
+ return saturate_cast<T>(_abs(a - b));
}
- const double val;
- const double scale;
+
+ __device__ __forceinline__ AbsDiffMat() {}
+ __device__ __forceinline__ AbsDiffMat(const AbsDiffMat& other) {}
+ };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+ template <typename T, typename D> struct TransformFunctorTraits< VAbsDiff4<T, D> > : DefaultTransformFunctorTraits< VAbsDiff4<T, D> >
+ {
+ enum { smart_shift = 2 };
};
- template <> struct TransformFunctorTraits< MultiplyScalar<ushort, ushort> > : DefaultTransformFunctorTraits< MultiplyScalar<ushort, ushort> >
+ ////////////////////////////////////
+
+ template <typename T, typename D> struct TransformFunctorTraits< VAbsDiff2<T, D> > : DefaultTransformFunctorTraits< VAbsDiff4<T, D> >
+ {
+ enum { smart_shift = 2 };
+ };
+
+ ////////////////////////////////////
+
+ template <> struct TransformFunctorTraits< AbsDiffMat<ushort> > : DefaultTransformFunctorTraits< AbsDiffMat<ushort> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< MultiplyScalar<short, short> > : DefaultTransformFunctorTraits< MultiplyScalar<short, short> >
+ template <> struct TransformFunctorTraits< AbsDiffMat<short> > : DefaultTransformFunctorTraits< AbsDiffMat<short> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< MultiplyScalar<int, int> > : DefaultTransformFunctorTraits< MultiplyScalar<int, int> >
+ template <> struct TransformFunctorTraits< AbsDiffMat<int> > : DefaultTransformFunctorTraits< AbsDiffMat<int> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< MultiplyScalar<float, float> > : DefaultTransformFunctorTraits< MultiplyScalar<float, float> >
+ template <> struct TransformFunctorTraits< AbsDiffMat<float> > : DefaultTransformFunctorTraits< AbsDiffMat<float> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
+}}}
- template <typename T, typename D> void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream)
+namespace arithm
+{
+ template <typename T>
+ void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
- cudaSafeCall( cudaSetDoubleForDevice(&val) );
- cudaSafeCall( cudaSetDoubleForDevice(&scale) );
- MultiplyScalar<T, D> op(val, scale);
- cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VAbsDiff4<T, T>(), WithOutMask(), stream);
}
- template void multiply_gpu<uchar, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<uchar, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<uchar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<uchar, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<uchar, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<uchar, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<uchar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //template void multiply_gpu<schar, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<schar, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<schar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<schar, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<schar, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<schar, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<schar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //template void multiply_gpu<ushort, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<ushort, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<ushort, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<ushort, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<ushort, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<ushort, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<ushort, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //template void multiply_gpu<short, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<short, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<short, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<short, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<short, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<short, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<short, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //template void multiply_gpu<int, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<int, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<int, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<int, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<int, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<int, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<int, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //template void multiply_gpu<float, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<float, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<float, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<float, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<float, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<float, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<float, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //template void multiply_gpu<double, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<double, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<double, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<double, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<double, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void multiply_gpu<double, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void multiply_gpu<double, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //////////////////////////////////////////////////////////////////////////
- // divide
-
- struct divide_8uc4_32f : binary_function<uchar4, float, uchar4>
- {
- __device__ __forceinline__ uchar4 operator ()(uchar4 a, float b) const
- {
- return b != 0 ? make_uchar4(saturate_cast<uchar>(a.x / b), saturate_cast<uchar>(a.y / b),
- saturate_cast<uchar>(a.z / b), saturate_cast<uchar>(a.w / b))
- : make_uchar4(0,0,0,0);
- }
- };
+ template void vabsDiff4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void vabsDiff4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(divide_8uc4_32f)
+ template <typename T>
+ void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
- enum { smart_block_dim_x = 8 };
- enum { smart_block_dim_y = 8 };
- enum { smart_shift = 8 };
- };
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VAbsDiff2<T, T>(), WithOutMask(), stream);
+ }
+
+ template void vabsDiff2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void vabsDiff2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- void divide_gpu(const PtrStepSz<uchar4>& src1, const PtrStepSzf& src2, const PtrStepSz<uchar4>& dst, cudaStream_t stream)
+ template <typename T>
+ void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
- cv::gpu::device::transform(static_cast< PtrStepSz<uchar4> >(src1), src2, static_cast< PtrStepSz<uchar4> >(dst), divide_8uc4_32f(), WithOutMask(), stream);
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, AbsDiffMat<T>(), WithOutMask(), stream);
}
+ template void absDiffMat<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void absDiffMat<schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void absDiffMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void absDiffMat<short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void absDiffMat<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void absDiffMat<float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void absDiffMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// absDiffScalar
- struct divide_16sc4_32f : binary_function<short4, float, short4>
+namespace
+{
+ template <typename T, typename S> struct AbsDiffScalar : unary_function<T, T>
{
- __device__ __forceinline__ short4 operator ()(short4 a, float b) const
+ S val;
+
+ explicit AbsDiffScalar(S val_) : val(val_) {}
+
+ __device__ __forceinline__ T operator ()(T a) const
{
- return b != 0 ? make_short4(saturate_cast<short>(a.x / b), saturate_cast<short>(a.y / b),
- saturate_cast<short>(a.z / b), saturate_cast<short>(a.w / b))
- : make_short4(0,0,0,0);
+ abs_func<S> f;
+ return saturate_cast<T>(f(a - val));
}
};
+}
- OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(divide_16sc4_32f)
+namespace cv { namespace gpu { namespace device
+{
+ template <> struct TransformFunctorTraits< AbsDiffScalar<ushort, float> > : DefaultTransformFunctorTraits< AbsDiffScalar<ushort, float> >
{
- enum { smart_block_dim_x = 8 };
enum { smart_block_dim_y = 8 };
- enum { smart_shift = 8 };
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< AbsDiffScalar<short, float> > : DefaultTransformFunctorTraits< AbsDiffScalar<short, float> >
+ {
+ enum { smart_block_dim_y = 8 };
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< AbsDiffScalar<int, float> > : DefaultTransformFunctorTraits< AbsDiffScalar<int, float> >
+ {
+ enum { smart_block_dim_y = 8 };
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< AbsDiffScalar<float, float> > : DefaultTransformFunctorTraits< AbsDiffScalar<float, float> >
+ {
+ enum { smart_block_dim_y = 8 };
+ enum { smart_shift = 4 };
};
+}}}
- void divide_gpu(const PtrStepSz<short4>& src1, const PtrStepSzf& src2, const PtrStepSz<short4>& dst, cudaStream_t stream)
+namespace arithm
+{
+ template <typename T, typename S>
+ void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
{
- cv::gpu::device::transform(static_cast< PtrStepSz<short4> >(src1), src2, static_cast< PtrStepSz<short4> >(dst), divide_16sc4_32f(), WithOutMask(), stream);
+ AbsDiffScalar<T, S> op(static_cast<S>(val));
+
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, op, WithOutMask(), stream);
}
- template <typename T, typename D> struct Divide : binary_function<T, T, D>
- {
- Divide(double scale_) : scale(scale_) {}
- __device__ __forceinline__ D operator ()(T a, T b) const
- {
- return b != 0 ? saturate_cast<D>(a * scale / b) : 0;
- }
- const double scale;
- };
+ template void absDiffScalar<uchar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+ template void absDiffScalar<schar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+ template void absDiffScalar<ushort, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+ template void absDiffScalar<short, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+ template void absDiffScalar<int, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+ template void absDiffScalar<float, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+ template void absDiffScalar<double, double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+}
- template <> struct TransformFunctorTraits< Divide<ushort, ushort> > : DefaultTransformFunctorTraits< Divide<ushort, ushort> >
+//////////////////////////////////////////////////////////////////////////
+// absMat
+
+namespace cv { namespace gpu { namespace device
+{
+ template <> struct TransformFunctorTraits< abs_func<ushort> > : DefaultTransformFunctorTraits< abs_func<ushort> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< Divide<short, short> > : DefaultTransformFunctorTraits< Divide<short, short> >
+ template <> struct TransformFunctorTraits< abs_func<short> > : DefaultTransformFunctorTraits< abs_func<short> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< Divide<int, int> > : DefaultTransformFunctorTraits< Divide<int, int> >
+ template <> struct TransformFunctorTraits< abs_func<int> > : DefaultTransformFunctorTraits< abs_func<int> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< Divide<float, float> > : DefaultTransformFunctorTraits< Divide<float, float> >
+ template <> struct TransformFunctorTraits< abs_func<float> > : DefaultTransformFunctorTraits< abs_func<float> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
+}}}
- template <typename T, typename D> void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream)
+namespace arithm
+{
+ template <typename T>
+ void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
{
- cudaSafeCall( cudaSetDoubleForDevice(&scale) );
- Divide<T, D> op(scale);
- cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
+ transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, abs_func<T>(), WithOutMask(), stream);
}
- template void divide_gpu<uchar, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<uchar, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<uchar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<uchar, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<uchar, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<uchar, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<uchar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //template void divide_gpu<schar, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<schar, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<schar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<schar, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<schar, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<schar, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<schar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //template void divide_gpu<ushort, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<ushort, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<ushort, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<ushort, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<ushort, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<ushort, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<ushort, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //template void divide_gpu<short, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<short, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<short, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<short, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<short, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<short, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<short, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //template void divide_gpu<int, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<int, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<int, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<int, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<int, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<int, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<int, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //template void divide_gpu<float, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<float, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<float, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<float, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<float, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<float, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<float, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //template void divide_gpu<double, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<double, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<double, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<double, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<double, int >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<double, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<double, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- template <typename T, typename D> struct DivideScalar : unary_function<T, D>
- {
- DivideScalar(double val_, double scale_) : val(val_), scale(scale_) {}
- __device__ __forceinline__ D operator ()(T a) const
+ template void absMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void absMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void absMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void absMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void absMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void absMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void absMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// sqrMat
+
+namespace
+{
+ template <typename T> struct Sqr : unary_function<T, T>
+ {
+ __device__ __forceinline__ T operator ()(T x) const
{
- return saturate_cast<D>(scale * a / val);
+ return saturate_cast<T>(x * x);
}
- const double val;
- const double scale;
+
+ __device__ __forceinline__ Sqr() {}
+ __device__ __forceinline__ Sqr(const Sqr& other) {}
};
+}
- template <> struct TransformFunctorTraits< DivideScalar<ushort, ushort> > : DefaultTransformFunctorTraits< DivideScalar<ushort, ushort> >
+namespace cv { namespace gpu { namespace device
+{
+ template <> struct TransformFunctorTraits< Sqr<ushort> > : DefaultTransformFunctorTraits< Sqr<ushort> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< DivideScalar<short, short> > : DefaultTransformFunctorTraits< DivideScalar<short, short> >
+ template <> struct TransformFunctorTraits< Sqr<short> > : DefaultTransformFunctorTraits< Sqr<short> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< DivideScalar<int, int> > : DefaultTransformFunctorTraits< DivideScalar<int, int> >
+ template <> struct TransformFunctorTraits< Sqr<int> > : DefaultTransformFunctorTraits< Sqr<int> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< DivideScalar<float, float> > : DefaultTransformFunctorTraits< DivideScalar<float, float> >
+ template <> struct TransformFunctorTraits< Sqr<float> > : DefaultTransformFunctorTraits< Sqr<float> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
+}}}
- template <typename T, typename D> void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream)
+namespace arithm
+{
+ template <typename T>
+ void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
{
- cudaSafeCall( cudaSetDoubleForDevice(&val) );
- cudaSafeCall( cudaSetDoubleForDevice(&scale) );
- DivideScalar<T, D> op(val, scale);
- cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
+ transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Sqr<T>(), WithOutMask(), stream);
}
- template void divide_gpu<uchar, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<uchar, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<uchar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<uchar, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<uchar, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<uchar, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<uchar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //template void divide_gpu<schar, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<schar, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<schar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<schar, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<schar, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<schar, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<schar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //template void divide_gpu<ushort, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<ushort, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<ushort, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<ushort, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<ushort, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<ushort, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<ushort, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //template void divide_gpu<short, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<short, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<short, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<short, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<short, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<short, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<short, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //template void divide_gpu<int, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<int, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<int, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<int, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<int, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<int, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<int, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //template void divide_gpu<float, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<float, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<float, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<float, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<float, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<float, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<float, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- //template void divide_gpu<double, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<double, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<double, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<double, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<double, int >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- //template void divide_gpu<double, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
- template void divide_gpu<double, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
- template <typename T, typename D> struct Reciprocal : unary_function<T, D>
- {
- Reciprocal(double scale_) : scale(scale_) {}
- __device__ __forceinline__ D operator ()(T a) const
- {
- return a != 0 ? saturate_cast<D>(scale / a) : 0;
- }
- const double scale;
- };
+ template void sqrMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void sqrMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void sqrMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void sqrMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void sqrMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void sqrMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void sqrMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// sqrtMat
- template <> struct TransformFunctorTraits< Reciprocal<ushort, ushort> > : DefaultTransformFunctorTraits< Reciprocal<ushort, ushort> >
+namespace cv { namespace gpu { namespace device
+{
+ template <> struct TransformFunctorTraits< sqrt_func<uchar> > : DefaultTransformFunctorTraits< sqrt_func<ushort> >
+ {
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< sqrt_func<schar> > : DefaultTransformFunctorTraits< sqrt_func<schar> >
+ {
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< sqrt_func<ushort> > : DefaultTransformFunctorTraits< sqrt_func<ushort> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< Reciprocal<short, short> > : DefaultTransformFunctorTraits< Reciprocal<short, short> >
+ template <> struct TransformFunctorTraits< sqrt_func<short> > : DefaultTransformFunctorTraits< sqrt_func<short> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< Reciprocal<int, int> > : DefaultTransformFunctorTraits< Reciprocal<int, int> >
+ template <> struct TransformFunctorTraits< sqrt_func<int> > : DefaultTransformFunctorTraits< sqrt_func<int> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< Reciprocal<float, float> > : DefaultTransformFunctorTraits< Reciprocal<float, float> >
+ template <> struct TransformFunctorTraits< sqrt_func<float> > : DefaultTransformFunctorTraits< sqrt_func<float> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
+}}}
- template <typename T, typename D> void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream)
+namespace arithm
+{
+ template <typename T>
+ void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
{
- cudaSafeCall( cudaSetDoubleForDevice(&scalar) );
- Reciprocal<T, D> op(scalar);
- cv::gpu::device::transform((PtrStepSz<T>)src2, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
+ transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, sqrt_func<T>(), WithOutMask(), stream);
}
- template void divide_gpu<uchar, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- //template void divide_gpu<uchar, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- template void divide_gpu<uchar, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- template void divide_gpu<uchar, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- template void divide_gpu<uchar, int >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- template void divide_gpu<uchar, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- template void divide_gpu<uchar, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-
- //template void divide_gpu<schar, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- //template void divide_gpu<schar, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- //template void divide_gpu<schar, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- //template void divide_gpu<schar, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- //template void divide_gpu<schar, int >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- //template void divide_gpu<schar, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- //template void divide_gpu<schar, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-
- //template void divide_gpu<ushort, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- //template void divide_gpu<ushort, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- template void divide_gpu<ushort, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- //template void divide_gpu<ushort, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- template void divide_gpu<ushort, int >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- template void divide_gpu<ushort, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- template void divide_gpu<ushort, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-
- //template void divide_gpu<short, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- //template void divide_gpu<short, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- //template void divide_gpu<short, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- template void divide_gpu<short, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- template void divide_gpu<short, int >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- template void divide_gpu<short, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- template void divide_gpu<short, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-
- //template void divide_gpu<int, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- //template void divide_gpu<int, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- //template void divide_gpu<int, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- //template void divide_gpu<int, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- template void divide_gpu<int, int >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- template void divide_gpu<int, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- template void divide_gpu<int, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-
- //template void divide_gpu<float, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- //template void divide_gpu<float, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- //template void divide_gpu<float, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- //template void divide_gpu<float, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- //template void divide_gpu<float, int >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- template void divide_gpu<float, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- template void divide_gpu<float, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-
- //template void divide_gpu<double, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- //template void divide_gpu<double, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- //template void divide_gpu<double, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- //template void divide_gpu<double, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- //template void divide_gpu<double, int >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- //template void divide_gpu<double, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
- template void divide_gpu<double, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-
- //////////////////////////////////////////////////////////////////////////
- // absdiff
-
- template <typename T> struct Absdiff : binary_function<T, T, T>
- {
- static __device__ __forceinline__ int abs(int a)
- {
- return ::abs(a);
- }
- static __device__ __forceinline__ float abs(float a)
- {
- return ::fabsf(a);
- }
- static __device__ __forceinline__ double abs(double a)
- {
- return ::fabs(a);
- }
+ template void sqrtMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void sqrtMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void sqrtMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void sqrtMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void sqrtMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void sqrtMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void sqrtMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
- __device__ __forceinline__ T operator ()(T a, T b) const
- {
- return saturate_cast<T>(::abs(a - b));
- }
- };
+//////////////////////////////////////////////////////////////////////////
+// logMat
- template <> struct TransformFunctorTraits< Absdiff<ushort> > : DefaultTransformFunctorTraits< Absdiff<ushort> >
+namespace cv { namespace gpu { namespace device
+{
+ template <> struct TransformFunctorTraits< log_func<uchar> > : DefaultTransformFunctorTraits< log_func<ushort> >
+ {
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< log_func<schar> > : DefaultTransformFunctorTraits< log_func<schar> >
+ {
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< log_func<ushort> > : DefaultTransformFunctorTraits< log_func<ushort> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< Absdiff<short> > : DefaultTransformFunctorTraits< Absdiff<short> >
+ template <> struct TransformFunctorTraits< log_func<short> > : DefaultTransformFunctorTraits< log_func<short> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< Absdiff<int> > : DefaultTransformFunctorTraits< Absdiff<int> >
+ template <> struct TransformFunctorTraits< log_func<int> > : DefaultTransformFunctorTraits< log_func<int> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< Absdiff<float> > : DefaultTransformFunctorTraits< Absdiff<float> >
+ template <> struct TransformFunctorTraits< log_func<float> > : DefaultTransformFunctorTraits< log_func<float> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
+}}}
- template <typename T> void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+namespace arithm
+{
+ template <typename T>
+ void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
{
- cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<T>)dst, Absdiff<T>(), WithOutMask(), stream);
+ transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, log_func<T>(), WithOutMask(), stream);
}
- //template void absdiff_gpu<uchar >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void absdiff_gpu<schar >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- //template void absdiff_gpu<ushort>(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void absdiff_gpu<short >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void absdiff_gpu<int >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- //template void absdiff_gpu<float >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void absdiff_gpu<double>(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void logMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void logMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void logMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void logMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void logMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void logMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void logMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// expMat
- template <typename T> struct AbsdiffScalar : unary_function<T, T>
+namespace
+{
+ template <typename T> struct Exp : unary_function<T, T>
{
- AbsdiffScalar(double val_) : val(val_) {}
- __device__ __forceinline__ T operator ()(T a) const
+ __device__ __forceinline__ T operator ()(T x) const
{
- return saturate_cast<T>(::fabs(a - val));
+ exp_func<T> f;
+ return saturate_cast<T>(f(x));
}
- double val;
+
+ __device__ __forceinline__ Exp() {}
+ __device__ __forceinline__ Exp(const Exp& other) {}
};
+}
- template <> struct TransformFunctorTraits< AbsdiffScalar<ushort> > : DefaultTransformFunctorTraits< AbsdiffScalar<ushort> >
+namespace cv { namespace gpu { namespace device
+{
+ template <> struct TransformFunctorTraits< Exp<ushort> > : DefaultTransformFunctorTraits< Exp<ushort> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< AbsdiffScalar<short> > : DefaultTransformFunctorTraits< AbsdiffScalar<short> >
+ template <> struct TransformFunctorTraits< Exp<short> > : DefaultTransformFunctorTraits< Exp<short> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< AbsdiffScalar<int> > : DefaultTransformFunctorTraits< AbsdiffScalar<int> >
+ template <> struct TransformFunctorTraits< Exp<int> > : DefaultTransformFunctorTraits< Exp<int> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
- template <> struct TransformFunctorTraits< AbsdiffScalar<float> > : DefaultTransformFunctorTraits< AbsdiffScalar<float> >
+ template <> struct TransformFunctorTraits< Exp<float> > : DefaultTransformFunctorTraits< Exp<float> >
{
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
+}}}
- template <typename T> void absdiff_gpu(const PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
+namespace arithm
+{
+ template <typename T>
+ void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
{
- cudaSafeCall( cudaSetDoubleForDevice(&val) );
- AbsdiffScalar<T> op(val);
- cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)dst, op, WithOutMask(), stream);
+ transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Exp<T>(), WithOutMask(), stream);
}
- //template void absdiff_gpu<uchar >(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
- template void absdiff_gpu<schar >(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
- //template void absdiff_gpu<ushort>(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
- template void absdiff_gpu<short >(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
- template void absdiff_gpu<int >(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
- //template void absdiff_gpu<float >(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
- template void absdiff_gpu<double>(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+ template void expMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void expMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void expMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void expMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void expMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void expMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ template void expMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
- //////////////////////////////////////////////////////////////////////////////////////
- // Compare
+//////////////////////////////////////////////////////////////////////////////////////
+// cmpMat
+namespace
+{
template <template <typename> class Op, typename T>
- struct Compare: binary_function<T, T, uchar>
+ struct Cmp: binary_function<T, T, uchar>
{
- __device__ __forceinline__ uchar operator()(T src1, T src2) const
+ __device__ __forceinline__ uchar operator()(T a, T b) const
{
Op<T> op;
- return static_cast<uchar>(static_cast<int>(op(src1, src2)) * 255);
+ return -op(a, b);
}
};
+}
-#define IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(op, type, block_dim_y, shift) \
- template <> struct TransformFunctorTraits< Compare<op, type> > : DefaultTransformFunctorTraits< Compare<op, type> > \
- { \
- enum { smart_block_dim_y = block_dim_y }; \
- enum { smart_shift = shift }; \
- };
+namespace cv { namespace gpu { namespace device
+{
+ #define IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(op, type, block_dim_y, shift) \
+ template <> struct TransformFunctorTraits< Cmp<op, type> > : DefaultTransformFunctorTraits< Cmp<op, type> > \
+ { \
+ enum { smart_block_dim_y = block_dim_y }; \
+ enum { smart_shift = shift }; \
+ };
IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(equal_to, int, 8, 4)
IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(equal_to, float, 8, 4)
IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(less_equal, int, 8, 4)
IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(less_equal, float, 8, 4)
-#undef IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS
+ #undef IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS
+}}}
- template <template <typename> class Op, typename T> void compare(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+namespace arithm
+{
+ template <template <typename> class Op, typename T>
+ void cmpMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
- Compare<Op, T> op;
- cv::gpu::device::transform(static_cast< PtrStepSz<T> >(src1), static_cast< PtrStepSz<T> >(src2), dst, op, WithOutMask(), stream);
+ Cmp<Op, T> op;
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, dst, op, WithOutMask(), stream);
}
- template <typename T> void compare_eq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+ template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
- compare<equal_to, T>(src1, src2, dst, stream);
+ cmpMat<equal_to, T>(src1, src2, dst, stream);
}
- template <typename T> void compare_ne(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+ template <typename T> void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
- compare<not_equal_to, T>(src1, src2, dst, stream);
+ cmpMat<not_equal_to, T>(src1, src2, dst, stream);
}
- template <typename T> void compare_lt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+ template <typename T> void cmpMatLt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
- compare<less, T>(src1, src2, dst, stream);
+ cmpMat<less, T>(src1, src2, dst, stream);
}
- template <typename T> void compare_le(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+ template <typename T> void cmpMatLe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
- compare<less_equal, T>(src1, src2, dst, stream);
+ cmpMat<less_equal, T>(src1, src2, dst, stream);
}
- template void compare_eq<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void compare_eq<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void compare_eq<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void compare_eq<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void compare_eq<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void compare_eq<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void compare_eq<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
- template void compare_ne<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void compare_ne<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void compare_ne<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void compare_ne<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void compare_ne<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void compare_ne<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void compare_ne<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
- template void compare_lt<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void compare_lt<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void compare_lt<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void compare_lt<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void compare_lt<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void compare_lt<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void compare_lt<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
- template void compare_le<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void compare_le<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void compare_le<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void compare_le<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void compare_le<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void compare_le<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void compare_le<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
+ template void cmpMatEq<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void cmpMatEq<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void cmpMatEq<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void cmpMatEq<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void cmpMatEq<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void cmpMatEq<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void cmpMatEq<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+ template void cmpMatNe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void cmpMatNe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void cmpMatNe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void cmpMatNe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void cmpMatNe<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void cmpMatNe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void cmpMatNe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+ template void cmpMatLt<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void cmpMatLt<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void cmpMatLt<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void cmpMatLt<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void cmpMatLt<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void cmpMatLt<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void cmpMatLt<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+ template void cmpMatLe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void cmpMatLe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void cmpMatLe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void cmpMatLe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void cmpMatLe<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void cmpMatLe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void cmpMatLe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+// cmpScalar
+
+namespace
+{
#define TYPE_VEC(type, cn) typename TypeVec<type, cn>::vec_type
- template <template <typename> class Op, typename T, int cn> struct CompareScalar;
+ template <template <typename> class Op, typename T, int cn> struct CmpScalar;
template <template <typename> class Op, typename T>
- struct CompareScalar<Op, T, 1>: unary_function<T, uchar>
+ struct CmpScalar<Op, T, 1> : unary_function<T, uchar>
{
const T val;
- __host__ explicit CompareScalar(T val_) : val(val_) {}
+ __host__ explicit CmpScalar(T val_) : val(val_) {}
__device__ __forceinline__ uchar operator()(T src) const
{
- Op<T> op;
- return static_cast<uchar>(static_cast<int>(op(src, val)) * 255);
+ Cmp<Op, T> op;
+ return op(src, val);
}
};
template <template <typename> class Op, typename T>
- struct CompareScalar<Op, T, 2>: unary_function<TYPE_VEC(T, 2), TYPE_VEC(uchar, 2)>
+ struct CmpScalar<Op, T, 2> : unary_function<TYPE_VEC(T, 2), TYPE_VEC(uchar, 2)>
{
const TYPE_VEC(T, 2) val;
- __host__ explicit CompareScalar(TYPE_VEC(T, 2) val_) : val(val_) {}
+ __host__ explicit CmpScalar(TYPE_VEC(T, 2) val_) : val(val_) {}
__device__ __forceinline__ TYPE_VEC(uchar, 2) operator()(const TYPE_VEC(T, 2) & src) const
{
- Op<T> op;
- return VecTraits<TYPE_VEC(uchar, 2)>::make(
- static_cast<uchar>(static_cast<int>(op(src.x, val.x)) * 255),
- static_cast<uchar>(static_cast<int>(op(src.y, val.y)) * 255));
+ Cmp<Op, T> op;
+ return VecTraits<TYPE_VEC(uchar, 2)>::make(op(src.x, val.x), op(src.y, val.y));
}
};
template <template <typename> class Op, typename T>
- struct CompareScalar<Op, T, 3>: unary_function<TYPE_VEC(T, 3), TYPE_VEC(uchar, 3)>
+ struct CmpScalar<Op, T, 3> : unary_function<TYPE_VEC(T, 3), TYPE_VEC(uchar, 3)>
{
const TYPE_VEC(T, 3) val;
- __host__ explicit CompareScalar(TYPE_VEC(T, 3) val_) : val(val_) {}
+ __host__ explicit CmpScalar(TYPE_VEC(T, 3) val_) : val(val_) {}
__device__ __forceinline__ TYPE_VEC(uchar, 3) operator()(const TYPE_VEC(T, 3) & src) const
{
- Op<T> op;
- return VecTraits<TYPE_VEC(uchar, 3)>::make(
- static_cast<uchar>(static_cast<int>(op(src.x, val.x)) * 255),
- static_cast<uchar>(static_cast<int>(op(src.y, val.y)) * 255),
- static_cast<uchar>(static_cast<int>(op(src.z, val.z)) * 255));
+ Cmp<Op, T> op;
+ return VecTraits<TYPE_VEC(uchar, 3)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z));
}
};
template <template <typename> class Op, typename T>
- struct CompareScalar<Op, T, 4>: unary_function<TYPE_VEC(T, 4), TYPE_VEC(uchar, 4)>
+ struct CmpScalar<Op, T, 4> : unary_function<TYPE_VEC(T, 4), TYPE_VEC(uchar, 4)>
{
const TYPE_VEC(T, 4) val;
- __host__ explicit CompareScalar(TYPE_VEC(T, 4) val_) : val(val_) {}
+ __host__ explicit CmpScalar(TYPE_VEC(T, 4) val_) : val(val_) {}
__device__ __forceinline__ TYPE_VEC(uchar, 4) operator()(const TYPE_VEC(T, 4) & src) const
{
- Op<T> op;
- return VecTraits<TYPE_VEC(uchar, 4)>::make(
- static_cast<uchar>(static_cast<int>(op(src.x, val.x)) * 255),
- static_cast<uchar>(static_cast<int>(op(src.y, val.y)) * 255),
- static_cast<uchar>(static_cast<int>(op(src.z, val.z)) * 255),
- static_cast<uchar>(static_cast<int>(op(src.w, val.w)) * 255));
+ Cmp<Op, T> op;
+ return VecTraits<TYPE_VEC(uchar, 4)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z), op(src.w, val.w));
}
};
#undef TYPE_VEC
+}
+namespace cv { namespace gpu { namespace device
+{
#define IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(op, type, block_dim_y, shift) \
- template <> struct TransformFunctorTraits< CompareScalar<op, type, 1> > : DefaultTransformFunctorTraits< CompareScalar<op, type, 1> > \
+ template <> struct TransformFunctorTraits< CmpScalar<op, type, 1> > : DefaultTransformFunctorTraits< CmpScalar<op, type, 1> > \
{ \
enum { smart_block_dim_y = block_dim_y }; \
enum { smart_shift = shift }; \
IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(less_equal, float, 8, 4)
#undef IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS
+}}}
- template <template <typename> class Op, typename T, int cn> void compare(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream)
+namespace arithm
+{
+ template <template <typename> class Op, typename T, int cn>
+ void cmpScalar(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef typename TypeVec<T, cn>::vec_type src_t;
typedef typename TypeVec<uchar, cn>::vec_type dst_t;
T sval[] = {static_cast<T>(val[0]), static_cast<T>(val[1]), static_cast<T>(val[2]), static_cast<T>(val[3])};
src_t val1 = VecTraits<src_t>::make(sval);
- CompareScalar<Op, T, cn> op(val1);
-
- cv::gpu::device::transform(static_cast< PtrStepSz<src_t> >(src), static_cast< PtrStepSz<dst_t> >(dst), op, WithOutMask(), stream);
+ CmpScalar<Op, T, cn> op(val1);
+ transform((PtrStepSz<src_t>) src, (PtrStepSz<dst_t>) dst, op, WithOutMask(), stream);
}
- template <typename T> void compare_eq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+ template <typename T> void cmpScalarEq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+ {
+ typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ static const func_t funcs[] =
+ {
+ 0,
+ cmpScalar<equal_to, T, 1>,
+ cmpScalar<equal_to, T, 2>,
+ cmpScalar<equal_to, T, 3>,
+ cmpScalar<equal_to, T, 4>
+ };
+
+ funcs[cn](src, val, dst, stream);
+ }
+ template <typename T> void cmpScalarNe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
0,
- compare<equal_to, T, 1>,
- compare<equal_to, T, 2>,
- compare<equal_to, T, 3>,
- compare<equal_to, T, 4>
+ cmpScalar<not_equal_to, T, 1>,
+ cmpScalar<not_equal_to, T, 2>,
+ cmpScalar<not_equal_to, T, 3>,
+ cmpScalar<not_equal_to, T, 4>
};
funcs[cn](src, val, dst, stream);
}
- template <typename T> void compare_ne(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+ template <typename T> void cmpScalarLt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
0,
- compare<not_equal_to, T, 1>,
- compare<not_equal_to, T, 2>,
- compare<not_equal_to, T, 3>,
- compare<not_equal_to, T, 4>
+ cmpScalar<less, T, 1>,
+ cmpScalar<less, T, 2>,
+ cmpScalar<less, T, 3>,
+ cmpScalar<less, T, 4>
};
funcs[cn](src, val, dst, stream);
}
- template <typename T> void compare_lt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+ template <typename T> void cmpScalarLe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
0,
- compare<less, T, 1>,
- compare<less, T, 2>,
- compare<less, T, 3>,
- compare<less, T, 4>
+ cmpScalar<less_equal, T, 1>,
+ cmpScalar<less_equal, T, 2>,
+ cmpScalar<less_equal, T, 3>,
+ cmpScalar<less_equal, T, 4>
};
funcs[cn](src, val, dst, stream);
}
- template <typename T> void compare_le(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+ template <typename T> void cmpScalarGt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
0,
- compare<less_equal, T, 1>,
- compare<less_equal, T, 2>,
- compare<less_equal, T, 3>,
- compare<less_equal, T, 4>
+ cmpScalar<greater, T, 1>,
+ cmpScalar<greater, T, 2>,
+ cmpScalar<greater, T, 3>,
+ cmpScalar<greater, T, 4>
};
funcs[cn](src, val, dst, stream);
}
- template <typename T> void compare_gt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+ template <typename T> void cmpScalarGe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
{
typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
0,
- compare<greater, T, 1>,
- compare<greater, T, 2>,
- compare<greater, T, 3>,
- compare<greater, T, 4>
+ cmpScalar<greater_equal, T, 1>,
+ cmpScalar<greater_equal, T, 2>,
+ cmpScalar<greater_equal, T, 3>,
+ cmpScalar<greater_equal, T, 4>
};
- funcs[cn](src, val, dst, stream);
+ funcs[cn](src, val, dst, stream);
+ }
+
+ template void cmpScalarEq<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarEq<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarEq<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarEq<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarEq<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarEq<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarEq<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+
+ template void cmpScalarNe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarNe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarNe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarNe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarNe<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarNe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarNe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+
+ template void cmpScalarLt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarLt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarLt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarLt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarLt<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarLt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarLt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+
+ template void cmpScalarLe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarLe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarLe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarLe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarLe<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarLe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarLe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+
+ template void cmpScalarGt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarGt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarGt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarGt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarGt<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarGt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarGt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+
+ template void cmpScalarGe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarGe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarGe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarGe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarGe<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarGe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template void cmpScalarGe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+// bitMat
+
+namespace cv { namespace gpu { namespace device
+{
+ template <> struct TransformFunctorTraits< bit_not<uchar> > : DefaultTransformFunctorTraits< bit_not<uchar> >
+ {
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< bit_not<ushort> > : DefaultTransformFunctorTraits< bit_not<ushort> >
+ {
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< bit_not<uint> > : DefaultTransformFunctorTraits< bit_not<uint> >
+ {
+ enum { smart_shift = 2 };
+ };
+
+ template <> struct TransformFunctorTraits< bit_and<uchar> > : DefaultTransformFunctorTraits< bit_and<uchar> >
+ {
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< bit_and<ushort> > : DefaultTransformFunctorTraits< bit_and<ushort> >
+ {
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< bit_and<uint> > : DefaultTransformFunctorTraits< bit_and<uint> >
+ {
+ enum { smart_shift = 2 };
+ };
+
+ template <> struct TransformFunctorTraits< bit_or<uchar> > : DefaultTransformFunctorTraits< bit_or<uchar> >
+ {
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< bit_or<ushort> > : DefaultTransformFunctorTraits< bit_or<ushort> >
+ {
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< bit_or<uint> > : DefaultTransformFunctorTraits< bit_or<uint> >
+ {
+ enum { smart_shift = 2 };
+ };
+
+ template <> struct TransformFunctorTraits< bit_xor<uchar> > : DefaultTransformFunctorTraits< bit_xor<uchar> >
+ {
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< bit_xor<ushort> > : DefaultTransformFunctorTraits< bit_xor<ushort> >
+ {
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< bit_xor<uint> > : DefaultTransformFunctorTraits< bit_xor<uint> >
+ {
+ enum { smart_shift = 2 };
+ };
+}}}
+
+namespace arithm
+{
+ template <typename T> void bitMatNot(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+ {
+ if (mask.data)
+ transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), mask, stream);
+ else
+ transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), WithOutMask(), stream);
+ }
+
+ template <typename T> void bitMatAnd(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+ {
+ if (mask.data)
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), mask, stream);
+ else
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), WithOutMask(), stream);
}
- template <typename T> void compare_ge(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+
+ template <typename T> void bitMatOr(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
{
- typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
- static const func_t funcs[] =
- {
- 0,
- compare<greater_equal, T, 1>,
- compare<greater_equal, T, 2>,
- compare<greater_equal, T, 3>,
- compare<greater_equal, T, 4>
- };
+ if (mask.data)
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), mask, stream);
+ else
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), WithOutMask(), stream);
+ }
- funcs[cn](src, val, dst, stream);
+ template <typename T> void bitMatXor(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+ {
+ if (mask.data)
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), mask, stream);
+ else
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), WithOutMask(), stream);
}
- template void compare_eq<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_eq<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_eq<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_eq<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_eq<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_eq<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_eq<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-
- template void compare_ne<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_ne<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_ne<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_ne<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_ne<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_ne<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_ne<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-
- template void compare_lt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_lt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_lt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_lt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_lt<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_lt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_lt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-
- template void compare_le<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_le<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_le<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_le<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_le<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_le<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_le<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-
- template void compare_gt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_gt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_gt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_gt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_gt<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_gt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_gt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-
- template void compare_ge<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_ge<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_ge<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_ge<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_ge<int >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_ge<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template void compare_ge<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-
- //////////////////////////////////////////////////////////////////////////
- // Unary bitwise logical matrix operations
-
- enum { UN_OP_NOT };
-
- template <typename T, int opid>
- struct UnOp;
+ template void bitMatNot<uchar>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void bitMatNot<ushort>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void bitMatNot<uint>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
- template <typename T>
- struct UnOp<T, UN_OP_NOT>
+ template void bitMatAnd<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void bitMatAnd<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void bitMatAnd<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ template void bitMatOr<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void bitMatOr<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void bitMatOr<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+ template void bitMatXor<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void bitMatXor<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template void bitMatXor<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+// bitScalar
+
+namespace cv { namespace gpu { namespace device
+{
+ template <> struct TransformFunctorTraits< binder2nd< bit_and<uchar> > > : DefaultTransformFunctorTraits< binder2nd< bit_and<uchar> > >
+ {
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< binder2nd< bit_and<ushort> > > : DefaultTransformFunctorTraits< binder2nd< bit_and<ushort> > >
+ {
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< binder2nd< bit_and<uint> > > : DefaultTransformFunctorTraits< binder2nd< bit_and<uint> > >
{
- static __device__ __forceinline__ T call(T v) { return ~v; }
+ enum { smart_shift = 2 };
};
+ template <> struct TransformFunctorTraits< binder2nd< bit_or<uchar> > > : DefaultTransformFunctorTraits< binder2nd< bit_or<uchar> > >
+ {
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< binder2nd< bit_or<ushort> > > : DefaultTransformFunctorTraits< binder2nd< bit_or<ushort> > >
+ {
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< binder2nd< bit_or<uint> > > : DefaultTransformFunctorTraits< binder2nd< bit_or<uint> > >
+ {
+ enum { smart_shift = 2 };
+ };
- template <int opid>
- __global__ void bitwiseUnOpKernel(int rows, int width, const PtrStepb src, PtrStepb dst)
+ template <> struct TransformFunctorTraits< binder2nd< bit_xor<uchar> > > : DefaultTransformFunctorTraits< binder2nd< bit_xor<uchar> > >
{
- const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;
- const int y = blockDim.y * blockIdx.y + threadIdx.y;
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< binder2nd< bit_xor<ushort> > > : DefaultTransformFunctorTraits< binder2nd< bit_xor<ushort> > >
+ {
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< binder2nd< bit_xor<uint> > > : DefaultTransformFunctorTraits< binder2nd< bit_xor<uint> > >
+ {
+ enum { smart_shift = 2 };
+ };
+}}}
- if (y < rows)
- {
- uchar* dst_ptr = dst.ptr(y) + x;
- const uchar* src_ptr = src.ptr(y) + x;
- if (x + sizeof(uint) - 1 < width)
- {
- *(uint*)dst_ptr = UnOp<uint, opid>::call(*(uint*)src_ptr);
- }
- else
- {
- const uchar* src_end = src.ptr(y) + width;
- while (src_ptr < src_end)
- {
- *dst_ptr++ = UnOp<uchar, opid>::call(*src_ptr++);
- }
- }
- }
+namespace arithm
+{
+ template <typename T> void bitScalarAnd(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
+ {
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(bit_and<T>(), src2), WithOutMask(), stream);
}
+ template <typename T> void bitScalarOr(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
+ {
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(bit_or<T>(), src2), WithOutMask(), stream);
+ }
- template <int opid>
- void bitwiseUnOp(int rows, int width, const PtrStepb src, PtrStepb dst,
- cudaStream_t stream)
+ template <typename T> void bitScalarXor(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
{
- dim3 threads(16, 16);
- dim3 grid(divUp(width, threads.x * sizeof(uint)),
- divUp(rows, threads.y));
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(bit_xor<T>(), src2), WithOutMask(), stream);
+ }
- bitwiseUnOpKernel<opid><<<grid, threads>>>(rows, width, src, dst);
- cudaSafeCall( cudaGetLastError() );
+ template void bitScalarAnd<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+ template void bitScalarAnd<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+ template void bitScalarAnd<uint>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
- }
+ template void bitScalarOr<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+ template void bitScalarOr<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+ template void bitScalarOr<uint>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+ template void bitScalarXor<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+ template void bitScalarXor<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+ template void bitScalarXor<uint>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+}
- template <typename T, int opid>
- __global__ void bitwiseUnOpKernel(int rows, int cols, int cn, const PtrStepb src,
- const PtrStepb mask, PtrStepb dst)
- {
- const int x = blockDim.x * blockIdx.x + threadIdx.x;
- const int y = blockDim.y * blockIdx.y + threadIdx.y;
+//////////////////////////////////////////////////////////////////////////
+// min
- if (x < cols && y < rows && mask.ptr(y)[x / cn])
+namespace
+{
+ template <typename T> struct VMin4;
+ template <> struct VMin4<uint> : binary_function<uint, uint, uint>
+ {
+ __device__ __forceinline__ uint operator ()(uint a, uint b) const
{
- T* dst_row = (T*)dst.ptr(y);
- const T* src_row = (const T*)src.ptr(y);
+ uint res = 0;
- dst_row[x] = UnOp<T, opid>::call(src_row[x]);
- }
- }
+ #if __CUDA_ARCH__ >= 300
+ asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+ return res;
+ }
- template <typename T, int opid>
- void bitwiseUnOp(int rows, int cols, int cn, const PtrStepb src,
- const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
+ __device__ __forceinline__ VMin4() {}
+ __device__ __forceinline__ VMin4(const VMin4& other) {}
+ };
+ template <> struct VMin4<int> : binary_function<int, int, int>
{
- dim3 threads(16, 16);
- dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
+ __device__ __forceinline__ int operator ()(int a, int b) const
+ {
+ int res = 0;
- bitwiseUnOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src, mask, dst);
- cudaSafeCall( cudaGetLastError() );
+ #if __CUDA_ARCH__ >= 300
+ asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vmin.s32.s32.s32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
- }
+ return res;
+ }
+
+ __device__ __forceinline__ VMin4() {}
+ __device__ __forceinline__ VMin4(const VMin4& other) {}
+ };
+ ////////////////////////////////////
- void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn,
- const PtrStepb src, PtrStepb dst, cudaStream_t stream)
+ template <typename T> struct VMin2;
+ template <> struct VMin2<uint> : binary_function<uint, uint, uint>
{
- bitwiseUnOp<UN_OP_NOT>(rows, static_cast<int>(cols * elem_size1 * cn), src, dst, stream);
- }
+ __device__ __forceinline__ uint operator ()(uint a, uint b) const
+ {
+ uint res = 0;
+ #if __CUDA_ARCH__ >= 300
+ asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
- template <typename T>
- void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src,
- const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
+ return res;
+ }
+
+ __device__ __forceinline__ VMin2() {}
+ __device__ __forceinline__ VMin2(const VMin2& other) {}
+ };
+ template <> struct VMin2<int> : binary_function<int, int, int>
{
- bitwiseUnOp<T, UN_OP_NOT>(rows, cols * cn, cn, src, mask, dst, stream);
- }
+ __device__ __forceinline__ int operator ()(int a, int b) const
+ {
+ int res = 0;
- template void bitwiseMaskNotCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
- template void bitwiseMaskNotCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
- template void bitwiseMaskNotCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+ #if __CUDA_ARCH__ >= 300
+ asm("vmin2.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vmin.s32.s32.s32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vmin.s32.s32.s32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+ return res;
+ }
- //////////////////////////////////////////////////////////////////////////
- // Binary bitwise logical matrix operations
+ __device__ __forceinline__ VMin2() {}
+ __device__ __forceinline__ VMin2(const VMin2& other) {}
+ };
+}
- enum { BIN_OP_OR, BIN_OP_AND, BIN_OP_XOR };
+namespace cv { namespace gpu { namespace device
+{
+ template <typename T> struct TransformFunctorTraits< VMin4<T> > : DefaultTransformFunctorTraits< VMin4<T> >
+ {
+ enum { smart_block_dim_y = 4 };
+ enum { smart_shift = 4 };
+ };
- template <typename T, int opid>
- struct BinOp;
+ ////////////////////////////////////
- template <typename T>
- struct BinOp<T, BIN_OP_OR>
+ template <typename T> struct TransformFunctorTraits< VMin2<T> > : DefaultTransformFunctorTraits< VMin2<T> >
{
- static __device__ __forceinline__ T call(T a, T b) { return a | b; }
+ enum { smart_block_dim_y = 4 };
+ enum { smart_shift = 4 };
};
+ ////////////////////////////////////
- template <typename T>
- struct BinOp<T, BIN_OP_AND>
+ template <> struct TransformFunctorTraits< minimum<ushort> > : DefaultTransformFunctorTraits< minimum<ushort> >
{
- static __device__ __forceinline__ T call(T a, T b) { return a & b; }
+ enum { smart_shift = 4 };
};
-
- template <typename T>
- struct BinOp<T, BIN_OP_XOR>
+ template <> struct TransformFunctorTraits< minimum<short> > : DefaultTransformFunctorTraits< minimum<short> >
+ {
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< minimum<int> > : DefaultTransformFunctorTraits< minimum<int> >
+ {
+ enum { smart_block_dim_y = 4 };
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< minimum<float> > : DefaultTransformFunctorTraits< minimum<float> >
{
- static __device__ __forceinline__ T call(T a, T b) { return a ^ b; }
+ enum { smart_block_dim_y = 4 };
+ enum { smart_shift = 4 };
};
+ template <> struct TransformFunctorTraits< binder2nd< minimum<ushort> > > : DefaultTransformFunctorTraits< binder2nd< minimum<ushort> > >
+ {
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< binder2nd< minimum<short> > > : DefaultTransformFunctorTraits< binder2nd< minimum<short> > >
+ {
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< binder2nd< minimum<int> > > : DefaultTransformFunctorTraits< binder2nd< minimum<int> > >
+ {
+ enum { smart_block_dim_y = 4 };
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< binder2nd< minimum<float> > > : DefaultTransformFunctorTraits< binder2nd< minimum<float> > >
+ {
+ enum { smart_block_dim_y = 4 };
+ enum { smart_shift = 4 };
+ };
+}}}
- template <int opid>
- __global__ void bitwiseBinOpKernel(int rows, int width, const PtrStepb src1,
- const PtrStepb src2, PtrStepb dst)
+namespace arithm
+{
+ template <typename T> void vmin4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
- const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;
- const int y = blockDim.y * blockIdx.y + threadIdx.y;
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMin4<T>(), WithOutMask(), stream);
+ }
- if (y < rows)
- {
- uchar* dst_ptr = dst.ptr(y) + x;
- const uchar* src1_ptr = src1.ptr(y) + x;
- const uchar* src2_ptr = src2.ptr(y) + x;
+ template <typename T> void vmin2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+ {
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMin2<T>(), WithOutMask(), stream);
+ }
- if (x + sizeof(uint) - 1 < width)
- {
- *(uint*)dst_ptr = BinOp<uint, opid>::call(*(uint*)src1_ptr, *(uint*)src2_ptr);
- }
- else
- {
- const uchar* src1_end = src1.ptr(y) + width;
- while (src1_ptr < src1_end)
- {
- *dst_ptr++ = BinOp<uchar, opid>::call(*src1_ptr++, *src2_ptr++);
- }
- }
- }
+ template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+ {
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, minimum<T>(), WithOutMask(), stream);
}
+ template void vmin4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void vmin4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template <int opid>
- void bitwiseBinOp(int rows, int width, const PtrStepb src1, const PtrStepb src2,
- PtrStepb dst, cudaStream_t stream)
- {
- dim3 threads(16, 16);
- dim3 grid(divUp(width, threads.x * sizeof(uint)), divUp(rows, threads.y));
+ template void vmin2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void vmin2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- bitwiseBinOpKernel<opid><<<grid, threads>>>(rows, width, src1, src2, dst);
- cudaSafeCall( cudaGetLastError() );
+ template void minMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void minMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void minMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void minMat<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void minMat<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void minMat<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void minMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
+ template <typename T> void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream)
+ {
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(minimum<T>(), src2), WithOutMask(), stream);
}
+ template void minScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+ template void minScalar<schar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+ template void minScalar<ushort>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+ template void minScalar<short >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+ template void minScalar<int >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+ template void minScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+ template void minScalar<double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+}
- template <typename T, int opid>
- __global__ void bitwiseBinOpKernel(
- int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
- const PtrStepb mask, PtrStepb dst)
- {
- const int x = blockDim.x * blockIdx.x + threadIdx.x;
- const int y = blockDim.y * blockIdx.y + threadIdx.y;
+//////////////////////////////////////////////////////////////////////////
+// max
- if (x < cols && y < rows && mask.ptr(y)[x / cn])
+namespace
+{
+ template <typename T> struct VMax4;
+ template <> struct VMax4<uint> : binary_function<uint, uint, uint>
+ {
+ __device__ __forceinline__ uint operator ()(uint a, uint b) const
{
- T* dst_row = (T*)dst.ptr(y);
- const T* src1_row = (const T*)src1.ptr(y);
- const T* src2_row = (const T*)src2.ptr(y);
+ uint res = 0;
- dst_row[x] = BinOp<T, opid>::call(src1_row[x], src2_row[x]);
- }
- }
+ #if __CUDA_ARCH__ >= 300
+ asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+ return res;
+ }
- template <typename T, int opid>
- void bitwiseBinOp(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
- const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
+ __device__ __forceinline__ VMax4() {}
+ __device__ __forceinline__ VMax4(const VMax4& other) {}
+ };
+ template <> struct VMax4<int> : binary_function<int, int, int>
{
- dim3 threads(16, 16);
- dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-
- bitwiseBinOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src1, src2, mask, dst);
- cudaSafeCall( cudaGetLastError() );
+ __device__ __forceinline__ int operator ()(int a, int b) const
+ {
+ int res = 0;
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
- }
+ #if __CUDA_ARCH__ >= 300
+ asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vmax.s32.s32.s32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+ return res;
+ }
- void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1,
- const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
- {
- bitwiseBinOp<BIN_OP_OR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);
- }
+ __device__ __forceinline__ VMax4() {}
+ __device__ __forceinline__ VMax4(const VMax4& other) {}
+ };
+ ////////////////////////////////////
- template <typename T>
- void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
- const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
+ template <typename T> struct VMax2;
+ template <> struct VMax2<uint> : binary_function<uint, uint, uint>
{
- bitwiseBinOp<T, BIN_OP_OR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
- }
+ __device__ __forceinline__ uint operator ()(uint a, uint b) const
+ {
+ uint res = 0;
- template void bitwiseMaskOrCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
- template void bitwiseMaskOrCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
- template void bitwiseMaskOrCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+ #if __CUDA_ARCH__ >= 300
+ asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
+ return res;
+ }
- void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1,
- const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
+ __device__ __forceinline__ VMax2() {}
+ __device__ __forceinline__ VMax2(const VMax2& other) {}
+ };
+ template <> struct VMax2<int> : binary_function<int, int, int>
{
- bitwiseBinOp<BIN_OP_AND>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);
- }
-
+ __device__ __forceinline__ int operator ()(int a, int b) const
+ {
+ int res = 0;
- template <typename T>
- void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
- const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
- {
- bitwiseBinOp<T, BIN_OP_AND>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
- }
+ #if __CUDA_ARCH__ >= 300
+ asm("vmax2.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #elif __CUDA_ARCH__ >= 200
+ asm("vmax.s32.s32.s32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ asm("vmax.s32.s32.s32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+ #endif
- template void bitwiseMaskAndCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
- template void bitwiseMaskAndCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
- template void bitwiseMaskAndCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+ return res;
+ }
+ __device__ __forceinline__ VMax2() {}
+ __device__ __forceinline__ VMax2(const VMax2& other) {}
+ };
+}
- void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1,
- const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
+namespace cv { namespace gpu { namespace device
+{
+ template <typename T> struct TransformFunctorTraits< VMax4<T> > : DefaultTransformFunctorTraits< VMax4<T> >
{
- bitwiseBinOp<BIN_OP_XOR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);
- }
+ enum { smart_block_dim_y = 4 };
+ enum { smart_shift = 4 };
+ };
+ ////////////////////////////////////
- template <typename T>
- void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
- const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
+ template <typename T> struct TransformFunctorTraits< VMax2<T> > : DefaultTransformFunctorTraits< VMax2<T> >
{
- bitwiseBinOp<T, BIN_OP_XOR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
- }
-
- template void bitwiseMaskXorCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
- template void bitwiseMaskXorCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
- template void bitwiseMaskXorCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+ enum { smart_block_dim_y = 4 };
+ enum { smart_shift = 4 };
+ };
- //////////////////////////////////////////////////////////////////////////
- // min/max
+ ////////////////////////////////////
- namespace detail
+ template <> struct TransformFunctorTraits< maximum<ushort> > : DefaultTransformFunctorTraits< maximum<ushort> >
{
- template <size_t size, typename F> struct MinMaxTraits : DefaultTransformFunctorTraits<F>
- {
- };
- template <typename F> struct MinMaxTraits<2, F> : DefaultTransformFunctorTraits<F>
- {
- enum { smart_shift = 4 };
- };
- template <typename F> struct MinMaxTraits<4, F> : DefaultTransformFunctorTraits<F>
- {
- enum { smart_block_dim_y = 4 };
- enum { smart_shift = 4 };
- };
- }
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< maximum<short> > : DefaultTransformFunctorTraits< maximum<short> >
+ {
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< maximum<int> > : DefaultTransformFunctorTraits< maximum<int> >
+ {
+ enum { smart_block_dim_y = 4 };
+ enum { smart_shift = 4 };
+ };
+ template <> struct TransformFunctorTraits< maximum<float> > : DefaultTransformFunctorTraits< maximum<float> >
+ {
+ enum { smart_block_dim_y = 4 };
+ enum { smart_shift = 4 };
+ };
- template <typename T> struct TransformFunctorTraits< minimum<T> > : detail::MinMaxTraits< sizeof(T), minimum<T> >
+ template <> struct TransformFunctorTraits< binder2nd< maximum<ushort> > > : DefaultTransformFunctorTraits< binder2nd< maximum<ushort> > >
{
+ enum { smart_shift = 4 };
};
- template <typename T> struct TransformFunctorTraits< maximum<T> > : detail::MinMaxTraits< sizeof(T), maximum<T> >
+ template <> struct TransformFunctorTraits< binder2nd< maximum<short> > > : DefaultTransformFunctorTraits< binder2nd< maximum<short> > >
{
+ enum { smart_shift = 4 };
};
- template <typename T> struct TransformFunctorTraits< binder2nd< minimum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< minimum<T> > >
+ template <> struct TransformFunctorTraits< binder2nd< maximum<int> > > : DefaultTransformFunctorTraits< binder2nd< maximum<int> > >
{
+ enum { smart_block_dim_y = 4 };
+ enum { smart_shift = 4 };
};
- template <typename T> struct TransformFunctorTraits< binder2nd< maximum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< maximum<T> > >
+ template <> struct TransformFunctorTraits< binder2nd< maximum<float> > > : DefaultTransformFunctorTraits< binder2nd< maximum<float> > >
{
+ enum { smart_block_dim_y = 4 };
+ enum { smart_shift = 4 };
};
+}}}
- template <typename T>
- void min_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+namespace arithm
+{
+ template <typename T> void vmax4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
- cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<T>)dst, minimum<T>(), WithOutMask(), stream);
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMax4<T>(), WithOutMask(), stream);
}
- template void min_gpu<uchar >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void min_gpu<schar >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void min_gpu<ushort>(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void min_gpu<short >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void min_gpu<int >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void min_gpu<float >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void min_gpu<double>(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
- template <typename T>
- void max_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+ template <typename T> void vmax2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
- cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<T>)dst, maximum<T>(), WithOutMask(), stream);
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMax2<T>(), WithOutMask(), stream);
}
- template void max_gpu<uchar >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void max_gpu<schar >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void max_gpu<ushort>(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void max_gpu<short >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void max_gpu<int >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void max_gpu<float >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template void max_gpu<double>(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
- template <typename T>
- void min_gpu(const PtrStepSzb src, T val, PtrStepSzb dst, cudaStream_t stream)
+ template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
{
- cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, device::bind2nd(minimum<T>(), val), WithOutMask(), stream);
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, maximum<T>(), WithOutMask(), stream);
}
- template void min_gpu<uchar >(const PtrStepSzb src, uchar val, PtrStepSzb dst, cudaStream_t stream);
- template void min_gpu<schar >(const PtrStepSzb src, schar val, PtrStepSzb dst, cudaStream_t stream);
- template void min_gpu<ushort>(const PtrStepSzb src, ushort val, PtrStepSzb dst, cudaStream_t stream);
- template void min_gpu<short >(const PtrStepSzb src, short val, PtrStepSzb dst, cudaStream_t stream);
- template void min_gpu<int >(const PtrStepSzb src, int val, PtrStepSzb dst, cudaStream_t stream);
- template void min_gpu<float >(const PtrStepSzb src, float val, PtrStepSzb dst, cudaStream_t stream);
- template void min_gpu<double>(const PtrStepSzb src, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void vmax4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void vmax4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template <typename T>
- void max_gpu(const PtrStepSzb src, T val, PtrStepSzb dst, cudaStream_t stream)
+ template void vmax2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void vmax2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+ template void maxMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void maxMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void maxMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void maxMat<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void maxMat<int >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void maxMat<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template void maxMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+ template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream)
{
- cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, device::bind2nd(maximum<T>(), val), WithOutMask(), stream);
+ transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(maximum<T>(), src2), WithOutMask(), stream);
}
- template void max_gpu<uchar >(const PtrStepSzb src, uchar val, PtrStepSzb dst, cudaStream_t stream);
- template void max_gpu<schar >(const PtrStepSzb src, schar val, PtrStepSzb dst, cudaStream_t stream);
- template void max_gpu<ushort>(const PtrStepSzb src, ushort val, PtrStepSzb dst, cudaStream_t stream);
- template void max_gpu<short >(const PtrStepSzb src, short val, PtrStepSzb dst, cudaStream_t stream);
- template void max_gpu<int >(const PtrStepSzb src, int val, PtrStepSzb dst, cudaStream_t stream);
- template void max_gpu<float >(const PtrStepSzb src, float val, PtrStepSzb dst, cudaStream_t stream);
- template void max_gpu<double>(const PtrStepSzb src, double val, PtrStepSzb dst, cudaStream_t stream);
+ template void maxScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+ template void maxScalar<schar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+ template void maxScalar<ushort>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+ template void maxScalar<short >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+ template void maxScalar<int >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+ template void maxScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+ template void maxScalar<double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+}
- //////////////////////////////////////////////////////////////////////////
- // threshold
+//////////////////////////////////////////////////////////////////////////
+// threshold
+namespace cv { namespace gpu { namespace device
+{
namespace detail
{
template <size_t size, typename F> struct ThresholdTraits : DefaultTransformFunctorTraits<F>
template <typename T> struct TransformFunctorTraits< thresh_to_zero_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_inv_func<T> >
{
};
+}}}
+namespace arithm
+{
template <template <typename> class Op, typename T>
- void threshold_caller(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, T thresh, T maxVal, cudaStream_t stream)
+ void threshold_caller(PtrStepSz<T> src, PtrStepSz<T> dst, T thresh, T maxVal, cudaStream_t stream)
{
Op<T> op(thresh, maxVal);
- cv::gpu::device::transform(src, dst, op, WithOutMask(), stream);
+ transform(src, dst, op, WithOutMask(), stream);
}
template <typename T>
- void threshold_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, T thresh, T maxVal, int type,
- cudaStream_t stream)
+ void threshold(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream)
{
- typedef void (*caller_t)(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, T thresh, T maxVal, cudaStream_t stream);
+ typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> dst, T thresh, T maxVal, cudaStream_t stream);
static const caller_t callers[] =
{
threshold_caller<thresh_to_zero_inv_func, T>
};
- callers[type]((PtrStepSz<T>)src, (PtrStepSz<T>)dst, thresh, maxVal, stream);
+ callers[type]((PtrStepSz<T>) src, (PtrStepSz<T>) dst, static_cast<T>(thresh), static_cast<T>(maxVal), stream);
}
- template void threshold_gpu<uchar>(const PtrStepSzb& src, const PtrStepSzb& dst, uchar thresh, uchar maxVal, int type, cudaStream_t stream);
- template void threshold_gpu<schar>(const PtrStepSzb& src, const PtrStepSzb& dst, schar thresh, schar maxVal, int type, cudaStream_t stream);
- template void threshold_gpu<ushort>(const PtrStepSzb& src, const PtrStepSzb& dst, ushort thresh, ushort maxVal, int type, cudaStream_t stream);
- template void threshold_gpu<short>(const PtrStepSzb& src, const PtrStepSzb& dst, short thresh, short maxVal, int type, cudaStream_t stream);
- template void threshold_gpu<int>(const PtrStepSzb& src, const PtrStepSzb& dst, int thresh, int maxVal, int type, cudaStream_t stream);
- template void threshold_gpu<float>(const PtrStepSzb& src, const PtrStepSzb& dst, float thresh, float maxVal, int type, cudaStream_t stream);
- template void threshold_gpu<double>(const PtrStepSzb& src, const PtrStepSzb& dst, double thresh, double maxVal, int type, cudaStream_t stream);
+ template void threshold<uchar>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+ template void threshold<schar>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+ template void threshold<ushort>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+ template void threshold<short>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+ template void threshold<int>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+ template void threshold<float>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+ template void threshold<double>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+}
- //////////////////////////////////////////////////////////////////////////
- // pow
+//////////////////////////////////////////////////////////////////////////
+// pow
- template<typename T, bool Signed = device::numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>
+namespace
+{
+ template<typename T, bool Signed = numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>
{
- const float power;
+ float power;
PowOp(double power_) : power(static_cast<float>(power_)) {}
};
template<typename T> struct PowOp<T, true> : unary_function<T, T>
{
- const float power;
+ float power;
PowOp(double power_) : power(static_cast<float>(power_)) {}
};
template<> struct PowOp<double> : unary_function<double, double>
{
- const double power;
+ double power;
PowOp(double power_) : power(power_) {}
return ::pow(::fabs(e), power);
}
};
+}
+namespace cv { namespace gpu { namespace device
+{
namespace detail
{
template <size_t size, typename T> struct PowOpTraits : DefaultTransformFunctorTraits< PowOp<T> >
template <typename T> struct TransformFunctorTraits< PowOp<T> > : detail::PowOpTraits<sizeof(T), T>
{
};
+}}}
+namespace arithm
+{
template<typename T>
- void pow_caller(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream)
+ void pow(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream)
{
- cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, PowOp<T>(power), WithOutMask(), stream);
+ transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, PowOp<T>(power), WithOutMask(), stream);
}
- template void pow_caller<uchar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
- template void pow_caller<schar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
- template void pow_caller<short>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
- template void pow_caller<ushort>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
- template void pow_caller<int>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
- template void pow_caller<float>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
- template void pow_caller<double>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+ template void pow<uchar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+ template void pow<schar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+ template void pow<short>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+ template void pow<ushort>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+ template void pow<int>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+ template void pow<float>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+ template void pow<double>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+}
- //////////////////////////////////////////////////////////////////////////
- // addWeighted
+//////////////////////////////////////////////////////////////////////////
+// addWeighted
- namespace detail
+namespace
+{
+ template <typename T> struct UseDouble_
{
- template <typename T> struct UseDouble
- {
- enum {value = 0};
- };
- template <> struct UseDouble<int>
- {
- enum {value = 1};
- };
- template <> struct UseDouble<float>
- {
- enum {value = 1};
- };
- template <> struct UseDouble<double>
- {
- enum {value = 1};
- };
- }
+ enum {value = 0};
+ };
+ template <> struct UseDouble_<double>
+ {
+ enum {value = 1};
+ };
template <typename T1, typename T2, typename D> struct UseDouble
{
- enum {value = (detail::UseDouble<T1>::value || detail::UseDouble<T2>::value || detail::UseDouble<D>::value)};
+ enum {value = (UseDouble_<T1>::value || UseDouble_<T2>::value || UseDouble_<D>::value)};
};
- namespace detail
+ template <typename T1, typename T2, typename D, bool useDouble> struct AddWeighted_;
+ template <typename T1, typename T2, typename D> struct AddWeighted_<T1, T2, D, false> : binary_function<T1, T2, D>
{
- template <typename T1, typename T2, typename D, bool useDouble> struct AddWeighted;
- template <typename T1, typename T2, typename D> struct AddWeighted<T1, T2, D, false> : binary_function<T1, T2, D>
- {
- AddWeighted(double alpha_, double beta_, double gamma_) : alpha(static_cast<float>(alpha_)), beta(static_cast<float>(beta_)), gamma(static_cast<float>(gamma_)) {}
+ float alpha;
+ float beta;
+ float gamma;
- __device__ __forceinline__ D operator ()(T1 a, T2 b) const
- {
- return saturate_cast<D>(a * alpha + b * beta + gamma);
- }
+ AddWeighted_(double alpha_, double beta_, double gamma_) : alpha(static_cast<float>(alpha_)), beta(static_cast<float>(beta_)), gamma(static_cast<float>(gamma_)) {}
- const float alpha;
- const float beta;
- const float gamma;
- };
- template <typename T1, typename T2, typename D> struct AddWeighted<T1, T2, D, true> : binary_function<T1, T2, D>
+ __device__ __forceinline__ D operator ()(T1 a, T2 b) const
{
- AddWeighted(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {}
+ return saturate_cast<D>(a * alpha + b * beta + gamma);
+ }
+ };
+ template <typename T1, typename T2, typename D> struct AddWeighted_<T1, T2, D, true> : binary_function<T1, T2, D>
+ {
+ double alpha;
+ double beta;
+ double gamma;
- __device__ __forceinline__ D operator ()(T1 a, T2 b) const
- {
- return saturate_cast<D>(a * alpha + b * beta + gamma);
- }
+ AddWeighted_(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {}
- const double alpha;
- const double beta;
- const double gamma;
- };
- }
- template <typename T1, typename T2, typename D> struct AddWeighted : detail::AddWeighted<T1, T2, D, UseDouble<T1, T2, D>::value>
+ __device__ __forceinline__ D operator ()(T1 a, T2 b) const
+ {
+ return saturate_cast<D>(a * alpha + b * beta + gamma);
+ }
+ };
+ template <typename T1, typename T2, typename D> struct AddWeighted : AddWeighted_<T1, T2, D, UseDouble<T1, T2, D>::value>
{
- AddWeighted(double alpha_, double beta_, double gamma_) : detail::AddWeighted<T1, T2, D, UseDouble<T1, T2, D>::value>(alpha_, beta_, gamma_) {}
+ AddWeighted(double alpha_, double beta_, double gamma_) : AddWeighted_<T1, T2, D, UseDouble<T1, T2, D>::value>(alpha_, beta_, gamma_) {}
};
+}
+namespace cv { namespace gpu { namespace device
+{
template <> struct TransformFunctorTraits< AddWeighted<ushort, ushort, ushort> > : DefaultTransformFunctorTraits< AddWeighted<ushort, ushort, ushort> >
{
enum { smart_shift = 4 };
enum { smart_block_dim_y = 8 };
enum { smart_shift = 4 };
};
+}}}
+namespace arithm
+{
template <typename T1, typename T2, typename D>
- void addWeighted_gpu(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream)
+ void addWeighted(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream)
{
- if (UseDouble<T1, T2, D>::value)
- {
- cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
- cudaSafeCall( cudaSetDoubleForDevice(&beta) );
- cudaSafeCall( cudaSetDoubleForDevice(&gamma) );
- }
-
AddWeighted<T1, T2, D> op(alpha, beta, gamma);
- cv::gpu::device::transform(static_cast< PtrStepSz<T1> >(src1), static_cast< PtrStepSz<T2> >(src2), static_cast< PtrStepSz<D> >(dst), op, WithOutMask(), stream);
+ transform((PtrStepSz<T1>) src1, (PtrStepSz<T2>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
}
- template void addWeighted_gpu<uchar, uchar, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, uchar, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, uchar, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, uchar, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, uchar, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, uchar, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, uchar, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
- template void addWeighted_gpu<uchar, schar, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, schar, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, schar, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, schar, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, schar, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, schar, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, schar, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
- template void addWeighted_gpu<uchar, ushort, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, ushort, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, ushort, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, ushort, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, ushort, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, ushort, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, ushort, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
- template void addWeighted_gpu<uchar, short, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, short, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, short, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, short, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, short, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, short, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, short, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
- template void addWeighted_gpu<uchar, int, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, int, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, int, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, int, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, int, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, int, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, int, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
- template void addWeighted_gpu<uchar, float, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, float, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, float, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, float, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, float, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, float, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, float, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
- template void addWeighted_gpu<uchar, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<uchar, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
-
-
- template void addWeighted_gpu<schar, schar, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, schar, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, schar, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, schar, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, schar, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, schar, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, schar, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
- template void addWeighted_gpu<schar, ushort, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, ushort, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, ushort, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, ushort, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, ushort, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, ushort, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, ushort, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
- template void addWeighted_gpu<schar, short, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, short, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, short, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, short, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, short, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, short, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, short, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
- template void addWeighted_gpu<schar, int, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, int, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, int, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, int, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, int, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, int, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, int, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
- template void addWeighted_gpu<schar, float, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, float, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, float, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, float, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, float, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, float, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, float, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
- template void addWeighted_gpu<schar, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<schar, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
-
-
- template void addWeighted_gpu<ushort, ushort, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, ushort, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, ushort, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, ushort, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, ushort, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, ushort, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, ushort, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
- template void addWeighted_gpu<ushort, short, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, short, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, short, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, short, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, short, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, short, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, short, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
- template void addWeighted_gpu<ushort, int, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, int, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, int, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, int, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, int, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, int, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, int, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
- template void addWeighted_gpu<ushort, float, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, float, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, float, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, float, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, float, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, float, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, float, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
- template void addWeighted_gpu<ushort, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<ushort, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
-
-
- template void addWeighted_gpu<short, short, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<short, short, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<short, short, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<short, short, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<short, short, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<short, short, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<short, short, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
- template void addWeighted_gpu<short, int, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<short, int, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<short, int, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<short, int, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<short, int, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<short, int, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<short, int, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
- template void addWeighted_gpu<short, float, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<short, float, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<short, float, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<short, float, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<short, float, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<short, float, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<short, float, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
- template void addWeighted_gpu<short, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<short, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<short, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<short, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<short, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<short, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<short, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
-
-
- template void addWeighted_gpu<int, int, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<int, int, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<int, int, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<int, int, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<int, int, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<int, int, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<int, int, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
- template void addWeighted_gpu<int, float, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<int, float, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<int, float, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<int, float, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<int, float, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<int, float, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<int, float, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
- template void addWeighted_gpu<int, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<int, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<int, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<int, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<int, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<int, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<int, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
-
-
- template void addWeighted_gpu<float, float, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<float, float, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<float, float, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<float, float, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<float, float, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<float, float, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<float, float, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
- template void addWeighted_gpu<float, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<float, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<float, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<float, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<float, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<float, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<float, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
-
-
- template void addWeighted_gpu<double, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<double, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<double, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<double, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<double, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<double, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
- template void addWeighted_gpu<double, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-}}} // namespace cv { namespace gpu { namespace device
-
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+ template void addWeighted<uchar, uchar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, uchar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, uchar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, uchar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, uchar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, uchar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, uchar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+ template void addWeighted<uchar, schar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, schar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, schar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, schar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, schar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, schar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, schar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+ template void addWeighted<uchar, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+ template void addWeighted<uchar, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+ template void addWeighted<uchar, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+ template void addWeighted<uchar, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+ template void addWeighted<uchar, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<uchar, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+
+
+ template void addWeighted<schar, schar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, schar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, schar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, schar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, schar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, schar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, schar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+ template void addWeighted<schar, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+ template void addWeighted<schar, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+ template void addWeighted<schar, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+ template void addWeighted<schar, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+ template void addWeighted<schar, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<schar, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+
+
+ template void addWeighted<ushort, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+ template void addWeighted<ushort, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+ template void addWeighted<ushort, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+ template void addWeighted<ushort, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+ template void addWeighted<ushort, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<ushort, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+
+
+ template void addWeighted<short, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<short, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<short, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<short, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<short, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<short, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<short, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+ template void addWeighted<short, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<short, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<short, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<short, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<short, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<short, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<short, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+ template void addWeighted<short, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<short, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<short, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<short, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<short, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<short, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<short, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+ template void addWeighted<short, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<short, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<short, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<short, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<short, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<short, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<short, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+
+
+ template void addWeighted<int, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<int, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<int, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<int, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<int, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<int, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<int, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+ template void addWeighted<int, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<int, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<int, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<int, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<int, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<int, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<int, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+ template void addWeighted<int, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<int, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<int, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<int, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<int, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<int, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<int, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+
+
+ template void addWeighted<float, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<float, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<float, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<float, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<float, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<float, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<float, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+ template void addWeighted<float, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<float, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<float, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<float, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<float, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<float, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<float, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+
+
+ template void addWeighted<double, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<double, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<double, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<double, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<double, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<double, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+ template void addWeighted<double, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; typedef Npp32fc npp_complex_type; };
template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; typedef Npp64fc npp_complex_type; };
- template <int DEPTH> struct NppArithmFunc
- {
- typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-
- typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pSrc2, int nSrc2Step, npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);
- };
- template <> struct NppArithmFunc<CV_32F>
- {
- typedef NppTypeTraits<CV_32F>::npp_t npp_t;
-
- typedef NppStatus (*func_t)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f* pSrc2, int nSrc2Step, Npp32f* pDst, int nDstStep, NppiSize oSizeROI);
- };
-
- template <int DEPTH, typename NppArithmFunc<DEPTH>::func_t func> struct NppArithm
- {
- typedef typename NppArithmFunc<DEPTH>::npp_t npp_t;
-
- static void call(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
- {
- NppStreamHandler h(stream);
-
- NppiSize sz;
- sz.width = src1.cols;
- sz.height = src1.rows;
-
- nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), (const npp_t*)src2.data, static_cast<int>(src2.step),
- (npp_t*)dst.data, static_cast<int>(dst.step), sz, 0) );
-
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
- }
- };
- template <typename NppArithmFunc<CV_32F>::func_t func> struct NppArithm<CV_32F, func>
- {
- typedef typename NppArithmFunc<CV_32F>::npp_t npp_t;
-
- static void call(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
- {
- NppStreamHandler h(stream);
-
- NppiSize sz;
- sz.width = src1.cols;
- sz.height = src1.rows;
-
- nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), (const npp_t*)src2.data, static_cast<int>(src2.step),
- (npp_t*)dst.data, static_cast<int>(dst.step), sz) );
-
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
- }
- };
-
template<int DEPTH, int cn> struct NppArithmScalarFunc
{
typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
////////////////////////////////////////////////////////////////////////
// add
-namespace cv { namespace gpu { namespace device
+namespace arithm
{
template <typename T, typename D>
- void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+ void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+ template <typename T, typename D>
+ void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T, typename D>
- void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-}}}
+ void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
{
- using namespace cv::gpu::device;
+ using namespace arithm;
- typedef void (*func_t)(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+ typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
static const func_t funcs[7][7] =
{
- {add_gpu<unsigned char, unsigned char> , 0 /*add_gpu<unsigned char, signed char>*/ , add_gpu<unsigned char, unsigned short> , add_gpu<unsigned char, short> , add_gpu<unsigned char, int> , add_gpu<unsigned char, float> , add_gpu<unsigned char, double> },
- {0 /*add_gpu<signed char, unsigned char>*/ , 0 /*add_gpu<signed char, signed char>*/ , 0 /*add_gpu<signed char, unsigned short>*/, 0 /*add_gpu<signed char, short>*/ , 0 /*add_gpu<signed char, int>*/, 0 /*add_gpu<signed char, float>*/, 0 /*add_gpu<signed char, double>*/},
- {0 /*add_gpu<unsigned short, unsigned char>*/, 0 /*add_gpu<unsigned short, signed char>*/, add_gpu<unsigned short, unsigned short> , 0 /*add_gpu<unsigned short, short>*/, add_gpu<unsigned short, int> , add_gpu<unsigned short, float> , add_gpu<unsigned short, double> },
- {0 /*add_gpu<short, unsigned char>*/ , 0 /*add_gpu<short, signed char>*/ , 0 /*add_gpu<short, unsigned short>*/ , add_gpu<short, short> , add_gpu<short, int> , add_gpu<short, float> , add_gpu<short, double> },
- {0 /*add_gpu<int, unsigned char>*/ , 0 /*add_gpu<int, signed char>*/ , 0 /*add_gpu<int, unsigned short>*/ , 0 /*add_gpu<int, short>*/ , add_gpu<int, int> , add_gpu<int, float> , add_gpu<int, double> },
- {0 /*add_gpu<float, unsigned char>*/ , 0 /*add_gpu<float, signed char>*/ , 0 /*add_gpu<float, unsigned short>*/ , 0 /*add_gpu<float, short>*/ , 0 /*add_gpu<float, int>*/ , add_gpu<float, float> , add_gpu<float, double> },
- {0 /*add_gpu<double, unsigned char>*/ , 0 /*add_gpu<double, signed char>*/ , 0 /*add_gpu<double, unsigned short>*/ , 0 /*add_gpu<double, short>*/ , 0 /*add_gpu<double, int>*/ , 0 /*add_gpu<double, float>*/ , add_gpu<double, double> }
+ {
+ addMat<unsigned char, unsigned char>,
+ addMat<unsigned char, signed char>,
+ addMat<unsigned char, unsigned short>,
+ addMat<unsigned char, short>,
+ addMat<unsigned char, int>,
+ addMat<unsigned char, float>,
+ addMat<unsigned char, double>
+ },
+ {
+ addMat<signed char, unsigned char>,
+ addMat<signed char, signed char>,
+ addMat<signed char, unsigned short>,
+ addMat<signed char, short>,
+ addMat<signed char, int>,
+ addMat<signed char, float>,
+ addMat<signed char, double>
+ },
+ {
+ 0 /*addMat<unsigned short, unsigned char>*/,
+ 0 /*addMat<unsigned short, signed char>*/,
+ addMat<unsigned short, unsigned short>,
+ addMat<unsigned short, short>,
+ addMat<unsigned short, int>,
+ addMat<unsigned short, float>,
+ addMat<unsigned short, double>
+ },
+ {
+ 0 /*addMat<short, unsigned char>*/,
+ 0 /*addMat<short, signed char>*/,
+ addMat<short, unsigned short>,
+ addMat<short, short>,
+ addMat<short, int>,
+ addMat<short, float>,
+ addMat<short, double>
+ },
+ {
+ 0 /*addMat<int, unsigned char>*/,
+ 0 /*addMat<int, signed char>*/,
+ 0 /*addMat<int, unsigned short>*/,
+ 0 /*addMat<int, short>*/,
+ addMat<int, int>,
+ addMat<int, float>,
+ addMat<int, double>
+ },
+ {
+ 0 /*addMat<float, unsigned char>*/,
+ 0 /*addMat<float, signed char>*/,
+ 0 /*addMat<float, unsigned short>*/,
+ 0 /*addMat<float, short>*/,
+ 0 /*addMat<float, int>*/,
+ addMat<float, float>,
+ addMat<float, double>
+ },
+ {
+ 0 /*addMat<double, unsigned char>*/,
+ 0 /*addMat<double, signed char>*/,
+ 0 /*addMat<double, unsigned short>*/,
+ 0 /*addMat<double, short>*/,
+ 0 /*addMat<double, int>*/,
+ 0 /*addMat<double, float>*/,
+ addMat<double, double>
+ }
};
- typedef void (*npp_func_t)(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
- static const npp_func_t npp_funcs[] =
+ typedef void (*vfunc_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ static const vfunc_t vfuncs4[4][4] =
{
- NppArithm<CV_8U , nppiAdd_8u_C1RSfs >::call,
- 0,
- NppArithm<CV_16U, nppiAdd_16u_C1RSfs>::call,
- NppArithm<CV_16S, nppiAdd_16s_C1RSfs>::call,
- NppArithm<CV_32S, nppiAdd_32s_C1RSfs>::call,
- NppArithm<CV_32F, nppiAdd_32f_C1R >::call
+ {
+ vadd4<unsigned int, unsigned int>,
+ vadd4<unsigned int, int>,
+ 0,
+ 0
+ },
+ {
+ vadd4<int, unsigned int>,
+ vadd4<int, int>,
+ 0,
+ 0
+ },
+ {
+ 0,
+ 0,
+ 0,
+ 0
+ },
+ {
+ 0,
+ 0,
+ 0,
+ 0
+ }
+ };
+ static const vfunc_t vfuncs2[4][4] =
+ {
+ {
+ 0,
+ 0,
+ 0,
+ 0
+ },
+ {
+ 0,
+ 0,
+ 0,
+ 0
+ },
+ {
+ 0,
+ 0,
+ vadd2<unsigned int, unsigned int>,
+ vadd2<unsigned int, int>
+ },
+ {
+ 0,
+ 0,
+ vadd2<int, unsigned int>,
+ vadd2<int, int>
+ }
};
if (dtype < 0)
dtype = src1.depth();
- CV_Assert(src1.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
- CV_Assert(src1.type() == src2.type() && src1.size() == src2.size());
- CV_Assert(mask.empty() || (src1.channels() == 1 && mask.size() == src1.size() && mask.type() == CV_8U));
+ const int sdepth = src1.depth();
+ const int ddepth = CV_MAT_DEPTH(dtype);
+ const int cn = src1.channels();
+
+ CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+ CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
+ CV_Assert( mask.empty() || (cn == 1 && mask.size() == src1.size() && mask.type() == CV_8U) );
- if (src1.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+ if (sdepth == CV_64F || ddepth == CV_64F)
{
- if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+ if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
- dst.create(src1.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src1.channels()));
+ dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
cudaStream_t stream = StreamAccessor::getStream(s);
- if (mask.empty() && dst.type() == src1.type() && src1.depth() <= CV_32F)
+ PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+ PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+ PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
+
+ if (mask.empty() && sdepth < CV_32S && ddepth < CV_32S)
{
- npp_funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
- return;
+ const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+ const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+ const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+ const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+ if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+ {
+ const vfunc_t vfunc4 = vfuncs4[sdepth][ddepth];
+ const vfunc_t vfunc2 = vfuncs2[sdepth][ddepth];
+
+ if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+ {
+ const int vcols = src1_.cols >> 2;
+
+ vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+ PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+ PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+ stream);
+
+ return;
+ }
+
+ if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+ {
+ const int vcols = src1_.cols >> 1;
+
+ vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+ PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+ PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+ stream);
+
+ return;
+ }
+ }
}
- const func_t func = funcs[src1.depth()][dst.depth()];
+ const func_t func = funcs[sdepth][ddepth];
if (!func)
CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
- func(src1.reshape(1), src2.reshape(1), dst.reshape(1), mask, stream);
+ func(src1_, src2_, dst_, mask, stream);
+}
+
+namespace arithm
+{
+ template <typename T, typename S, typename D>
+ void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
}
void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
{
- using namespace cv::gpu::device;
+ using namespace arithm;
- typedef void (*func_t)(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+ typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
static const func_t funcs[7][7] =
{
- {add_gpu<unsigned char, unsigned char> , 0 /*add_gpu<unsigned char, signed char>*/ , add_gpu<unsigned char, unsigned short> , add_gpu<unsigned char, short> , add_gpu<unsigned char, int> , add_gpu<unsigned char, float> , add_gpu<unsigned char, double> },
- {0 /*add_gpu<signed char, unsigned char>*/ , 0 /*add_gpu<signed char, signed char>*/ , 0 /*add_gpu<signed char, unsigned short>*/, 0 /*add_gpu<signed char, short>*/ , 0 /*add_gpu<signed char, int>*/, 0 /*add_gpu<signed char, float>*/, 0 /*add_gpu<signed char, double>*/},
- {0 /*add_gpu<unsigned short, unsigned char>*/, 0 /*add_gpu<unsigned short, signed char>*/, add_gpu<unsigned short, unsigned short> , 0 /*add_gpu<unsigned short, short>*/, add_gpu<unsigned short, int> , add_gpu<unsigned short, float> , add_gpu<unsigned short, double> },
- {0 /*add_gpu<short, unsigned char>*/ , 0 /*add_gpu<short, signed char>*/ , 0 /*add_gpu<short, unsigned short>*/ , add_gpu<short, short> , add_gpu<short, int> , add_gpu<short, float> , add_gpu<short, double> },
- {0 /*add_gpu<int, unsigned char>*/ , 0 /*add_gpu<int, signed char>*/ , 0 /*add_gpu<int, unsigned short>*/ , 0 /*add_gpu<int, short>*/ , add_gpu<int, int> , add_gpu<int, float> , add_gpu<int, double> },
- {0 /*add_gpu<float, unsigned char>*/ , 0 /*add_gpu<float, signed char>*/ , 0 /*add_gpu<float, unsigned short>*/ , 0 /*add_gpu<float, short>*/ , 0 /*add_gpu<float, int>*/ , add_gpu<float, float> , add_gpu<float, double> },
- {0 /*add_gpu<double, unsigned char>*/ , 0 /*add_gpu<double, signed char>*/ , 0 /*add_gpu<double, unsigned short>*/ , 0 /*add_gpu<double, short>*/ , 0 /*add_gpu<double, int>*/ , 0 /*add_gpu<double, float>*/ , add_gpu<double, double> }
+ {
+ addScalar<unsigned char, float, unsigned char>,
+ addScalar<unsigned char, float, signed char>,
+ addScalar<unsigned char, float, unsigned short>,
+ addScalar<unsigned char, float, short>,
+ addScalar<unsigned char, float, int>,
+ addScalar<unsigned char, float, float>,
+ addScalar<unsigned char, double, double>
+ },
+ {
+ addScalar<signed char, float, unsigned char>,
+ addScalar<signed char, float, signed char>,
+ addScalar<signed char, float, unsigned short>,
+ addScalar<signed char, float, short>,
+ addScalar<signed char, float, int>,
+ addScalar<signed char, float, float>,
+ addScalar<signed char, double, double>
+ },
+ {
+ 0 /*addScalar<unsigned short, float, unsigned char>*/,
+ 0 /*addScalar<unsigned short, float, signed char>*/,
+ addScalar<unsigned short, float, unsigned short>,
+ addScalar<unsigned short, float, short>,
+ addScalar<unsigned short, float, int>,
+ addScalar<unsigned short, float, float>,
+ addScalar<unsigned short, double, double>
+ },
+ {
+ 0 /*addScalar<short, float, unsigned char>*/,
+ 0 /*addScalar<short, float, signed char>*/,
+ addScalar<short, float, unsigned short>,
+ addScalar<short, float, short>,
+ addScalar<short, float, int>,
+ addScalar<short, float, float>,
+ addScalar<short, double, double>
+ },
+ {
+ 0 /*addScalar<int, float, unsigned char>*/,
+ 0 /*addScalar<int, float, signed char>*/,
+ 0 /*addScalar<int, float, unsigned short>*/,
+ 0 /*addScalar<int, float, short>*/,
+ addScalar<int, float, int>,
+ addScalar<int, float, float>,
+ addScalar<int, double, double>
+ },
+ {
+ 0 /*addScalar<float, float, unsigned char>*/,
+ 0 /*addScalar<float, float, signed char>*/,
+ 0 /*addScalar<float, float, unsigned short>*/,
+ 0 /*addScalar<float, float, short>*/,
+ 0 /*addScalar<float, float, int>*/,
+ addScalar<float, float, float>,
+ addScalar<float, double, double>
+ },
+ {
+ 0 /*addScalar<double, double, unsigned char>*/,
+ 0 /*addScalar<double, double, signed char>*/,
+ 0 /*addScalar<double, double, unsigned short>*/,
+ 0 /*addScalar<double, double, short>*/,
+ 0 /*addScalar<double, double, int>*/,
+ 0 /*addScalar<double, double, float>*/,
+ addScalar<double, double, double>
+ }
};
typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream);
if (dtype < 0)
dtype = src.depth();
- CV_Assert(src.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
- CV_Assert(src.channels() <= 4);
- CV_Assert(mask.empty() || (src.channels() == 1 && mask.size() == src.size() && mask.type() == CV_8U));
+ const int sdepth = src.depth();
+ const int ddepth = CV_MAT_DEPTH(dtype);
+ const int cn = src.channels();
+
+ CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+ CV_Assert( cn <= 4 );
+ CV_Assert( mask.empty() || (cn == 1 && mask.size() == src.size() && mask.type() == CV_8U) );
- if (src.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+ if (sdepth == CV_64F || ddepth == CV_64F)
{
- if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+ if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
- dst.create(src.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
+ dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn));
cudaStream_t stream = StreamAccessor::getStream(s);
- if (mask.empty() && dst.type() == src.type())
+ const npp_func_t npp_func = npp_funcs[sdepth][cn - 1];
+ if (ddepth == sdepth && cn > 1 && npp_func != 0)
{
- const npp_func_t npp_func = npp_funcs[src.depth()][src.channels() - 1];
-
- if (npp_func)
- {
- npp_func(src, sc, dst, stream);
- return;
- }
+ npp_func(src, sc, dst, stream);
+ return;
}
- CV_Assert(src.channels() == 1);
+ CV_Assert( cn == 1 );
- const func_t func = funcs[src.depth()][dst.depth()];
+ const func_t func = funcs[sdepth][ddepth];
if (!func)
CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
////////////////////////////////////////////////////////////////////////
// subtract
-namespace cv { namespace gpu { namespace device
+namespace arithm
{
template <typename T, typename D>
- void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+ void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T, typename D>
- void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-}}}
+ void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+ template <typename T, typename D>
+ void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
{
- using namespace cv::gpu::device;
+ using namespace arithm;
- typedef void (*func_t)(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+ typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
static const func_t funcs[7][7] =
{
- {subtract_gpu<unsigned char, unsigned char> , 0 /*subtract_gpu<unsigned char, signed char>*/ , subtract_gpu<unsigned char, unsigned short> , subtract_gpu<unsigned char, short> , subtract_gpu<unsigned char, int> , subtract_gpu<unsigned char, float> , subtract_gpu<unsigned char, double> },
- {0 /*subtract_gpu<signed char, unsigned char>*/ , 0 /*subtract_gpu<signed char, signed char>*/ , 0 /*subtract_gpu<signed char, unsigned short>*/, 0 /*subtract_gpu<signed char, short>*/ , 0 /*subtract_gpu<signed char, int>*/, 0 /*subtract_gpu<signed char, float>*/, 0 /*subtract_gpu<signed char, double>*/},
- {0 /*subtract_gpu<unsigned short, unsigned char>*/, 0 /*subtract_gpu<unsigned short, signed char>*/, subtract_gpu<unsigned short, unsigned short> , 0 /*subtract_gpu<unsigned short, short>*/, subtract_gpu<unsigned short, int> , subtract_gpu<unsigned short, float> , subtract_gpu<unsigned short, double> },
- {0 /*subtract_gpu<short, unsigned char>*/ , 0 /*subtract_gpu<short, signed char>*/ , 0 /*subtract_gpu<short, unsigned short>*/ , subtract_gpu<short, short> , subtract_gpu<short, int> , subtract_gpu<short, float> , subtract_gpu<short, double> },
- {0 /*subtract_gpu<int, unsigned char>*/ , 0 /*subtract_gpu<int, signed char>*/ , 0 /*subtract_gpu<int, unsigned short>*/ , 0 /*subtract_gpu<int, short>*/ , subtract_gpu<int, int> , subtract_gpu<int, float> , subtract_gpu<int, double> },
- {0 /*subtract_gpu<float, unsigned char>*/ , 0 /*subtract_gpu<float, signed char>*/ , 0 /*subtract_gpu<float, unsigned short>*/ , 0 /*subtract_gpu<float, short>*/ , 0 /*subtract_gpu<float, int>*/ , subtract_gpu<float, float> , subtract_gpu<float, double> },
- {0 /*subtract_gpu<double, unsigned char>*/ , 0 /*subtract_gpu<double, signed char>*/ , 0 /*subtract_gpu<double, unsigned short>*/ , 0 /*subtract_gpu<double, short>*/ , 0 /*subtract_gpu<double, int>*/ , 0 /*subtract_gpu<double, float>*/ , subtract_gpu<double, double> }
+ {
+ subMat<unsigned char, unsigned char>,
+ subMat<unsigned char, signed char>,
+ subMat<unsigned char, unsigned short>,
+ subMat<unsigned char, short>,
+ subMat<unsigned char, int>,
+ subMat<unsigned char, float>,
+ subMat<unsigned char, double>
+ },
+ {
+ subMat<signed char, unsigned char>,
+ subMat<signed char, signed char>,
+ subMat<signed char, unsigned short>,
+ subMat<signed char, short>,
+ subMat<signed char, int>,
+ subMat<signed char, float>,
+ subMat<signed char, double>
+ },
+ {
+ 0 /*subMat<unsigned short, unsigned char>*/,
+ 0 /*subMat<unsigned short, signed char>*/,
+ subMat<unsigned short, unsigned short>,
+ subMat<unsigned short, short>,
+ subMat<unsigned short, int>,
+ subMat<unsigned short, float>,
+ subMat<unsigned short, double>
+ },
+ {
+ 0 /*subMat<short, unsigned char>*/,
+ 0 /*subMat<short, signed char>*/,
+ subMat<short, unsigned short>,
+ subMat<short, short>,
+ subMat<short, int>,
+ subMat<short, float>,
+ subMat<short, double>
+ },
+ {
+ 0 /*subMat<int, unsigned char>*/,
+ 0 /*subMat<int, signed char>*/,
+ 0 /*subMat<int, unsigned short>*/,
+ 0 /*subMat<int, short>*/,
+ subMat<int, int>,
+ subMat<int, float>,
+ subMat<int, double>
+ },
+ {
+ 0 /*subMat<float, unsigned char>*/,
+ 0 /*subMat<float, signed char>*/,
+ 0 /*subMat<float, unsigned short>*/,
+ 0 /*subMat<float, short>*/,
+ 0 /*subMat<float, int>*/,
+ subMat<float, float>,
+ subMat<float, double>
+ },
+ {
+ 0 /*subMat<double, unsigned char>*/,
+ 0 /*subMat<double, signed char>*/,
+ 0 /*subMat<double, unsigned short>*/,
+ 0 /*subMat<double, short>*/,
+ 0 /*subMat<double, int>*/,
+ 0 /*subMat<double, float>*/,
+ subMat<double, double>
+ }
};
- typedef void (*npp_func_t)(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
- static const npp_func_t npp_funcs[6] =
+ typedef void (*vfunc_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ static const vfunc_t vfuncs4[4][4] =
{
- NppArithm<CV_8U , nppiSub_8u_C1RSfs>::call,
- 0,
- NppArithm<CV_16U, nppiSub_16u_C1RSfs>::call,
- NppArithm<CV_16S, nppiSub_16s_C1RSfs>::call,
- NppArithm<CV_32S, nppiSub_32s_C1RSfs>::call,
- NppArithm<CV_32F, nppiSub_32f_C1R >::call
+ {
+ vsub4<unsigned int, unsigned int>,
+ vsub4<unsigned int, int>,
+ 0,
+ 0
+ },
+ {
+ vsub4<int, unsigned int>,
+ vsub4<int, int>,
+ 0,
+ 0
+ },
+ {
+ 0,
+ 0,
+ 0,
+ 0
+ },
+ {
+ 0,
+ 0,
+ 0,
+ 0
+ }
+ };
+ static const vfunc_t vfuncs2[4][4] =
+ {
+ {
+ 0,
+ 0,
+ 0,
+ 0
+ },
+ {
+ 0,
+ 0,
+ 0,
+ 0
+ },
+ {
+ 0,
+ 0,
+ vsub2<unsigned int, unsigned int>,
+ vsub2<unsigned int, int>
+ },
+ {
+ 0,
+ 0,
+ vsub2<int, unsigned int>,
+ vsub2<int, int>
+ }
};
if (dtype < 0)
dtype = src1.depth();
- CV_Assert(src1.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
- CV_Assert(src1.type() == src2.type() && src1.size() == src2.size());
- CV_Assert(mask.empty() || (src1.channels() == 1 && mask.size() == src1.size() && mask.type() == CV_8U));
+ const int sdepth = src1.depth();
+ const int ddepth = CV_MAT_DEPTH(dtype);
+ const int cn = src1.channels();
- if (src1.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+ CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+ CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
+ CV_Assert( mask.empty() || (cn == 1 && mask.size() == src1.size() && mask.type() == CV_8U) );
+
+ if (sdepth == CV_64F || ddepth == CV_64F)
{
- if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+ if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
- dst.create(src1.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src1.channels()));
+ dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
cudaStream_t stream = StreamAccessor::getStream(s);
- if (mask.empty() && dst.type() == src1.type() && src1.depth() <= CV_32F)
+ PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+ PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+ PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
+
+ if (mask.empty() && sdepth < CV_32S && ddepth < CV_32S)
{
- npp_funcs[src1.depth()](src2.reshape(1), src1.reshape(1), dst.reshape(1), stream);
- return;
+ const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+ const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+ const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+ const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+ if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+ {
+ const vfunc_t vfunc4 = vfuncs4[sdepth][ddepth];
+ const vfunc_t vfunc2 = vfuncs2[sdepth][ddepth];
+
+ if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+ {
+ const int vcols = src1_.cols >> 2;
+
+ vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+ PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+ PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+ stream);
+
+ return;
+ }
+
+ if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+ {
+ const int vcols = src1_.cols >> 1;
+
+ vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+ PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+ PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+ stream);
+
+ return;
+ }
+ }
}
- const func_t func = funcs[src1.depth()][dst.depth()];
+ const func_t func = funcs[sdepth][ddepth];
if (!func)
CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
- func(src1.reshape(1), src2.reshape(1), dst.reshape(1), mask, stream);
+ func(src1_, src2_, dst_, mask, stream);
+}
+
+namespace arithm
+{
+ template <typename T, typename S, typename D>
+ void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
}
void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
{
- using namespace cv::gpu::device;
+ using namespace arithm;
- typedef void (*func_t)(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+ typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
static const func_t funcs[7][7] =
{
- {subtract_gpu<unsigned char, unsigned char> , 0 /*subtract_gpu<unsigned char, signed char>*/ , subtract_gpu<unsigned char, unsigned short> , subtract_gpu<unsigned char, short> , subtract_gpu<unsigned char, int> , subtract_gpu<unsigned char, float> , subtract_gpu<unsigned char, double> },
- {0 /*subtract_gpu<signed char, unsigned char>*/ , 0 /*subtract_gpu<signed char, signed char>*/ , 0 /*subtract_gpu<signed char, unsigned short>*/, 0 /*subtract_gpu<signed char, short>*/ , 0 /*subtract_gpu<signed char, int>*/, 0 /*subtract_gpu<signed char, float>*/, 0 /*subtract_gpu<signed char, double>*/},
- {0 /*subtract_gpu<unsigned short, unsigned char>*/, 0 /*subtract_gpu<unsigned short, signed char>*/, subtract_gpu<unsigned short, unsigned short> , 0 /*subtract_gpu<unsigned short, short>*/, subtract_gpu<unsigned short, int> , subtract_gpu<unsigned short, float> , subtract_gpu<unsigned short, double> },
- {0 /*subtract_gpu<short, unsigned char>*/ , 0 /*subtract_gpu<short, signed char>*/ , 0 /*subtract_gpu<short, unsigned short>*/ , subtract_gpu<short, short> , subtract_gpu<short, int> , subtract_gpu<short, float> , subtract_gpu<short, double> },
- {0 /*subtract_gpu<int, unsigned char>*/ , 0 /*subtract_gpu<int, signed char>*/ , 0 /*subtract_gpu<int, unsigned short>*/ , 0 /*subtract_gpu<int, short>*/ , subtract_gpu<int, int> , subtract_gpu<int, float> , subtract_gpu<int, double> },
- {0 /*subtract_gpu<float, unsigned char>*/ , 0 /*subtract_gpu<float, signed char>*/ , 0 /*subtract_gpu<float, unsigned short>*/ , 0 /*subtract_gpu<float, short>*/ , 0 /*subtract_gpu<float, int>*/ , subtract_gpu<float, float> , subtract_gpu<float, double> },
- {0 /*subtract_gpu<double, unsigned char>*/ , 0 /*subtract_gpu<double, signed char>*/ , 0 /*subtract_gpu<double, unsigned short>*/ , 0 /*subtract_gpu<double, short>*/ , 0 /*subtract_gpu<double, int>*/ , 0 /*subtract_gpu<double, float>*/ , subtract_gpu<double, double> }
+ {
+ subScalar<unsigned char, float, unsigned char>,
+ subScalar<unsigned char, float, signed char>,
+ subScalar<unsigned char, float, unsigned short>,
+ subScalar<unsigned char, float, short>,
+ subScalar<unsigned char, float, int>,
+ subScalar<unsigned char, float, float>,
+ subScalar<unsigned char, double, double>
+ },
+ {
+ subScalar<signed char, float, unsigned char>,
+ subScalar<signed char, float, signed char>,
+ subScalar<signed char, float, unsigned short>,
+ subScalar<signed char, float, short>,
+ subScalar<signed char, float, int>,
+ subScalar<signed char, float, float>,
+ subScalar<signed char, double, double>
+ },
+ {
+ 0 /*subScalar<unsigned short, float, unsigned char>*/,
+ 0 /*subScalar<unsigned short, float, signed char>*/,
+ subScalar<unsigned short, float, unsigned short>,
+ subScalar<unsigned short, float, short>,
+ subScalar<unsigned short, float, int>,
+ subScalar<unsigned short, float, float>,
+ subScalar<unsigned short, double, double>
+ },
+ {
+ 0 /*subScalar<short, float, unsigned char>*/,
+ 0 /*subScalar<short, float, signed char>*/,
+ subScalar<short, float, unsigned short>,
+ subScalar<short, float, short>,
+ subScalar<short, float, int>,
+ subScalar<short, float, float>,
+ subScalar<short, double, double>
+ },
+ {
+ 0 /*subScalar<int, float, unsigned char>*/,
+ 0 /*subScalar<int, float, signed char>*/,
+ 0 /*subScalar<int, float, unsigned short>*/,
+ 0 /*subScalar<int, float, short>*/,
+ subScalar<int, float, int>,
+ subScalar<int, float, float>,
+ subScalar<int, double, double>
+ },
+ {
+ 0 /*subScalar<float, float, unsigned char>*/,
+ 0 /*subScalar<float, float, signed char>*/,
+ 0 /*subScalar<float, float, unsigned short>*/,
+ 0 /*subScalar<float, float, short>*/,
+ 0 /*subScalar<float, float, int>*/,
+ subScalar<float, float, float>,
+ subScalar<float, double, double>
+ },
+ {
+ 0 /*subScalar<double, double, unsigned char>*/,
+ 0 /*subScalar<double, double, signed char>*/,
+ 0 /*subScalar<double, double, unsigned short>*/,
+ 0 /*subScalar<double, double, short>*/,
+ 0 /*subScalar<double, double, int>*/,
+ 0 /*subScalar<double, double, float>*/,
+ subScalar<double, double, double>
+ }
};
typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream);
if (dtype < 0)
dtype = src.depth();
- CV_Assert(src.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
- CV_Assert(src.channels() <= 4);
- CV_Assert(mask.empty() || (src.channels() == 1 && mask.size() == src.size() && mask.type() == CV_8U));
+ const int sdepth = src.depth();
+ const int ddepth = CV_MAT_DEPTH(dtype);
+ const int cn = src.channels();
- if (src.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+ CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+ CV_Assert( cn <= 4 );
+ CV_Assert( mask.empty() || (cn == 1 && mask.size() == src.size() && mask.type() == CV_8U) );
+
+ if (sdepth == CV_64F || ddepth == CV_64F)
{
- if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+ if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
- dst.create(src.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
+ dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn));
cudaStream_t stream = StreamAccessor::getStream(s);
- if (mask.empty() && dst.type() == src.type())
+ const npp_func_t npp_func = npp_funcs[sdepth][cn - 1];
+ if (ddepth == sdepth && cn > 1 && npp_func != 0)
{
- const npp_func_t npp_func = npp_funcs[src.depth()][src.channels() - 1];
-
- if (npp_func)
- {
- npp_func(src, sc, dst, stream);
- return;
- }
+ npp_func(src, sc, dst, stream);
+ return;
}
- CV_Assert(src.channels() == 1);
+ CV_Assert( cn == 1 );
- const func_t func = funcs[src.depth()][dst.depth()];
+ const func_t func = funcs[sdepth][ddepth];
if (!func)
CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
////////////////////////////////////////////////////////////////////////
// multiply
-namespace cv { namespace gpu { namespace device
+namespace arithm
{
- void multiply_gpu(const PtrStepSz<uchar4>& src1, const PtrStepSzf& src2, const PtrStepSz<uchar4>& dst, cudaStream_t stream);
- void multiply_gpu(const PtrStepSz<short4>& src1, const PtrStepSzf& src2, const PtrStepSz<short4>& dst, cudaStream_t stream);
+ void mulMat_8uc4_32f(PtrStepSz<unsigned int> src1, PtrStepSzf src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
- template <typename T, typename D>
- void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+ void mulMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream);
- template <typename T, typename D>
- void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-}}}
+ template <typename T, typename S, typename D>
+ void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+}
void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s)
{
- using namespace cv::gpu::device;
+ using namespace arithm;
cudaStream_t stream = StreamAccessor::getStream(s);
if (src1.type() == CV_8UC4 && src2.type() == CV_32FC1)
{
- CV_Assert(src1.size() == src2.size());
+ CV_Assert( src1.size() == src2.size() );
dst.create(src1.size(), src1.type());
- multiply_gpu(static_cast<PtrStepSz<uchar4> >(src1), static_cast<PtrStepSzf>(src2), static_cast<PtrStepSz<uchar4> >(dst), stream);
+ mulMat_8uc4_32f(src1, src2, dst, stream);
}
else if (src1.type() == CV_16SC4 && src2.type() == CV_32FC1)
{
- CV_Assert(src1.size() == src2.size());
+ CV_Assert( src1.size() == src2.size() );
dst.create(src1.size(), src1.type());
- multiply_gpu(static_cast<PtrStepSz<short4> >(src1), static_cast<PtrStepSzf>(src2), static_cast<PtrStepSz<short4> >(dst), stream);
+ mulMat_16sc4_32f(src1, src2, dst, stream);
}
else
{
- typedef void (*func_t)(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+ typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
static const func_t funcs[7][7] =
{
- {multiply_gpu<unsigned char, unsigned char> , 0 /*multiply_gpu<unsigned char, signed char>*/ , multiply_gpu<unsigned char, unsigned short> , multiply_gpu<unsigned char, short> , multiply_gpu<unsigned char, int> , multiply_gpu<unsigned char, float> , multiply_gpu<unsigned char, double> },
- {0 /*multiply_gpu<signed char, unsigned char>*/ , 0 /*multiply_gpu<signed char, signed char>*/ , 0 /*multiply_gpu<signed char, unsigned short>*/, 0 /*multiply_gpu<signed char, short>*/ , 0 /*multiply_gpu<signed char, int>*/, 0 /*multiply_gpu<signed char, float>*/, 0 /*multiply_gpu<signed char, double>*/},
- {0 /*multiply_gpu<unsigned short, unsigned char>*/, 0 /*multiply_gpu<unsigned short, signed char>*/, multiply_gpu<unsigned short, unsigned short> , 0 /*multiply_gpu<unsigned short, short>*/, multiply_gpu<unsigned short, int> , multiply_gpu<unsigned short, float> , multiply_gpu<unsigned short, double> },
- {0 /*multiply_gpu<short, unsigned char>*/ , 0 /*multiply_gpu<short, signed char>*/ , 0 /*multiply_gpu<short, unsigned short>*/ , multiply_gpu<short, short> , multiply_gpu<short, int> , multiply_gpu<short, float> , multiply_gpu<short, double> },
- {0 /*multiply_gpu<int, unsigned char>*/ , 0 /*multiply_gpu<int, signed char>*/ , 0 /*multiply_gpu<int, unsigned short>*/ , 0 /*multiply_gpu<int, short>*/ , multiply_gpu<int, int> , multiply_gpu<int, float> , multiply_gpu<int, double> },
- {0 /*multiply_gpu<float, unsigned char>*/ , 0 /*multiply_gpu<float, signed char>*/ , 0 /*multiply_gpu<float, unsigned short>*/ , 0 /*multiply_gpu<float, short>*/ , 0 /*multiply_gpu<float, int>*/ , multiply_gpu<float, float> , multiply_gpu<float, double> },
- {0 /*multiply_gpu<double, unsigned char>*/ , 0 /*multiply_gpu<double, signed char>*/ , 0 /*multiply_gpu<double, unsigned short>*/ , 0 /*multiply_gpu<double, short>*/ , 0 /*multiply_gpu<double, int>*/ , 0 /*multiply_gpu<double, float>*/ , multiply_gpu<double, double> }
- };
-
- typedef void (*npp_func_t)(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
- static const npp_func_t npp_funcs[] =
- {
- NppArithm<CV_8U , nppiMul_8u_C1RSfs >::call,
- 0,
- NppArithm<CV_16U, nppiMul_16u_C1RSfs>::call,
- NppArithm<CV_16S, nppiMul_16s_C1RSfs>::call,
- NppArithm<CV_32S, nppiMul_32s_C1RSfs>::call,
- NppArithm<CV_32F, nppiMul_32f_C1R >::call
+ {
+ mulMat<unsigned char, float, unsigned char>,
+ mulMat<unsigned char, float, signed char>,
+ mulMat<unsigned char, float, unsigned short>,
+ mulMat<unsigned char, float, short>,
+ mulMat<unsigned char, float, int>,
+ mulMat<unsigned char, float, float>,
+ mulMat<unsigned char, double, double>
+ },
+ {
+ mulMat<signed char, float, unsigned char>,
+ mulMat<signed char, float, signed char>,
+ mulMat<signed char, float, unsigned short>,
+ mulMat<signed char, float, short>,
+ mulMat<signed char, float, int>,
+ mulMat<signed char, float, float>,
+ mulMat<signed char, double, double>
+ },
+ {
+ 0 /*mulMat<unsigned short, float, unsigned char>*/,
+ 0 /*mulMat<unsigned short, float, signed char>*/,
+ mulMat<unsigned short, float, unsigned short>,
+ mulMat<unsigned short, float, short>,
+ mulMat<unsigned short, float, int>,
+ mulMat<unsigned short, float, float>,
+ mulMat<unsigned short, double, double>
+ },
+ {
+ 0 /*mulMat<short, float, unsigned char>*/,
+ 0 /*mulMat<short, float, signed char>*/,
+ mulMat<short, float, unsigned short>,
+ mulMat<short, float, short>,
+ mulMat<short, float, int>,
+ mulMat<short, float, float>,
+ mulMat<short, double, double>
+ },
+ {
+ 0 /*mulMat<int, float, unsigned char>*/,
+ 0 /*mulMat<int, float, signed char>*/,
+ 0 /*mulMat<int, float, unsigned short>*/,
+ 0 /*mulMat<int, float, short>*/,
+ mulMat<int, float, int>,
+ mulMat<int, float, float>,
+ mulMat<int, double, double>
+ },
+ {
+ 0 /*mulMat<float, float, unsigned char>*/,
+ 0 /*mulMat<float, float, signed char>*/,
+ 0 /*mulMat<float, float, unsigned short>*/,
+ 0 /*mulMat<float, float, short>*/,
+ 0 /*mulMat<float, float, int>*/,
+ mulMat<float, float, float>,
+ mulMat<float, double, double>
+ },
+ {
+ 0 /*mulMat<double, double, unsigned char>*/,
+ 0 /*mulMat<double, double, signed char>*/,
+ 0 /*mulMat<double, double, unsigned short>*/,
+ 0 /*mulMat<double, double, short>*/,
+ 0 /*mulMat<double, double, int>*/,
+ 0 /*mulMat<double, double, float>*/,
+ mulMat<double, double, double>
+ }
};
if (dtype < 0)
dtype = src1.depth();
- CV_Assert(src1.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
- CV_Assert(src1.type() == src2.type() && src1.size() == src2.size());
+ const int sdepth = src1.depth();
+ const int ddepth = CV_MAT_DEPTH(dtype);
+ const int cn = src1.channels();
- if (src1.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+ CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+ CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
+
+ if (sdepth == CV_64F || ddepth == CV_64F)
{
- if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+ if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
- dst.create(src1.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src1.channels()));
+ dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
-#if (CUDA_VERSION <= 4020)
- if (scale == 1 && dst.type() == src1.type() && src1.depth() <= CV_32F)
-#else
- if (scale == 1 && dst.type() == src1.type() && src1.depth() <= CV_32F && src1.depth() > CV_8U)
-#endif
- {
- npp_funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
- return;
- }
+ PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+ PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+ PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
- const func_t func = funcs[src1.depth()][dst.depth()];
+ const func_t func = funcs[sdepth][ddepth];
if (!func)
CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
- func(src1.reshape(1), src2.reshape(1), dst.reshape(1), scale, stream);
+ func(src1_, src2_, dst_, scale, stream);
}
}
-namespace
+namespace arithm
{
- inline bool isIntScalar(Scalar sc)
- {
- return sc.val[0] == static_cast<int>(sc.val[0]) && sc.val[1] == static_cast<int>(sc.val[1]) && sc.val[2] == static_cast<int>(sc.val[2]) && sc.val[3] == static_cast<int>(sc.val[3]);
- }
+ template <typename T, typename S, typename D>
+ void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
}
void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s)
{
- using namespace cv::gpu::device;
+ using namespace arithm;
- typedef void (*func_t)(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+ typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[7][7] =
{
- {multiply_gpu<unsigned char, unsigned char> , 0 /*multiply_gpu<unsigned char, signed char>*/ , multiply_gpu<unsigned char, unsigned short> , multiply_gpu<unsigned char, short> , multiply_gpu<unsigned char, int> , multiply_gpu<unsigned char, float> , multiply_gpu<unsigned char, double> },
- {0 /*multiply_gpu<signed char, unsigned char>*/ , 0 /*multiply_gpu<signed char, signed char>*/ , 0 /*multiply_gpu<signed char, unsigned short>*/, 0 /*multiply_gpu<signed char, short>*/ , 0 /*multiply_gpu<signed char, int>*/, 0 /*multiply_gpu<signed char, float>*/, 0 /*multiply_gpu<signed char, double>*/},
- {0 /*multiply_gpu<unsigned short, unsigned char>*/, 0 /*multiply_gpu<unsigned short, signed char>*/, multiply_gpu<unsigned short, unsigned short> , 0 /*multiply_gpu<unsigned short, short>*/, multiply_gpu<unsigned short, int> , multiply_gpu<unsigned short, float> , multiply_gpu<unsigned short, double> },
- {0 /*multiply_gpu<short, unsigned char>*/ , 0 /*multiply_gpu<short, signed char>*/ , 0 /*multiply_gpu<short, unsigned short>*/ , multiply_gpu<short, short> , multiply_gpu<short, int> , multiply_gpu<short, float> , multiply_gpu<short, double> },
- {0 /*multiply_gpu<int, unsigned char>*/ , 0 /*multiply_gpu<int, signed char>*/ , 0 /*multiply_gpu<int, unsigned short>*/ , 0 /*multiply_gpu<int, short>*/ , multiply_gpu<int, int> , multiply_gpu<int, float> , multiply_gpu<int, double> },
- {0 /*multiply_gpu<float, unsigned char>*/ , 0 /*multiply_gpu<float, signed char>*/ , 0 /*multiply_gpu<float, unsigned short>*/ , 0 /*multiply_gpu<float, short>*/ , 0 /*multiply_gpu<float, int>*/ , multiply_gpu<float, float> , multiply_gpu<float, double> },
- {0 /*multiply_gpu<double, unsigned char>*/ , 0 /*multiply_gpu<double, signed char>*/ , 0 /*multiply_gpu<double, unsigned short>*/ , 0 /*multiply_gpu<double, short>*/ , 0 /*multiply_gpu<double, int>*/ , 0 /*multiply_gpu<double, float>*/ , multiply_gpu<double, double> }
+ {
+ mulScalar<unsigned char, float, unsigned char>,
+ mulScalar<unsigned char, float, signed char>,
+ mulScalar<unsigned char, float, unsigned short>,
+ mulScalar<unsigned char, float, short>,
+ mulScalar<unsigned char, float, int>,
+ mulScalar<unsigned char, float, float>,
+ mulScalar<unsigned char, double, double>
+ },
+ {
+ mulScalar<signed char, float, unsigned char>,
+ mulScalar<signed char, float, signed char>,
+ mulScalar<signed char, float, unsigned short>,
+ mulScalar<signed char, float, short>,
+ mulScalar<signed char, float, int>,
+ mulScalar<signed char, float, float>,
+ mulScalar<signed char, double, double>
+ },
+ {
+ 0 /*mulScalar<unsigned short, float, unsigned char>*/,
+ 0 /*mulScalar<unsigned short, float, signed char>*/,
+ mulScalar<unsigned short, float, unsigned short>,
+ mulScalar<unsigned short, float, short>,
+ mulScalar<unsigned short, float, int>,
+ mulScalar<unsigned short, float, float>,
+ mulScalar<unsigned short, double, double>
+ },
+ {
+ 0 /*mulScalar<short, float, unsigned char>*/,
+ 0 /*mulScalar<short, float, signed char>*/,
+ mulScalar<short, float, unsigned short>,
+ mulScalar<short, float, short>,
+ mulScalar<short, float, int>,
+ mulScalar<short, float, float>,
+ mulScalar<short, double, double>
+ },
+ {
+ 0 /*mulScalar<int, float, unsigned char>*/,
+ 0 /*mulScalar<int, float, signed char>*/,
+ 0 /*mulScalar<int, float, unsigned short>*/,
+ 0 /*mulScalar<int, float, short>*/,
+ mulScalar<int, float, int>,
+ mulScalar<int, float, float>,
+ mulScalar<int, double, double>
+ },
+ {
+ 0 /*mulScalar<float, float, unsigned char>*/,
+ 0 /*mulScalar<float, float, signed char>*/,
+ 0 /*mulScalar<float, float, unsigned short>*/,
+ 0 /*mulScalar<float, float, short>*/,
+ 0 /*mulScalar<float, float, int>*/,
+ mulScalar<float, float, float>,
+ mulScalar<float, double, double>
+ },
+ {
+ 0 /*mulScalar<double, double, unsigned char>*/,
+ 0 /*mulScalar<double, double, signed char>*/,
+ 0 /*mulScalar<double, double, unsigned short>*/,
+ 0 /*mulScalar<double, double, short>*/,
+ 0 /*mulScalar<double, double, int>*/,
+ 0 /*mulScalar<double, double, float>*/,
+ mulScalar<double, double, double>
+ }
};
typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream);
if (dtype < 0)
dtype = src.depth();
- CV_Assert(src.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
- CV_Assert(src.channels() <= 4);
+ const int sdepth = src.depth();
+ const int ddepth = CV_MAT_DEPTH(dtype);
+ const int cn = src.channels();
+
+ CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+ CV_Assert( cn <= 4 );
- if (src.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+ if (sdepth == CV_64F || ddepth == CV_64F)
{
- if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+ if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
- dst.create(src.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
+ dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn));
cudaStream_t stream = StreamAccessor::getStream(s);
- if (dst.type() == src.type() && scale == 1 && (src.depth() == CV_32F || isIntScalar(sc)))
- {
- const npp_func_t npp_func = npp_funcs[src.depth()][src.channels() - 1];
+ const Scalar nsc(sc.val[0] * scale, sc.val[1] * scale, sc.val[2] * scale, sc.val[3] * scale);
- if (npp_func)
- {
- npp_func(src, sc, dst, stream);
- return;
- }
+ const npp_func_t npp_func = npp_funcs[sdepth][cn - 1];
+ if (ddepth == sdepth && cn > 1 && npp_func != 0)
+ {
+ npp_func(src, nsc, dst, stream);
+ return;
}
- CV_Assert(src.channels() == 1);
+ CV_Assert( cn == 1 );
- const func_t func = funcs[src.depth()][dst.depth()];
+ const func_t func = funcs[sdepth][ddepth];
if (!func)
CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
- func(src, sc.val[0], dst, scale, stream);
+ func(src, nsc.val[0], dst, stream);
}
////////////////////////////////////////////////////////////////////////
// divide
-namespace cv { namespace gpu { namespace device
+namespace arithm
{
- void divide_gpu(const PtrStepSz<uchar4>& src1, const PtrStepSzf& src2, const PtrStepSz<uchar4>& dst, cudaStream_t stream);
- void divide_gpu(const PtrStepSz<short4>& src1, const PtrStepSzf& src2, const PtrStepSz<short4>& dst, cudaStream_t stream);
-
- template <typename T, typename D>
- void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+ void divMat_8uc4_32f(PtrStepSz<unsigned int> src1, PtrStepSzf src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
- template <typename T, typename D>
- void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+ void divMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream);
- template <typename T, typename D>
- void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-}}}
+ template <typename T, typename S, typename D>
+ void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+}
void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s)
{
- using namespace cv::gpu::device;
+ using namespace arithm;
cudaStream_t stream = StreamAccessor::getStream(s);
if (src1.type() == CV_8UC4 && src2.type() == CV_32FC1)
{
- CV_Assert(src1.size() == src2.size());
+ CV_Assert( src1.size() == src2.size() );
dst.create(src1.size(), src1.type());
- divide_gpu(static_cast<PtrStepSz<uchar4> >(src1), static_cast<PtrStepSzf>(src2), static_cast<PtrStepSz<uchar4> >(dst), stream);
+ divMat_8uc4_32f(src1, src2, dst, stream);
}
else if (src1.type() == CV_16SC4 && src2.type() == CV_32FC1)
{
- CV_Assert(src1.size() == src2.size());
+ CV_Assert( src1.size() == src2.size() );
dst.create(src1.size(), src1.type());
- divide_gpu(static_cast<PtrStepSz<short4> >(src1), static_cast<PtrStepSzf>(src2), static_cast<PtrStepSz<short4> >(dst), stream);
+ divMat_16sc4_32f(src1, src2, dst, stream);
}
else
{
- typedef void (*func_t)(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+ typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
static const func_t funcs[7][7] =
{
- {divide_gpu<unsigned char, unsigned char> , 0 /*divide_gpu<unsigned char, signed char>*/ , divide_gpu<unsigned char, unsigned short> , divide_gpu<unsigned char, short> , divide_gpu<unsigned char, int> , divide_gpu<unsigned char, float> , divide_gpu<unsigned char, double> },
- {0 /*divide_gpu<signed char, unsigned char>*/ , 0 /*divide_gpu<signed char, signed char>*/ , 0 /*divide_gpu<signed char, unsigned short>*/, 0 /*divide_gpu<signed char, short>*/ , 0 /*divide_gpu<signed char, int>*/, 0 /*divide_gpu<signed char, float>*/, 0 /*divide_gpu<signed char, double>*/},
- {0 /*divide_gpu<unsigned short, unsigned char>*/, 0 /*divide_gpu<unsigned short, signed char>*/, divide_gpu<unsigned short, unsigned short> , 0 /*divide_gpu<unsigned short, short>*/, divide_gpu<unsigned short, int> , divide_gpu<unsigned short, float> , divide_gpu<unsigned short, double> },
- {0 /*divide_gpu<short, unsigned char>*/ , 0 /*divide_gpu<short, signed char>*/ , 0 /*divide_gpu<short, unsigned short>*/ , divide_gpu<short, short> , divide_gpu<short, int> , divide_gpu<short, float> , divide_gpu<short, double> },
- {0 /*divide_gpu<int, unsigned char>*/ , 0 /*divide_gpu<int, signed char>*/ , 0 /*divide_gpu<int, unsigned short>*/ , 0 /*divide_gpu<int, short>*/ , divide_gpu<int, int> , divide_gpu<int, float> , divide_gpu<int, double> },
- {0 /*divide_gpu<float, unsigned char>*/ , 0 /*divide_gpu<float, signed char>*/ , 0 /*divide_gpu<float, unsigned short>*/ , 0 /*divide_gpu<float, short>*/ , 0 /*divide_gpu<float, int>*/ , divide_gpu<float, float> , divide_gpu<float, double> },
- {0 /*divide_gpu<double, unsigned char>*/ , 0 /*divide_gpu<double, signed char>*/ , 0 /*divide_gpu<double, unsigned short>*/ , 0 /*divide_gpu<double, short>*/ , 0 /*divide_gpu<double, int>*/ , 0 /*divide_gpu<double, float>*/ , divide_gpu<double, double> }
- };
-
- typedef void (*npp_func_t)(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
- static const npp_func_t npp_funcs[6] =
- {
- NppArithm<CV_8U , nppiDiv_8u_C1RSfs >::call,
- 0,
- NppArithm<CV_16U, nppiDiv_16u_C1RSfs>::call,
- NppArithm<CV_16S, nppiDiv_16s_C1RSfs>::call,
- NppArithm<CV_32S, nppiDiv_32s_C1RSfs>::call,
- NppArithm<CV_32F, nppiDiv_32f_C1R >::call
+ {
+ divMat<unsigned char, float, unsigned char>,
+ divMat<unsigned char, float, signed char>,
+ divMat<unsigned char, float, unsigned short>,
+ divMat<unsigned char, float, short>,
+ divMat<unsigned char, float, int>,
+ divMat<unsigned char, float, float>,
+ divMat<unsigned char, double, double>
+ },
+ {
+ divMat<signed char, float, unsigned char>,
+ divMat<signed char, float, signed char>,
+ divMat<signed char, float, unsigned short>,
+ divMat<signed char, float, short>,
+ divMat<signed char, float, int>,
+ divMat<signed char, float, float>,
+ divMat<signed char, double, double>
+ },
+ {
+ 0 /*divMat<unsigned short, float, unsigned char>*/,
+ 0 /*divMat<unsigned short, float, signed char>*/,
+ divMat<unsigned short, float, unsigned short>,
+ divMat<unsigned short, float, short>,
+ divMat<unsigned short, float, int>,
+ divMat<unsigned short, float, float>,
+ divMat<unsigned short, double, double>
+ },
+ {
+ 0 /*divMat<short, float, unsigned char>*/,
+ 0 /*divMat<short, float, signed char>*/,
+ divMat<short, float, unsigned short>,
+ divMat<short, float, short>,
+ divMat<short, float, int>,
+ divMat<short, float, float>,
+ divMat<short, double, double>
+ },
+ {
+ 0 /*divMat<int, float, unsigned char>*/,
+ 0 /*divMat<int, float, signed char>*/,
+ 0 /*divMat<int, float, unsigned short>*/,
+ 0 /*divMat<int, float, short>*/,
+ divMat<int, float, int>,
+ divMat<int, float, float>,
+ divMat<int, double, double>
+ },
+ {
+ 0 /*divMat<float, float, unsigned char>*/,
+ 0 /*divMat<float, float, signed char>*/,
+ 0 /*divMat<float, float, unsigned short>*/,
+ 0 /*divMat<float, float, short>*/,
+ 0 /*divMat<float, float, int>*/,
+ divMat<float, float, float>,
+ divMat<float, double, double>
+ },
+ {
+ 0 /*divMat<double, double, unsigned char>*/,
+ 0 /*divMat<double, double, signed char>*/,
+ 0 /*divMat<double, double, unsigned short>*/,
+ 0 /*divMat<double, double, short>*/,
+ 0 /*divMat<double, double, int>*/,
+ 0 /*divMat<double, double, float>*/,
+ divMat<double, double, double>
+ }
};
if (dtype < 0)
dtype = src1.depth();
- CV_Assert(src1.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
- CV_Assert(src1.type() == src2.type() && src1.size() == src2.size());
+ const int sdepth = src1.depth();
+ const int ddepth = CV_MAT_DEPTH(dtype);
+ const int cn = src1.channels();
- if (src1.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+ CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+ CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
+
+ if (sdepth == CV_64F || ddepth == CV_64F)
{
- if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+ if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
- dst.create(src1.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src1.channels()));
+ dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
- if (scale == 1 && dst.type() == src1.type() && src1.depth() <= CV_32F)
- {
- npp_funcs[src1.depth()](src2.reshape(1), src1.reshape(1), dst.reshape(1), stream);
- return;
- }
+ PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+ PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+ PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
- const func_t func = funcs[src1.depth()][dst.depth()];
+ const func_t func = funcs[sdepth][ddepth];
if (!func)
CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
- func(src1.reshape(1), src2.reshape(1), dst.reshape(1), scale, stream);
+ func(src1_, src2_, dst_, scale, stream);
}
}
+namespace arithm
+{
+ template <typename T, typename S, typename D>
+ void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+}
+
void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s)
{
- using namespace cv::gpu::device;
+ using namespace arithm;
- typedef void (*func_t)(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+ typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[7][7] =
{
- {divide_gpu<unsigned char, unsigned char> , 0 /*divide_gpu<unsigned char, signed char>*/ , divide_gpu<unsigned char, unsigned short> , divide_gpu<unsigned char, short> , divide_gpu<unsigned char, int> , divide_gpu<unsigned char, float> , divide_gpu<unsigned char, double> },
- {0 /*divide_gpu<signed char, unsigned char>*/ , 0 /*divide_gpu<signed char, signed char>*/ , 0 /*divide_gpu<signed char, unsigned short>*/, 0 /*divide_gpu<signed char, short>*/ , 0 /*divide_gpu<signed char, int>*/, 0 /*divide_gpu<signed char, float>*/, 0 /*divide_gpu<signed char, double>*/},
- {0 /*divide_gpu<unsigned short, unsigned char>*/, 0 /*divide_gpu<unsigned short, signed char>*/, divide_gpu<unsigned short, unsigned short> , 0 /*divide_gpu<unsigned short, short>*/, divide_gpu<unsigned short, int> , divide_gpu<unsigned short, float> , divide_gpu<unsigned short, double> },
- {0 /*divide_gpu<short, unsigned char>*/ , 0 /*divide_gpu<short, signed char>*/ , 0 /*divide_gpu<short, unsigned short>*/ , divide_gpu<short, short> , divide_gpu<short, int> , divide_gpu<short, float> , divide_gpu<short, double> },
- {0 /*divide_gpu<int, unsigned char>*/ , 0 /*divide_gpu<int, signed char>*/ , 0 /*divide_gpu<int, unsigned short>*/ , 0 /*divide_gpu<int, short>*/ , divide_gpu<int, int> , divide_gpu<int, float> , divide_gpu<int, double> },
- {0 /*divide_gpu<float, unsigned char>*/ , 0 /*divide_gpu<float, signed char>*/ , 0 /*divide_gpu<float, unsigned short>*/ , 0 /*divide_gpu<float, short>*/ , 0 /*divide_gpu<float, int>*/ , divide_gpu<float, float> , divide_gpu<float, double> },
- {0 /*divide_gpu<double, unsigned char>*/ , 0 /*divide_gpu<double, signed char>*/ , 0 /*divide_gpu<double, unsigned short>*/ , 0 /*divide_gpu<double, short>*/ , 0 /*divide_gpu<double, int>*/ , 0 /*divide_gpu<double, float>*/ , divide_gpu<double, double> }
+ {
+ divScalar<unsigned char, float, unsigned char>,
+ divScalar<unsigned char, float, signed char>,
+ divScalar<unsigned char, float, unsigned short>,
+ divScalar<unsigned char, float, short>,
+ divScalar<unsigned char, float, int>,
+ divScalar<unsigned char, float, float>,
+ divScalar<unsigned char, double, double>
+ },
+ {
+ divScalar<signed char, float, unsigned char>,
+ divScalar<signed char, float, signed char>,
+ divScalar<signed char, float, unsigned short>,
+ divScalar<signed char, float, short>,
+ divScalar<signed char, float, int>,
+ divScalar<signed char, float, float>,
+ divScalar<signed char, double, double>
+ },
+ {
+ 0 /*divScalar<unsigned short, float, unsigned char>*/,
+ 0 /*divScalar<unsigned short, float, signed char>*/,
+ divScalar<unsigned short, float, unsigned short>,
+ divScalar<unsigned short, float, short>,
+ divScalar<unsigned short, float, int>,
+ divScalar<unsigned short, float, float>,
+ divScalar<unsigned short, double, double>
+ },
+ {
+ 0 /*divScalar<short, float, unsigned char>*/,
+ 0 /*divScalar<short, float, signed char>*/,
+ divScalar<short, float, unsigned short>,
+ divScalar<short, float, short>,
+ divScalar<short, float, int>,
+ divScalar<short, float, float>,
+ divScalar<short, double, double>
+ },
+ {
+ 0 /*divScalar<int, float, unsigned char>*/,
+ 0 /*divScalar<int, float, signed char>*/,
+ 0 /*divScalar<int, float, unsigned short>*/,
+ 0 /*divScalar<int, float, short>*/,
+ divScalar<int, float, int>,
+ divScalar<int, float, float>,
+ divScalar<int, double, double>
+ },
+ {
+ 0 /*divScalar<float, float, unsigned char>*/,
+ 0 /*divScalar<float, float, signed char>*/,
+ 0 /*divScalar<float, float, unsigned short>*/,
+ 0 /*divScalar<float, float, short>*/,
+ 0 /*divScalar<float, float, int>*/,
+ divScalar<float, float, float>,
+ divScalar<float, double, double>
+ },
+ {
+ 0 /*divScalar<double, double, unsigned char>*/,
+ 0 /*divScalar<double, double, signed char>*/,
+ 0 /*divScalar<double, double, unsigned short>*/,
+ 0 /*divScalar<double, double, short>*/,
+ 0 /*divScalar<double, double, int>*/,
+ 0 /*divScalar<double, double, float>*/,
+ divScalar<double, double, double>
+ }
};
typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream);
if (dtype < 0)
dtype = src.depth();
- CV_Assert(src.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
- CV_Assert(src.channels() <= 4);
+ const int sdepth = src.depth();
+ const int ddepth = CV_MAT_DEPTH(dtype);
+ const int cn = src.channels();
- if (src.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+ CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+ CV_Assert( cn <= 4 );
+
+ if (sdepth == CV_64F || ddepth == CV_64F)
{
- if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+ if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
- dst.create(src.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
+ dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn));
cudaStream_t stream = StreamAccessor::getStream(s);
- if (dst.type() == src.type() && scale == 1 && (src.depth() == CV_32F || isIntScalar(sc)))
- {
- const npp_func_t npp_func = npp_funcs[src.depth()][src.channels() - 1];
+ const Scalar nsc(sc.val[0] / scale, sc.val[1] / scale, sc.val[2] / scale, sc.val[3] / scale);
- if (npp_func)
- {
- npp_func(src, sc, dst, stream);
- return;
- }
+ const npp_func_t npp_func = npp_funcs[sdepth][cn - 1];
+ if (ddepth == sdepth && cn > 1 && npp_func != 0)
+ {
+ npp_func(src, nsc, dst, stream);
+ return;
}
- CV_Assert(src.channels() == 1);
+ CV_Assert( cn == 1 );
- const func_t func = funcs[src.depth()][dst.depth()];
+ const func_t func = funcs[sdepth][ddepth];
if (!func)
CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
- func(src, sc.val[0], dst, scale, stream);
+ func(src, nsc.val[0], dst, stream);
+}
+
+namespace arithm
+{
+ template <typename T, typename S, typename D>
+ void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
}
void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, Stream& s)
{
- using namespace cv::gpu::device;
+ using namespace arithm;
- typedef void (*func_t)(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
+ typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[7][7] =
{
- {divide_gpu<unsigned char, unsigned char> , 0 /*divide_gpu<unsigned char, signed char>*/ , divide_gpu<unsigned char, unsigned short> , divide_gpu<unsigned char, short> , divide_gpu<unsigned char, int> , divide_gpu<unsigned char, float> , divide_gpu<unsigned char, double> },
- {0 /*divide_gpu<signed char, unsigned char>*/ , 0 /*divide_gpu<signed char, signed char>*/ , 0 /*divide_gpu<signed char, unsigned short>*/, 0 /*divide_gpu<signed char, short>*/ , 0 /*divide_gpu<signed char, int>*/, 0 /*divide_gpu<signed char, float>*/, 0 /*divide_gpu<signed char, double>*/},
- {0 /*divide_gpu<unsigned short, unsigned char>*/, 0 /*divide_gpu<unsigned short, signed char>*/, divide_gpu<unsigned short, unsigned short> , 0 /*divide_gpu<unsigned short, short>*/, divide_gpu<unsigned short, int> , divide_gpu<unsigned short, float> , divide_gpu<unsigned short, double> },
- {0 /*divide_gpu<short, unsigned char>*/ , 0 /*divide_gpu<short, signed char>*/ , 0 /*divide_gpu<short, unsigned short>*/ , divide_gpu<short, short> , divide_gpu<short, int> , divide_gpu<short, float> , divide_gpu<short, double> },
- {0 /*divide_gpu<int, unsigned char>*/ , 0 /*divide_gpu<int, signed char>*/ , 0 /*divide_gpu<int, unsigned short>*/ , 0 /*divide_gpu<int, short>*/ , divide_gpu<int, int> , divide_gpu<int, float> , divide_gpu<int, double> },
- {0 /*divide_gpu<float, unsigned char>*/ , 0 /*divide_gpu<float, signed char>*/ , 0 /*divide_gpu<float, unsigned short>*/ , 0 /*divide_gpu<float, short>*/ , 0 /*divide_gpu<float, int>*/ , divide_gpu<float, float> , divide_gpu<float, double> },
- {0 /*divide_gpu<double, unsigned char>*/ , 0 /*divide_gpu<double, signed char>*/ , 0 /*divide_gpu<double, unsigned short>*/ , 0 /*divide_gpu<double, short>*/ , 0 /*divide_gpu<double, int>*/ , 0 /*divide_gpu<double, float>*/ , divide_gpu<double, double> }
+ {
+ divInv<unsigned char, float, unsigned char>,
+ divInv<unsigned char, float, signed char>,
+ divInv<unsigned char, float, unsigned short>,
+ divInv<unsigned char, float, short>,
+ divInv<unsigned char, float, int>,
+ divInv<unsigned char, float, float>,
+ divInv<unsigned char, double, double>
+ },
+ {
+ divInv<signed char, float, unsigned char>,
+ divInv<signed char, float, signed char>,
+ divInv<signed char, float, unsigned short>,
+ divInv<signed char, float, short>,
+ divInv<signed char, float, int>,
+ divInv<signed char, float, float>,
+ divInv<signed char, double, double>
+ },
+ {
+ 0 /*divInv<unsigned short, float, unsigned char>*/,
+ 0 /*divInv<unsigned short, float, signed char>*/,
+ divInv<unsigned short, float, unsigned short>,
+ divInv<unsigned short, float, short>,
+ divInv<unsigned short, float, int>,
+ divInv<unsigned short, float, float>,
+ divInv<unsigned short, double, double>
+ },
+ {
+ 0 /*divInv<short, float, unsigned char>*/,
+ 0 /*divInv<short, float, signed char>*/,
+ divInv<short, float, unsigned short>,
+ divInv<short, float, short>,
+ divInv<short, float, int>,
+ divInv<short, float, float>,
+ divInv<short, double, double>
+ },
+ {
+ 0 /*divInv<int, float, unsigned char>*/,
+ 0 /*divInv<int, float, signed char>*/,
+ 0 /*divInv<int, float, unsigned short>*/,
+ 0 /*divInv<int, float, short>*/,
+ divInv<int, float, int>,
+ divInv<int, float, float>,
+ divInv<int, double, double>
+ },
+ {
+ 0 /*divInv<float, float, unsigned char>*/,
+ 0 /*divInv<float, float, signed char>*/,
+ 0 /*divInv<float, float, unsigned short>*/,
+ 0 /*divInv<float, float, short>*/,
+ 0 /*divInv<float, float, int>*/,
+ divInv<float, float, float>,
+ divInv<float, double, double>
+ },
+ {
+ 0 /*divInv<double, double, unsigned char>*/,
+ 0 /*divInv<double, double, signed char>*/,
+ 0 /*divInv<double, double, unsigned short>*/,
+ 0 /*divInv<double, double, short>*/,
+ 0 /*divInv<double, double, int>*/,
+ 0 /*divInv<double, double, float>*/,
+ divInv<double, double, double>
+ }
};
if (dtype < 0)
dtype = src.depth();
- CV_Assert(src.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
- CV_Assert(src.channels() == 1);
+ const int sdepth = src.depth();
+ const int ddepth = CV_MAT_DEPTH(dtype);
+ const int cn = src.channels();
+
+ CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+ CV_Assert( cn == 1 );
- if (src.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+ if (sdepth == CV_64F || ddepth == CV_64F)
{
- if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+ if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
- dst.create(src.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
+ dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn));
cudaStream_t stream = StreamAccessor::getStream(s);
- const func_t func = funcs[src.depth()][dst.depth()];
+ const func_t func = funcs[sdepth][ddepth];
if (!func)
CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
- func(scale, src, dst, stream);
+ func(src, scale, dst, stream);
}
//////////////////////////////////////////////////////////////////////////////
// absdiff
-namespace cv { namespace gpu { namespace device
+namespace arithm
{
template <typename T>
- void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
template <typename T>
- void absdiff_gpu(const PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-}}}
+ void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-namespace
+ template <typename T>
+ void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+}
+
+void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s)
{
- template <int DEPTH> struct NppAbsDiffFunc
- {
- typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
+ using namespace arithm;
- typedef NppStatus (*func_t)(const npp_t* src1, int src1_step, const npp_t* src2, int src2_step, npp_t* dst, int dst_step, NppiSize sz);
+ typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ static const func_t funcs[] =
+ {
+ absDiffMat<unsigned char>,
+ absDiffMat<signed char>,
+ absDiffMat<unsigned short>,
+ absDiffMat<short>,
+ absDiffMat<int>,
+ absDiffMat<float>,
+ absDiffMat<double>
};
-
- template <int DEPTH, typename NppAbsDiffFunc<DEPTH>::func_t func> struct NppAbsDiff
+ static const func_t vfuncs4[] =
{
- typedef typename NppAbsDiffFunc<DEPTH>::npp_t npp_t;
-
- static void call(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
- {
- NppStreamHandler h(stream);
-
- NppiSize sz;
- sz.width = src1.cols;
- sz.height = src1.rows;
+ vabsDiff4<unsigned int>,
+ vabsDiff4<int>,
+ 0,
+ 0
+ };
+ static const func_t vfuncs2[] =
+ {
+ 0,
+ 0,
+ vabsDiff2<unsigned int>,
+ vabsDiff2<int>
+ };
- nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), (const npp_t*)src2.data, static_cast<int>(src2.step),
- (npp_t*)dst.data, static_cast<int>(dst.step), sz) );
+ const int depth = src1.depth();
+ const int cn = src1.channels();
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
- }
- };
+ CV_Assert( depth <= CV_64F );
+ CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
- template <int DEPTH> struct NppAbsDiffCFunc
+ if (depth == CV_64F)
{
- typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
- typedef npp_t scalar_t;
+ if (!deviceSupports(NATIVE_DOUBLE))
+ CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+ }
- typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, npp_t* pDst, int nDstStep, NppiSize oSizeROI, npp_t nConstant);
- };
- template <> struct NppAbsDiffCFunc<CV_16U>
- {
- typedef NppTypeTraits<CV_16U>::npp_t npp_t;
- typedef Npp32u scalar_t;
+ dst.create(src1.size(), src1.type());
-#if (CUDA_VERSION <= 4020)
- typedef NppStatus (*func_t)(const Npp16u* pSrc1, int nSrc1Step, Npp16u* pDst, int nDstStep, NppiSize oSizeROI, Npp32u nConstant);
-#else
- typedef NppStatus (*func_t)(const Npp16u * pSrc1, int nSrc1Step, Npp16u * pDst, int nDstStep, NppiSize oSizeROI, Npp16u nConstant);
-#endif
- };
+ cudaStream_t stream = StreamAccessor::getStream(s);
+
+ PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+ PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+ PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
- template <int DEPTH, typename NppAbsDiffCFunc<DEPTH>::func_t func> struct NppAbsDiffC
+ if (depth < CV_32S)
{
- typedef typename NppAbsDiffCFunc<DEPTH>::npp_t npp_t;
- typedef typename NppAbsDiffCFunc<DEPTH>::scalar_t scalar_t;
+ const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+ const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+ const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
- static void call(const PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
- {
- NppStreamHandler h(stream);
+ const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
- NppiSize sz;
- sz.width = src1.cols;
- sz.height = src1.rows;
+ if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+ {
+ const func_t vfunc4 = vfuncs4[depth];
+ const func_t vfunc2 = vfuncs2[depth];
- nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step),
- (npp_t*)dst.data, static_cast<int>(dst.step), sz, static_cast<scalar_t>(val)) );
+ if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+ {
+ const int vcols = src1_.cols >> 2;
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
- }
- };
-}
+ vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+ PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+ PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+ stream);
-void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
-{
- using namespace cv::gpu::device;
+ return;
+ }
- typedef void (*func_t)(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- static const func_t funcs[] =
- {
- NppAbsDiff<CV_8U, nppiAbsDiff_8u_C1R>::call,
- absdiff_gpu<signed char>,
- NppAbsDiff<CV_16U, nppiAbsDiff_16u_C1R>::call,
- absdiff_gpu<short>,
- absdiff_gpu<int>,
- NppAbsDiff<CV_32F, nppiAbsDiff_32f_C1R>::call,
- absdiff_gpu<double>
- };
+ if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+ {
+ const int vcols = src1_.cols >> 1;
- CV_Assert(src1.depth() <= CV_64F);
- CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+ vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+ PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+ PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+ stream);
- if (src1.depth() == CV_64F)
- {
- if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
- CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+ return;
+ }
+ }
}
- dst.create(src1.size(), src1.type());
+ const func_t func = funcs[depth];
+
+ if (!func)
+ CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
- funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
+ func(src1_, src2_, dst_, stream);
+}
+
+namespace arithm
+{
+ template <typename T, typename S>
+ void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
}
void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Stream& stream)
{
- using namespace cv::gpu::device;
+ using namespace arithm;
- typedef void (*func_t)(const PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+ typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
- NppAbsDiffC<CV_8U, nppiAbsDiffC_8u_C1R>::call,
- absdiff_gpu<signed char>,
- NppAbsDiffC<CV_16U, nppiAbsDiffC_16u_C1R>::call,
- absdiff_gpu<short>,
- absdiff_gpu<int>,
- NppAbsDiffC<CV_32F, nppiAbsDiffC_32f_C1R>::call,
- absdiff_gpu<double>
+ absDiffScalar<unsigned char, float>,
+ absDiffScalar<signed char, float>,
+ absDiffScalar<unsigned short, float>,
+ absDiffScalar<short, float>,
+ absDiffScalar<int, float>,
+ absDiffScalar<float, float>,
+ absDiffScalar<double, double>
};
- CV_Assert(src1.depth() <= CV_64F);
- CV_Assert(src1.channels() == 1);
+ const int depth = src1.depth();
+
+ CV_Assert( depth <= CV_64F );
+ CV_Assert( src1.channels() == 1 );
- if (src1.depth() == CV_64F)
+ if (depth == CV_64F)
{
- if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+ if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
dst.create(src1.size(), src1.type());
- funcs[src1.depth()](src1, src2.val[0], dst, StreamAccessor::getStream(stream));
+ funcs[depth](src1, src2.val[0], dst, StreamAccessor::getStream(stream));
}
//////////////////////////////////////////////////////////////////////////////
// abs
-void cv::gpu::abs(const GpuMat& src, GpuMat& dst, Stream& s)
+namespace arithm
{
- CV_Assert(src.depth() == CV_16S || src.depth() == CV_32F);
-
- dst.create(src.size(), src.type());
-
- cudaStream_t stream = StreamAccessor::getStream(s);
-
- NppStreamHandler h(stream);
-
- NppiSize oSizeROI;
- oSizeROI.width = src.cols * src.channels();
- oSizeROI.height = src.rows;
-
- bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16);
-
- if (src.depth() == CV_16S)
- {
- if (aligned && oSizeROI.width % 4 == 0)
- {
- oSizeROI.width /= 4;
- nppSafeCall( nppiAbs_16s_C4R(src.ptr<Npp16s>(), static_cast<int>(src.step), dst.ptr<Npp16s>(), static_cast<int>(dst.step), oSizeROI) );
- }
- else
- {
- nppSafeCall( nppiAbs_16s_C1R(src.ptr<Npp16s>(), static_cast<int>(src.step), dst.ptr<Npp16s>(), static_cast<int>(dst.step), oSizeROI) );
- }
- }
- else
- {
- if (aligned && oSizeROI.width % 4 == 0)
- {
- oSizeROI.width /= 4;
- nppSafeCall( nppiAbs_32f_C4R(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), oSizeROI) );
- }
- else
- {
- nppSafeCall( nppiAbs_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), oSizeROI) );
- }
- }
-
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
+ template <typename T>
+ void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
}
-//////////////////////////////////////////////////////////////////////////////
-// sqr
-
-namespace
+void cv::gpu::abs(const GpuMat& src, GpuMat& dst, Stream& stream)
{
- template <int DEPTH> struct NppSqrFunc
- {
- typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
+ using namespace arithm;
- typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);
- };
- template <> struct NppSqrFunc<CV_32F>
+ typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+ static const func_t funcs[] =
{
- typedef NppTypeTraits<CV_32F>::npp_t npp_t;
-
- typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oSizeROI);
+ absMat<unsigned char>,
+ absMat<signed char>,
+ absMat<unsigned short>,
+ absMat<short>,
+ absMat<int>,
+ absMat<float>,
+ absMat<double>
};
- template <int DEPTH, typename NppSqrFunc<DEPTH>::func_t func, typename NppSqrFunc<DEPTH>::func_t func_c4> struct NppSqr
- {
- typedef typename NppSqrFunc<DEPTH>::npp_t npp_t;
-
- static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream)
- {
- NppStreamHandler h(stream);
-
- NppiSize oSizeROI;
- oSizeROI.width = src.cols * src.channels();
- oSizeROI.height = src.rows;
-
- bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16);
+ const int depth = src.depth();
- if (aligned && oSizeROI.width % 4 == 0)
- {
- oSizeROI.width /= 4;
- nppSafeCall( func_c4(src.ptr<npp_t>(), static_cast<int>(src.step), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI, 0) );
- }
- else
- {
- nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI, 0) );
- }
+ CV_Assert( depth <= CV_64F );
+ CV_Assert( src.channels() == 1 );
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
- }
- };
- template <typename NppSqrFunc<CV_32F>::func_t func, typename NppSqrFunc<CV_32F>::func_t func_c4> struct NppSqr<CV_32F, func, func_c4>
+ if (depth == CV_64F)
{
- typedef NppSqrFunc<CV_32F>::npp_t npp_t;
-
- static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream)
- {
- NppStreamHandler h(stream);
+ if (!deviceSupports(NATIVE_DOUBLE))
+ CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+ }
- NppiSize oSizeROI;
- oSizeROI.width = src.cols * src.channels();
- oSizeROI.height = src.rows;
+ dst.create(src.size(), src.type());
- bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16);
+ funcs[depth](src, dst, StreamAccessor::getStream(stream));
+}
- if (aligned && oSizeROI.width % 4 == 0)
- {
- oSizeROI.width /= 4;
- nppSafeCall( func_c4(src.ptr<npp_t>(), static_cast<int>(src.step), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );
- }
- else
- {
- nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );
- }
+//////////////////////////////////////////////////////////////////////////////
+// sqr
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
- }
- };
+namespace arithm
+{
+ template <typename T>
+ void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
}
void cv::gpu::sqr(const GpuMat& src, GpuMat& dst, Stream& stream)
{
- typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream);
+ using namespace arithm;
+ typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
- NppSqr<CV_8U, nppiSqr_8u_C1RSfs, nppiSqr_8u_C4RSfs>::call,
- 0,
- NppSqr<CV_16U, nppiSqr_16u_C1RSfs, nppiSqr_16u_C4RSfs>::call,
- NppSqr<CV_16S, nppiSqr_16s_C1RSfs, nppiSqr_16s_C4RSfs>::call,
- 0,
- NppSqr<CV_32F, nppiSqr_32f_C1R, nppiSqr_32f_C4R>::call
+ sqrMat<unsigned char>,
+ sqrMat<signed char>,
+ sqrMat<unsigned short>,
+ sqrMat<short>,
+ sqrMat<int>,
+ sqrMat<float>,
+ sqrMat<double>
};
- CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_16S || src.depth() == CV_32F);
+ const int depth = src.depth();
+
+ CV_Assert( depth <= CV_64F );
+ CV_Assert( src.channels() == 1 );
+
+ if (depth == CV_64F)
+ {
+ if (!deviceSupports(NATIVE_DOUBLE))
+ CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+ }
dst.create(src.size(), src.type());
- funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+ funcs[depth](src, dst, StreamAccessor::getStream(stream));
}
//////////////////////////////////////////////////////////////////////////////
// sqrt
-namespace
+namespace arithm
{
- template <int DEPTH> struct NppOneSourceFunc
- {
- typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-
- typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);
- };
- template <> struct NppOneSourceFunc<CV_32F>
- {
- typedef NppTypeTraits<CV_32F>::npp_t npp_t;
-
- typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oSizeROI);
- };
-
- template <int DEPTH, typename NppOneSourceFunc<DEPTH>::func_t func> struct NppOneSource
- {
- typedef typename NppOneSourceFunc<DEPTH>::npp_t npp_t;
-
- static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream)
- {
- NppStreamHandler h(stream);
-
- NppiSize oSizeROI;
- oSizeROI.width = src.cols * src.channels();
- oSizeROI.height = src.rows;
-
- nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI, 0) );
-
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
- }
- };
- template <typename NppOneSourceFunc<CV_32F>::func_t func> struct NppOneSource<CV_32F, func>
- {
- typedef NppOneSourceFunc<CV_32F>::npp_t npp_t;
-
- static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream)
- {
- NppStreamHandler h(stream);
-
- NppiSize oSizeROI;
- oSizeROI.width = src.cols * src.channels();
- oSizeROI.height = src.rows;
-
- nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );
-
- if (stream == 0)
- cudaSafeCall( cudaDeviceSynchronize() );
- }
- };
+ template <typename T>
+ void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
}
void cv::gpu::sqrt(const GpuMat& src, GpuMat& dst, Stream& stream)
{
- typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream);
+ using namespace arithm;
+ typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
- NppOneSource<CV_8U, nppiSqrt_8u_C1RSfs>::call,
- 0,
- NppOneSource<CV_16U, nppiSqrt_16u_C1RSfs>::call,
- NppOneSource<CV_16S, nppiSqrt_16s_C1RSfs>::call,
- 0,
- NppOneSource<CV_32F, nppiSqrt_32f_C1R>::call
+ sqrtMat<unsigned char>,
+ sqrtMat<signed char>,
+ sqrtMat<unsigned short>,
+ sqrtMat<short>,
+ sqrtMat<int>,
+ sqrtMat<float>,
+ sqrtMat<double>
};
- CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_16S || src.depth() == CV_32F);
+ const int depth = src.depth();
+
+ CV_Assert( depth <= CV_64F );
+ CV_Assert( src.channels() == 1 );
+
+ if (depth == CV_64F)
+ {
+ if (!deviceSupports(NATIVE_DOUBLE))
+ CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+ }
dst.create(src.size(), src.type());
- funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+ funcs[depth](src, dst, StreamAccessor::getStream(stream));
}
////////////////////////////////////////////////////////////////////////
// log
+namespace arithm
+{
+ template <typename T>
+ void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
void cv::gpu::log(const GpuMat& src, GpuMat& dst, Stream& stream)
{
- typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream);
+ using namespace arithm;
+ typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
- NppOneSource<CV_8U, nppiLn_8u_C1RSfs>::call,
- 0,
- NppOneSource<CV_16U, nppiLn_16u_C1RSfs>::call,
- NppOneSource<CV_16S, nppiLn_16s_C1RSfs>::call,
- 0,
- NppOneSource<CV_32F, nppiLn_32f_C1R>::call
+ logMat<unsigned char>,
+ logMat<signed char>,
+ logMat<unsigned short>,
+ logMat<short>,
+ logMat<int>,
+ logMat<float>,
+ logMat<double>
};
- CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_16S || src.depth() == CV_32F);
+ const int depth = src.depth();
+
+ CV_Assert( depth <= CV_64F );
+ CV_Assert( src.channels() == 1 );
+
+ if (depth == CV_64F)
+ {
+ if (!deviceSupports(NATIVE_DOUBLE))
+ CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+ }
dst.create(src.size(), src.type());
- funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+ funcs[depth](src, dst, StreamAccessor::getStream(stream));
}
////////////////////////////////////////////////////////////////////////
// exp
+namespace arithm
+{
+ template <typename T>
+ void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream)
{
- typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream);
+ using namespace arithm;
+ typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
- NppOneSource<CV_8U, nppiExp_8u_C1RSfs>::call,
- 0,
- NppOneSource<CV_16U, nppiExp_16u_C1RSfs>::call,
- NppOneSource<CV_16S, nppiExp_16s_C1RSfs>::call,
- 0,
- NppOneSource<CV_32F, nppiExp_32f_C1R>::call
+ expMat<unsigned char>,
+ expMat<signed char>,
+ expMat<unsigned short>,
+ expMat<short>,
+ expMat<int>,
+ expMat<float>,
+ expMat<double>
};
- CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_16S || src.depth() == CV_32F);
+ const int depth = src.depth();
+
+ CV_Assert( depth <= CV_64F );
+ CV_Assert( src.channels() == 1 );
+
+ if (depth == CV_64F)
+ {
+ if (!deviceSupports(NATIVE_DOUBLE))
+ CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+ }
dst.create(src.size(), src.type());
- funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+ funcs[depth](src, dst, StreamAccessor::getStream(stream));
}
//////////////////////////////////////////////////////////////////////////////
-// Comparison of two matrixes
+// compare
-namespace cv { namespace gpu { namespace device
+namespace arithm
{
- template <typename T> void compare_eq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template <typename T> void compare_ne(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template <typename T> void compare_lt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template <typename T> void compare_le(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
- template <typename T> void compare_eq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template <typename T> void compare_ne(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template <typename T> void compare_lt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template <typename T> void compare_le(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template <typename T> void compare_gt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
- template <typename T> void compare_ge(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-}}}
+ template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template <typename T> void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template <typename T> void cmpMatLt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template <typename T> void cmpMatLe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+}
-void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop, Stream& stream)
+void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop, Stream& s)
{
- using namespace cv::gpu::device;
+ using namespace arithm;
typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[7][4] =
{
- {compare_eq<unsigned char> , compare_ne<unsigned char> , compare_lt<unsigned char> , compare_le<unsigned char> },
- {compare_eq<signed char> , compare_ne<signed char> , compare_lt<signed char> , compare_le<signed char> },
- {compare_eq<unsigned short>, compare_ne<unsigned short>, compare_lt<unsigned short>, compare_le<unsigned short>},
- {compare_eq<short> , compare_ne<short> , compare_lt<short> , compare_le<short> },
- {compare_eq<int> , compare_ne<int> , compare_lt<int> , compare_le<int> },
- {compare_eq<float> , compare_ne<float> , compare_lt<float> , compare_le<float> },
- {compare_eq<double> , compare_ne<double> , compare_lt<double> , compare_le<double> }
+ {cmpMatEq<unsigned char> , cmpMatNe<unsigned char> , cmpMatLt<unsigned char> , cmpMatLe<unsigned char> },
+ {cmpMatEq<signed char> , cmpMatNe<signed char> , cmpMatLt<signed char> , cmpMatLe<signed char> },
+ {cmpMatEq<unsigned short>, cmpMatNe<unsigned short>, cmpMatLt<unsigned short>, cmpMatLe<unsigned short>},
+ {cmpMatEq<short> , cmpMatNe<short> , cmpMatLt<short> , cmpMatLe<short> },
+ {cmpMatEq<int> , cmpMatNe<int> , cmpMatLt<int> , cmpMatLe<int> },
+ {cmpMatEq<float> , cmpMatNe<float> , cmpMatLt<float> , cmpMatLe<float> },
+ {cmpMatEq<double> , cmpMatNe<double> , cmpMatLt<double> , cmpMatLe<double> }
};
- CV_Assert(src1.depth() <= CV_64F);
- CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
- CV_Assert(cmpop >= CMP_EQ && cmpop <= CMP_NE);
+ const int depth = src1.depth();
+ const int cn = src1.channels();
- if (src1.depth() == CV_64F)
+ CV_Assert( depth <= CV_64F );
+ CV_Assert( src2.size() == src1.size() && src2.type() == src1.type() );
+ CV_Assert( cmpop >= CMP_EQ && cmpop <= CMP_NE );
+
+ if (depth == CV_64F)
{
- if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+ if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
+ dst.create(src1.size(), CV_MAKE_TYPE(CV_8U, cn));
+
+ cudaStream_t stream = StreamAccessor::getStream(s);
+
static const int codes[] =
{
0, 2, 3, 2, 3, 1
&src2, &src1, &src1, &src2, &src2, &src2
};
- dst.create(src1.size(), CV_MAKE_TYPE(CV_8U, src1.channels()));
+ const int code = codes[cmpop];
+ PtrStepSzb src1_(src1.rows, src1.cols * cn, psrc1[cmpop]->data, psrc1[cmpop]->step);
+ PtrStepSzb src2_(src1.rows, src1.cols * cn, psrc2[cmpop]->data, psrc2[cmpop]->step);
+ PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
- funcs[src1.depth()][codes[cmpop]](psrc1[cmpop]->reshape(1), psrc2[cmpop]->reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
+ const func_t func = funcs[depth][code];
+
+ func(src1_, src2_, dst_, stream);
+}
+
+namespace arithm
+{
+ template <typename T> void cmpScalarEq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template <typename T> void cmpScalarNe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template <typename T> void cmpScalarLt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template <typename T> void cmpScalarLe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template <typename T> void cmpScalarGt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+ template <typename T> void cmpScalarGe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
}
namespace
{
- template <typename T>
- void castScalar(Scalar& sc)
+ template <typename T> void castScalar(Scalar& sc)
{
sc.val[0] = saturate_cast<T>(sc.val[0]);
sc.val[1] = saturate_cast<T>(sc.val[1]);
void cv::gpu::compare(const GpuMat& src, Scalar sc, GpuMat& dst, int cmpop, Stream& stream)
{
- using namespace cv::gpu::device;
+ using namespace arithm;
typedef void (*func_t)(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[7][6] =
{
- {compare_eq<unsigned char> , compare_gt<unsigned char> , compare_ge<unsigned char> , compare_lt<unsigned char> , compare_le<unsigned char> , compare_ne<unsigned char> },
- {compare_eq<signed char> , compare_gt<signed char> , compare_ge<signed char> , compare_lt<signed char> , compare_le<signed char> , compare_ne<signed char> },
- {compare_eq<unsigned short>, compare_gt<unsigned short>, compare_ge<unsigned short>, compare_lt<unsigned short>, compare_le<unsigned short>, compare_ne<unsigned short>},
- {compare_eq<short> , compare_gt<short> , compare_ge<short> , compare_lt<short> , compare_le<short> , compare_ne<short> },
- {compare_eq<int> , compare_gt<int> , compare_ge<int> , compare_lt<int> , compare_le<int> , compare_ne<int> },
- {compare_eq<float> , compare_gt<float> , compare_ge<float> , compare_lt<float> , compare_le<float> , compare_ne<float> },
- {compare_eq<double> , compare_gt<double> , compare_ge<double> , compare_lt<double> , compare_le<double> , compare_ne<double> }
+ {cmpScalarEq<unsigned char> , cmpScalarGt<unsigned char> , cmpScalarGe<unsigned char> , cmpScalarLt<unsigned char> , cmpScalarLe<unsigned char> , cmpScalarNe<unsigned char> },
+ {cmpScalarEq<signed char> , cmpScalarGt<signed char> , cmpScalarGe<signed char> , cmpScalarLt<signed char> , cmpScalarLe<signed char> , cmpScalarNe<signed char> },
+ {cmpScalarEq<unsigned short>, cmpScalarGt<unsigned short>, cmpScalarGe<unsigned short>, cmpScalarLt<unsigned short>, cmpScalarLe<unsigned short>, cmpScalarNe<unsigned short>},
+ {cmpScalarEq<short> , cmpScalarGt<short> , cmpScalarGe<short> , cmpScalarLt<short> , cmpScalarLe<short> , cmpScalarNe<short> },
+ {cmpScalarEq<int> , cmpScalarGt<int> , cmpScalarGe<int> , cmpScalarLt<int> , cmpScalarLe<int> , cmpScalarNe<int> },
+ {cmpScalarEq<float> , cmpScalarGt<float> , cmpScalarGe<float> , cmpScalarLt<float> , cmpScalarLe<float> , cmpScalarNe<float> },
+ {cmpScalarEq<double> , cmpScalarGt<double> , cmpScalarGe<double> , cmpScalarLt<double> , cmpScalarLe<double> , cmpScalarNe<double> }
};
typedef void (*cast_func_t)(Scalar& sc);
castScalar<unsigned char>, castScalar<signed char>, castScalar<unsigned short>, castScalar<short>, castScalar<int>, castScalar<float>, castScalar<double>
};
- CV_Assert(src.depth() <= CV_64F);
- CV_Assert(src.channels() <= 4);
- CV_Assert(cmpop >= CMP_EQ && cmpop <= CMP_NE);
+ const int depth = src.depth();
+ const int cn = src.channels();
+
+ CV_Assert( depth <= CV_64F );
+ CV_Assert( cn <= 4 );
+ CV_Assert( cmpop >= CMP_EQ && cmpop <= CMP_NE );
- if (src.depth() == CV_64F)
+ if (depth == CV_64F)
{
- if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+ if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
- dst.create(src.size(), CV_MAKE_TYPE(CV_8U, src.channels()));
+ dst.create(src.size(), CV_MAKE_TYPE(CV_8U, cn));
- cast_func[src.depth()](sc);
+ cast_func[depth](sc);
- funcs[src.depth()][cmpop](src, src.channels(), sc.val, dst, StreamAccessor::getStream(stream));
+ funcs[depth][cmpop](src, cn, sc.val, dst, StreamAccessor::getStream(stream));
}
-
//////////////////////////////////////////////////////////////////////////////
// Unary bitwise logical operations
-namespace cv { namespace gpu { namespace device
+namespace arithm
{
- void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src, PtrStepb dst, cudaStream_t stream);
-
- template <typename T>
- void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
-}}}
+ template <typename T> void bitMatNot(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
-namespace
+void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, Stream& s)
{
- void bitwiseNotCaller(const GpuMat& src, GpuMat& dst, cudaStream_t stream)
- {
- dst.create(src.size(), src.type());
+ using namespace arithm;
- cv::gpu::device::bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), dst.channels(), src, dst, stream);
- }
-
- void bitwiseNotCaller(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
- {
- using namespace cv::gpu::device;
+ const int depth = src.depth();
- typedef void (*func_t)(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
- static func_t funcs[] =
- {
- bitwiseMaskNotCaller<unsigned char>, bitwiseMaskNotCaller<unsigned char>,
- bitwiseMaskNotCaller<unsigned short>, bitwiseMaskNotCaller<unsigned short>,
- bitwiseMaskNotCaller<unsigned int>, bitwiseMaskNotCaller<unsigned int>,
- bitwiseMaskNotCaller<unsigned int>
- };
+ CV_Assert( depth <= CV_64F );
+ CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
- CV_Assert(src.depth() <= CV_64F);
- CV_Assert(mask.type() == CV_8U && mask.size() == src.size());
+ dst.create(src.size(), src.type());
- dst.create(src.size(), src.type());
+ cudaStream_t stream = StreamAccessor::getStream(s);
- const func_t func = funcs[src.depth()];
+ const int bcols = src.cols * src.elemSize();
- int cn = src.depth() != CV_64F ? src.channels() : src.channels() * (sizeof(double) / sizeof(unsigned int));
+ if ((bcols & 3) == 0)
+ {
+ const int vcols = bcols >> 2;
- func(src.rows, src.cols, cn, src, mask, dst, stream);
+ bitMatNot<unsigned int>(
+ PtrStepSzb(src.rows, vcols, src.data, src.step),
+ PtrStepSzb(src.rows, vcols, dst.data, dst.step),
+ mask, stream);
}
-}
+ else if ((bcols & 1) == 0)
+ {
+ const int vcols = bcols >> 1;
-void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, Stream& stream)
-{
- if (mask.empty())
- bitwiseNotCaller(src, dst, StreamAccessor::getStream(stream));
+ bitMatNot<unsigned short>(
+ PtrStepSzb(src.rows, vcols, src.data, src.step),
+ PtrStepSzb(src.rows, vcols, dst.data, dst.step),
+ mask, stream);
+ }
else
- bitwiseNotCaller(src, dst, mask, StreamAccessor::getStream(stream));
+ {
+ bitMatNot<unsigned short>(
+ PtrStepSzb(src.rows, bcols, src.data, src.step),
+ PtrStepSzb(src.rows, bcols, dst.data, dst.step),
+ mask, stream);
+ }
}
//////////////////////////////////////////////////////////////////////////////
// Binary bitwise logical operations
-namespace cv { namespace gpu { namespace device
+namespace arithm
{
- void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
+ template <typename T> void bitMatAnd(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template <typename T> void bitMatOr(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+ template <typename T> void bitMatXor(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
- template <typename T>
- void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
+void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& s)
+{
+ using namespace arithm;
- void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
+ const int depth = src1.depth();
- template <typename T>
- void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
+ CV_Assert( depth <= CV_64F );
+ CV_Assert( src2.size() == src1.size() && src2.type() == src1.type() );
+ CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src1.size()) );
- void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
+ dst.create(src1.size(), src1.type());
- template <typename T>
- void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
-}}}
+ cudaStream_t stream = StreamAccessor::getStream(s);
-namespace
-{
- void bitwiseOrCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
- {
- CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+ const int bcols = src1.cols * src1.elemSize();
- dst.create(src1.size(), src1.type());
+ if ((bcols & 3) == 0)
+ {
+ const int vcols = bcols >> 2;
- cv::gpu::device::bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
+ bitMatAnd<unsigned int>(
+ PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+ PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
+ PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
+ mask, stream);
}
+ else if ((bcols & 1) == 0)
+ {
+ const int vcols = bcols >> 1;
- void bitwiseOrCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
+ bitMatAnd<unsigned int>(
+ PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+ PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
+ PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
+ mask, stream);
+ }
+ else
{
- using namespace cv::gpu::device;
- typedef void (*func_t)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
- static func_t funcs[] =
- {
- bitwiseMaskOrCaller<unsigned char>, bitwiseMaskOrCaller<unsigned char>,
- bitwiseMaskOrCaller<unsigned short>, bitwiseMaskOrCaller<unsigned short>,
- bitwiseMaskOrCaller<unsigned int>, bitwiseMaskOrCaller<unsigned int>,
- bitwiseMaskOrCaller<unsigned int>
- };
+ bitMatAnd<unsigned int>(
+ PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
+ PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
+ PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
+ mask, stream);
+ }
+}
- CV_Assert(src1.depth() <= CV_64F);
- CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
- CV_Assert(mask.type() == CV_8U && mask.size() == src1.size());
+void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& s)
+{
+ using namespace arithm;
- dst.create(src1.size(), src1.type());
+ const int depth = src1.depth();
- const func_t func = funcs[src1.depth()];
+ CV_Assert( depth <= CV_64F );
+ CV_Assert( src2.size() == src1.size() && src2.type() == src1.type() );
+ CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src1.size()) );
- int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int));
+ dst.create(src1.size(), src1.type());
- func(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
- }
+ cudaStream_t stream = StreamAccessor::getStream(s);
+ const int bcols = src1.cols * src1.elemSize();
- void bitwiseAndCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
+ if ((bcols & 3) == 0)
{
- CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-
- dst.create(src1.size(), src1.type());
+ const int vcols = bcols >> 2;
- cv::gpu::device::bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
+ bitMatOr<unsigned int>(
+ PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+ PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
+ PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
+ mask, stream);
}
+ else if ((bcols & 1) == 0)
+ {
+ const int vcols = bcols >> 1;
- void bitwiseAndCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
+ bitMatOr<unsigned int>(
+ PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+ PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
+ PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
+ mask, stream);
+ }
+ else
{
- using namespace cv::gpu::device;
- typedef void (*func_t)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
- static func_t funcs[] =
- {
- bitwiseMaskAndCaller<unsigned char>, bitwiseMaskAndCaller<unsigned char>,
- bitwiseMaskAndCaller<unsigned short>, bitwiseMaskAndCaller<unsigned short>,
- bitwiseMaskAndCaller<unsigned int>, bitwiseMaskAndCaller<unsigned int>,
- bitwiseMaskAndCaller<unsigned int>
- };
+ bitMatOr<unsigned int>(
+ PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
+ PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
+ PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
+ mask, stream);
+ }
+}
- CV_Assert(src1.depth() <= CV_64F);
- CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
- CV_Assert(mask.type() == CV_8U && mask.size() == src1.size());
+void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& s)
+{
+ using namespace arithm;
- dst.create(src1.size(), src1.type());
+ const int depth = src1.depth();
- const func_t func = funcs[src1.depth()];
+ CV_Assert( depth <= CV_64F );
+ CV_Assert( src2.size() == src1.size() && src2.type() == src1.type() );
+ CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src1.size()) );
- int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int));
+ dst.create(src1.size(), src1.type());
- func(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
- }
+ cudaStream_t stream = StreamAccessor::getStream(s);
+ const int bcols = src1.cols * src1.elemSize();
- void bitwiseXorCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
+ if ((bcols & 3) == 0)
{
- CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+ const int vcols = bcols >> 2;
- dst.create(src1.size(), src1.type());
-
- cv::gpu::device::bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
+ bitMatXor<unsigned int>(
+ PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+ PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
+ PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
+ mask, stream);
}
-
- void bitwiseXorCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
+ else if ((bcols & 1) == 0)
{
- using namespace cv::gpu::device;
-
- typedef void (*func_t)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
- static func_t funcs[] =
- {
- bitwiseMaskXorCaller<unsigned char>, bitwiseMaskXorCaller<unsigned char>,
- bitwiseMaskXorCaller<unsigned short>, bitwiseMaskXorCaller<unsigned short>,
- bitwiseMaskXorCaller<unsigned int>, bitwiseMaskXorCaller<unsigned int>,
- bitwiseMaskXorCaller<unsigned int>
- };
-
- CV_Assert(src1.depth() <= CV_64F);
- CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
- CV_Assert(mask.type() == CV_8U && mask.size() == src1.size());
-
- dst.create(src1.size(), src1.type());
-
- const func_t func = funcs[src1.depth()];
+ const int vcols = bcols >> 1;
- int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int));
-
- func(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
+ bitMatXor<unsigned int>(
+ PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+ PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
+ PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
+ mask, stream);
}
-}
-
-void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
-{
- if (mask.empty())
- bitwiseOrCaller(src1, src2, dst, StreamAccessor::getStream(stream));
else
- bitwiseOrCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
-}
+ {
-void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
-{
- if (mask.empty())
- bitwiseAndCaller(src1, src2, dst, StreamAccessor::getStream(stream));
- else
- bitwiseAndCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
+ bitMatXor<unsigned int>(
+ PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
+ PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
+ PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
+ mask, stream);
+ }
}
-void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
+//////////////////////////////////////////////////////////////////////////////
+// Binary bitwise logical operations with scalars
+
+namespace arithm
{
- if (mask.empty())
- bitwiseXorCaller(src1, src2, dst, StreamAccessor::getStream(stream));
- else
- bitwiseXorCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
+ template <typename T> void bitScalarAnd(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream);
+ template <typename T> void bitScalarOr(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream);
+ template <typename T> void bitScalarXor(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream);
}
namespace
{
+ typedef void (*bit_scalar_func_t)(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream);
+
+ template <bit_scalar_func_t func> struct BitScalar
+ {
+ static void call(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream)
+ {
+ func(src, static_cast<unsigned int>(sc.val[0]), dst, stream);
+ }
+ };
+
+ template <bit_scalar_func_t func> struct BitScalar4
+ {
+ static void call(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream)
+ {
+ Scalar_<unsigned int> isc = sc;
+
+ unsigned int packedVal = 0;
+
+ packedVal |= (isc.val[0] & 0xffff);
+ packedVal |= (isc.val[1] & 0xffff) << 8;
+ packedVal |= (isc.val[2] & 0xffff) << 16;
+ packedVal |= (isc.val[3] & 0xffff) << 24;
+
+ func(src, packedVal, dst, stream);
+ }
+ };
+
template <int DEPTH, int cn> struct NppBitwiseCFunc
{
typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
};
}
-void cv::gpu::bitwise_or(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
+void cv::gpu::bitwise_and(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
{
+ using namespace arithm;
+
typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[5][4] =
{
- {NppBitwiseC<CV_8U , 1, nppiOrC_8u_C1R >::call, 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, NppBitwiseC<CV_8U , 4, nppiOrC_8u_C4R >::call},
+ {BitScalar< bitScalarAnd<unsigned char> >::call , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarAnd<unsigned int> >::call},
{0,0,0,0},
- {NppBitwiseC<CV_16U, 1, nppiOrC_16u_C1R>::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
+ {BitScalar< bitScalarAnd<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
{0,0,0,0},
- {NppBitwiseC<CV_32S, 1, nppiOrC_32s_C1R>::call, 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call}
+ {BitScalar< bitScalarAnd<unsigned int> >::call , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
};
- CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S);
- CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
+ const int depth = src.depth();
+ const int cn = src.channels();
+
+ CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32S );
+ CV_Assert( cn == 1 || cn == 3 || cn == 4 );
dst.create(src.size(), src.type());
- funcs[src.depth()][src.channels() - 1](src, sc, dst, StreamAccessor::getStream(stream));
+ funcs[depth][cn - 1](src, sc, dst, StreamAccessor::getStream(stream));
}
-void cv::gpu::bitwise_and(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
+void cv::gpu::bitwise_or(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
{
+ using namespace arithm;
+
typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[5][4] =
{
- {NppBitwiseC<CV_8U , 1, nppiAndC_8u_C1R >::call, 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, NppBitwiseC<CV_8U , 4, nppiAndC_8u_C4R >::call},
+ {BitScalar< bitScalarOr<unsigned char> >::call , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOr<unsigned int> >::call},
{0,0,0,0},
- {NppBitwiseC<CV_16U, 1, nppiAndC_16u_C1R>::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
+ {BitScalar< bitScalarOr<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
{0,0,0,0},
- {NppBitwiseC<CV_32S, 1, nppiAndC_32s_C1R>::call, 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
+ {BitScalar< bitScalarOr<unsigned int> >::call , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call}
};
- CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S);
- CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
+ const int depth = src.depth();
+ const int cn = src.channels();
+
+ CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32S );
+ CV_Assert( cn == 1 || cn == 3 || cn == 4 );
dst.create(src.size(), src.type());
- funcs[src.depth()][src.channels() - 1](src, sc, dst, StreamAccessor::getStream(stream));
+ funcs[depth][cn - 1](src, sc, dst, StreamAccessor::getStream(stream));
}
void cv::gpu::bitwise_xor(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
{
+ using namespace arithm;
+
typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
static const func_t funcs[5][4] =
{
- {NppBitwiseC<CV_8U , 1, nppiXorC_8u_C1R >::call, 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, NppBitwiseC<CV_8U , 4, nppiXorC_8u_C4R >::call},
+ {BitScalar< bitScalarXor<unsigned char> >::call , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarXor<unsigned int> >::call},
{0,0,0,0},
- {NppBitwiseC<CV_16U, 1, nppiXorC_16u_C1R>::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
+ {BitScalar< bitScalarXor<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
{0,0,0,0},
- {NppBitwiseC<CV_32S, 1, nppiXorC_32s_C1R>::call, 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
+ {BitScalar< bitScalarXor<unsigned int> >::call , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
};
- CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S);
- CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
+ const int depth = src.depth();
+ const int cn = src.channels();
+
+ CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32S );
+ CV_Assert( cn == 1 || cn == 3 || cn == 4 );
dst.create(src.size(), src.type());
- funcs[src.depth()][src.channels() - 1](src, sc, dst, StreamAccessor::getStream(stream));
+ funcs[depth][cn - 1](src, sc, dst, StreamAccessor::getStream(stream));
}
//////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////
// Minimum and maximum operations
-namespace cv { namespace gpu { namespace device
+namespace arithm
{
- template <typename T> void min_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
- template <typename T> void max_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
- template <typename T> void min_gpu(const PtrStepSzb src, T val, PtrStepSzb dst, cudaStream_t stream);
- template <typename T> void max_gpu(const PtrStepSzb src, T val, PtrStepSzb dst, cudaStream_t stream);
-}}}
+ template <typename T> void vmin4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template <typename T> void vmin2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template <typename T> void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+
+ template <typename T> void vmax4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template <typename T> void vmax2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+}
-void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s)
{
- using namespace cv::gpu::device;
+ using namespace arithm;
- typedef void (*func_t)(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
- min_gpu<unsigned char>,
- min_gpu<signed char>,
- min_gpu<unsigned short>,
- min_gpu<short>,
- min_gpu<int>,
- min_gpu<float>,
- min_gpu<double>
+ minMat<unsigned char>,
+ minMat<signed char>,
+ minMat<unsigned short>,
+ minMat<short>,
+ minMat<int>,
+ minMat<float>,
+ minMat<double>
};
+ static const func_t vfuncs4[] =
+ {
+ vmin4<unsigned int>,
+ vmin4<int>,
+ 0,
+ 0
+ };
+ static const func_t vfuncs2[] =
+ {
+ 0,
+ 0,
+ vmin2<unsigned int>,
+ vmin2<int>
+ };
+
+ const int depth = src1.depth();
+ const int cn = src1.channels();
- CV_Assert(src1.depth() <= CV_64F);
- CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+ CV_Assert( depth <= CV_64F );
+ CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
- if (src1.depth() == CV_64F)
+ if (depth == CV_64F)
{
- if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+ if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
dst.create(src1.size(), src1.type());
- funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
+ cudaStream_t stream = StreamAccessor::getStream(s);
+
+ PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+ PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+ PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
+
+ if (depth < CV_32S)
+ {
+ const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+ const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+ const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+ const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+ if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+ {
+ const func_t vfunc4 = vfuncs4[depth];
+ const func_t vfunc2 = vfuncs2[depth];
+
+ if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+ {
+ const int vcols = src1_.cols >> 2;
+
+ vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+ PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+ PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+ stream);
+
+ return;
+ }
+
+ if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+ {
+ const int vcols = src1_.cols >> 1;
+
+ vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+ PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+ PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+ stream);
+
+ return;
+ }
+ }
+ }
+
+ const func_t func = funcs[depth];
+
+ if (!func)
+ CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+ func(src1_, src2_, dst_, stream);
}
-void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s)
{
- using namespace cv::gpu::device;
+ using namespace arithm;
- typedef void (*func_t)(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+ typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
- max_gpu<unsigned char>,
- max_gpu<signed char>,
- max_gpu<unsigned short>,
- max_gpu<short>,
- max_gpu<int>,
- max_gpu<float>,
- max_gpu<double>
+ maxMat<unsigned char>,
+ maxMat<signed char>,
+ maxMat<unsigned short>,
+ maxMat<short>,
+ maxMat<int>,
+ maxMat<float>,
+ maxMat<double>
+ };
+ static const func_t vfuncs4[] =
+ {
+ vmax4<unsigned int>,
+ vmax4<int>,
+ 0,
+ 0
+ };
+ static const func_t vfuncs2[] =
+ {
+ 0,
+ 0,
+ vmax2<unsigned int>,
+ vmax2<int>
};
- CV_Assert(src1.depth() <= CV_64F);
- CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+ const int depth = src1.depth();
+ const int cn = src1.channels();
- if (src1.depth() == CV_64F)
+ CV_Assert( depth <= CV_64F );
+ CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
+
+ if (depth == CV_64F)
{
- if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+ if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
dst.create(src1.size(), src1.type());
- funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
-}
+ cudaStream_t stream = StreamAccessor::getStream(s);
-namespace
-{
- template <typename T> void minScalar(const PtrStepSzb src, double val, PtrStepSzb dst, cudaStream_t stream)
+ PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+ PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+ PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
+
+ if (depth < CV_32S)
{
- cv::gpu::device::min_gpu(src, saturate_cast<T>(val), dst, stream);
+ const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+ const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+ const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+ const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+ if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+ {
+ const func_t vfunc4 = vfuncs4[depth];
+ const func_t vfunc2 = vfuncs2[depth];
+
+ if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+ {
+ const int vcols = src1_.cols >> 2;
+
+ vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+ PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+ PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+ stream);
+
+ return;
+ }
+
+ if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+ {
+ const int vcols = src1_.cols >> 1;
+
+ vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+ PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+ PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+ stream);
+
+ return;
+ }
+ }
}
- template <typename T> void maxScalar(const PtrStepSzb src, double val, PtrStepSzb dst, cudaStream_t stream)
+ const func_t func = funcs[depth];
+
+ if (!func)
+ CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+ func(src1_, src2_, dst_, stream);
+}
+
+namespace
+{
+ template <typename T> double castScalar(double val)
{
- cv::gpu::device::max_gpu(src, saturate_cast<T>(val), dst, stream);
+ return saturate_cast<T>(val);
}
}
void cv::gpu::min(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
{
- typedef void (*func_t)(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+ using namespace arithm;
+
+ typedef void (*func_t)(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
minScalar<unsigned char>,
minScalar<double>
};
- CV_Assert(src.depth() <= CV_64F);
- CV_Assert(src.channels() == 1);
+ typedef double (*cast_func_t)(double sc);
+ static const cast_func_t cast_func[] =
+ {
+ castScalar<unsigned char>, castScalar<signed char>, castScalar<unsigned short>, castScalar<short>, castScalar<int>, castScalar<float>, castScalar<double>
+ };
+
+ const int depth = src.depth();
- if (src.depth() == CV_64F)
+ CV_Assert( depth <= CV_64F );
+ CV_Assert( src.channels() == 1 );
+
+ if (depth == CV_64F)
{
- if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+ if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
dst.create(src.size(), src.type());
- funcs[src.depth()](src, val, dst, StreamAccessor::getStream(stream));
+ funcs[depth](src, cast_func[depth](val), dst, StreamAccessor::getStream(stream));
}
void cv::gpu::max(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
{
- typedef void (*func_t)(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+ using namespace arithm;
+
+ typedef void (*func_t)(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
maxScalar<unsigned char>,
maxScalar<double>
};
- CV_Assert(src.depth() <= CV_64F);
- CV_Assert(src.channels() == 1);
+ typedef double (*cast_func_t)(double sc);
+ static const cast_func_t cast_func[] =
+ {
+ castScalar<unsigned char>, castScalar<signed char>, castScalar<unsigned short>, castScalar<short>, castScalar<int>, castScalar<float>, castScalar<double>
+ };
+
+ const int depth = src.depth();
+
+ CV_Assert( depth <= CV_64F );
+ CV_Assert( src.channels() == 1 );
- if (src.depth() == CV_64F)
+ if (depth == CV_64F)
{
- if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+ if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
dst.create(src.size(), src.type());
- funcs[src.depth()](src, val, dst, StreamAccessor::getStream(stream));
+ funcs[depth](src, cast_func[depth](val), dst, StreamAccessor::getStream(stream));
}
////////////////////////////////////////////////////////////////////////
// threshold
-namespace cv { namespace gpu { namespace device
+namespace arithm
{
template <typename T>
- void threshold_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, T thresh, T maxVal, int type, cudaStream_t stream);
-}}}
-
-namespace
-{
- template <typename T> void threshold_caller(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream)
- {
- cv::gpu::device::threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream);
- }
+ void threshold(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
}
double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, Stream& s)
{
- CV_Assert(src.channels() == 1 && src.depth() <= CV_64F);
- CV_Assert(type <= THRESH_TOZERO_INV);
+ const int depth = src.depth();
+
+ CV_Assert( src.channels() == 1 && depth <= CV_64F );
+ CV_Assert( type <= THRESH_TOZERO_INV );
- if (src.depth() == CV_64F)
+ if (depth == CV_64F)
{
- if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+ if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
}
else
{
- typedef void (*func_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream);
+ typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
static const func_t funcs[] =
{
- threshold_caller<unsigned char>, threshold_caller<signed char>,
- threshold_caller<unsigned short>, threshold_caller<short>,
- threshold_caller<int>, threshold_caller<float>, threshold_caller<double>
+ arithm::threshold<unsigned char>,
+ arithm::threshold<signed char>,
+ arithm::threshold<unsigned short>,
+ arithm::threshold<short>,
+ arithm::threshold<int>,
+ arithm::threshold<float>,
+ arithm::threshold<double>
};
- if (src.depth() != CV_32F && src.depth() != CV_64F)
+ if (depth != CV_32F && depth != CV_64F)
{
thresh = cvFloor(thresh);
maxVal = cvRound(maxVal);
}
- funcs[src.depth()](src, dst, thresh, maxVal, type, stream);
+ funcs[depth](src, dst, thresh, maxVal, type, stream);
}
return thresh;
////////////////////////////////////////////////////////////////////////
// pow
-namespace cv { namespace gpu { namespace device
+namespace arithm
{
- template<typename T> void pow_caller(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
-}}}
+ template<typename T> void pow(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+}
void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
{
- using namespace cv::gpu::device;
-
typedef void (*func_t)(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[] =
{
- pow_caller<unsigned char>, pow_caller<signed char>,
- pow_caller<unsigned short>, pow_caller<short>,
- pow_caller<int>, pow_caller<float>, pow_caller<double>
+ arithm::pow<unsigned char>,
+ arithm::pow<signed char>,
+ arithm::pow<unsigned short>,
+ arithm::pow<short>,
+ arithm::pow<int>,
+ arithm::pow<float>,
+ arithm::pow<double>
};
- CV_Assert(src.depth() <= CV_64F);
+ const int depth = src.depth();
+ const int cn = src.channels();
- if (src.depth() == CV_64F)
+ CV_Assert(depth <= CV_64F);
+
+ if (depth == CV_64F)
{
- if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+ if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
dst.create(src.size(), src.type());
- funcs[src.depth()](src.reshape(1), power, dst.reshape(1), StreamAccessor::getStream(stream));
+ PtrStepSzb src_(src.rows, src.cols * cn, src.data, src.step);
+ PtrStepSzb dst_(src.rows, src.cols * cn, dst.data, dst.step);
+
+ funcs[depth](src_, power, dst_, StreamAccessor::getStream(stream));
}
////////////////////////////////////////////////////////////////////////
NppAlphaComp<CV_32F, nppiAlphaComp_32f_AC4R>::call
};
- CV_Assert(img1.type() == CV_8UC4 || img1.type() == CV_16UC4 || img1.type() == CV_32SC4 || img1.type() == CV_32FC4);
- CV_Assert(img1.size() == img2.size() && img1.type() == img2.type());
+ CV_Assert( img1.type() == CV_8UC4 || img1.type() == CV_16UC4 || img1.type() == CV_32SC4 || img1.type() == CV_32FC4 );
+ CV_Assert( img1.size() == img2.size() && img1.type() == img2.type() );
dst.create(img1.size(), img1.type());
////////////////////////////////////////////////////////////////////////
// addWeighted
-namespace cv { namespace gpu { namespace device
+namespace arithm
{
template <typename T1, typename T2, typename D>
- void addWeighted_gpu(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-}}}
+ void addWeighted(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+}
-void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, int dtype, Stream& stream)
+void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, int ddepth, Stream& stream)
{
- using namespace cv::gpu::device;
-
- typedef void (*func_t)(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
+ typedef void (*func_t)(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
static const func_t funcs[7][7][7] =
{
{
{
- addWeighted_gpu<unsigned char, unsigned char, unsigned char >,
- addWeighted_gpu<unsigned char, unsigned char, signed char >,
- addWeighted_gpu<unsigned char, unsigned char, unsigned short>,
- addWeighted_gpu<unsigned char, unsigned char, short >,
- addWeighted_gpu<unsigned char, unsigned char, int >,
- addWeighted_gpu<unsigned char, unsigned char, float >,
- addWeighted_gpu<unsigned char, unsigned char, double>
+ arithm::addWeighted<unsigned char, unsigned char, unsigned char >,
+ arithm::addWeighted<unsigned char, unsigned char, signed char >,
+ arithm::addWeighted<unsigned char, unsigned char, unsigned short>,
+ arithm::addWeighted<unsigned char, unsigned char, short >,
+ arithm::addWeighted<unsigned char, unsigned char, int >,
+ arithm::addWeighted<unsigned char, unsigned char, float >,
+ arithm::addWeighted<unsigned char, unsigned char, double>
},
{
- addWeighted_gpu<unsigned char, signed char, unsigned char >,
- addWeighted_gpu<unsigned char, signed char, signed char >,
- addWeighted_gpu<unsigned char, signed char, unsigned short>,
- addWeighted_gpu<unsigned char, signed char, short >,
- addWeighted_gpu<unsigned char, signed char, int >,
- addWeighted_gpu<unsigned char, signed char, float >,
- addWeighted_gpu<unsigned char, signed char, double>
+ arithm::addWeighted<unsigned char, signed char, unsigned char >,
+ arithm::addWeighted<unsigned char, signed char, signed char >,
+ arithm::addWeighted<unsigned char, signed char, unsigned short>,
+ arithm::addWeighted<unsigned char, signed char, short >,
+ arithm::addWeighted<unsigned char, signed char, int >,
+ arithm::addWeighted<unsigned char, signed char, float >,
+ arithm::addWeighted<unsigned char, signed char, double>
},
{
- addWeighted_gpu<unsigned char, unsigned short, unsigned char >,
- addWeighted_gpu<unsigned char, unsigned short, signed char >,
- addWeighted_gpu<unsigned char, unsigned short, unsigned short>,
- addWeighted_gpu<unsigned char, unsigned short, short >,
- addWeighted_gpu<unsigned char, unsigned short, int >,
- addWeighted_gpu<unsigned char, unsigned short, float >,
- addWeighted_gpu<unsigned char, unsigned short, double>
+ arithm::addWeighted<unsigned char, unsigned short, unsigned char >,
+ arithm::addWeighted<unsigned char, unsigned short, signed char >,
+ arithm::addWeighted<unsigned char, unsigned short, unsigned short>,
+ arithm::addWeighted<unsigned char, unsigned short, short >,
+ arithm::addWeighted<unsigned char, unsigned short, int >,
+ arithm::addWeighted<unsigned char, unsigned short, float >,
+ arithm::addWeighted<unsigned char, unsigned short, double>
},
{
- addWeighted_gpu<unsigned char, short, unsigned char >,
- addWeighted_gpu<unsigned char, short, signed char >,
- addWeighted_gpu<unsigned char, short, unsigned short>,
- addWeighted_gpu<unsigned char, short, short >,
- addWeighted_gpu<unsigned char, short, int >,
- addWeighted_gpu<unsigned char, short, float >,
- addWeighted_gpu<unsigned char, short, double>
+ arithm::addWeighted<unsigned char, short, unsigned char >,
+ arithm::addWeighted<unsigned char, short, signed char >,
+ arithm::addWeighted<unsigned char, short, unsigned short>,
+ arithm::addWeighted<unsigned char, short, short >,
+ arithm::addWeighted<unsigned char, short, int >,
+ arithm::addWeighted<unsigned char, short, float >,
+ arithm::addWeighted<unsigned char, short, double>
},
{
- addWeighted_gpu<unsigned char, int, unsigned char >,
- addWeighted_gpu<unsigned char, int, signed char >,
- addWeighted_gpu<unsigned char, int, unsigned short>,
- addWeighted_gpu<unsigned char, int, short >,
- addWeighted_gpu<unsigned char, int, int >,
- addWeighted_gpu<unsigned char, int, float >,
- addWeighted_gpu<unsigned char, int, double>
+ arithm::addWeighted<unsigned char, int, unsigned char >,
+ arithm::addWeighted<unsigned char, int, signed char >,
+ arithm::addWeighted<unsigned char, int, unsigned short>,
+ arithm::addWeighted<unsigned char, int, short >,
+ arithm::addWeighted<unsigned char, int, int >,
+ arithm::addWeighted<unsigned char, int, float >,
+ arithm::addWeighted<unsigned char, int, double>
},
{
- addWeighted_gpu<unsigned char, float, unsigned char >,
- addWeighted_gpu<unsigned char, float, signed char >,
- addWeighted_gpu<unsigned char, float, unsigned short>,
- addWeighted_gpu<unsigned char, float, short >,
- addWeighted_gpu<unsigned char, float, int >,
- addWeighted_gpu<unsigned char, float, float >,
- addWeighted_gpu<unsigned char, float, double>
+ arithm::addWeighted<unsigned char, float, unsigned char >,
+ arithm::addWeighted<unsigned char, float, signed char >,
+ arithm::addWeighted<unsigned char, float, unsigned short>,
+ arithm::addWeighted<unsigned char, float, short >,
+ arithm::addWeighted<unsigned char, float, int >,
+ arithm::addWeighted<unsigned char, float, float >,
+ arithm::addWeighted<unsigned char, float, double>
},
{
- addWeighted_gpu<unsigned char, double, unsigned char >,
- addWeighted_gpu<unsigned char, double, signed char >,
- addWeighted_gpu<unsigned char, double, unsigned short>,
- addWeighted_gpu<unsigned char, double, short >,
- addWeighted_gpu<unsigned char, double, int >,
- addWeighted_gpu<unsigned char, double, float >,
- addWeighted_gpu<unsigned char, double, double>
+ arithm::addWeighted<unsigned char, double, unsigned char >,
+ arithm::addWeighted<unsigned char, double, signed char >,
+ arithm::addWeighted<unsigned char, double, unsigned short>,
+ arithm::addWeighted<unsigned char, double, short >,
+ arithm::addWeighted<unsigned char, double, int >,
+ arithm::addWeighted<unsigned char, double, float >,
+ arithm::addWeighted<unsigned char, double, double>
}
},
{
{
- 0/*addWeighted_gpu<signed char, unsigned char, unsigned char >*/,
- 0/*addWeighted_gpu<signed char, unsigned char, signed char >*/,
- 0/*addWeighted_gpu<signed char, unsigned char, unsigned short>*/,
- 0/*addWeighted_gpu<signed char, unsigned char, short >*/,
- 0/*addWeighted_gpu<signed char, unsigned char, int >*/,
- 0/*addWeighted_gpu<signed char, unsigned char, float >*/,
- 0/*addWeighted_gpu<signed char, unsigned char, double>*/
+ 0/*arithm::addWeighted<signed char, unsigned char, unsigned char >*/,
+ 0/*arithm::addWeighted<signed char, unsigned char, signed char >*/,
+ 0/*arithm::addWeighted<signed char, unsigned char, unsigned short>*/,
+ 0/*arithm::addWeighted<signed char, unsigned char, short >*/,
+ 0/*arithm::addWeighted<signed char, unsigned char, int >*/,
+ 0/*arithm::addWeighted<signed char, unsigned char, float >*/,
+ 0/*arithm::addWeighted<signed char, unsigned char, double>*/
},
{
- addWeighted_gpu<signed char, signed char, unsigned char >,
- addWeighted_gpu<signed char, signed char, signed char >,
- addWeighted_gpu<signed char, signed char, unsigned short>,
- addWeighted_gpu<signed char, signed char, short >,
- addWeighted_gpu<signed char, signed char, int >,
- addWeighted_gpu<signed char, signed char, float >,
- addWeighted_gpu<signed char, signed char, double>
+ arithm::addWeighted<signed char, signed char, unsigned char >,
+ arithm::addWeighted<signed char, signed char, signed char >,
+ arithm::addWeighted<signed char, signed char, unsigned short>,
+ arithm::addWeighted<signed char, signed char, short >,
+ arithm::addWeighted<signed char, signed char, int >,
+ arithm::addWeighted<signed char, signed char, float >,
+ arithm::addWeighted<signed char, signed char, double>
},
{
- addWeighted_gpu<signed char, unsigned short, unsigned char >,
- addWeighted_gpu<signed char, unsigned short, signed char >,
- addWeighted_gpu<signed char, unsigned short, unsigned short>,
- addWeighted_gpu<signed char, unsigned short, short >,
- addWeighted_gpu<signed char, unsigned short, int >,
- addWeighted_gpu<signed char, unsigned short, float >,
- addWeighted_gpu<signed char, unsigned short, double>
+ arithm::addWeighted<signed char, unsigned short, unsigned char >,
+ arithm::addWeighted<signed char, unsigned short, signed char >,
+ arithm::addWeighted<signed char, unsigned short, unsigned short>,
+ arithm::addWeighted<signed char, unsigned short, short >,
+ arithm::addWeighted<signed char, unsigned short, int >,
+ arithm::addWeighted<signed char, unsigned short, float >,
+ arithm::addWeighted<signed char, unsigned short, double>
},
{
- addWeighted_gpu<signed char, short, unsigned char >,
- addWeighted_gpu<signed char, short, signed char >,
- addWeighted_gpu<signed char, short, unsigned short>,
- addWeighted_gpu<signed char, short, short >,
- addWeighted_gpu<signed char, short, int >,
- addWeighted_gpu<signed char, short, float >,
- addWeighted_gpu<signed char, short, double>
+ arithm::addWeighted<signed char, short, unsigned char >,
+ arithm::addWeighted<signed char, short, signed char >,
+ arithm::addWeighted<signed char, short, unsigned short>,
+ arithm::addWeighted<signed char, short, short >,
+ arithm::addWeighted<signed char, short, int >,
+ arithm::addWeighted<signed char, short, float >,
+ arithm::addWeighted<signed char, short, double>
},
{
- addWeighted_gpu<signed char, int, unsigned char >,
- addWeighted_gpu<signed char, int, signed char >,
- addWeighted_gpu<signed char, int, unsigned short>,
- addWeighted_gpu<signed char, int, short >,
- addWeighted_gpu<signed char, int, int >,
- addWeighted_gpu<signed char, int, float >,
- addWeighted_gpu<signed char, int, double>
+ arithm::addWeighted<signed char, int, unsigned char >,
+ arithm::addWeighted<signed char, int, signed char >,
+ arithm::addWeighted<signed char, int, unsigned short>,
+ arithm::addWeighted<signed char, int, short >,
+ arithm::addWeighted<signed char, int, int >,
+ arithm::addWeighted<signed char, int, float >,
+ arithm::addWeighted<signed char, int, double>
},
{
- addWeighted_gpu<signed char, float, unsigned char >,
- addWeighted_gpu<signed char, float, signed char >,
- addWeighted_gpu<signed char, float, unsigned short>,
- addWeighted_gpu<signed char, float, short >,
- addWeighted_gpu<signed char, float, int >,
- addWeighted_gpu<signed char, float, float >,
- addWeighted_gpu<signed char, float, double>
+ arithm::addWeighted<signed char, float, unsigned char >,
+ arithm::addWeighted<signed char, float, signed char >,
+ arithm::addWeighted<signed char, float, unsigned short>,
+ arithm::addWeighted<signed char, float, short >,
+ arithm::addWeighted<signed char, float, int >,
+ arithm::addWeighted<signed char, float, float >,
+ arithm::addWeighted<signed char, float, double>
},
{
- addWeighted_gpu<signed char, double, unsigned char >,
- addWeighted_gpu<signed char, double, signed char >,
- addWeighted_gpu<signed char, double, unsigned short>,
- addWeighted_gpu<signed char, double, short >,
- addWeighted_gpu<signed char, double, int >,
- addWeighted_gpu<signed char, double, float >,
- addWeighted_gpu<signed char, double, double>
+ arithm::addWeighted<signed char, double, unsigned char >,
+ arithm::addWeighted<signed char, double, signed char >,
+ arithm::addWeighted<signed char, double, unsigned short>,
+ arithm::addWeighted<signed char, double, short >,
+ arithm::addWeighted<signed char, double, int >,
+ arithm::addWeighted<signed char, double, float >,
+ arithm::addWeighted<signed char, double, double>
}
},
{
{
- 0/*addWeighted_gpu<unsigned short, unsigned char, unsigned char >*/,
- 0/*addWeighted_gpu<unsigned short, unsigned char, signed char >*/,
- 0/*addWeighted_gpu<unsigned short, unsigned char, unsigned short>*/,
- 0/*addWeighted_gpu<unsigned short, unsigned char, short >*/,
- 0/*addWeighted_gpu<unsigned short, unsigned char, int >*/,
- 0/*addWeighted_gpu<unsigned short, unsigned char, float >*/,
- 0/*addWeighted_gpu<unsigned short, unsigned char, double>*/
+ 0/*arithm::addWeighted<unsigned short, unsigned char, unsigned char >*/,
+ 0/*arithm::addWeighted<unsigned short, unsigned char, signed char >*/,
+ 0/*arithm::addWeighted<unsigned short, unsigned char, unsigned short>*/,
+ 0/*arithm::addWeighted<unsigned short, unsigned char, short >*/,
+ 0/*arithm::addWeighted<unsigned short, unsigned char, int >*/,
+ 0/*arithm::addWeighted<unsigned short, unsigned char, float >*/,
+ 0/*arithm::addWeighted<unsigned short, unsigned char, double>*/
},
{
- 0/*addWeighted_gpu<unsigned short, signed char, unsigned char >*/,
- 0/*addWeighted_gpu<unsigned short, signed char, signed char >*/,
- 0/*addWeighted_gpu<unsigned short, signed char, unsigned short>*/,
- 0/*addWeighted_gpu<unsigned short, signed char, short >*/,
- 0/*addWeighted_gpu<unsigned short, signed char, int >*/,
- 0/*addWeighted_gpu<unsigned short, signed char, float >*/,
- 0/*addWeighted_gpu<unsigned short, signed char, double>*/
+ 0/*arithm::addWeighted<unsigned short, signed char, unsigned char >*/,
+ 0/*arithm::addWeighted<unsigned short, signed char, signed char >*/,
+ 0/*arithm::addWeighted<unsigned short, signed char, unsigned short>*/,
+ 0/*arithm::addWeighted<unsigned short, signed char, short >*/,
+ 0/*arithm::addWeighted<unsigned short, signed char, int >*/,
+ 0/*arithm::addWeighted<unsigned short, signed char, float >*/,
+ 0/*arithm::addWeighted<unsigned short, signed char, double>*/
},
{
- addWeighted_gpu<unsigned short, unsigned short, unsigned char >,
- addWeighted_gpu<unsigned short, unsigned short, signed char >,
- addWeighted_gpu<unsigned short, unsigned short, unsigned short>,
- addWeighted_gpu<unsigned short, unsigned short, short >,
- addWeighted_gpu<unsigned short, unsigned short, int >,
- addWeighted_gpu<unsigned short, unsigned short, float >,
- addWeighted_gpu<unsigned short, unsigned short, double>
+ arithm::addWeighted<unsigned short, unsigned short, unsigned char >,
+ arithm::addWeighted<unsigned short, unsigned short, signed char >,
+ arithm::addWeighted<unsigned short, unsigned short, unsigned short>,
+ arithm::addWeighted<unsigned short, unsigned short, short >,
+ arithm::addWeighted<unsigned short, unsigned short, int >,
+ arithm::addWeighted<unsigned short, unsigned short, float >,
+ arithm::addWeighted<unsigned short, unsigned short, double>
},
{
- addWeighted_gpu<unsigned short, short, unsigned char >,
- addWeighted_gpu<unsigned short, short, signed char >,
- addWeighted_gpu<unsigned short, short, unsigned short>,
- addWeighted_gpu<unsigned short, short, short >,
- addWeighted_gpu<unsigned short, short, int >,
- addWeighted_gpu<unsigned short, short, float >,
- addWeighted_gpu<unsigned short, short, double>
+ arithm::addWeighted<unsigned short, short, unsigned char >,
+ arithm::addWeighted<unsigned short, short, signed char >,
+ arithm::addWeighted<unsigned short, short, unsigned short>,
+ arithm::addWeighted<unsigned short, short, short >,
+ arithm::addWeighted<unsigned short, short, int >,
+ arithm::addWeighted<unsigned short, short, float >,
+ arithm::addWeighted<unsigned short, short, double>
},
{
- addWeighted_gpu<unsigned short, int, unsigned char >,
- addWeighted_gpu<unsigned short, int, signed char >,
- addWeighted_gpu<unsigned short, int, unsigned short>,
- addWeighted_gpu<unsigned short, int, short >,
- addWeighted_gpu<unsigned short, int, int >,
- addWeighted_gpu<unsigned short, int, float >,
- addWeighted_gpu<unsigned short, int, double>
+ arithm::addWeighted<unsigned short, int, unsigned char >,
+ arithm::addWeighted<unsigned short, int, signed char >,
+ arithm::addWeighted<unsigned short, int, unsigned short>,
+ arithm::addWeighted<unsigned short, int, short >,
+ arithm::addWeighted<unsigned short, int, int >,
+ arithm::addWeighted<unsigned short, int, float >,
+ arithm::addWeighted<unsigned short, int, double>
},
{
- addWeighted_gpu<unsigned short, float, unsigned char >,
- addWeighted_gpu<unsigned short, float, signed char >,
- addWeighted_gpu<unsigned short, float, unsigned short>,
- addWeighted_gpu<unsigned short, float, short >,
- addWeighted_gpu<unsigned short, float, int >,
- addWeighted_gpu<unsigned short, float, float >,
- addWeighted_gpu<unsigned short, float, double>
+ arithm::addWeighted<unsigned short, float, unsigned char >,
+ arithm::addWeighted<unsigned short, float, signed char >,
+ arithm::addWeighted<unsigned short, float, unsigned short>,
+ arithm::addWeighted<unsigned short, float, short >,
+ arithm::addWeighted<unsigned short, float, int >,
+ arithm::addWeighted<unsigned short, float, float >,
+ arithm::addWeighted<unsigned short, float, double>
},
{
- addWeighted_gpu<unsigned short, double, unsigned char >,
- addWeighted_gpu<unsigned short, double, signed char >,
- addWeighted_gpu<unsigned short, double, unsigned short>,
- addWeighted_gpu<unsigned short, double, short >,
- addWeighted_gpu<unsigned short, double, int >,
- addWeighted_gpu<unsigned short, double, float >,
- addWeighted_gpu<unsigned short, double, double>
+ arithm::addWeighted<unsigned short, double, unsigned char >,
+ arithm::addWeighted<unsigned short, double, signed char >,
+ arithm::addWeighted<unsigned short, double, unsigned short>,
+ arithm::addWeighted<unsigned short, double, short >,
+ arithm::addWeighted<unsigned short, double, int >,
+ arithm::addWeighted<unsigned short, double, float >,
+ arithm::addWeighted<unsigned short, double, double>
}
},
{
{
- 0/*addWeighted_gpu<short, unsigned char, unsigned char >*/,
- 0/*addWeighted_gpu<short, unsigned char, signed char >*/,
- 0/*addWeighted_gpu<short, unsigned char, unsigned short>*/,
- 0/*addWeighted_gpu<short, unsigned char, short >*/,
- 0/*addWeighted_gpu<short, unsigned char, int >*/,
- 0/*addWeighted_gpu<short, unsigned char, float >*/,
- 0/*addWeighted_gpu<short, unsigned char, double>*/
+ 0/*arithm::addWeighted<short, unsigned char, unsigned char >*/,
+ 0/*arithm::addWeighted<short, unsigned char, signed char >*/,
+ 0/*arithm::addWeighted<short, unsigned char, unsigned short>*/,
+ 0/*arithm::addWeighted<short, unsigned char, short >*/,
+ 0/*arithm::addWeighted<short, unsigned char, int >*/,
+ 0/*arithm::addWeighted<short, unsigned char, float >*/,
+ 0/*arithm::addWeighted<short, unsigned char, double>*/
},
{
- 0/*addWeighted_gpu<short, signed char, unsigned char >*/,
- 0/*addWeighted_gpu<short, signed char, signed char >*/,
- 0/*addWeighted_gpu<short, signed char, unsigned short>*/,
- 0/*addWeighted_gpu<short, signed char, short >*/,
- 0/*addWeighted_gpu<short, signed char, int >*/,
- 0/*addWeighted_gpu<short, signed char, float >*/,
- 0/*addWeighted_gpu<short, signed char, double>*/
+ 0/*arithm::addWeighted<short, signed char, unsigned char >*/,
+ 0/*arithm::addWeighted<short, signed char, signed char >*/,
+ 0/*arithm::addWeighted<short, signed char, unsigned short>*/,
+ 0/*arithm::addWeighted<short, signed char, short >*/,
+ 0/*arithm::addWeighted<short, signed char, int >*/,
+ 0/*arithm::addWeighted<short, signed char, float >*/,
+ 0/*arithm::addWeighted<short, signed char, double>*/
},
{
- 0/*addWeighted_gpu<short, unsigned short, unsigned char >*/,
- 0/*addWeighted_gpu<short, unsigned short, signed char >*/,
- 0/*addWeighted_gpu<short, unsigned short, unsigned short>*/,
- 0/*addWeighted_gpu<short, unsigned short, short >*/,
- 0/*addWeighted_gpu<short, unsigned short, int >*/,
- 0/*addWeighted_gpu<short, unsigned short, float >*/,
- 0/*addWeighted_gpu<short, unsigned short, double>*/
+ 0/*arithm::addWeighted<short, unsigned short, unsigned char >*/,
+ 0/*arithm::addWeighted<short, unsigned short, signed char >*/,
+ 0/*arithm::addWeighted<short, unsigned short, unsigned short>*/,
+ 0/*arithm::addWeighted<short, unsigned short, short >*/,
+ 0/*arithm::addWeighted<short, unsigned short, int >*/,
+ 0/*arithm::addWeighted<short, unsigned short, float >*/,
+ 0/*arithm::addWeighted<short, unsigned short, double>*/
},
{
- addWeighted_gpu<short, short, unsigned char >,
- addWeighted_gpu<short, short, signed char >,
- addWeighted_gpu<short, short, unsigned short>,
- addWeighted_gpu<short, short, short >,
- addWeighted_gpu<short, short, int >,
- addWeighted_gpu<short, short, float >,
- addWeighted_gpu<short, short, double>
+ arithm::addWeighted<short, short, unsigned char >,
+ arithm::addWeighted<short, short, signed char >,
+ arithm::addWeighted<short, short, unsigned short>,
+ arithm::addWeighted<short, short, short >,
+ arithm::addWeighted<short, short, int >,
+ arithm::addWeighted<short, short, float >,
+ arithm::addWeighted<short, short, double>
},
{
- addWeighted_gpu<short, int, unsigned char >,
- addWeighted_gpu<short, int, signed char >,
- addWeighted_gpu<short, int, unsigned short>,
- addWeighted_gpu<short, int, short >,
- addWeighted_gpu<short, int, int >,
- addWeighted_gpu<short, int, float >,
- addWeighted_gpu<short, int, double>
+ arithm::addWeighted<short, int, unsigned char >,
+ arithm::addWeighted<short, int, signed char >,
+ arithm::addWeighted<short, int, unsigned short>,
+ arithm::addWeighted<short, int, short >,
+ arithm::addWeighted<short, int, int >,
+ arithm::addWeighted<short, int, float >,
+ arithm::addWeighted<short, int, double>
},
{
- addWeighted_gpu<short, float, unsigned char >,
- addWeighted_gpu<short, float, signed char >,
- addWeighted_gpu<short, float, unsigned short>,
- addWeighted_gpu<short, float, short >,
- addWeighted_gpu<short, float, int >,
- addWeighted_gpu<short, float, float >,
- addWeighted_gpu<short, float, double>
+ arithm::addWeighted<short, float, unsigned char >,
+ arithm::addWeighted<short, float, signed char >,
+ arithm::addWeighted<short, float, unsigned short>,
+ arithm::addWeighted<short, float, short >,
+ arithm::addWeighted<short, float, int >,
+ arithm::addWeighted<short, float, float >,
+ arithm::addWeighted<short, float, double>
},
{
- addWeighted_gpu<short, double, unsigned char >,
- addWeighted_gpu<short, double, signed char >,
- addWeighted_gpu<short, double, unsigned short>,
- addWeighted_gpu<short, double, short >,
- addWeighted_gpu<short, double, int >,
- addWeighted_gpu<short, double, float >,
- addWeighted_gpu<short, double, double>
+ arithm::addWeighted<short, double, unsigned char >,
+ arithm::addWeighted<short, double, signed char >,
+ arithm::addWeighted<short, double, unsigned short>,
+ arithm::addWeighted<short, double, short >,
+ arithm::addWeighted<short, double, int >,
+ arithm::addWeighted<short, double, float >,
+ arithm::addWeighted<short, double, double>
}
},
{
{
- 0/*addWeighted_gpu<int, unsigned char, unsigned char >*/,
- 0/*addWeighted_gpu<int, unsigned char, signed char >*/,
- 0/*addWeighted_gpu<int, unsigned char, unsigned short>*/,
- 0/*addWeighted_gpu<int, unsigned char, short >*/,
- 0/*addWeighted_gpu<int, unsigned char, int >*/,
- 0/*addWeighted_gpu<int, unsigned char, float >*/,
- 0/*addWeighted_gpu<int, unsigned char, double>*/
+ 0/*arithm::addWeighted<int, unsigned char, unsigned char >*/,
+ 0/*arithm::addWeighted<int, unsigned char, signed char >*/,
+ 0/*arithm::addWeighted<int, unsigned char, unsigned short>*/,
+ 0/*arithm::addWeighted<int, unsigned char, short >*/,
+ 0/*arithm::addWeighted<int, unsigned char, int >*/,
+ 0/*arithm::addWeighted<int, unsigned char, float >*/,
+ 0/*arithm::addWeighted<int, unsigned char, double>*/
},
{
- 0/*addWeighted_gpu<int, signed char, unsigned char >*/,
- 0/*addWeighted_gpu<int, signed char, signed char >*/,
- 0/*addWeighted_gpu<int, signed char, unsigned short>*/,
- 0/*addWeighted_gpu<int, signed char, short >*/,
- 0/*addWeighted_gpu<int, signed char, int >*/,
- 0/*addWeighted_gpu<int, signed char, float >*/,
- 0/*addWeighted_gpu<int, signed char, double>*/
+ 0/*arithm::addWeighted<int, signed char, unsigned char >*/,
+ 0/*arithm::addWeighted<int, signed char, signed char >*/,
+ 0/*arithm::addWeighted<int, signed char, unsigned short>*/,
+ 0/*arithm::addWeighted<int, signed char, short >*/,
+ 0/*arithm::addWeighted<int, signed char, int >*/,
+ 0/*arithm::addWeighted<int, signed char, float >*/,
+ 0/*arithm::addWeighted<int, signed char, double>*/
},
{
- 0/*addWeighted_gpu<int, unsigned short, unsigned char >*/,
- 0/*addWeighted_gpu<int, unsigned short, signed char >*/,
- 0/*addWeighted_gpu<int, unsigned short, unsigned short>*/,
- 0/*addWeighted_gpu<int, unsigned short, short >*/,
- 0/*addWeighted_gpu<int, unsigned short, int >*/,
- 0/*addWeighted_gpu<int, unsigned short, float >*/,
- 0/*addWeighted_gpu<int, unsigned short, double>*/
+ 0/*arithm::addWeighted<int, unsigned short, unsigned char >*/,
+ 0/*arithm::addWeighted<int, unsigned short, signed char >*/,
+ 0/*arithm::addWeighted<int, unsigned short, unsigned short>*/,
+ 0/*arithm::addWeighted<int, unsigned short, short >*/,
+ 0/*arithm::addWeighted<int, unsigned short, int >*/,
+ 0/*arithm::addWeighted<int, unsigned short, float >*/,
+ 0/*arithm::addWeighted<int, unsigned short, double>*/
},
{
- 0/*addWeighted_gpu<int, short, unsigned char >*/,
- 0/*addWeighted_gpu<int, short, signed char >*/,
- 0/*addWeighted_gpu<int, short, unsigned short>*/,
- 0/*addWeighted_gpu<int, short, short >*/,
- 0/*addWeighted_gpu<int, short, int >*/,
- 0/*addWeighted_gpu<int, short, float >*/,
- 0/*addWeighted_gpu<int, short, double>*/
+ 0/*arithm::addWeighted<int, short, unsigned char >*/,
+ 0/*arithm::addWeighted<int, short, signed char >*/,
+ 0/*arithm::addWeighted<int, short, unsigned short>*/,
+ 0/*arithm::addWeighted<int, short, short >*/,
+ 0/*arithm::addWeighted<int, short, int >*/,
+ 0/*arithm::addWeighted<int, short, float >*/,
+ 0/*arithm::addWeighted<int, short, double>*/
},
{
- addWeighted_gpu<int, int, unsigned char >,
- addWeighted_gpu<int, int, signed char >,
- addWeighted_gpu<int, int, unsigned short>,
- addWeighted_gpu<int, int, short >,
- addWeighted_gpu<int, int, int >,
- addWeighted_gpu<int, int, float >,
- addWeighted_gpu<int, int, double>
+ arithm::addWeighted<int, int, unsigned char >,
+ arithm::addWeighted<int, int, signed char >,
+ arithm::addWeighted<int, int, unsigned short>,
+ arithm::addWeighted<int, int, short >,
+ arithm::addWeighted<int, int, int >,
+ arithm::addWeighted<int, int, float >,
+ arithm::addWeighted<int, int, double>
},
{
- addWeighted_gpu<int, float, unsigned char >,
- addWeighted_gpu<int, float, signed char >,
- addWeighted_gpu<int, float, unsigned short>,
- addWeighted_gpu<int, float, short >,
- addWeighted_gpu<int, float, int >,
- addWeighted_gpu<int, float, float >,
- addWeighted_gpu<int, float, double>
+ arithm::addWeighted<int, float, unsigned char >,
+ arithm::addWeighted<int, float, signed char >,
+ arithm::addWeighted<int, float, unsigned short>,
+ arithm::addWeighted<int, float, short >,
+ arithm::addWeighted<int, float, int >,
+ arithm::addWeighted<int, float, float >,
+ arithm::addWeighted<int, float, double>
},
{
- addWeighted_gpu<int, double, unsigned char >,
- addWeighted_gpu<int, double, signed char >,
- addWeighted_gpu<int, double, unsigned short>,
- addWeighted_gpu<int, double, short >,
- addWeighted_gpu<int, double, int >,
- addWeighted_gpu<int, double, float >,
- addWeighted_gpu<int, double, double>
+ arithm::addWeighted<int, double, unsigned char >,
+ arithm::addWeighted<int, double, signed char >,
+ arithm::addWeighted<int, double, unsigned short>,
+ arithm::addWeighted<int, double, short >,
+ arithm::addWeighted<int, double, int >,
+ arithm::addWeighted<int, double, float >,
+ arithm::addWeighted<int, double, double>
}
},
{
{
- 0/*addWeighted_gpu<float, unsigned char, unsigned char >*/,
- 0/*addWeighted_gpu<float, unsigned char, signed char >*/,
- 0/*addWeighted_gpu<float, unsigned char, unsigned short>*/,
- 0/*addWeighted_gpu<float, unsigned char, short >*/,
- 0/*addWeighted_gpu<float, unsigned char, int >*/,
- 0/*addWeighted_gpu<float, unsigned char, float >*/,
- 0/*addWeighted_gpu<float, unsigned char, double>*/
+ 0/*arithm::addWeighted<float, unsigned char, unsigned char >*/,
+ 0/*arithm::addWeighted<float, unsigned char, signed char >*/,
+ 0/*arithm::addWeighted<float, unsigned char, unsigned short>*/,
+ 0/*arithm::addWeighted<float, unsigned char, short >*/,
+ 0/*arithm::addWeighted<float, unsigned char, int >*/,
+ 0/*arithm::addWeighted<float, unsigned char, float >*/,
+ 0/*arithm::addWeighted<float, unsigned char, double>*/
},
{
- 0/*addWeighted_gpu<float, signed char, unsigned char >*/,
- 0/*addWeighted_gpu<float, signed char, signed char >*/,
- 0/*addWeighted_gpu<float, signed char, unsigned short>*/,
- 0/*addWeighted_gpu<float, signed char, short >*/,
- 0/*addWeighted_gpu<float, signed char, int >*/,
- 0/*addWeighted_gpu<float, signed char, float >*/,
- 0/*addWeighted_gpu<float, signed char, double>*/
+ 0/*arithm::addWeighted<float, signed char, unsigned char >*/,
+ 0/*arithm::addWeighted<float, signed char, signed char >*/,
+ 0/*arithm::addWeighted<float, signed char, unsigned short>*/,
+ 0/*arithm::addWeighted<float, signed char, short >*/,
+ 0/*arithm::addWeighted<float, signed char, int >*/,
+ 0/*arithm::addWeighted<float, signed char, float >*/,
+ 0/*arithm::addWeighted<float, signed char, double>*/
},
{
- 0/*addWeighted_gpu<float, unsigned short, unsigned char >*/,
- 0/*addWeighted_gpu<float, unsigned short, signed char >*/,
- 0/*addWeighted_gpu<float, unsigned short, unsigned short>*/,
- 0/*addWeighted_gpu<float, unsigned short, short >*/,
- 0/*addWeighted_gpu<float, unsigned short, int >*/,
- 0/*addWeighted_gpu<float, unsigned short, float >*/,
- 0/*addWeighted_gpu<float, unsigned short, double>*/
+ 0/*arithm::addWeighted<float, unsigned short, unsigned char >*/,
+ 0/*arithm::addWeighted<float, unsigned short, signed char >*/,
+ 0/*arithm::addWeighted<float, unsigned short, unsigned short>*/,
+ 0/*arithm::addWeighted<float, unsigned short, short >*/,
+ 0/*arithm::addWeighted<float, unsigned short, int >*/,
+ 0/*arithm::addWeighted<float, unsigned short, float >*/,
+ 0/*arithm::addWeighted<float, unsigned short, double>*/
},
{
- 0/*addWeighted_gpu<float, short, unsigned char >*/,
- 0/*addWeighted_gpu<float, short, signed char >*/,
- 0/*addWeighted_gpu<float, short, unsigned short>*/,
- 0/*addWeighted_gpu<float, short, short >*/,
- 0/*addWeighted_gpu<float, short, int >*/,
- 0/*addWeighted_gpu<float, short, float >*/,
- 0/*addWeighted_gpu<float, short, double>*/
+ 0/*arithm::addWeighted<float, short, unsigned char >*/,
+ 0/*arithm::addWeighted<float, short, signed char >*/,
+ 0/*arithm::addWeighted<float, short, unsigned short>*/,
+ 0/*arithm::addWeighted<float, short, short >*/,
+ 0/*arithm::addWeighted<float, short, int >*/,
+ 0/*arithm::addWeighted<float, short, float >*/,
+ 0/*arithm::addWeighted<float, short, double>*/
},
{
- 0/*addWeighted_gpu<float, int, unsigned char >*/,
- 0/*addWeighted_gpu<float, int, signed char >*/,
- 0/*addWeighted_gpu<float, int, unsigned short>*/,
- 0/*addWeighted_gpu<float, int, short >*/,
- 0/*addWeighted_gpu<float, int, int >*/,
- 0/*addWeighted_gpu<float, int, float >*/,
- 0/*addWeighted_gpu<float, int, double>*/
+ 0/*arithm::addWeighted<float, int, unsigned char >*/,
+ 0/*arithm::addWeighted<float, int, signed char >*/,
+ 0/*arithm::addWeighted<float, int, unsigned short>*/,
+ 0/*arithm::addWeighted<float, int, short >*/,
+ 0/*arithm::addWeighted<float, int, int >*/,
+ 0/*arithm::addWeighted<float, int, float >*/,
+ 0/*arithm::addWeighted<float, int, double>*/
},
{
- addWeighted_gpu<float, float, unsigned char >,
- addWeighted_gpu<float, float, signed char >,
- addWeighted_gpu<float, float, unsigned short>,
- addWeighted_gpu<float, float, short >,
- addWeighted_gpu<float, float, int >,
- addWeighted_gpu<float, float, float >,
- addWeighted_gpu<float, float, double>
+ arithm::addWeighted<float, float, unsigned char >,
+ arithm::addWeighted<float, float, signed char >,
+ arithm::addWeighted<float, float, unsigned short>,
+ arithm::addWeighted<float, float, short >,
+ arithm::addWeighted<float, float, int >,
+ arithm::addWeighted<float, float, float >,
+ arithm::addWeighted<float, float, double>
},
{
- addWeighted_gpu<float, double, unsigned char >,
- addWeighted_gpu<float, double, signed char >,
- addWeighted_gpu<float, double, unsigned short>,
- addWeighted_gpu<float, double, short >,
- addWeighted_gpu<float, double, int >,
- addWeighted_gpu<float, double, float >,
- addWeighted_gpu<float, double, double>
+ arithm::addWeighted<float, double, unsigned char >,
+ arithm::addWeighted<float, double, signed char >,
+ arithm::addWeighted<float, double, unsigned short>,
+ arithm::addWeighted<float, double, short >,
+ arithm::addWeighted<float, double, int >,
+ arithm::addWeighted<float, double, float >,
+ arithm::addWeighted<float, double, double>
}
},
{
{
- 0/*addWeighted_gpu<double, unsigned char, unsigned char >*/,
- 0/*addWeighted_gpu<double, unsigned char, signed char >*/,
- 0/*addWeighted_gpu<double, unsigned char, unsigned short>*/,
- 0/*addWeighted_gpu<double, unsigned char, short >*/,
- 0/*addWeighted_gpu<double, unsigned char, int >*/,
- 0/*addWeighted_gpu<double, unsigned char, float >*/,
- 0/*addWeighted_gpu<double, unsigned char, double>*/
+ 0/*arithm::addWeighted<double, unsigned char, unsigned char >*/,
+ 0/*arithm::addWeighted<double, unsigned char, signed char >*/,
+ 0/*arithm::addWeighted<double, unsigned char, unsigned short>*/,
+ 0/*arithm::addWeighted<double, unsigned char, short >*/,
+ 0/*arithm::addWeighted<double, unsigned char, int >*/,
+ 0/*arithm::addWeighted<double, unsigned char, float >*/,
+ 0/*arithm::addWeighted<double, unsigned char, double>*/
},
{
- 0/*addWeighted_gpu<double, signed char, unsigned char >*/,
- 0/*addWeighted_gpu<double, signed char, signed char >*/,
- 0/*addWeighted_gpu<double, signed char, unsigned short>*/,
- 0/*addWeighted_gpu<double, signed char, short >*/,
- 0/*addWeighted_gpu<double, signed char, int >*/,
- 0/*addWeighted_gpu<double, signed char, float >*/,
- 0/*addWeighted_gpu<double, signed char, double>*/
+ 0/*arithm::addWeighted<double, signed char, unsigned char >*/,
+ 0/*arithm::addWeighted<double, signed char, signed char >*/,
+ 0/*arithm::addWeighted<double, signed char, unsigned short>*/,
+ 0/*arithm::addWeighted<double, signed char, short >*/,
+ 0/*arithm::addWeighted<double, signed char, int >*/,
+ 0/*arithm::addWeighted<double, signed char, float >*/,
+ 0/*arithm::addWeighted<double, signed char, double>*/
},
{
- 0/*addWeighted_gpu<double, unsigned short, unsigned char >*/,
- 0/*addWeighted_gpu<double, unsigned short, signed char >*/,
- 0/*addWeighted_gpu<double, unsigned short, unsigned short>*/,
- 0/*addWeighted_gpu<double, unsigned short, short >*/,
- 0/*addWeighted_gpu<double, unsigned short, int >*/,
- 0/*addWeighted_gpu<double, unsigned short, float >*/,
- 0/*addWeighted_gpu<double, unsigned short, double>*/
+ 0/*arithm::addWeighted<double, unsigned short, unsigned char >*/,
+ 0/*arithm::addWeighted<double, unsigned short, signed char >*/,
+ 0/*arithm::addWeighted<double, unsigned short, unsigned short>*/,
+ 0/*arithm::addWeighted<double, unsigned short, short >*/,
+ 0/*arithm::addWeighted<double, unsigned short, int >*/,
+ 0/*arithm::addWeighted<double, unsigned short, float >*/,
+ 0/*arithm::addWeighted<double, unsigned short, double>*/
},
{
- 0/*addWeighted_gpu<double, short, unsigned char >*/,
- 0/*addWeighted_gpu<double, short, signed char >*/,
- 0/*addWeighted_gpu<double, short, unsigned short>*/,
- 0/*addWeighted_gpu<double, short, short >*/,
- 0/*addWeighted_gpu<double, short, int >*/,
- 0/*addWeighted_gpu<double, short, float >*/,
- 0/*addWeighted_gpu<double, short, double>*/
+ 0/*arithm::addWeighted<double, short, unsigned char >*/,
+ 0/*arithm::addWeighted<double, short, signed char >*/,
+ 0/*arithm::addWeighted<double, short, unsigned short>*/,
+ 0/*arithm::addWeighted<double, short, short >*/,
+ 0/*arithm::addWeighted<double, short, int >*/,
+ 0/*arithm::addWeighted<double, short, float >*/,
+ 0/*arithm::addWeighted<double, short, double>*/
},
{
- 0/*addWeighted_gpu<double, int, unsigned char >*/,
- 0/*addWeighted_gpu<double, int, signed char >*/,
- 0/*addWeighted_gpu<double, int, unsigned short>*/,
- 0/*addWeighted_gpu<double, int, short >*/,
- 0/*addWeighted_gpu<double, int, int >*/,
- 0/*addWeighted_gpu<double, int, float >*/,
- 0/*addWeighted_gpu<double, int, double>*/
+ 0/*arithm::addWeighted<double, int, unsigned char >*/,
+ 0/*arithm::addWeighted<double, int, signed char >*/,
+ 0/*arithm::addWeighted<double, int, unsigned short>*/,
+ 0/*arithm::addWeighted<double, int, short >*/,
+ 0/*arithm::addWeighted<double, int, int >*/,
+ 0/*arithm::addWeighted<double, int, float >*/,
+ 0/*arithm::addWeighted<double, int, double>*/
},
{
- 0/*addWeighted_gpu<double, float, unsigned char >*/,
- 0/*addWeighted_gpu<double, float, signed char >*/,
- 0/*addWeighted_gpu<double, float, unsigned short>*/,
- 0/*addWeighted_gpu<double, float, short >*/,
- 0/*addWeighted_gpu<double, float, int >*/,
- 0/*addWeighted_gpu<double, float, float >*/,
- 0/*addWeighted_gpu<double, float, double>*/
+ 0/*arithm::addWeighted<double, float, unsigned char >*/,
+ 0/*arithm::addWeighted<double, float, signed char >*/,
+ 0/*arithm::addWeighted<double, float, unsigned short>*/,
+ 0/*arithm::addWeighted<double, float, short >*/,
+ 0/*arithm::addWeighted<double, float, int >*/,
+ 0/*arithm::addWeighted<double, float, float >*/,
+ 0/*arithm::addWeighted<double, float, double>*/
},
{
- addWeighted_gpu<double, double, unsigned char >,
- addWeighted_gpu<double, double, signed char >,
- addWeighted_gpu<double, double, unsigned short>,
- addWeighted_gpu<double, double, short >,
- addWeighted_gpu<double, double, int >,
- addWeighted_gpu<double, double, float >,
- addWeighted_gpu<double, double, double>
+ arithm::addWeighted<double, double, unsigned char >,
+ arithm::addWeighted<double, double, signed char >,
+ arithm::addWeighted<double, double, unsigned short>,
+ arithm::addWeighted<double, double, short >,
+ arithm::addWeighted<double, double, int >,
+ arithm::addWeighted<double, double, float >,
+ arithm::addWeighted<double, double, double>
}
}
};
- CV_Assert(src1.size() == src2.size());
- CV_Assert(src1.type() == src2.type() || (dtype >= 0 && src1.channels() == src2.channels()));
-
- dtype = dtype >= 0 ? CV_MAKETYPE(dtype, src1.channels()) : src1.type();
+ int sdepth1 = src1.depth();
+ int sdepth2 = src2.depth();
+ ddepth = ddepth >= 0 ? CV_MAT_DEPTH(ddepth) : std::max(sdepth1, sdepth2);
+ const int cn = src1.channels();
- CV_Assert(src1.depth() <= CV_64F && src2.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
+ CV_Assert( src2.size() == src1.size() && src2.channels() == cn );
+ CV_Assert( sdepth1 <= CV_64F && sdepth2 <= CV_64F && ddepth <= CV_64F );
- if (src1.depth() == CV_64F || src2.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+ if (sdepth1 == CV_64F || sdepth2 == CV_64F || ddepth == CV_64F)
{
- if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+ if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
}
- dst.create(src1.size(), dtype);
+ dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
- const GpuMat* psrc1 = &src1;
- const GpuMat* psrc2 = &src2;
+ PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+ PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+ PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
- if (src1.depth() > src2.depth())
+ if (sdepth1 > sdepth2)
{
- std::swap(psrc1, psrc2);
+ std::swap(src1_.data, src2_.data);
+ std::swap(src1_.step, src2_.step);
std::swap(alpha, beta);
+ std::swap(sdepth1, sdepth2);
}
- const func_t func = funcs[psrc1->depth()][psrc2->depth()][dst.depth()];
+ const func_t func = funcs[sdepth1][sdepth2][ddepth];
if (!func)
CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
- func(psrc1->reshape(1), alpha, psrc2->reshape(1), beta, gamma, dst.reshape(1), StreamAccessor::getStream(stream));
+ func(src1_, alpha, src2_, beta, gamma, dst_, StreamAccessor::getStream(stream));
}
#endif