From: Roman Donchenko Date: Tue, 27 Aug 2013 09:23:26 +0000 (+0400) Subject: Merge commit '43aec5ad^' into merge-2.4 X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~3787^2~6 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=4d06c4c7b6a46940cfc724d59e6e78670d96d0b1;p=platform%2Fupstream%2Fopencv.git Merge commit '43aec5ad^' into merge-2.4 Conflicts: modules/contrib/src/inputoutput.cpp modules/gpu/perf/perf_imgproc.cpp modules/gpuarithm/perf/perf_element_operations.cpp modules/gpuarithm/src/element_operations.cpp modules/ts/src/precomp.hpp --- 4d06c4c7b6a46940cfc724d59e6e78670d96d0b1 diff --cc modules/contrib/src/inputoutput.cpp index d0e947b,d6d514f..310dec7 --- a/modules/contrib/src/inputoutput.cpp +++ b/modules/contrib/src/inputoutput.cpp @@@ -10,11 -11,11 +10,11 @@@ namespace cv { - std::vector Directory::GetListFiles( const String& path, const String & exten, bool addPath ) - std::vector Directory::GetListFiles( const std::string& path, const std::string & exten, bool addPath ) ++ std::vector Directory::GetListFiles( const String& path, const String & exten, bool addPath ) { - std::vector list; + std::vector list; list.clear(); - std::string path_f = path + "/" + exten; + String path_f = path + "/" + exten; #ifdef WIN32 #ifdef HAVE_WINRT WIN32_FIND_DATAW FindFileData; diff --cc modules/gpuarithm/perf/perf_arithm.cpp index dfeafa0,0000000..b18c8a8 mode 100644,000000..100644 --- a/modules/gpuarithm/perf/perf_arithm.cpp +++ b/modules/gpuarithm/perf/perf_arithm.cpp @@@ -1,307 -1,0 +1,307 @@@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "perf_precomp.hpp" + +using namespace std; +using namespace testing; +using namespace perf; + +////////////////////////////////////////////////////////////////////// +// GEMM + +CV_FLAGS(GemmFlags, 0, cv::GEMM_1_T, cv::GEMM_2_T, cv::GEMM_3_T) +#define ALL_GEMM_FLAGS Values(GemmFlags(0), GemmFlags(cv::GEMM_1_T), GemmFlags(cv::GEMM_2_T), GemmFlags(cv::GEMM_3_T), \ + GemmFlags(cv::GEMM_1_T | cv::GEMM_2_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_3_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_2_T | cv::GEMM_3_T)) + +DEF_PARAM_TEST(Sz_Type_Flags, cv::Size, MatType, GemmFlags); + +PERF_TEST_P(Sz_Type_Flags, GEMM, + Combine(Values(cv::Size(512, 512), cv::Size(1024, 1024)), + Values(CV_32FC1, CV_32FC2, CV_64FC1), + ALL_GEMM_FLAGS)) +{ + const cv::Size size = GET_PARAM(0); + const int type = GET_PARAM(1); + const int flags = GET_PARAM(2); + + cv::Mat src1(size, type); + declare.in(src1, WARMUP_RNG); + + cv::Mat src2(size, type); + declare.in(src2, WARMUP_RNG); + + cv::Mat src3(size, type); + declare.in(src3, WARMUP_RNG); + + if (PERF_RUN_GPU()) + { + declare.time(5.0); + + const cv::gpu::GpuMat d_src1(src1); + const cv::gpu::GpuMat d_src2(src2); + const cv::gpu::GpuMat d_src3(src3); + cv::gpu::GpuMat dst; + + TEST_CYCLE() cv::gpu::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, dst, flags); + - GPU_SANITY_CHECK(dst, 1e-6); ++ GPU_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE); + } + else + { + declare.time(50.0); + + cv::Mat dst; + + TEST_CYCLE() cv::gemm(src1, src2, 1.0, src3, 1.0, dst, flags); + + CPU_SANITY_CHECK(dst); + } +} + +////////////////////////////////////////////////////////////////////// +// MulSpectrums + +CV_FLAGS(DftFlags, 0, cv::DFT_INVERSE, cv::DFT_SCALE, cv::DFT_ROWS, cv::DFT_COMPLEX_OUTPUT, cv::DFT_REAL_OUTPUT) + +DEF_PARAM_TEST(Sz_Flags, cv::Size, DftFlags); + +PERF_TEST_P(Sz_Flags, MulSpectrums, + Combine(GPU_TYPICAL_MAT_SIZES, + Values(0, DftFlags(cv::DFT_ROWS)))) +{ + const cv::Size size = GET_PARAM(0); + const int flag = GET_PARAM(1); + + cv::Mat a(size, CV_32FC2); + cv::Mat b(size, CV_32FC2); + declare.in(a, b, WARMUP_RNG); + + if (PERF_RUN_GPU()) + { + const cv::gpu::GpuMat d_a(a); + const cv::gpu::GpuMat d_b(b); + cv::gpu::GpuMat dst; + + TEST_CYCLE() cv::gpu::mulSpectrums(d_a, d_b, dst, flag); + + GPU_SANITY_CHECK(dst); + } + else + { + cv::Mat dst; + + TEST_CYCLE() cv::mulSpectrums(a, b, dst, flag); + + CPU_SANITY_CHECK(dst); + } +} + +////////////////////////////////////////////////////////////////////// +// MulAndScaleSpectrums + +PERF_TEST_P(Sz, MulAndScaleSpectrums, + GPU_TYPICAL_MAT_SIZES) +{ + const cv::Size size = GetParam(); + + const float scale = 1.f / size.area(); + + cv::Mat src1(size, CV_32FC2); + cv::Mat src2(size, CV_32FC2); + declare.in(src1,src2, WARMUP_RNG); + + if (PERF_RUN_GPU()) + { + const cv::gpu::GpuMat d_src1(src1); + const cv::gpu::GpuMat d_src2(src2); + cv::gpu::GpuMat dst; + + TEST_CYCLE() cv::gpu::mulAndScaleSpectrums(d_src1, d_src2, dst, cv::DFT_ROWS, scale, false); + + GPU_SANITY_CHECK(dst); + } + else + { + FAIL_NO_CPU(); + } +} + +////////////////////////////////////////////////////////////////////// +// Dft + +PERF_TEST_P(Sz_Flags, Dft, + Combine(GPU_TYPICAL_MAT_SIZES, + Values(0, DftFlags(cv::DFT_ROWS), DftFlags(cv::DFT_INVERSE)))) +{ + declare.time(10.0); + + const cv::Size size = GET_PARAM(0); + const int flag = GET_PARAM(1); + + cv::Mat src(size, CV_32FC2); + declare.in(src, WARMUP_RNG); + + if (PERF_RUN_GPU()) + { + const cv::gpu::GpuMat d_src(src); + cv::gpu::GpuMat dst; + + TEST_CYCLE() cv::gpu::dft(d_src, dst, size, flag); + + GPU_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE); + } + else + { + cv::Mat dst; + + TEST_CYCLE() cv::dft(src, dst, flag); + + CPU_SANITY_CHECK(dst); + } +} + +////////////////////////////////////////////////////////////////////// +// Convolve + +DEF_PARAM_TEST(Sz_KernelSz_Ccorr, cv::Size, int, bool); + +PERF_TEST_P(Sz_KernelSz_Ccorr, Convolve, + Combine(GPU_TYPICAL_MAT_SIZES, + Values(17, 27, 32, 64), + Bool())) +{ + declare.time(10.0); + + const cv::Size size = GET_PARAM(0); + const int templ_size = GET_PARAM(1); + const bool ccorr = GET_PARAM(2); + + const cv::Mat image(size, CV_32FC1); + const cv::Mat templ(templ_size, templ_size, CV_32FC1); + declare.in(image, templ, WARMUP_RNG); + + if (PERF_RUN_GPU()) + { + cv::gpu::GpuMat d_image = cv::gpu::createContinuous(size, CV_32FC1); + d_image.upload(image); + + cv::gpu::GpuMat d_templ = cv::gpu::createContinuous(templ_size, templ_size, CV_32FC1); + d_templ.upload(templ); + + cv::Ptr convolution = cv::gpu::createConvolution(); + + cv::gpu::GpuMat dst; + + TEST_CYCLE() convolution->convolve(d_image, d_templ, dst, ccorr); + - GPU_SANITY_CHECK(dst); ++ GPU_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE); + } + else + { + if (ccorr) + FAIL_NO_CPU(); + + cv::Mat dst; + + TEST_CYCLE() cv::filter2D(image, dst, image.depth(), templ); + + CPU_SANITY_CHECK(dst); + } +} + +////////////////////////////////////////////////////////////////////// +// Integral + +PERF_TEST_P(Sz, Integral, + GPU_TYPICAL_MAT_SIZES) +{ + const cv::Size size = GetParam(); + + cv::Mat src(size, CV_8UC1); + declare.in(src, WARMUP_RNG); + + if (PERF_RUN_GPU()) + { + const cv::gpu::GpuMat d_src(src); + cv::gpu::GpuMat dst; + cv::gpu::GpuMat d_buf; + + TEST_CYCLE() cv::gpu::integral(d_src, dst, d_buf); + + GPU_SANITY_CHECK(dst); + } + else + { + cv::Mat dst; + + TEST_CYCLE() cv::integral(src, dst); + + CPU_SANITY_CHECK(dst); + } +} + +////////////////////////////////////////////////////////////////////// +// IntegralSqr + +PERF_TEST_P(Sz, IntegralSqr, + GPU_TYPICAL_MAT_SIZES) +{ + const cv::Size size = GetParam(); + + cv::Mat src(size, CV_8UC1); + declare.in(src, WARMUP_RNG); + + if (PERF_RUN_GPU()) + { + const cv::gpu::GpuMat d_src(src); + cv::gpu::GpuMat dst, buf; + + TEST_CYCLE() cv::gpu::sqrIntegral(d_src, dst, buf); + + GPU_SANITY_CHECK(dst); + } + else + { + FAIL_NO_CPU(); + } +} diff --cc modules/gpuarithm/src/element_operations.cpp index 3ec4f84,0000000..20473de mode 100644,000000..100644 --- a/modules/gpuarithm/src/element_operations.cpp +++ b/modules/gpuarithm/src/element_operations.cpp @@@ -1,3147 -1,0 +1,3147 @@@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +using namespace cv; +using namespace cv::gpu; + +#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER) + +void cv::gpu::add(InputArray, InputArray, OutputArray, InputArray, int, Stream&) { throw_no_cuda(); } +void cv::gpu::subtract(InputArray, InputArray, OutputArray, InputArray, int, Stream&) { throw_no_cuda(); } +void cv::gpu::multiply(InputArray, InputArray, OutputArray, double, int, Stream&) { throw_no_cuda(); } +void cv::gpu::divide(InputArray, InputArray, OutputArray, double, int, Stream&) { throw_no_cuda(); } +void cv::gpu::absdiff(InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); } + +void cv::gpu::abs(InputArray, OutputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::sqr(InputArray, OutputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::sqrt(InputArray, OutputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::exp(InputArray, OutputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::log(InputArray, OutputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::pow(InputArray, double, OutputArray, Stream&) { throw_no_cuda(); } + +void cv::gpu::compare(InputArray, InputArray, OutputArray, int, Stream&) { throw_no_cuda(); } + +void cv::gpu::bitwise_not(InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::bitwise_or(InputArray, InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::bitwise_and(InputArray, InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::bitwise_xor(InputArray, InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); } + +void cv::gpu::rshift(InputArray, Scalar_, OutputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::lshift(InputArray, Scalar_, OutputArray, Stream&) { throw_no_cuda(); } + +void cv::gpu::min(InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::max(InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); } + +void cv::gpu::addWeighted(InputArray, double, InputArray, double, double, OutputArray, int, Stream&) { throw_no_cuda(); } + +double cv::gpu::threshold(InputArray, OutputArray, double, double, int, Stream&) {throw_no_cuda(); return 0.0;} + +void cv::gpu::magnitude(InputArray, OutputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::magnitude(InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::magnitudeSqr(InputArray, OutputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::magnitudeSqr(InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::phase(InputArray, InputArray, OutputArray, bool, Stream&) { throw_no_cuda(); } +void cv::gpu::cartToPolar(InputArray, InputArray, OutputArray, OutputArray, bool, Stream&) { throw_no_cuda(); } +void cv::gpu::polarToCart(InputArray, InputArray, OutputArray, OutputArray, bool, Stream&) { throw_no_cuda(); } + +#else + +//////////////////////////////////////////////////////////////////////// +// arithm_op + +namespace +{ + typedef void (*mat_mat_func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double scale, Stream& stream, int op); + typedef void (*mat_scalar_func_t)(const GpuMat& src, Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double scale, Stream& stream, int op); + + void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, InputArray _mask, double scale, int dtype, Stream& stream, + mat_mat_func_t mat_mat_func, mat_scalar_func_t mat_scalar_func, int op = 0) + { + const int kind1 = _src1.kind(); + const int kind2 = _src2.kind(); + + const bool isScalar1 = (kind1 == _InputArray::MATX); + const bool isScalar2 = (kind2 == _InputArray::MATX); + CV_Assert( !isScalar1 || !isScalar2 ); + + GpuMat src1; + if (!isScalar1) + src1 = _src1.getGpuMat(); + + GpuMat src2; + if (!isScalar2) + src2 = _src2.getGpuMat(); + + Mat scalar; + if (isScalar1) + scalar = _src1.getMat(); + else if (isScalar2) + scalar = _src2.getMat(); + + Scalar val; + if (!scalar.empty()) + { + CV_Assert( scalar.total() <= 4 ); + scalar.convertTo(Mat_(scalar.rows, scalar.cols, &val[0]), CV_64F); + } + + GpuMat mask = _mask.getGpuMat(); + + const int sdepth = src1.empty() ? src2.depth() : src1.depth(); + const int cn = src1.empty() ? src2.channels() : src1.channels(); + const Size size = src1.empty() ? src2.size() : src1.size(); + + if (dtype < 0) + dtype = sdepth; + + const int ddepth = CV_MAT_DEPTH(dtype); + + CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F ); + CV_Assert( !scalar.empty() || (src2.type() == src1.type() && src2.size() == src1.size()) ); + CV_Assert( mask.empty() || (cn == 1 && mask.size() == size && mask.type() == CV_8UC1) ); + + if (sdepth == CV_64F || ddepth == CV_64F) + { + if (!deviceSupports(NATIVE_DOUBLE)) + CV_Error(Error::StsUnsupportedFormat, "The device doesn't support double"); + } + + _dst.create(size, CV_MAKE_TYPE(ddepth, cn)); + GpuMat dst = _dst.getGpuMat(); + + if (isScalar1) + mat_scalar_func(src2, val, true, dst, mask, scale, stream, op); + else if (isScalar2) + mat_scalar_func(src1, val, false, dst, mask, scale, stream, op); + else + mat_mat_func(src1, src2, dst, mask, scale, stream, op); + } +} + + +//////////////////////////////////////////////////////////////////////// +// Basic arithmetical operations (add subtract multiply divide) + +namespace +{ + template struct NppTypeTraits; + template<> struct NppTypeTraits { typedef Npp8u npp_t; }; + template<> struct NppTypeTraits { typedef Npp8s npp_t; }; + template<> struct NppTypeTraits { typedef Npp16u npp_t; }; + template<> struct NppTypeTraits { typedef Npp16s npp_t; typedef Npp16sc npp_complex_type; }; + template<> struct NppTypeTraits { typedef Npp32s npp_t; typedef Npp32sc npp_complex_type; }; + template<> struct NppTypeTraits { typedef Npp32f npp_t; typedef Npp32fc npp_complex_type; }; + template<> struct NppTypeTraits { typedef Npp64f npp_t; typedef Npp64fc npp_complex_type; }; + + template struct NppArithmScalarFunc + { + typedef typename NppTypeTraits::npp_t npp_t; + + typedef NppStatus (*func_ptr)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pConstants, + npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor); + }; + template struct NppArithmScalarFunc + { + typedef typename NppTypeTraits::npp_t npp_t; + + typedef NppStatus (*func_ptr)(const npp_t* pSrc1, int nSrc1Step, const npp_t pConstants, + npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor); + }; + template struct NppArithmScalarFunc + { + typedef typename NppTypeTraits::npp_complex_type npp_complex_type; + + typedef NppStatus (*func_ptr)(const npp_complex_type* pSrc1, int nSrc1Step, const npp_complex_type pConstants, + npp_complex_type* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor); + }; + template struct NppArithmScalarFunc + { + typedef NppStatus (*func_ptr)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f* pConstants, Npp32f* pDst, int nDstStep, NppiSize oSizeROI); + }; + template<> struct NppArithmScalarFunc + { + typedef NppStatus (*func_ptr)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f pConstants, Npp32f* pDst, int nDstStep, NppiSize oSizeROI); + }; + template<> struct NppArithmScalarFunc + { + typedef NppStatus (*func_ptr)(const Npp32fc* pSrc1, int nSrc1Step, const Npp32fc pConstants, Npp32fc* pDst, int nDstStep, NppiSize oSizeROI); + }; + + template::func_ptr func> struct NppArithmScalar + { + typedef typename NppTypeTraits::npp_t npp_t; + + static void call(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + const npp_t pConstants[] = { saturate_cast(sc.val[0]), saturate_cast(sc.val[1]), saturate_cast(sc.val[2]), saturate_cast(sc.val[3]) }; + + nppSafeCall( func((const npp_t*)src.data, static_cast(src.step), pConstants, (npp_t*)dst.data, static_cast(dst.step), sz, 0) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template::func_ptr func> struct NppArithmScalar + { + typedef typename NppTypeTraits::npp_t npp_t; + + static void call(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( func((const npp_t*)src.data, static_cast(src.step), saturate_cast(sc.val[0]), (npp_t*)dst.data, static_cast(dst.step), sz, 0) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template::func_ptr func> struct NppArithmScalar + { + typedef typename NppTypeTraits::npp_t npp_t; + typedef typename NppTypeTraits::npp_complex_type npp_complex_type; + + static void call(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + npp_complex_type nConstant; + nConstant.re = saturate_cast(sc.val[0]); + nConstant.im = saturate_cast(sc.val[1]); + + nppSafeCall( func((const npp_complex_type*)src.data, static_cast(src.step), nConstant, + (npp_complex_type*)dst.data, static_cast(dst.step), sz, 0) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template::func_ptr func> struct NppArithmScalar + { + typedef typename NppTypeTraits::npp_t npp_t; + + static void call(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + const Npp32f pConstants[] = { saturate_cast(sc.val[0]), saturate_cast(sc.val[1]), saturate_cast(sc.val[2]), saturate_cast(sc.val[3]) }; + + nppSafeCall( func((const npp_t*)src.data, static_cast(src.step), pConstants, (npp_t*)dst.data, static_cast(dst.step), sz) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template::func_ptr func> struct NppArithmScalar + { + typedef typename NppTypeTraits::npp_t npp_t; + + static void call(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( func((const npp_t*)src.data, static_cast(src.step), saturate_cast(sc.val[0]), (npp_t*)dst.data, static_cast(dst.step), sz) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template::func_ptr func> struct NppArithmScalar + { + typedef typename NppTypeTraits::npp_t npp_t; + typedef typename NppTypeTraits::npp_complex_type npp_complex_type; + + static void call(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Npp32fc nConstant; + nConstant.re = saturate_cast(sc.val[0]); + nConstant.im = saturate_cast(sc.val[1]); + + nppSafeCall( func((const npp_complex_type*)src.data, static_cast(src.step), nConstant, (npp_complex_type*)dst.data, static_cast(dst.step), sz) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; +} + +//////////////////////////////////////////////////////////////////////// +// add + +namespace arithm +{ + void addMat_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream); + void addMat_v2(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream); + + template + void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +static void addMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int) +{ + typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + static const func_t funcs[7][7] = + { + { + arithm::addMat, + arithm::addMat, + arithm::addMat, + arithm::addMat, + arithm::addMat, + arithm::addMat, + arithm::addMat + }, + { + arithm::addMat, + arithm::addMat, + arithm::addMat, + arithm::addMat, + arithm::addMat, + arithm::addMat, + arithm::addMat + }, + { + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + arithm::addMat, + arithm::addMat, + arithm::addMat, + arithm::addMat, + arithm::addMat + }, + { + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + arithm::addMat, + arithm::addMat, + arithm::addMat, + arithm::addMat, + arithm::addMat + }, + { + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + arithm::addMat, + arithm::addMat, + arithm::addMat + }, + { + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + arithm::addMat, + arithm::addMat + }, + { + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + arithm::addMat + } + }; + + const int sdepth = src1.depth(); + const int ddepth = dst.depth(); + const int cn = src1.channels(); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + + PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step); + PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step); + PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step); + + if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth) + { + const intptr_t src1ptr = reinterpret_cast(src1_.data); + const intptr_t src2ptr = reinterpret_cast(src2_.data); + const intptr_t dstptr = reinterpret_cast(dst_.data); + + const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0; + + if (isAllAligned) + { + if (sdepth == CV_8U && (src1_.cols & 3) == 0) + { + const int vcols = src1_.cols >> 2; + + arithm::addMat_v4(PtrStepSz(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step), + stream); + + return; + } + else if (sdepth == CV_16U && (src1_.cols & 1) == 0) + { + const int vcols = src1_.cols >> 1; + + arithm::addMat_v2(PtrStepSz(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step), + stream); + + return; + } + } + } + + const func_t func = funcs[sdepth][ddepth]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src1_, src2_, dst_, mask, stream); +} + +namespace arithm +{ + template + void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +static void addScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int) +{ + typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + static const func_t funcs[7][7] = + { + { + arithm::addScalar, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar + }, + { + arithm::addScalar, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar + }, + { + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar + }, + { + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar + }, + { + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar + }, + { + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + arithm::addScalar, + arithm::addScalar + }, + { + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + arithm::addScalar + } + }; + + typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream); + static const npp_func_t npp_funcs[7][4] = + { + {NppArithmScalar::call, 0 , NppArithmScalar::call, NppArithmScalar::call}, + {0 , 0 , 0 , 0 }, + {NppArithmScalar::call, 0 , NppArithmScalar::call, NppArithmScalar::call}, + {NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call}, + {NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call, 0 }, + {NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call}, + {0 , 0 , 0 , 0 } + }; + + const int sdepth = src.depth(); + const int ddepth = dst.depth(); + const int cn = src.channels(); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + + const npp_func_t npp_func = npp_funcs[sdepth][cn - 1]; + if (ddepth == sdepth && cn > 1 && npp_func != 0) + { + npp_func(src, val, dst, stream); + return; + } + + CV_Assert( cn == 1 ); + + const func_t func = funcs[sdepth][ddepth]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src, val[0], dst, mask, stream); +} + +void cv::gpu::add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, int dtype, Stream& stream) +{ + arithm_op(src1, src2, dst, mask, 1.0, dtype, stream, addMat, addScalar); +} + +//////////////////////////////////////////////////////////////////////// +// subtract + +namespace arithm +{ + void subMat_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream); + void subMat_v2(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream); + + template + void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +static void subMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int) +{ + typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + static const func_t funcs[7][7] = + { + { + arithm::subMat, + arithm::subMat, + arithm::subMat, + arithm::subMat, + arithm::subMat, + arithm::subMat, + arithm::subMat + }, + { + arithm::subMat, + arithm::subMat, + arithm::subMat, + arithm::subMat, + arithm::subMat, + arithm::subMat, + arithm::subMat + }, + { + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + arithm::subMat, + arithm::subMat, + arithm::subMat, + arithm::subMat, + arithm::subMat + }, + { + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + arithm::subMat, + arithm::subMat, + arithm::subMat, + arithm::subMat, + arithm::subMat + }, + { + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + arithm::subMat, + arithm::subMat, + arithm::subMat + }, + { + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + arithm::subMat, + arithm::subMat + }, + { + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + arithm::subMat + } + }; + + const int sdepth = src1.depth(); + const int ddepth = dst.depth(); + const int cn = src1.channels(); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + + PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step); + PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step); + PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step); + + if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth) + { + const intptr_t src1ptr = reinterpret_cast(src1_.data); + const intptr_t src2ptr = reinterpret_cast(src2_.data); + const intptr_t dstptr = reinterpret_cast(dst_.data); + + const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0; + + if (isAllAligned) + { + if (sdepth == CV_8U && (src1_.cols & 3) == 0) + { + const int vcols = src1_.cols >> 2; + + arithm::subMat_v4(PtrStepSz(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step), + stream); + + return; + } + else if (sdepth == CV_16U && (src1_.cols & 1) == 0) + { + const int vcols = src1_.cols >> 1; + + arithm::subMat_v2(PtrStepSz(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step), + stream); + + return; + } + } + } + + const func_t func = funcs[sdepth][ddepth]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src1_, src2_, dst_, mask, stream); +} + +namespace arithm +{ + template + void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +static void subScalar(const GpuMat& src, Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int) +{ + typedef void (*func_t)(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + static const func_t funcs[7][7] = + { + { + arithm::subScalar, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar + }, + { + arithm::subScalar, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar + }, + { + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar + }, + { + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar + }, + { + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar + }, + { + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + arithm::subScalar, + arithm::subScalar + }, + { + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + arithm::subScalar + } + }; + + typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream); + static const npp_func_t npp_funcs[7][4] = + { + {NppArithmScalar::call, 0 , NppArithmScalar::call, NppArithmScalar::call}, + {0 , 0 , 0 , 0 }, + {NppArithmScalar::call, 0 , NppArithmScalar::call, NppArithmScalar::call}, + {NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call}, + {NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call, 0 }, + {NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call}, + {0 , 0 , 0 , 0 } + }; + + const int sdepth = src.depth(); + const int ddepth = dst.depth(); + const int cn = src.channels(); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + + const npp_func_t npp_func = npp_funcs[sdepth][cn - 1]; + if (ddepth == sdepth && cn > 1 && npp_func != 0 && !inv) + { + npp_func(src, val, dst, stream); + return; + } + + CV_Assert( cn == 1 ); + + const func_t func = funcs[sdepth][ddepth]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src, val[0], inv, dst, mask, stream); +} + +void cv::gpu::subtract(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, int dtype, Stream& stream) +{ + arithm_op(src1, src2, dst, mask, 1.0, dtype, stream, subMat, subScalar); +} + +//////////////////////////////////////////////////////////////////////// +// multiply + +namespace arithm +{ + void mulMat_8uc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream); + + void mulMat_16sc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream); + + template + void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); +} + +static void mulMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& _stream, int) +{ + typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + static const func_t funcs[7][7] = + { + { + arithm::mulMat, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat + }, + { + arithm::mulMat, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat + }, + { + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat + }, + { + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat + }, + { + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat + }, + { + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + arithm::mulMat, + arithm::mulMat + }, + { + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + arithm::mulMat + } + }; + + const int sdepth = src1.depth(); + const int ddepth = dst.depth(); + const int cn = src1.channels(); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + + PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step); + PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step); + PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step); + + const func_t func = funcs[sdepth][ddepth]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src1_, src2_, dst_, scale, stream); +} + +namespace arithm +{ + template + void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); +} + +static void mulScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat&, double scale, Stream& _stream, int) +{ + typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + static const func_t funcs[7][7] = + { + { + arithm::mulScalar, + arithm::mulScalar, + arithm::mulScalar, + arithm::mulScalar, + arithm::mulScalar, + arithm::mulScalar, + arithm::mulScalar + }, + { + arithm::mulScalar, + arithm::mulScalar, + arithm::mulScalar, + arithm::mulScalar, + arithm::mulScalar, + arithm::mulScalar, + arithm::mulScalar + }, + { + 0 /*arithm::mulScalar*/, + 0 /*arithm::mulScalar*/, + arithm::mulScalar, + arithm::mulScalar, + arithm::mulScalar, + arithm::mulScalar, + arithm::mulScalar + }, + { + 0 /*arithm::mulScalar*/, + 0 /*arithm::mulScalar*/, + arithm::mulScalar, + arithm::mulScalar, + arithm::mulScalar, + arithm::mulScalar, + arithm::mulScalar + }, + { + 0 /*arithm::mulScalar*/, + 0 /*arithm::mulScalar*/, + 0 /*arithm::mulScalar*/, + 0 /*arithm::mulScalar*/, + arithm::mulScalar, + arithm::mulScalar, + arithm::mulScalar + }, + { + 0 /*arithm::mulScalar*/, + 0 /*arithm::mulScalar*/, + 0 /*arithm::mulScalar*/, + 0 /*arithm::mulScalar*/, + 0 /*arithm::mulScalar*/, + arithm::mulScalar, + arithm::mulScalar + }, + { + 0 /*arithm::mulScalar*/, + 0 /*arithm::mulScalar*/, + 0 /*arithm::mulScalar*/, + 0 /*arithm::mulScalar*/, + 0 /*arithm::mulScalar*/, + 0 /*arithm::mulScalar*/, + arithm::mulScalar + } + }; + + typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream); + static const npp_func_t npp_funcs[7][4] = + { + {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, + {0 , 0, 0 , 0 }, + {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, + {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, + {NppArithmScalar::call, 0, NppArithmScalar::call, 0 }, + {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, + {0 , 0, 0 , 0 } + }; + + const int sdepth = src.depth(); + const int ddepth = dst.depth(); + const int cn = src.channels(); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + + val[0] *= scale; + val[1] *= scale; + val[2] *= scale; + val[3] *= scale; + + const npp_func_t npp_func = npp_funcs[sdepth][cn - 1]; + if (ddepth == sdepth && cn > 1 && npp_func != 0) + { + npp_func(src, val, dst, stream); + return; + } + + CV_Assert( cn == 1 ); + + const func_t func = funcs[sdepth][ddepth]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src, val[0], dst, stream); +} + +void cv::gpu::multiply(InputArray _src1, InputArray _src2, OutputArray _dst, double scale, int dtype, Stream& stream) +{ + if (_src1.type() == CV_8UC4 && _src2.type() == CV_32FC1) + { + GpuMat src1 = _src1.getGpuMat(); + GpuMat src2 = _src2.getGpuMat(); + + CV_Assert( src1.size() == src2.size() ); + + _dst.create(src1.size(), src1.type()); + GpuMat dst = _dst.getGpuMat(); + + arithm::mulMat_8uc4_32f(src1, src2, dst, StreamAccessor::getStream(stream)); + } + else if (_src1.type() == CV_16SC4 && _src2.type() == CV_32FC1) + { + GpuMat src1 = _src1.getGpuMat(); + GpuMat src2 = _src2.getGpuMat(); + + CV_Assert( src1.size() == src2.size() ); + + _dst.create(src1.size(), src1.type()); + GpuMat dst = _dst.getGpuMat(); + + arithm::mulMat_16sc4_32f(src1, src2, dst, StreamAccessor::getStream(stream)); + } + else + { + arithm_op(_src1, _src2, _dst, GpuMat(), scale, dtype, stream, mulMat, mulScalar); + } +} + +//////////////////////////////////////////////////////////////////////// +// divide + +namespace arithm +{ + void divMat_8uc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream); + + void divMat_16sc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream); + + template + void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); +} + +static void divMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& _stream, int) +{ + typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + static const func_t funcs[7][7] = + { + { + arithm::divMat, + arithm::divMat, + arithm::divMat, + arithm::divMat, + arithm::divMat, + arithm::divMat, + arithm::divMat + }, + { + arithm::divMat, + arithm::divMat, + arithm::divMat, + arithm::divMat, + arithm::divMat, + arithm::divMat, + arithm::divMat + }, + { + 0 /*arithm::divMat*/, + 0 /*arithm::divMat*/, + arithm::divMat, + arithm::divMat, + arithm::divMat, + arithm::divMat, + arithm::divMat + }, + { + 0 /*arithm::divMat*/, + 0 /*arithm::divMat*/, + arithm::divMat, + arithm::divMat, + arithm::divMat, + arithm::divMat, + arithm::divMat + }, + { + 0 /*arithm::divMat*/, + 0 /*arithm::divMat*/, + 0 /*arithm::divMat*/, + 0 /*arithm::divMat*/, + arithm::divMat, + arithm::divMat, + arithm::divMat + }, + { + 0 /*arithm::divMat*/, + 0 /*arithm::divMat*/, + 0 /*arithm::divMat*/, + 0 /*arithm::divMat*/, + 0 /*arithm::divMat*/, + arithm::divMat, + arithm::divMat + }, + { + 0 /*arithm::divMat*/, + 0 /*arithm::divMat*/, + 0 /*arithm::divMat*/, + 0 /*arithm::divMat*/, + 0 /*arithm::divMat*/, + 0 /*arithm::divMat*/, + arithm::divMat + } + }; + + const int sdepth = src1.depth(); + const int ddepth = dst.depth(); + const int cn = src1.channels(); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + + PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step); + PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step); + PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step); + + const func_t func = funcs[sdepth][ddepth]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src1_, src2_, dst_, scale, stream); +} + +namespace arithm +{ + template + void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); +} + +static void divScalar(const GpuMat& src, Scalar val, bool inv, GpuMat& dst, const GpuMat&, double scale, Stream& _stream, int) +{ + typedef void (*func_t)(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream); + static const func_t funcs[7][7] = + { + { + arithm::divScalar, + arithm::divScalar, + arithm::divScalar, + arithm::divScalar, + arithm::divScalar, + arithm::divScalar, + arithm::divScalar + }, + { + arithm::divScalar, + arithm::divScalar, + arithm::divScalar, + arithm::divScalar, + arithm::divScalar, + arithm::divScalar, + arithm::divScalar + }, + { + 0 /*arithm::divScalar*/, + 0 /*arithm::divScalar*/, + arithm::divScalar, + arithm::divScalar, + arithm::divScalar, + arithm::divScalar, + arithm::divScalar + }, + { + 0 /*arithm::divScalar*/, + 0 /*arithm::divScalar*/, + arithm::divScalar, + arithm::divScalar, + arithm::divScalar, + arithm::divScalar, + arithm::divScalar + }, + { + 0 /*arithm::divScalar*/, + 0 /*arithm::divScalar*/, + 0 /*arithm::divScalar*/, + 0 /*arithm::divScalar*/, + arithm::divScalar, + arithm::divScalar, + arithm::divScalar + }, + { + 0 /*arithm::divScalar*/, + 0 /*arithm::divScalar*/, + 0 /*arithm::divScalar*/, + 0 /*arithm::divScalar*/, + 0 /*arithm::divScalar*/, + arithm::divScalar, + arithm::divScalar + }, + { + 0 /*arithm::divScalar*/, + 0 /*arithm::divScalar*/, + 0 /*arithm::divScalar*/, + 0 /*arithm::divScalar*/, + 0 /*arithm::divScalar*/, + 0 /*arithm::divScalar*/, + arithm::divScalar + } + }; + + typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream); + static const npp_func_t npp_funcs[7][4] = + { + {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, + {0 , 0, 0 , 0 }, + {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, + {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, + {NppArithmScalar::call, 0, NppArithmScalar::call, 0 }, + {NppArithmScalar::call, 0, NppArithmScalar::call, NppArithmScalar::call}, + {0 , 0, 0 , 0 } + }; + + const int sdepth = src.depth(); + const int ddepth = dst.depth(); + const int cn = src.channels(); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + + if (inv) + { + val[0] *= scale; + val[1] *= scale; + val[2] *= scale; + val[3] *= scale; + } + else + { + val[0] /= scale; + val[1] /= scale; + val[2] /= scale; + val[3] /= scale; + } + + const npp_func_t npp_func = npp_funcs[sdepth][cn - 1]; + if (ddepth == sdepth && cn > 1 && npp_func != 0 && !inv) + { + npp_func(src, val, dst, stream); + return; + } + + CV_Assert( cn == 1 ); + + const func_t func = funcs[sdepth][ddepth]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src, val[0], inv, dst, stream); +} + +void cv::gpu::divide(InputArray _src1, InputArray _src2, OutputArray _dst, double scale, int dtype, Stream& stream) +{ + if (_src1.type() == CV_8UC4 && _src2.type() == CV_32FC1) + { + GpuMat src1 = _src1.getGpuMat(); + GpuMat src2 = _src2.getGpuMat(); + + CV_Assert( src1.size() == src2.size() ); + + _dst.create(src1.size(), src1.type()); + GpuMat dst = _dst.getGpuMat(); + + arithm::divMat_8uc4_32f(src1, src2, dst, StreamAccessor::getStream(stream)); + } + else if (_src1.type() == CV_16SC4 && _src2.type() == CV_32FC1) + { + GpuMat src1 = _src1.getGpuMat(); + GpuMat src2 = _src2.getGpuMat(); + + CV_Assert( src1.size() == src2.size() ); + + _dst.create(src1.size(), src1.type()); + GpuMat dst = _dst.getGpuMat(); + + arithm::divMat_16sc4_32f(src1, src2, dst, StreamAccessor::getStream(stream)); + } + else + { + arithm_op(_src1, _src2, _dst, GpuMat(), scale, dtype, stream, divMat, divScalar); + } +} + +////////////////////////////////////////////////////////////////////////////// +// absdiff + +namespace arithm +{ + void absDiffMat_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream); + void absDiffMat_v2(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream); + + template + void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); +} + +static void absDiffMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& _stream, int) +{ + typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + static const func_t funcs[] = + { + arithm::absDiffMat, + arithm::absDiffMat, + arithm::absDiffMat, + arithm::absDiffMat, + arithm::absDiffMat, + arithm::absDiffMat, + arithm::absDiffMat + }; + + const int depth = src1.depth(); + const int cn = src1.channels(); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + + PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step); + PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step); + PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step); + + if (depth == CV_8U || depth == CV_16U) + { + const intptr_t src1ptr = reinterpret_cast(src1_.data); + const intptr_t src2ptr = reinterpret_cast(src2_.data); + const intptr_t dstptr = reinterpret_cast(dst_.data); + + const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0; + + if (isAllAligned) + { + if (depth == CV_8U && (src1_.cols & 3) == 0) + { + const int vcols = src1_.cols >> 2; + + arithm::absDiffMat_v4(PtrStepSz(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step), + stream); + + return; + } + else if (depth == CV_16U && (src1_.cols & 1) == 0) + { + const int vcols = src1_.cols >> 1; + + arithm::absDiffMat_v2(PtrStepSz(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step), + stream); + + return; + } + } + } + + const func_t func = funcs[depth]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src1_, src2_, dst_, stream); +} + +namespace arithm +{ + template + void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); +} + +static void absDiffScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int) +{ + typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + static const func_t funcs[] = + { + arithm::absDiffScalar, + arithm::absDiffScalar, + arithm::absDiffScalar, + arithm::absDiffScalar, + arithm::absDiffScalar, + arithm::absDiffScalar, + arithm::absDiffScalar + }; + + const int depth = src.depth(); + + funcs[depth](src, val[0], dst, StreamAccessor::getStream(stream)); +} + +void cv::gpu::absdiff(InputArray src1, InputArray src2, OutputArray dst, Stream& stream) +{ + arithm_op(src1, src2, dst, noArray(), 1.0, -1, stream, absDiffMat, absDiffScalar); +} + +////////////////////////////////////////////////////////////////////////////// +// abs + +namespace arithm +{ + template + void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); +} + +void cv::gpu::abs(InputArray _src, OutputArray _dst, Stream& stream) +{ + using namespace arithm; + + typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + static const func_t funcs[] = + { + absMat, + absMat, + absMat, + absMat, + absMat, + absMat, + absMat + }; + + GpuMat src = _src.getGpuMat(); + + const int depth = src.depth(); + + CV_Assert( depth <= CV_64F ); + CV_Assert( src.channels() == 1 ); + + if (depth == CV_64F) + { + if (!deviceSupports(NATIVE_DOUBLE)) + CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double"); + } + + _dst.create(src.size(), src.type()); + GpuMat dst = _dst.getGpuMat(); + + funcs[depth](src, dst, StreamAccessor::getStream(stream)); +} + +////////////////////////////////////////////////////////////////////////////// +// sqr + +namespace arithm +{ + template + void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); +} + +void cv::gpu::sqr(InputArray _src, OutputArray _dst, Stream& stream) +{ + using namespace arithm; + + typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + static const func_t funcs[] = + { + sqrMat, + sqrMat, + sqrMat, + sqrMat, + sqrMat, + sqrMat, + sqrMat + }; + + GpuMat src = _src.getGpuMat(); + + const int depth = src.depth(); + + CV_Assert( depth <= CV_64F ); + CV_Assert( src.channels() == 1 ); + + if (depth == CV_64F) + { + if (!deviceSupports(NATIVE_DOUBLE)) + CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double"); + } + + _dst.create(src.size(), src.type()); + GpuMat dst = _dst.getGpuMat(); + + funcs[depth](src, dst, StreamAccessor::getStream(stream)); +} + +////////////////////////////////////////////////////////////////////////////// +// sqrt + +namespace arithm +{ + template + void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); +} + +void cv::gpu::sqrt(InputArray _src, OutputArray _dst, Stream& stream) +{ + using namespace arithm; + + typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + static const func_t funcs[] = + { + sqrtMat, + sqrtMat, + sqrtMat, + sqrtMat, + sqrtMat, + sqrtMat, + sqrtMat + }; + + GpuMat src = _src.getGpuMat(); + + const int depth = src.depth(); + + CV_Assert( depth <= CV_64F ); + CV_Assert( src.channels() == 1 ); + + if (depth == CV_64F) + { + if (!deviceSupports(NATIVE_DOUBLE)) + CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double"); + } + + _dst.create(src.size(), src.type()); + GpuMat dst = _dst.getGpuMat(); + + funcs[depth](src, dst, StreamAccessor::getStream(stream)); +} + +//////////////////////////////////////////////////////////////////////// +// exp + +namespace arithm +{ + template + void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); +} + +void cv::gpu::exp(InputArray _src, OutputArray _dst, Stream& stream) +{ + using namespace arithm; + + typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + static const func_t funcs[] = + { + expMat, + expMat, + expMat, + expMat, + expMat, + expMat, + expMat + }; + + GpuMat src = _src.getGpuMat(); + + const int depth = src.depth(); + + CV_Assert( depth <= CV_64F ); + CV_Assert( src.channels() == 1 ); + + if (depth == CV_64F) + { + if (!deviceSupports(NATIVE_DOUBLE)) + CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double"); + } + + _dst.create(src.size(), src.type()); + GpuMat dst = _dst.getGpuMat(); + + funcs[depth](src, dst, StreamAccessor::getStream(stream)); +} + +//////////////////////////////////////////////////////////////////////// +// log + +namespace arithm +{ + template + void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); +} + +void cv::gpu::log(InputArray _src, OutputArray _dst, Stream& stream) +{ + using namespace arithm; + + typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream); + static const func_t funcs[] = + { + logMat, + logMat, + logMat, + logMat, + logMat, + logMat, + logMat + }; + + GpuMat src = _src.getGpuMat(); + + const int depth = src.depth(); + + CV_Assert( depth <= CV_64F ); + CV_Assert( src.channels() == 1 ); + + if (depth == CV_64F) + { + if (!deviceSupports(NATIVE_DOUBLE)) + CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double"); + } + + _dst.create(src.size(), src.type()); + GpuMat dst = _dst.getGpuMat(); + + funcs[depth](src, dst, StreamAccessor::getStream(stream)); +} + +//////////////////////////////////////////////////////////////////////// +// pow + +namespace arithm +{ + template void pow(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream); +} + +void cv::gpu::pow(InputArray _src, double power, OutputArray _dst, Stream& stream) +{ + typedef void (*func_t)(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream); + static const func_t funcs[] = + { + arithm::pow, + arithm::pow, + arithm::pow, + arithm::pow, + arithm::pow, + arithm::pow, + arithm::pow + }; + + GpuMat src = _src.getGpuMat(); + + const int depth = src.depth(); + const int cn = src.channels(); + + CV_Assert(depth <= CV_64F); + + if (depth == CV_64F) + { + if (!deviceSupports(NATIVE_DOUBLE)) + CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double"); + } + + _dst.create(src.size(), src.type()); + GpuMat dst = _dst.getGpuMat(); + + PtrStepSzb src_(src.rows, src.cols * cn, src.data, src.step); + PtrStepSzb dst_(src.rows, src.cols * cn, dst.data, dst.step); + + funcs[depth](src_, power, dst_, StreamAccessor::getStream(stream)); +} + +////////////////////////////////////////////////////////////////////////////// +// compare + +namespace arithm +{ + void cmpMatEq_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream); + void cmpMatNe_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream); + void cmpMatLt_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream); + void cmpMatLe_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream); + + template void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatLt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void cmpMatLe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); +} + +static void cmpMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& _stream, int cmpop) +{ + using namespace arithm; + + typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + static const func_t funcs[7][4] = + { + {cmpMatEq , cmpMatNe , cmpMatLt , cmpMatLe }, + {cmpMatEq , cmpMatNe , cmpMatLt , cmpMatLe }, + {cmpMatEq, cmpMatNe, cmpMatLt, cmpMatLe}, + {cmpMatEq , cmpMatNe , cmpMatLt , cmpMatLe }, + {cmpMatEq , cmpMatNe , cmpMatLt , cmpMatLe }, + {cmpMatEq , cmpMatNe , cmpMatLt , cmpMatLe }, + {cmpMatEq , cmpMatNe , cmpMatLt , cmpMatLe } + }; + + typedef void (*func_v4_t)(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream); + static const func_v4_t funcs_v4[] = + { + cmpMatEq_v4, cmpMatNe_v4, cmpMatLt_v4, cmpMatLe_v4 + }; + + const int depth = src1.depth(); + const int cn = src1.channels(); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + + static const int codes[] = + { + 0, 2, 3, 2, 3, 1 + }; + const GpuMat* psrc1[] = + { + &src1, &src2, &src2, &src1, &src1, &src1 + }; + const GpuMat* psrc2[] = + { + &src2, &src1, &src1, &src2, &src2, &src2 + }; + + const int code = codes[cmpop]; + PtrStepSzb src1_(src1.rows, src1.cols * cn, psrc1[cmpop]->data, psrc1[cmpop]->step); + PtrStepSzb src2_(src1.rows, src1.cols * cn, psrc2[cmpop]->data, psrc2[cmpop]->step); + PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step); + + if (depth == CV_8U && (src1_.cols & 3) == 0) + { + const intptr_t src1ptr = reinterpret_cast(src1_.data); + const intptr_t src2ptr = reinterpret_cast(src2_.data); + const intptr_t dstptr = reinterpret_cast(dst_.data); + + const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0; + + if (isAllAligned) + { + const int vcols = src1_.cols >> 2; + + funcs_v4[code](PtrStepSz(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step), + stream); + + return; + } + } + + const func_t func = funcs[depth][code]; + + func(src1_, src2_, dst_, stream); +} + +namespace arithm +{ + template void cmpScalarEq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarNe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarLt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarLe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarGt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + template void cmpScalarGe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); +} + +namespace +{ + template void castScalar(Scalar& sc) + { + sc.val[0] = saturate_cast(sc.val[0]); + sc.val[1] = saturate_cast(sc.val[1]); + sc.val[2] = saturate_cast(sc.val[2]); + sc.val[3] = saturate_cast(sc.val[3]); + } +} + +static void cmpScalar(const GpuMat& src, Scalar val, bool inv, GpuMat& dst, const GpuMat&, double, Stream& stream, int cmpop) +{ + using namespace arithm; + + typedef void (*func_t)(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream); + static const func_t funcs[7][6] = + { + {cmpScalarEq , cmpScalarGt , cmpScalarGe , cmpScalarLt , cmpScalarLe , cmpScalarNe }, + {cmpScalarEq , cmpScalarGt , cmpScalarGe , cmpScalarLt , cmpScalarLe , cmpScalarNe }, + {cmpScalarEq, cmpScalarGt, cmpScalarGe, cmpScalarLt, cmpScalarLe, cmpScalarNe}, + {cmpScalarEq , cmpScalarGt , cmpScalarGe , cmpScalarLt , cmpScalarLe , cmpScalarNe }, + {cmpScalarEq , cmpScalarGt , cmpScalarGe , cmpScalarLt , cmpScalarLe , cmpScalarNe }, + {cmpScalarEq , cmpScalarGt , cmpScalarGe , cmpScalarLt , cmpScalarLe , cmpScalarNe }, + {cmpScalarEq , cmpScalarGt , cmpScalarGe , cmpScalarLt , cmpScalarLe , cmpScalarNe } + }; + + typedef void (*cast_func_t)(Scalar& sc); + static const cast_func_t cast_func[] = + { + castScalar, castScalar, castScalar, castScalar, castScalar, castScalar, castScalar + }; + + if (inv) + { + // src1 is a scalar; swap it with src2 + cmpop = cmpop == CMP_LT ? CMP_GT : cmpop == CMP_LE ? CMP_GE : + cmpop == CMP_GE ? CMP_LE : cmpop == CMP_GT ? CMP_LT : cmpop; + } + + const int depth = src.depth(); + const int cn = src.channels(); + + cast_func[depth](val); + + funcs[depth][cmpop](src, cn, val.val, dst, StreamAccessor::getStream(stream)); +} + +void cv::gpu::compare(InputArray src1, InputArray src2, OutputArray dst, int cmpop, Stream& stream) +{ + arithm_op(src1, src2, dst, noArray(), 1.0, CV_8U, stream, cmpMat, cmpScalar, cmpop); +} + +////////////////////////////////////////////////////////////////////////////// +// bitwise_not + +namespace arithm +{ + template void bitMatNot(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +void cv::gpu::bitwise_not(InputArray _src, OutputArray _dst, InputArray _mask, Stream& _stream) +{ + using namespace arithm; + + GpuMat src = _src.getGpuMat(); + GpuMat mask = _mask.getGpuMat(); + + const int depth = src.depth(); + + CV_Assert( depth <= CV_64F ); + CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) ); + + _dst.create(src.size(), src.type()); + GpuMat dst = _dst.getGpuMat(); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + + const int bcols = (int) (src.cols * src.elemSize()); + + if ((bcols & 3) == 0) + { + const int vcols = bcols >> 2; + + bitMatNot( + PtrStepSzb(src.rows, vcols, src.data, src.step), + PtrStepSzb(src.rows, vcols, dst.data, dst.step), + mask, stream); + } + else if ((bcols & 1) == 0) + { + const int vcols = bcols >> 1; + + bitMatNot( + PtrStepSzb(src.rows, vcols, src.data, src.step), + PtrStepSzb(src.rows, vcols, dst.data, dst.step), + mask, stream); + } + else + { - bitMatNot( ++ bitMatNot( + PtrStepSzb(src.rows, bcols, src.data, src.step), + PtrStepSzb(src.rows, bcols, dst.data, dst.step), + mask, stream); + } +} + +////////////////////////////////////////////////////////////////////////////// +// Binary bitwise logical operations + +namespace +{ + enum + { + BIT_OP_AND, + BIT_OP_OR, + BIT_OP_XOR + }; +} + +namespace arithm +{ + template void bitMatAnd(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void bitMatOr(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void bitMatXor(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +static void bitMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int op) +{ + using namespace arithm; + + typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + static const func_t funcs32[] = + { + bitMatAnd, + bitMatOr, + bitMatXor + }; + static const func_t funcs16[] = + { + bitMatAnd, + bitMatOr, + bitMatXor + }; + static const func_t funcs8[] = + { + bitMatAnd, + bitMatOr, + bitMatXor + }; + + cudaStream_t stream = StreamAccessor::getStream(_stream); + + const int bcols = (int) (src1.cols * src1.elemSize()); + + if ((bcols & 3) == 0) + { + const int vcols = bcols >> 2; + + funcs32[op](PtrStepSzb(src1.rows, vcols, src1.data, src1.step), + PtrStepSzb(src1.rows, vcols, src2.data, src2.step), + PtrStepSzb(src1.rows, vcols, dst.data, dst.step), + mask, stream); + } + else if ((bcols & 1) == 0) + { + const int vcols = bcols >> 1; + + funcs16[op](PtrStepSzb(src1.rows, vcols, src1.data, src1.step), + PtrStepSzb(src1.rows, vcols, src2.data, src2.step), + PtrStepSzb(src1.rows, vcols, dst.data, dst.step), + mask, stream); + } + else + { + + funcs8[op](PtrStepSzb(src1.rows, bcols, src1.data, src1.step), + PtrStepSzb(src1.rows, bcols, src2.data, src2.step), + PtrStepSzb(src1.rows, bcols, dst.data, dst.step), + mask, stream); + } +} + +namespace arithm +{ + template void bitScalarAnd(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream); + template void bitScalarOr(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream); + template void bitScalarXor(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream); +} + +namespace +{ + typedef void (*bit_scalar_func_t)(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream); + + template struct BitScalar + { + static void call(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream) + { + func(src, saturate_cast(sc.val[0]), dst, stream); + } + }; + + template struct BitScalar4 + { + static void call(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream) + { + unsigned int packedVal = 0; + + packedVal |= (saturate_cast(sc.val[0]) & 0xffff); + packedVal |= (saturate_cast(sc.val[1]) & 0xffff) << 8; + packedVal |= (saturate_cast(sc.val[2]) & 0xffff) << 16; + packedVal |= (saturate_cast(sc.val[3]) & 0xffff) << 24; + + func(src, packedVal, dst, stream); + } + }; + + template struct NppBitwiseCFunc + { + typedef typename NppTypeTraits::npp_t npp_t; + + typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pConstants, npp_t* pDst, int nDstStep, NppiSize oSizeROI); + }; + template struct NppBitwiseCFunc + { + typedef typename NppTypeTraits::npp_t npp_t; + + typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t pConstant, npp_t* pDst, int nDstStep, NppiSize oSizeROI); + }; + + template ::func_t func> struct NppBitwiseC + { + typedef typename NppBitwiseCFunc::npp_t npp_t; + + static void call(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize oSizeROI; + oSizeROI.width = src.cols; + oSizeROI.height = src.rows; + + const npp_t pConstants[] = {saturate_cast(sc.val[0]), saturate_cast(sc.val[1]), saturate_cast(sc.val[2]), saturate_cast(sc.val[3])}; + + nppSafeCall( func(src.ptr(), static_cast(src.step), pConstants, dst.ptr(), static_cast(dst.step), oSizeROI) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template ::func_t func> struct NppBitwiseC + { + typedef typename NppBitwiseCFunc::npp_t npp_t; + + static void call(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize oSizeROI; + oSizeROI.width = src.cols; + oSizeROI.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), saturate_cast(sc.val[0]), dst.ptr(), static_cast(dst.step), oSizeROI) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; +} + +static void bitScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat& mask, double, Stream& stream, int op) +{ + using namespace arithm; + + typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream); + static const func_t funcs[3][5][4] = + { + { + {BitScalar >::call , 0, NppBitwiseC::call, BitScalar4< bitScalarAnd >::call}, + {0,0,0,0}, + {BitScalar >::call, 0, NppBitwiseC::call, NppBitwiseC::call}, + {0,0,0,0}, + {BitScalar >::call , 0, NppBitwiseC::call, NppBitwiseC::call} + }, + { + {BitScalar >::call , 0, NppBitwiseC::call, BitScalar4< bitScalarOr >::call}, + {0,0,0,0}, + {BitScalar >::call, 0, NppBitwiseC::call, NppBitwiseC::call}, + {0,0,0,0}, + {BitScalar >::call , 0, NppBitwiseC::call, NppBitwiseC::call} + }, + { + {BitScalar >::call , 0, NppBitwiseC::call, BitScalar4< bitScalarXor >::call}, + {0,0,0,0}, + {BitScalar >::call, 0, NppBitwiseC::call, NppBitwiseC::call}, + {0,0,0,0}, + {BitScalar >::call , 0, NppBitwiseC::call, NppBitwiseC::call} + } + }; + + const int depth = src.depth(); + const int cn = src.channels(); + + CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32S ); + CV_Assert( cn == 1 || cn == 3 || cn == 4 ); + CV_Assert( mask.empty() ); + + funcs[op][depth][cn - 1](src, val, dst, StreamAccessor::getStream(stream)); +} + +void cv::gpu::bitwise_or(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, Stream& stream) +{ + arithm_op(src1, src2, dst, mask, 1.0, -1, stream, bitMat, bitScalar, BIT_OP_OR); +} + +void cv::gpu::bitwise_and(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, Stream& stream) +{ + arithm_op(src1, src2, dst, mask, 1.0, -1, stream, bitMat, bitScalar, BIT_OP_AND); +} + +void cv::gpu::bitwise_xor(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, Stream& stream) +{ + arithm_op(src1, src2, dst, mask, 1.0, -1, stream, bitMat, bitScalar, BIT_OP_XOR); +} + +////////////////////////////////////////////////////////////////////////////// +// shift + +namespace +{ + template struct NppShiftFunc + { + typedef typename NppTypeTraits::npp_t npp_t; + + typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const Npp32u* pConstants, npp_t* pDst, int nDstStep, NppiSize oSizeROI); + }; + template struct NppShiftFunc + { + typedef typename NppTypeTraits::npp_t npp_t; + + typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const Npp32u pConstants, npp_t* pDst, int nDstStep, NppiSize oSizeROI); + }; + + template ::func_t func> struct NppShift + { + typedef typename NppTypeTraits::npp_t npp_t; + + static void call(const GpuMat& src, Scalar_ sc, GpuMat& dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize oSizeROI; + oSizeROI.width = src.cols; + oSizeROI.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), sc.val, dst.ptr(), static_cast(dst.step), oSizeROI) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template ::func_t func> struct NppShift + { + typedef typename NppTypeTraits::npp_t npp_t; + + static void call(const GpuMat& src, Scalar_ sc, GpuMat& dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize oSizeROI; + oSizeROI.width = src.cols; + oSizeROI.height = src.rows; + + nppSafeCall( func(src.ptr(), static_cast(src.step), sc.val[0], dst.ptr(), static_cast(dst.step), oSizeROI) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; +} + +void cv::gpu::rshift(InputArray _src, Scalar_ val, OutputArray _dst, Stream& stream) +{ + typedef void (*func_t)(const GpuMat& src, Scalar_ sc, GpuMat& dst, cudaStream_t stream); + static const func_t funcs[5][4] = + { + {NppShift::call, 0, NppShift::call, NppShift::call }, + {NppShift::call, 0, NppShift::call, NppShift::call }, + {NppShift::call, 0, NppShift::call, NppShift::call}, + {NppShift::call, 0, NppShift::call, NppShift::call}, + {NppShift::call, 0, NppShift::call, NppShift::call}, + }; + + GpuMat src = _src.getGpuMat(); + + CV_Assert( src.depth() < CV_32F ); + CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 ); + + _dst.create(src.size(), src.type()); + GpuMat dst = _dst.getGpuMat(); + + funcs[src.depth()][src.channels() - 1](src, val, dst, StreamAccessor::getStream(stream)); +} + +void cv::gpu::lshift(InputArray _src, Scalar_ val, OutputArray _dst, Stream& stream) +{ + typedef void (*func_t)(const GpuMat& src, Scalar_ sc, GpuMat& dst, cudaStream_t stream); + static const func_t funcs[5][4] = + { + {NppShift::call , 0, NppShift::call , NppShift::call }, + {0 , 0, 0 , 0 }, + {NppShift::call, 0, NppShift::call, NppShift::call}, + {0 , 0, 0 , 0 }, + {NppShift::call, 0, NppShift::call, NppShift::call}, + }; + + GpuMat src = _src.getGpuMat(); + + CV_Assert( src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S ); + CV_Assert( src.channels() == 1 || src.channels() == 3 || src.channels() == 4 ); + + _dst.create(src.size(), src.type()); + GpuMat dst = _dst.getGpuMat(); + + funcs[src.depth()][src.channels() - 1](src, val, dst, StreamAccessor::getStream(stream)); +} + +////////////////////////////////////////////////////////////////////////////// +// Minimum and maximum operations + +namespace +{ + enum + { + MIN_OP, + MAX_OP + }; +} + +namespace arithm +{ + void minMat_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream); + void minMat_v2(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream); + template void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + + void maxMat_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream); + void maxMat_v2(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream); + template void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); +} + +void minMaxMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double, Stream& _stream, int op) +{ + using namespace arithm; + + typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + static const func_t funcs[2][7] = + { + { + minMat, + minMat, + minMat, + minMat, + minMat, + minMat, + minMat + }, + { + maxMat, + maxMat, + maxMat, + maxMat, + maxMat, + maxMat, + maxMat + } + }; + + typedef void (*opt_func_t)(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream); + static const opt_func_t funcs_v4[2] = + { + minMat_v4, maxMat_v4 + }; + static const opt_func_t funcs_v2[2] = + { + minMat_v2, maxMat_v2 + }; + + const int depth = src1.depth(); + const int cn = src1.channels(); + + CV_Assert( depth <= CV_64F ); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + + PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step); + PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step); + PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step); + + if (depth == CV_8U || depth == CV_16U) + { + const intptr_t src1ptr = reinterpret_cast(src1_.data); + const intptr_t src2ptr = reinterpret_cast(src2_.data); + const intptr_t dstptr = reinterpret_cast(dst_.data); + + const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0; + + if (isAllAligned) + { + if (depth == CV_8U && (src1_.cols & 3) == 0) + { + const int vcols = src1_.cols >> 2; + + funcs_v4[op](PtrStepSz(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step), + stream); + + return; + } + else if (depth == CV_16U && (src1_.cols & 1) == 0) + { + const int vcols = src1_.cols >> 1; + + funcs_v2[op](PtrStepSz(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step), + stream); + + return; + } + } + } + + const func_t func = funcs[op][depth]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src1_, src2_, dst_, stream); +} + +namespace +{ + template double castScalar(double val) + { + return saturate_cast(val); + } +} + +void minMaxScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat&, double, Stream& stream, int op) +{ + using namespace arithm; + + typedef void (*func_t)(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + static const func_t funcs[2][7] = + { + { + minScalar, + minScalar, + minScalar, + minScalar, + minScalar, + minScalar, + minScalar + }, + { + maxScalar, + maxScalar, + maxScalar, + maxScalar, + maxScalar, + maxScalar, + maxScalar + } + }; + + typedef double (*cast_func_t)(double sc); + static const cast_func_t cast_func[] = + { + castScalar, castScalar, castScalar, castScalar, castScalar, castScalar, castScalar + }; + + const int depth = src.depth(); + + CV_Assert( depth <= CV_64F ); + CV_Assert( src.channels() == 1 ); + + funcs[op][depth](src, cast_func[depth](val[0]), dst, StreamAccessor::getStream(stream)); +} + +void cv::gpu::min(InputArray src1, InputArray src2, OutputArray dst, Stream& stream) +{ + arithm_op(src1, src2, dst, noArray(), 1.0, -1, stream, minMaxMat, minMaxScalar, MIN_OP); +} + +void cv::gpu::max(InputArray src1, InputArray src2, OutputArray dst, Stream& stream) +{ + arithm_op(src1, src2, dst, noArray(), 1.0, -1, stream, minMaxMat, minMaxScalar, MAX_OP); +} + +//////////////////////////////////////////////////////////////////////// +// addWeighted + +namespace arithm +{ + template + void addWeighted(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); +} + +void cv::gpu::addWeighted(InputArray _src1, double alpha, InputArray _src2, double beta, double gamma, OutputArray _dst, int ddepth, Stream& stream) +{ + typedef void (*func_t)(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream); + static const func_t funcs[7][7][7] = + { + { + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + } + }, + { + { + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/ + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + } + }, + { + { + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/ + }, + { + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/ + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + } + }, + { + { + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/ + }, + { + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/ + }, + { + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/ + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + } + }, + { + { + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/ + }, + { + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/ + }, + { + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/ + }, + { + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/ + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + } + }, + { + { + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/ + }, + { + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/ + }, + { + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/ + }, + { + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/ + }, + { + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/ + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + } + }, + { + { + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/ + }, + { + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/ + }, + { + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/ + }, + { + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/ + }, + { + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/ + }, + { + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/, + 0/*arithm::addWeighted*/ + }, + { + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted, + arithm::addWeighted + } + } + }; + + GpuMat src1 = _src1.getGpuMat(); + GpuMat src2 = _src2.getGpuMat(); + + int sdepth1 = src1.depth(); + int sdepth2 = src2.depth(); + ddepth = ddepth >= 0 ? CV_MAT_DEPTH(ddepth) : std::max(sdepth1, sdepth2); + const int cn = src1.channels(); + + CV_Assert( src2.size() == src1.size() && src2.channels() == cn ); + CV_Assert( sdepth1 <= CV_64F && sdepth2 <= CV_64F && ddepth <= CV_64F ); + + if (sdepth1 == CV_64F || sdepth2 == CV_64F || ddepth == CV_64F) + { + if (!deviceSupports(NATIVE_DOUBLE)) + CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double"); + } + + _dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn)); + GpuMat dst = _dst.getGpuMat(); + + PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step); + PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step); + PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step); + + if (sdepth1 > sdepth2) + { + std::swap(src1_.data, src2_.data); + std::swap(src1_.step, src2_.step); + std::swap(alpha, beta); + std::swap(sdepth1, sdepth2); + } + + const func_t func = funcs[sdepth1][sdepth2][ddepth]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src1_, alpha, src2_, beta, gamma, dst_, StreamAccessor::getStream(stream)); +} + +//////////////////////////////////////////////////////////////////////// +// threshold + +namespace arithm +{ + template + void threshold(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream); +} + +double cv::gpu::threshold(InputArray _src, OutputArray _dst, double thresh, double maxVal, int type, Stream& _stream) +{ + GpuMat src = _src.getGpuMat(); + + const int depth = src.depth(); + + CV_Assert( src.channels() == 1 && depth <= CV_64F ); + CV_Assert( type <= 4/*THRESH_TOZERO_INV*/ ); + + if (depth == CV_64F) + { + if (!deviceSupports(NATIVE_DOUBLE)) + CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double"); + } + + _dst.create(src.size(), src.type()); + GpuMat dst = _dst.getGpuMat(); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + + if (src.type() == CV_32FC1 && type == 2/*THRESH_TRUNC*/) + { + NppStreamHandler h(stream); + + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( nppiThreshold_32f_C1R(src.ptr(), static_cast(src.step), + dst.ptr(), static_cast(dst.step), sz, static_cast(thresh), NPP_CMP_GREATER) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + else + { + typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream); + static const func_t funcs[] = + { + arithm::threshold, + arithm::threshold, + arithm::threshold, + arithm::threshold, + arithm::threshold, + arithm::threshold, + arithm::threshold + }; + + if (depth != CV_32F && depth != CV_64F) + { + thresh = cvFloor(thresh); + maxVal = cvRound(maxVal); + } + + funcs[depth](src, dst, thresh, maxVal, type, stream); + } + + return thresh; +} + +//////////////////////////////////////////////////////////////////////// +// NPP magnitide + +namespace +{ + typedef NppStatus (*nppMagnitude_t)(const Npp32fc* pSrc, int nSrcStep, Npp32f* pDst, int nDstStep, NppiSize oSizeROI); + + void npp_magnitude(const GpuMat& src, GpuMat& dst, nppMagnitude_t func, cudaStream_t stream) + { + CV_Assert(src.type() == CV_32FC2); + + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + NppStreamHandler h(stream); + + nppSafeCall( func(src.ptr(), static_cast(src.step), dst.ptr(), static_cast(dst.step), sz) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } +} + +void cv::gpu::magnitude(InputArray _src, OutputArray _dst, Stream& stream) +{ + GpuMat src = _src.getGpuMat(); + + _dst.create(src.size(), CV_32FC1); + GpuMat dst = _dst.getGpuMat(); + + npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R, StreamAccessor::getStream(stream)); +} + +void cv::gpu::magnitudeSqr(InputArray _src, OutputArray _dst, Stream& stream) +{ + GpuMat src = _src.getGpuMat(); + + _dst.create(src.size(), CV_32FC1); + GpuMat dst = _dst.getGpuMat(); + + npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R, StreamAccessor::getStream(stream)); +} + +//////////////////////////////////////////////////////////////////////// +// Polar <-> Cart + +namespace cv { namespace gpu { namespace cudev +{ + namespace mathfunc + { + void cartToPolar_gpu(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, bool magSqr, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream); + void polarToCart_gpu(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream); + } +}}} + +namespace +{ + void cartToPolar_caller(const GpuMat& x, const GpuMat& y, GpuMat* mag, bool magSqr, GpuMat* angle, bool angleInDegrees, cudaStream_t stream) + { + using namespace ::cv::gpu::cudev::mathfunc; + + CV_Assert(x.size() == y.size() && x.type() == y.type()); + CV_Assert(x.depth() == CV_32F); + + GpuMat x1cn = x.reshape(1); + GpuMat y1cn = y.reshape(1); + GpuMat mag1cn = mag ? mag->reshape(1) : GpuMat(); + GpuMat angle1cn = angle ? angle->reshape(1) : GpuMat(); + + cartToPolar_gpu(x1cn, y1cn, mag1cn, magSqr, angle1cn, angleInDegrees, stream); + } + + void polarToCart_caller(const GpuMat& mag, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, cudaStream_t stream) + { + using namespace ::cv::gpu::cudev::mathfunc; + + CV_Assert((mag.empty() || mag.size() == angle.size()) && mag.type() == angle.type()); + CV_Assert(mag.depth() == CV_32F); + + GpuMat mag1cn = mag.reshape(1); + GpuMat angle1cn = angle.reshape(1); + GpuMat x1cn = x.reshape(1); + GpuMat y1cn = y.reshape(1); + + polarToCart_gpu(mag1cn, angle1cn, x1cn, y1cn, angleInDegrees, stream); + } +} + +void cv::gpu::magnitude(InputArray _x, InputArray _y, OutputArray _dst, Stream& stream) +{ + GpuMat x = _x.getGpuMat(); + GpuMat y = _y.getGpuMat(); + + _dst.create(x.size(), CV_32FC1); + GpuMat dst = _dst.getGpuMat(); + + cartToPolar_caller(x, y, &dst, false, 0, false, StreamAccessor::getStream(stream)); +} + +void cv::gpu::magnitudeSqr(InputArray _x, InputArray _y, OutputArray _dst, Stream& stream) +{ + GpuMat x = _x.getGpuMat(); + GpuMat y = _y.getGpuMat(); + + _dst.create(x.size(), CV_32FC1); + GpuMat dst = _dst.getGpuMat(); + + cartToPolar_caller(x, y, &dst, true, 0, false, StreamAccessor::getStream(stream)); +} + +void cv::gpu::phase(InputArray _x, InputArray _y, OutputArray _dst, bool angleInDegrees, Stream& stream) +{ + GpuMat x = _x.getGpuMat(); + GpuMat y = _y.getGpuMat(); + + _dst.create(x.size(), CV_32FC1); + GpuMat dst = _dst.getGpuMat(); + + cartToPolar_caller(x, y, 0, false, &dst, angleInDegrees, StreamAccessor::getStream(stream)); +} + +void cv::gpu::cartToPolar(InputArray _x, InputArray _y, OutputArray _mag, OutputArray _angle, bool angleInDegrees, Stream& stream) +{ + GpuMat x = _x.getGpuMat(); + GpuMat y = _y.getGpuMat(); + + _mag.create(x.size(), CV_32FC1); + GpuMat mag = _mag.getGpuMat(); + + _angle.create(x.size(), CV_32FC1); + GpuMat angle = _angle.getGpuMat(); + + cartToPolar_caller(x, y, &mag, false, &angle, angleInDegrees, StreamAccessor::getStream(stream)); +} + +void cv::gpu::polarToCart(InputArray _mag, InputArray _angle, OutputArray _x, OutputArray _y, bool angleInDegrees, Stream& stream) +{ + GpuMat mag = _mag.getGpuMat(); + GpuMat angle = _angle.getGpuMat(); + + _x.create(mag.size(), CV_32FC1); + GpuMat x = _x.getGpuMat(); + + _y.create(mag.size(), CV_32FC1); + GpuMat y = _y.getGpuMat(); + + polarToCart_caller(mag, angle, x, y, angleInDegrees, StreamAccessor::getStream(stream)); +} + +#endif diff --cc modules/java/generator/gen_java.py index cf92955,d82a593..ef056a7 --- a/modules/java/generator/gen_java.py +++ b/modules/java/generator/gen_java.py @@@ -542,10 -544,10 +542,10 @@@ JNIEXPORT jstring JNICALL Java_org_open { static const char method_name[] = "highgui::VideoCapture_getSupportedPreviewSizes_10()"; try { - LOGD(%s, method_name); + LOGD("%s", method_name); VideoCapture* me = (VideoCapture*) self; //TODO: check for NULL union {double prop; const char* name;} u; - u.prop = me->get(CV_CAP_PROP_SUPPORTED_PREVIEW_SIZES_STRING); + u.prop = me->get(CAP_PROP_ANDROID_PREVIEW_SIZES_STRING); return env->NewStringUTF(u.name); } catch(const std::exception &e) { throwJavaException(env, &e, method_name); diff --cc modules/ts/src/precomp.hpp index d719472,1133978..fbb13ec --- a/modules/ts/src/precomp.hpp +++ b/modules/ts/src/precomp.hpp @@@ -1,6 -1,7 +1,7 @@@ -#include "opencv2/core/core_c.h" -#include "opencv2/core/internal.hpp" -#include "opencv2/ts/ts.hpp" +#include "opencv2/core/utility.hpp" +#include "opencv2/core/private.hpp" +#include "opencv2/ts.hpp" + #include "cvconfig.h" #ifdef GTEST_LINKED_AS_SHARED_LIBRARY #error ts module should not have GTEST_LINKED_AS_SHARED_LIBRARY defined