From: Roman Donchenko Date: Tue, 27 Aug 2013 09:23:26 +0000 (+0400) Subject: Merge commit '43aec5ad^' into merge-2.4 X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~3787^2~6 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=4d06c4c7b6a46940cfc724d59e6e78670d96d0b1;p=platform%2Fupstream%2Fopencv.git Merge commit '43aec5ad^' into merge-2.4 Conflicts: modules/contrib/src/inputoutput.cpp modules/gpu/perf/perf_imgproc.cpp modules/gpuarithm/perf/perf_element_operations.cpp modules/gpuarithm/src/element_operations.cpp modules/ts/src/precomp.hpp --- 4d06c4c7b6a46940cfc724d59e6e78670d96d0b1 diff --cc modules/contrib/src/inputoutput.cpp index d0e947b,d6d514f..310dec7 --- a/modules/contrib/src/inputoutput.cpp +++ b/modules/contrib/src/inputoutput.cpp @@@ -10,11 -11,11 +10,11 @@@ namespace cv { - std::vector Directory::GetListFiles( const String& path, const String & exten, bool addPath ) - std::vector Directory::GetListFiles( const std::string& path, const std::string & exten, bool addPath ) ++ std::vector Directory::GetListFiles( const String& path, const String & exten, bool addPath ) { - std::vector list; + std::vector list; list.clear(); - std::string path_f = path + "/" + exten; + String path_f = path + "/" + exten; #ifdef WIN32 #ifdef HAVE_WINRT WIN32_FIND_DATAW FindFileData; diff --cc modules/gpuarithm/perf/perf_arithm.cpp index dfeafa0,0000000..b18c8a8 mode 100644,000000..100644 --- a/modules/gpuarithm/perf/perf_arithm.cpp +++ b/modules/gpuarithm/perf/perf_arithm.cpp @@@ -1,307 -1,0 +1,307 @@@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "perf_precomp.hpp" + +using namespace std; +using namespace testing; +using namespace perf; + +////////////////////////////////////////////////////////////////////// +// GEMM + +CV_FLAGS(GemmFlags, 0, cv::GEMM_1_T, cv::GEMM_2_T, cv::GEMM_3_T) +#define ALL_GEMM_FLAGS Values(GemmFlags(0), GemmFlags(cv::GEMM_1_T), GemmFlags(cv::GEMM_2_T), GemmFlags(cv::GEMM_3_T), \ + GemmFlags(cv::GEMM_1_T | cv::GEMM_2_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_3_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_2_T | cv::GEMM_3_T)) + +DEF_PARAM_TEST(Sz_Type_Flags, cv::Size, MatType, GemmFlags); + +PERF_TEST_P(Sz_Type_Flags, GEMM, + Combine(Values(cv::Size(512, 512), cv::Size(1024, 1024)), + Values(CV_32FC1, CV_32FC2, CV_64FC1), + ALL_GEMM_FLAGS)) +{ + const cv::Size size = GET_PARAM(0); + const int type = GET_PARAM(1); + const int flags = GET_PARAM(2); + + cv::Mat src1(size, type); + declare.in(src1, WARMUP_RNG); + + cv::Mat src2(size, type); + declare.in(src2, WARMUP_RNG); + + cv::Mat src3(size, type); + declare.in(src3, WARMUP_RNG); + + if (PERF_RUN_GPU()) + { + declare.time(5.0); + + const cv::gpu::GpuMat d_src1(src1); + const cv::gpu::GpuMat d_src2(src2); + const cv::gpu::GpuMat d_src3(src3); + cv::gpu::GpuMat dst; + + TEST_CYCLE() cv::gpu::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, dst, flags); + - GPU_SANITY_CHECK(dst, 1e-6); ++ GPU_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE); + } + else + { + declare.time(50.0); + + cv::Mat dst; + + TEST_CYCLE() cv::gemm(src1, src2, 1.0, src3, 1.0, dst, flags); + + CPU_SANITY_CHECK(dst); + } +} + +////////////////////////////////////////////////////////////////////// +// MulSpectrums + +CV_FLAGS(DftFlags, 0, cv::DFT_INVERSE, cv::DFT_SCALE, cv::DFT_ROWS, cv::DFT_COMPLEX_OUTPUT, cv::DFT_REAL_OUTPUT) + +DEF_PARAM_TEST(Sz_Flags, cv::Size, DftFlags); + +PERF_TEST_P(Sz_Flags, MulSpectrums, + Combine(GPU_TYPICAL_MAT_SIZES, + Values(0, DftFlags(cv::DFT_ROWS)))) +{ + const cv::Size size = GET_PARAM(0); + const int flag = GET_PARAM(1); + + cv::Mat a(size, CV_32FC2); + cv::Mat b(size, CV_32FC2); + declare.in(a, b, WARMUP_RNG); + + if (PERF_RUN_GPU()) + { + const cv::gpu::GpuMat d_a(a); + const cv::gpu::GpuMat d_b(b); + cv::gpu::GpuMat dst; + + TEST_CYCLE() cv::gpu::mulSpectrums(d_a, d_b, dst, flag); + + GPU_SANITY_CHECK(dst); + } + else + { + cv::Mat dst; + + TEST_CYCLE() cv::mulSpectrums(a, b, dst, flag); + + CPU_SANITY_CHECK(dst); + } +} + +////////////////////////////////////////////////////////////////////// +// MulAndScaleSpectrums + +PERF_TEST_P(Sz, MulAndScaleSpectrums, + GPU_TYPICAL_MAT_SIZES) +{ + const cv::Size size = GetParam(); + + const float scale = 1.f / size.area(); + + cv::Mat src1(size, CV_32FC2); + cv::Mat src2(size, CV_32FC2); + declare.in(src1,src2, WARMUP_RNG); + + if (PERF_RUN_GPU()) + { + const cv::gpu::GpuMat d_src1(src1); + const cv::gpu::GpuMat d_src2(src2); + cv::gpu::GpuMat dst; + + TEST_CYCLE() cv::gpu::mulAndScaleSpectrums(d_src1, d_src2, dst, cv::DFT_ROWS, scale, false); + + GPU_SANITY_CHECK(dst); + } + else + { + FAIL_NO_CPU(); + } +} + +////////////////////////////////////////////////////////////////////// +// Dft + +PERF_TEST_P(Sz_Flags, Dft, + Combine(GPU_TYPICAL_MAT_SIZES, + Values(0, DftFlags(cv::DFT_ROWS), DftFlags(cv::DFT_INVERSE)))) +{ + declare.time(10.0); + + const cv::Size size = GET_PARAM(0); + const int flag = GET_PARAM(1); + + cv::Mat src(size, CV_32FC2); + declare.in(src, WARMUP_RNG); + + if (PERF_RUN_GPU()) + { + const cv::gpu::GpuMat d_src(src); + cv::gpu::GpuMat dst; + + TEST_CYCLE() cv::gpu::dft(d_src, dst, size, flag); + + GPU_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE); + } + else + { + cv::Mat dst; + + TEST_CYCLE() cv::dft(src, dst, flag); + + CPU_SANITY_CHECK(dst); + } +} + +////////////////////////////////////////////////////////////////////// +// Convolve + +DEF_PARAM_TEST(Sz_KernelSz_Ccorr, cv::Size, int, bool); + +PERF_TEST_P(Sz_KernelSz_Ccorr, Convolve, + Combine(GPU_TYPICAL_MAT_SIZES, + Values(17, 27, 32, 64), + Bool())) +{ + declare.time(10.0); + + const cv::Size size = GET_PARAM(0); + const int templ_size = GET_PARAM(1); + const bool ccorr = GET_PARAM(2); + + const cv::Mat image(size, CV_32FC1); + const cv::Mat templ(templ_size, templ_size, CV_32FC1); + declare.in(image, templ, WARMUP_RNG); + + if (PERF_RUN_GPU()) + { + cv::gpu::GpuMat d_image = cv::gpu::createContinuous(size, CV_32FC1); + d_image.upload(image); + + cv::gpu::GpuMat d_templ = cv::gpu::createContinuous(templ_size, templ_size, CV_32FC1); + d_templ.upload(templ); + + cv::Ptr convolution = cv::gpu::createConvolution(); + + cv::gpu::GpuMat dst; + + TEST_CYCLE() convolution->convolve(d_image, d_templ, dst, ccorr); + - GPU_SANITY_CHECK(dst); ++ GPU_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE); + } + else + { + if (ccorr) + FAIL_NO_CPU(); + + cv::Mat dst; + + TEST_CYCLE() cv::filter2D(image, dst, image.depth(), templ); + + CPU_SANITY_CHECK(dst); + } +} + +////////////////////////////////////////////////////////////////////// +// Integral + +PERF_TEST_P(Sz, Integral, + GPU_TYPICAL_MAT_SIZES) +{ + const cv::Size size = GetParam(); + + cv::Mat src(size, CV_8UC1); + declare.in(src, WARMUP_RNG); + + if (PERF_RUN_GPU()) + { + const cv::gpu::GpuMat d_src(src); + cv::gpu::GpuMat dst; + cv::gpu::GpuMat d_buf; + + TEST_CYCLE() cv::gpu::integral(d_src, dst, d_buf); + + GPU_SANITY_CHECK(dst); + } + else + { + cv::Mat dst; + + TEST_CYCLE() cv::integral(src, dst); + + CPU_SANITY_CHECK(dst); + } +} + +////////////////////////////////////////////////////////////////////// +// IntegralSqr + +PERF_TEST_P(Sz, IntegralSqr, + GPU_TYPICAL_MAT_SIZES) +{ + const cv::Size size = GetParam(); + + cv::Mat src(size, CV_8UC1); + declare.in(src, WARMUP_RNG); + + if (PERF_RUN_GPU()) + { + const cv::gpu::GpuMat d_src(src); + cv::gpu::GpuMat dst, buf; + + TEST_CYCLE() cv::gpu::sqrIntegral(d_src, dst, buf); + + GPU_SANITY_CHECK(dst); + } + else + { + FAIL_NO_CPU(); + } +} diff --cc modules/gpuarithm/src/element_operations.cpp index 3ec4f84,0000000..20473de mode 100644,000000..100644 --- a/modules/gpuarithm/src/element_operations.cpp +++ b/modules/gpuarithm/src/element_operations.cpp @@@ -1,3147 -1,0 +1,3147 @@@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +using namespace cv; +using namespace cv::gpu; + +#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER) + +void cv::gpu::add(InputArray, InputArray, OutputArray, InputArray, int, Stream&) { throw_no_cuda(); } +void cv::gpu::subtract(InputArray, InputArray, OutputArray, InputArray, int, Stream&) { throw_no_cuda(); } +void cv::gpu::multiply(InputArray, InputArray, OutputArray, double, int, Stream&) { throw_no_cuda(); } +void cv::gpu::divide(InputArray, InputArray, OutputArray, double, int, Stream&) { throw_no_cuda(); } +void cv::gpu::absdiff(InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); } + +void cv::gpu::abs(InputArray, OutputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::sqr(InputArray, OutputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::sqrt(InputArray, OutputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::exp(InputArray, OutputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::log(InputArray, OutputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::pow(InputArray, double, OutputArray, Stream&) { throw_no_cuda(); } + +void cv::gpu::compare(InputArray, InputArray, OutputArray, int, Stream&) { throw_no_cuda(); } + +void cv::gpu::bitwise_not(InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::bitwise_or(InputArray, InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::bitwise_and(InputArray, InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::bitwise_xor(InputArray, InputArray, OutputArray, InputArray, Stream&) { throw_no_cuda(); } + +void cv::gpu::rshift(InputArray, Scalar_, OutputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::lshift(InputArray, Scalar_, OutputArray, Stream&) { throw_no_cuda(); } + +void cv::gpu::min(InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::max(InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); } + +void cv::gpu::addWeighted(InputArray, double, InputArray, double, double, OutputArray, int, Stream&) { throw_no_cuda(); } + +double cv::gpu::threshold(InputArray, OutputArray, double, double, int, Stream&) {throw_no_cuda(); return 0.0;} + +void cv::gpu::magnitude(InputArray, OutputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::magnitude(InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::magnitudeSqr(InputArray, OutputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::magnitudeSqr(InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); } +void cv::gpu::phase(InputArray, InputArray, OutputArray, bool, Stream&) { throw_no_cuda(); } +void cv::gpu::cartToPolar(InputArray, InputArray, OutputArray, OutputArray, bool, Stream&) { throw_no_cuda(); } +void cv::gpu::polarToCart(InputArray, InputArray, OutputArray, OutputArray, bool, Stream&) { throw_no_cuda(); } + +#else + +//////////////////////////////////////////////////////////////////////// +// arithm_op + +namespace +{ + typedef void (*mat_mat_func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double scale, Stream& stream, int op); + typedef void (*mat_scalar_func_t)(const GpuMat& src, Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double scale, Stream& stream, int op); + + void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, InputArray _mask, double scale, int dtype, Stream& stream, + mat_mat_func_t mat_mat_func, mat_scalar_func_t mat_scalar_func, int op = 0) + { + const int kind1 = _src1.kind(); + const int kind2 = _src2.kind(); + + const bool isScalar1 = (kind1 == _InputArray::MATX); + const bool isScalar2 = (kind2 == _InputArray::MATX); + CV_Assert( !isScalar1 || !isScalar2 ); + + GpuMat src1; + if (!isScalar1) + src1 = _src1.getGpuMat(); + + GpuMat src2; + if (!isScalar2) + src2 = _src2.getGpuMat(); + + Mat scalar; + if (isScalar1) + scalar = _src1.getMat(); + else if (isScalar2) + scalar = _src2.getMat(); + + Scalar val; + if (!scalar.empty()) + { + CV_Assert( scalar.total() <= 4 ); + scalar.convertTo(Mat_(scalar.rows, scalar.cols, &val[0]), CV_64F); + } + + GpuMat mask = _mask.getGpuMat(); + + const int sdepth = src1.empty() ? src2.depth() : src1.depth(); + const int cn = src1.empty() ? src2.channels() : src1.channels(); + const Size size = src1.empty() ? src2.size() : src1.size(); + + if (dtype < 0) + dtype = sdepth; + + const int ddepth = CV_MAT_DEPTH(dtype); + + CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F ); + CV_Assert( !scalar.empty() || (src2.type() == src1.type() && src2.size() == src1.size()) ); + CV_Assert( mask.empty() || (cn == 1 && mask.size() == size && mask.type() == CV_8UC1) ); + + if (sdepth == CV_64F || ddepth == CV_64F) + { + if (!deviceSupports(NATIVE_DOUBLE)) + CV_Error(Error::StsUnsupportedFormat, "The device doesn't support double"); + } + + _dst.create(size, CV_MAKE_TYPE(ddepth, cn)); + GpuMat dst = _dst.getGpuMat(); + + if (isScalar1) + mat_scalar_func(src2, val, true, dst, mask, scale, stream, op); + else if (isScalar2) + mat_scalar_func(src1, val, false, dst, mask, scale, stream, op); + else + mat_mat_func(src1, src2, dst, mask, scale, stream, op); + } +} + + +//////////////////////////////////////////////////////////////////////// +// Basic arithmetical operations (add subtract multiply divide) + +namespace +{ + template struct NppTypeTraits; + template<> struct NppTypeTraits { typedef Npp8u npp_t; }; + template<> struct NppTypeTraits { typedef Npp8s npp_t; }; + template<> struct NppTypeTraits { typedef Npp16u npp_t; }; + template<> struct NppTypeTraits { typedef Npp16s npp_t; typedef Npp16sc npp_complex_type; }; + template<> struct NppTypeTraits { typedef Npp32s npp_t; typedef Npp32sc npp_complex_type; }; + template<> struct NppTypeTraits { typedef Npp32f npp_t; typedef Npp32fc npp_complex_type; }; + template<> struct NppTypeTraits { typedef Npp64f npp_t; typedef Npp64fc npp_complex_type; }; + + template struct NppArithmScalarFunc + { + typedef typename NppTypeTraits::npp_t npp_t; + + typedef NppStatus (*func_ptr)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pConstants, + npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor); + }; + template struct NppArithmScalarFunc + { + typedef typename NppTypeTraits::npp_t npp_t; + + typedef NppStatus (*func_ptr)(const npp_t* pSrc1, int nSrc1Step, const npp_t pConstants, + npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor); + }; + template struct NppArithmScalarFunc + { + typedef typename NppTypeTraits::npp_complex_type npp_complex_type; + + typedef NppStatus (*func_ptr)(const npp_complex_type* pSrc1, int nSrc1Step, const npp_complex_type pConstants, + npp_complex_type* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor); + }; + template struct NppArithmScalarFunc + { + typedef NppStatus (*func_ptr)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f* pConstants, Npp32f* pDst, int nDstStep, NppiSize oSizeROI); + }; + template<> struct NppArithmScalarFunc + { + typedef NppStatus (*func_ptr)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f pConstants, Npp32f* pDst, int nDstStep, NppiSize oSizeROI); + }; + template<> struct NppArithmScalarFunc + { + typedef NppStatus (*func_ptr)(const Npp32fc* pSrc1, int nSrc1Step, const Npp32fc pConstants, Npp32fc* pDst, int nDstStep, NppiSize oSizeROI); + }; + + template::func_ptr func> struct NppArithmScalar + { + typedef typename NppTypeTraits::npp_t npp_t; + + static void call(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + const npp_t pConstants[] = { saturate_cast(sc.val[0]), saturate_cast(sc.val[1]), saturate_cast(sc.val[2]), saturate_cast(sc.val[3]) }; + + nppSafeCall( func((const npp_t*)src.data, static_cast(src.step), pConstants, (npp_t*)dst.data, static_cast(dst.step), sz, 0) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template::func_ptr func> struct NppArithmScalar + { + typedef typename NppTypeTraits::npp_t npp_t; + + static void call(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( func((const npp_t*)src.data, static_cast(src.step), saturate_cast(sc.val[0]), (npp_t*)dst.data, static_cast(dst.step), sz, 0) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template::func_ptr func> struct NppArithmScalar + { + typedef typename NppTypeTraits::npp_t npp_t; + typedef typename NppTypeTraits::npp_complex_type npp_complex_type; + + static void call(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + npp_complex_type nConstant; + nConstant.re = saturate_cast(sc.val[0]); + nConstant.im = saturate_cast(sc.val[1]); + + nppSafeCall( func((const npp_complex_type*)src.data, static_cast(src.step), nConstant, + (npp_complex_type*)dst.data, static_cast(dst.step), sz, 0) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template::func_ptr func> struct NppArithmScalar + { + typedef typename NppTypeTraits::npp_t npp_t; + + static void call(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + const Npp32f pConstants[] = { saturate_cast(sc.val[0]), saturate_cast(sc.val[1]), saturate_cast(sc.val[2]), saturate_cast(sc.val[3]) }; + + nppSafeCall( func((const npp_t*)src.data, static_cast(src.step), pConstants, (npp_t*)dst.data, static_cast(dst.step), sz) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template::func_ptr func> struct NppArithmScalar + { + typedef typename NppTypeTraits::npp_t npp_t; + + static void call(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + nppSafeCall( func((const npp_t*)src.data, static_cast(src.step), saturate_cast(sc.val[0]), (npp_t*)dst.data, static_cast(dst.step), sz) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; + template::func_ptr func> struct NppArithmScalar + { + typedef typename NppTypeTraits::npp_t npp_t; + typedef typename NppTypeTraits::npp_complex_type npp_complex_type; + + static void call(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream) + { + NppStreamHandler h(stream); + + NppiSize sz; + sz.width = src.cols; + sz.height = src.rows; + + Npp32fc nConstant; + nConstant.re = saturate_cast(sc.val[0]); + nConstant.im = saturate_cast(sc.val[1]); + + nppSafeCall( func((const npp_complex_type*)src.data, static_cast(src.step), nConstant, (npp_complex_type*)dst.data, static_cast(dst.step), sz) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + }; +} + +//////////////////////////////////////////////////////////////////////// +// add + +namespace arithm +{ + void addMat_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream); + void addMat_v2(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream); + + template + void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +static void addMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int) +{ + typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + static const func_t funcs[7][7] = + { + { + arithm::addMat, + arithm::addMat, + arithm::addMat, + arithm::addMat, + arithm::addMat, + arithm::addMat, + arithm::addMat + }, + { + arithm::addMat, + arithm::addMat, + arithm::addMat, + arithm::addMat, + arithm::addMat, + arithm::addMat, + arithm::addMat + }, + { + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + arithm::addMat, + arithm::addMat, + arithm::addMat, + arithm::addMat, + arithm::addMat + }, + { + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + arithm::addMat, + arithm::addMat, + arithm::addMat, + arithm::addMat, + arithm::addMat + }, + { + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + arithm::addMat, + arithm::addMat, + arithm::addMat + }, + { + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + arithm::addMat, + arithm::addMat + }, + { + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + 0 /*arithm::addMat*/, + arithm::addMat + } + }; + + const int sdepth = src1.depth(); + const int ddepth = dst.depth(); + const int cn = src1.channels(); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + + PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step); + PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step); + PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step); + + if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth) + { + const intptr_t src1ptr = reinterpret_cast(src1_.data); + const intptr_t src2ptr = reinterpret_cast(src2_.data); + const intptr_t dstptr = reinterpret_cast(dst_.data); + + const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0; + + if (isAllAligned) + { + if (sdepth == CV_8U && (src1_.cols & 3) == 0) + { + const int vcols = src1_.cols >> 2; + + arithm::addMat_v4(PtrStepSz(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step), + stream); + + return; + } + else if (sdepth == CV_16U && (src1_.cols & 1) == 0) + { + const int vcols = src1_.cols >> 1; + + arithm::addMat_v2(PtrStepSz(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step), + stream); + + return; + } + } + } + + const func_t func = funcs[sdepth][ddepth]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src1_, src2_, dst_, mask, stream); +} + +namespace arithm +{ + template + void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +static void addScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int) +{ + typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + static const func_t funcs[7][7] = + { + { + arithm::addScalar, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar + }, + { + arithm::addScalar, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar + }, + { + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar + }, + { + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar + }, + { + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + arithm::addScalar, + arithm::addScalar, + arithm::addScalar + }, + { + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + arithm::addScalar, + arithm::addScalar + }, + { + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + 0 /*arithm::addScalar*/, + arithm::addScalar + } + }; + + typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream); + static const npp_func_t npp_funcs[7][4] = + { + {NppArithmScalar::call, 0 , NppArithmScalar::call, NppArithmScalar::call}, + {0 , 0 , 0 , 0 }, + {NppArithmScalar::call, 0 , NppArithmScalar::call, NppArithmScalar::call}, + {NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call}, + {NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call, 0 }, + {NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call}, + {0 , 0 , 0 , 0 } + }; + + const int sdepth = src.depth(); + const int ddepth = dst.depth(); + const int cn = src.channels(); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + + const npp_func_t npp_func = npp_funcs[sdepth][cn - 1]; + if (ddepth == sdepth && cn > 1 && npp_func != 0) + { + npp_func(src, val, dst, stream); + return; + } + + CV_Assert( cn == 1 ); + + const func_t func = funcs[sdepth][ddepth]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src, val[0], dst, mask, stream); +} + +void cv::gpu::add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, int dtype, Stream& stream) +{ + arithm_op(src1, src2, dst, mask, 1.0, dtype, stream, addMat, addScalar); +} + +//////////////////////////////////////////////////////////////////////// +// subtract + +namespace arithm +{ + void subMat_v4(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream); + void subMat_v2(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, cudaStream_t stream); + + template + void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +static void subMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int) +{ + typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + static const func_t funcs[7][7] = + { + { + arithm::subMat, + arithm::subMat, + arithm::subMat, + arithm::subMat, + arithm::subMat, + arithm::subMat, + arithm::subMat + }, + { + arithm::subMat, + arithm::subMat, + arithm::subMat, + arithm::subMat, + arithm::subMat, + arithm::subMat, + arithm::subMat + }, + { + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + arithm::subMat, + arithm::subMat, + arithm::subMat, + arithm::subMat, + arithm::subMat + }, + { + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + arithm::subMat, + arithm::subMat, + arithm::subMat, + arithm::subMat, + arithm::subMat + }, + { + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + arithm::subMat, + arithm::subMat, + arithm::subMat + }, + { + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + arithm::subMat, + arithm::subMat + }, + { + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + 0 /*arithm::subMat*/, + arithm::subMat + } + }; + + const int sdepth = src1.depth(); + const int ddepth = dst.depth(); + const int cn = src1.channels(); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + + PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step); + PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step); + PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step); + + if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth) + { + const intptr_t src1ptr = reinterpret_cast(src1_.data); + const intptr_t src2ptr = reinterpret_cast(src2_.data); + const intptr_t dstptr = reinterpret_cast(dst_.data); + + const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0; + + if (isAllAligned) + { + if (sdepth == CV_8U && (src1_.cols & 3) == 0) + { + const int vcols = src1_.cols >> 2; + + arithm::subMat_v4(PtrStepSz(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step), + stream); + + return; + } + else if (sdepth == CV_16U && (src1_.cols & 1) == 0) + { + const int vcols = src1_.cols >> 1; + + arithm::subMat_v2(PtrStepSz(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step), + PtrStepSz(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step), + stream); + + return; + } + } + } + + const func_t func = funcs[sdepth][ddepth]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src1_, src2_, dst_, mask, stream); +} + +namespace arithm +{ + template + void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +static void subScalar(const GpuMat& src, Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double, Stream& _stream, int) +{ + typedef void (*func_t)(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + static const func_t funcs[7][7] = + { + { + arithm::subScalar, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar + }, + { + arithm::subScalar, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar + }, + { + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar + }, + { + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar + }, + { + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + arithm::subScalar, + arithm::subScalar, + arithm::subScalar + }, + { + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + arithm::subScalar, + arithm::subScalar + }, + { + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + 0 /*arithm::subScalar*/, + arithm::subScalar + } + }; + + typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream); + static const npp_func_t npp_funcs[7][4] = + { + {NppArithmScalar::call, 0 , NppArithmScalar::call, NppArithmScalar::call}, + {0 , 0 , 0 , 0 }, + {NppArithmScalar::call, 0 , NppArithmScalar::call, NppArithmScalar::call}, + {NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call}, + {NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call, 0 }, + {NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call, NppArithmScalar::call}, + {0 , 0 , 0 , 0 } + }; + + const int sdepth = src.depth(); + const int ddepth = dst.depth(); + const int cn = src.channels(); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + + const npp_func_t npp_func = npp_funcs[sdepth][cn - 1]; + if (ddepth == sdepth && cn > 1 && npp_func != 0 && !inv) + { + npp_func(src, val, dst, stream); + return; + } + + CV_Assert( cn == 1 ); + + const func_t func = funcs[sdepth][ddepth]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src, val[0], inv, dst, mask, stream); +} + +void cv::gpu::subtract(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, int dtype, Stream& stream) +{ + arithm_op(src1, src2, dst, mask, 1.0, dtype, stream, subMat, subScalar); +} + +//////////////////////////////////////////////////////////////////////// +// multiply + +namespace arithm +{ + void mulMat_8uc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream); + + void mulMat_16sc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream); + + template + void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); +} + +static void mulMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& _stream, int) +{ + typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + static const func_t funcs[7][7] = + { + { + arithm::mulMat, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat + }, + { + arithm::mulMat, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat + }, + { + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat + }, + { + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat + }, + { + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + arithm::mulMat, + arithm::mulMat, + arithm::mulMat + }, + { + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + arithm::mulMat, + arithm::mulMat + }, + { + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + 0 /*arithm::mulMat*/, + arithm::mulMat + } + }; + + const int sdepth = src1.depth(); + const int ddepth = dst.depth(); + const int cn = src1.channels(); + + cudaStream_t stream = StreamAccessor::getStream(_stream); + + PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step); + PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step); + PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step); + + const func_t func = funcs[sdepth][ddepth]; + + if (!func) + CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types"); + + func(src1_, src2_, dst_, scale, stream); +} + +namespace arithm +{ + template + void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); +} + +static void mulScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat&, double scale, Stream& _stream, int) +{ + typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + static const func_t funcs[7][7] = + { + { + arithm::mulScalar, + arithm::mulScalar, + arithm::mulScalar, + arithm::mulScalar, + arithm::mulScalar, + arithm::mulScalar, + arithm::mulScalar