From: Vladislav Vinogradov Date: Wed, 23 Jan 2013 10:43:36 +0000 (+0400) Subject: optimizations: X-Git-Tag: accepted/2.0/20130307.220821~188^2~3 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=281d036fcf8ccb355300b122d987149a78ff93cd;p=profile%2Fivi%2Fopencv.git optimizations: - new reduce implementation (with kepler optimizations) - saturate_cast via asm command - video SIMD instructions in element operations - float arithmetics instead of double - new deviceSupports function --- diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp index dffb344..c2bf124 100644 --- a/modules/core/include/opencv2/core/gpumat.hpp +++ b/modules/core/include/opencv2/core/gpumat.hpp @@ -79,6 +79,8 @@ namespace cv { namespace gpu WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30 }; + CV_EXPORTS bool deviceSupports(FeatureSet feature_set); + // Gives information about what GPU archs this OpenCV GPU module was // compiled for class CV_EXPORTS TargetArchs diff --git a/modules/core/src/cuda/matrix_operations.cu b/modules/core/src/cuda/matrix_operations.cu index 9e830e5..60aa073 100644 --- a/modules/core/src/cuda/matrix_operations.cu +++ b/modules/core/src/cuda/matrix_operations.cu @@ -44,6 +44,7 @@ #include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/functional.hpp" +#include "opencv2/gpu/device/type_traits.hpp" namespace cv { namespace gpu { namespace device { @@ -54,6 +55,7 @@ namespace cv { namespace gpu { namespace device void writeScalar(const int*); void writeScalar(const float*); void writeScalar(const double*); + void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream); void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t); }}} @@ -226,16 +228,16 @@ namespace cv { namespace gpu { namespace device //////////////////////////////// ConvertTo //////////////////////////////// /////////////////////////////////////////////////////////////////////////// - template struct Convertor : unary_function + template struct Convertor : unary_function { - Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {} + Convertor(S alpha_, S beta_) : alpha(alpha_), beta(beta_) {} - __device__ __forceinline__ D operator()(const T& src) const + __device__ __forceinline__ D operator()(typename TypeTraits::ParameterType src) const { return saturate_cast(alpha * src + beta); } - double alpha, beta; + S alpha, beta; }; namespace detail @@ -282,16 +284,16 @@ namespace cv { namespace gpu { namespace device }; } - template struct TransformFunctorTraits< Convertor > : detail::ConvertTraits< Convertor > + template struct TransformFunctorTraits< Convertor > : detail::ConvertTraits< Convertor > { }; - template + template void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream) { cudaSafeCall( cudaSetDoubleForDevice(&alpha) ); cudaSafeCall( cudaSetDoubleForDevice(&beta) ); - Convertor op(alpha, beta); + Convertor op(static_cast(alpha), static_cast(beta)); cv::gpu::device::transform((PtrStepSz)src, (PtrStepSz)dst, op, WithOutMask(), stream); } @@ -304,36 +306,74 @@ namespace cv { namespace gpu { namespace device { typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream); - static const caller_t tab[8][8] = + static const caller_t tab[7][7] = { - {cvt_, cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, 0}, - - {cvt_, cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, 0}, - - {cvt_, cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, 0}, - - {cvt_, cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, 0}, - - {cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, cvt_, 0}, - - {cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, cvt_, 0}, - - {cvt_, cvt_, cvt_, - cvt_, cvt_, cvt_, cvt_, 0}, - - {0,0,0,0,0,0,0,0} + { + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_ + }, + { + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_ + }, + { + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_ + }, + { + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_ + }, + { + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_ + }, + { + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_ + }, + { + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_, + cvt_ + } }; caller_t func = tab[sdepth][ddepth]; - if (!func) - cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__, "convert_gpu"); - func(src, dst, alpha, beta, stream); } diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp index 8c7a470..01c613e 100644 --- a/modules/core/src/gpumat.cpp +++ b/modules/core/src/gpumat.cpp @@ -45,8 +45,7 @@ #include #ifdef HAVE_CUDA - #include - #include + #include #include #define CUDART_MINIMUM_REQUIRED_VERSION 4010 @@ -69,33 +68,89 @@ using namespace cv::gpu; namespace { - // Compares value to set using the given comparator. Returns true if - // there is at least one element x in the set satisfying to: x cmp value - // predicate. - template - bool compareToSet(const std::string& set_as_str, int value, Comparer cmp) + class CudaArch + { + public: + CudaArch(); + + bool builtWith(FeatureSet feature_set) const; + bool hasPtx(int major, int minor) const; + bool hasBin(int major, int minor) const; + bool hasEqualOrLessPtx(int major, int minor) const; + bool hasEqualOrGreaterPtx(int major, int minor) const; + bool hasEqualOrGreaterBin(int major, int minor) const; + + private: + static void fromStr(const string& set_as_str, vector& arr); + + vector bin; + vector ptx; + vector features; + }; + + const CudaArch cudaArch; + + CudaArch::CudaArch() + { + #ifdef HAVE_CUDA + fromStr(CUDA_ARCH_BIN, bin); + fromStr(CUDA_ARCH_PTX, ptx); + fromStr(CUDA_ARCH_FEATURES, features); + #endif + } + + bool CudaArch::builtWith(FeatureSet feature_set) const + { + return !features.empty() && (features.back() >= feature_set); + } + + bool CudaArch::hasPtx(int major, int minor) const + { + return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end(); + } + + bool CudaArch::hasBin(int major, int minor) const + { + return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end(); + } + + bool CudaArch::hasEqualOrLessPtx(int major, int minor) const + { + return !ptx.empty() && (ptx.front() <= major * 10 + minor); + } + + bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const + { + return !ptx.empty() && (ptx.back() >= major * 10 + minor); + } + + bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const + { + return !bin.empty() && (bin.back() >= major * 10 + minor); + } + + void CudaArch::fromStr(const string& set_as_str, vector& arr) { if (set_as_str.find_first_not_of(" ") == string::npos) - return false; + return; - std::stringstream stream(set_as_str); + istringstream stream(set_as_str); int cur_value; while (!stream.eof()) { stream >> cur_value; - if (cmp(cur_value, value)) - return true; + arr.push_back(cur_value); } - return false; + sort(arr.begin(), arr.end()); } } bool cv::gpu::TargetArchs::builtWith(cv::gpu::FeatureSet feature_set) { #if defined (HAVE_CUDA) - return ::compareToSet(CUDA_ARCH_FEATURES, feature_set, std::greater_equal()); + return cudaArch.builtWith(feature_set); #else (void)feature_set; return false; @@ -110,7 +165,7 @@ bool cv::gpu::TargetArchs::has(int major, int minor) bool cv::gpu::TargetArchs::hasPtx(int major, int minor) { #if defined (HAVE_CUDA) - return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::equal_to()); + return cudaArch.hasPtx(major, minor); #else (void)major; (void)minor; @@ -121,7 +176,7 @@ bool cv::gpu::TargetArchs::hasPtx(int major, int minor) bool cv::gpu::TargetArchs::hasBin(int major, int minor) { #if defined (HAVE_CUDA) - return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor, std::equal_to()); + return cudaArch.hasBin(major, minor); #else (void)major; (void)minor; @@ -132,8 +187,7 @@ bool cv::gpu::TargetArchs::hasBin(int major, int minor) bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) { #if defined (HAVE_CUDA) - return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, - std::less_equal()); + return cudaArch.hasEqualOrLessPtx(major, minor); #else (void)major; (void)minor; @@ -143,14 +197,13 @@ bool cv::gpu::TargetArchs::hasEqualOrLessPtx(int major, int minor) bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) { - return hasEqualOrGreaterPtx(major, minor) || - hasEqualOrGreaterBin(major, minor); + return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor); } bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) { #if defined (HAVE_CUDA) - return ::compareToSet(CUDA_ARCH_PTX, major * 10 + minor, std::greater_equal()); + return cudaArch.hasEqualOrGreaterPtx(major, minor); #else (void)major; (void)minor; @@ -161,8 +214,7 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) { #if defined (HAVE_CUDA) - return ::compareToSet(CUDA_ARCH_BIN, major * 10 + minor, - std::greater_equal()); + return cudaArch.hasEqualOrGreaterBin(major, minor); #else (void)major; (void)minor; @@ -170,6 +222,31 @@ bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) #endif } +bool cv::gpu::deviceSupports(FeatureSet feature_set) +{ + static int versions[] = + { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + static const int cache_size = static_cast(sizeof(versions) / sizeof(versions[0])); + + const int devId = getDevice(); + + int version; + + if (devId < cache_size && versions[devId] >= 0) + version = versions[devId]; + else + { + DeviceInfo dev(devId); + version = dev.majorVersion() * 10 + dev.minorVersion(); + if (devId < cache_size) + versions[devId] = version; + } + + return TargetArchs::builtWith(feature_set) && (version >= feature_set); +} + #if !defined (HAVE_CUDA) #define throw_nogpu CV_Error(CV_GpuNotSupported, "The library is compiled without CUDA support") diff --git a/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp b/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp new file mode 100644 index 0000000..091a160 --- /dev/null +++ b/modules/gpu/include/opencv2/gpu/device/detail/reduce.hpp @@ -0,0 +1,361 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPU_REDUCE_DETAIL_HPP__ +#define __OPENCV_GPU_REDUCE_DETAIL_HPP__ + +#include +#include "../warp.hpp" +#include "../warp_shuffle.hpp" + +namespace cv { namespace gpu { namespace device +{ + namespace reduce_detail + { + template struct GetType; + template struct GetType + { + typedef T type; + }; + template struct GetType + { + typedef T type; + }; + template struct GetType + { + typedef T type; + }; + + template + struct For + { + template + static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid) + { + thrust::get(smem)[tid] = thrust::get(val); + + For::loadToSmem(smem, val, tid); + } + template + static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid) + { + thrust::get(val) = thrust::get(smem)[tid]; + + For::loadFromSmem(smem, val, tid); + } + + template + static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op) + { + typename GetType::type>::type reg = thrust::get(smem)[tid + delta]; + thrust::get(smem)[tid] = thrust::get(val) = thrust::get(op)(thrust::get(val), reg); + + For::merge(smem, val, tid, delta, op); + } + template + static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op) + { + typename GetType::type>::type reg = shfl_down(thrust::get(val), delta, width); + thrust::get(val) = thrust::get(op)(thrust::get(val), reg); + + For::mergeShfl(val, delta, width, op); + } + }; + template + struct For + { + template + static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int) + { + } + template + static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int) + { + } + + template + static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&) + { + } + template + static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&) + { + } + }; + + template + __device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid) + { + smem[tid] = val; + } + template + __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid) + { + val = smem[tid]; + } + template + __device__ __forceinline__ void loadToSmem(const thrust::tuple& smem, + const thrust::tuple& val, + unsigned int tid) + { + For<0, thrust::tuple_size >::value>::loadToSmem(smem, val, tid); + } + template + __device__ __forceinline__ void loadFromSmem(const thrust::tuple& smem, + const thrust::tuple& val, + unsigned int tid) + { + For<0, thrust::tuple_size >::value>::loadFromSmem(smem, val, tid); + } + + template + __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op) + { + T reg = smem[tid + delta]; + smem[tid] = val = op(val, reg); + } + template + __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op) + { + T reg = shfl_down(val, delta, width); + val = op(val, reg); + } + template + __device__ __forceinline__ void merge(const thrust::tuple& smem, + const thrust::tuple& val, + unsigned int tid, + unsigned int delta, + const thrust::tuple& op) + { + For<0, thrust::tuple_size >::value>::merge(smem, val, tid, delta, op); + } + template + __device__ __forceinline__ void mergeShfl(const thrust::tuple& val, + unsigned int delta, + unsigned int width, + const thrust::tuple& op) + { + For<0, thrust::tuple_size >::value>::mergeShfl(val, delta, width, op); + } + + template struct Generic + { + template + static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op) + { + loadToSmem(smem, val, tid); + if (N >= 32) + __syncthreads(); + + if (N >= 2048) + { + if (tid < 1024) + merge(smem, val, tid, 1024, op); + + __syncthreads(); + } + if (N >= 1024) + { + if (tid < 512) + merge(smem, val, tid, 512, op); + + __syncthreads(); + } + if (N >= 512) + { + if (tid < 256) + merge(smem, val, tid, 256, op); + + __syncthreads(); + } + if (N >= 256) + { + if (tid < 128) + merge(smem, val, tid, 128, op); + + __syncthreads(); + } + if (N >= 128) + { + if (tid < 64) + merge(smem, val, tid, 64, op); + + __syncthreads(); + } + if (N >= 64) + { + if (tid < 32) + merge(smem, val, tid, 32, op); + } + + if (tid < 16) + { + merge(smem, val, tid, 16, op); + merge(smem, val, tid, 8, op); + merge(smem, val, tid, 4, op); + merge(smem, val, tid, 2, op); + merge(smem, val, tid, 1, op); + } + } + }; + + template + struct Unroll + { + static __device__ void loopShfl(Reference val, Op op, unsigned int N) + { + mergeShfl(val, I, N, op); + Unroll::loopShfl(val, op, N); + } + static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op) + { + merge(smem, val, tid, I, op); + Unroll::loop(smem, val, tid, op); + } + }; + template + struct Unroll<0, Pointer, Reference, Op> + { + static __device__ void loopShfl(Reference, Op, unsigned int) + { + } + static __device__ void loop(Pointer, Reference, unsigned int, Op) + { + } + }; + + template struct WarpOptimized + { + template + static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op) + { + #if __CUDA_ARCH__ >= 300 + (void) smem; + (void) tid; + + Unroll::loopShfl(val, op, N); + #else + loadToSmem(smem, val, tid); + + if (tid < N / 2) + Unroll::loop(smem, val, tid, op); + #endif + } + }; + + template struct GenericOptimized32 + { + enum { M = N / 32 }; + + template + static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op) + { + const unsigned int laneId = Warp::laneId(); + + #if __CUDA_ARCH__ >= 300 + Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize); + + if (laneId == 0) + loadToSmem(smem, val, tid / 32); + #else + loadToSmem(smem, val, tid); + + if (laneId < 16) + Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op); + + __syncthreads(); + + if (laneId == 0) + loadToSmem(smem, val, tid / 32); + #endif + + __syncthreads(); + + loadFromSmem(smem, val, tid); + + if (tid < 32) + { + #if __CUDA_ARCH__ >= 300 + Unroll::loopShfl(val, op, M); + #else + Unroll::loop(smem, val, tid, op); + #endif + } + } + }; + + template struct StaticIf; + template struct StaticIf + { + typedef T1 type; + }; + template struct StaticIf + { + typedef T2 type; + }; + + template struct IsPowerOf2 + { + enum { value = ((N != 0) && !(N & (N - 1))) }; + }; + + template struct Dispatcher + { + typedef typename StaticIf< + (N <= 32) && IsPowerOf2::value, + WarpOptimized, + typename StaticIf< + (N <= 1024) && IsPowerOf2::value, + GenericOptimized32, + Generic + >::type + >::type reductor; + }; + } +}}} + +#endif // __OPENCV_GPU_REDUCE_DETAIL_HPP__ diff --git a/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp b/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp new file mode 100644 index 0000000..a84e0c2 --- /dev/null +++ b/modules/gpu/include/opencv2/gpu/device/detail/reduce_key_val.hpp @@ -0,0 +1,498 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__ +#define __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__ + +#include +#include "../warp.hpp" +#include "../warp_shuffle.hpp" + +namespace cv { namespace gpu { namespace device +{ + namespace reduce_key_val_detail + { + template struct GetType; + template struct GetType + { + typedef T type; + }; + template struct GetType + { + typedef T type; + }; + template struct GetType + { + typedef T type; + }; + + template + struct For + { + template + static __device__ void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid) + { + thrust::get(smem)[tid] = thrust::get(data); + + For::loadToSmem(smem, data, tid); + } + template + static __device__ void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid) + { + thrust::get(data) = thrust::get(smem)[tid]; + + For::loadFromSmem(smem, data, tid); + } + + template + static __device__ void copyShfl(const ReferenceTuple& val, unsigned int delta, int width) + { + thrust::get(val) = shfl_down(thrust::get(val), delta, width); + + For::copyShfl(val, delta, width); + } + template + static __device__ void copy(const PointerTuple& svals, const ReferenceTuple& val, unsigned int tid, unsigned int delta) + { + thrust::get(svals)[tid] = thrust::get(val) = thrust::get(svals)[tid + delta]; + + For::copy(svals, val, tid, delta); + } + + template + static __device__ void mergeShfl(const KeyReferenceTuple& key, const ValReferenceTuple& val, const CmpTuple& cmp, unsigned int delta, int width) + { + typename GetType::type>::type reg = shfl_down(thrust::get(key), delta, width); + + if (thrust::get(cmp)(reg, thrust::get(key))) + { + thrust::get(key) = reg; + thrust::get(val) = shfl_down(thrust::get(val), delta, width); + } + + For::mergeShfl(key, val, cmp, delta, width); + } + template + static __device__ void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key, + const ValPointerTuple& svals, const ValReferenceTuple& val, + const CmpTuple& cmp, + unsigned int tid, unsigned int delta) + { + typename GetType::type>::type reg = thrust::get(skeys)[tid + delta]; + + if (thrust::get(cmp)(reg, thrust::get(key))) + { + thrust::get(skeys)[tid] = thrust::get(key) = reg; + thrust::get(svals)[tid] = thrust::get(val) = thrust::get(svals)[tid + delta]; + } + + For::merge(skeys, key, svals, val, cmp, tid, delta); + } + }; + template + struct For + { + template + static __device__ void loadToSmem(const PointerTuple&, const ReferenceTuple&, unsigned int) + { + } + template + static __device__ void loadFromSmem(const PointerTuple&, const ReferenceTuple&, unsigned int) + { + } + + template + static __device__ void copyShfl(const ReferenceTuple&, unsigned int, int) + { + } + template + static __device__ void copy(const PointerTuple&, const ReferenceTuple&, unsigned int, unsigned int) + { + } + + template + static __device__ void mergeShfl(const KeyReferenceTuple&, const ValReferenceTuple&, const CmpTuple&, unsigned int, int) + { + } + template + static __device__ void merge(const KeyPointerTuple&, const KeyReferenceTuple&, + const ValPointerTuple&, const ValReferenceTuple&, + const CmpTuple&, + unsigned int, unsigned int) + { + } + }; + + ////////////////////////////////////////////////////// + // loadToSmem + + template + __device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, unsigned int tid) + { + smem[tid] = data; + } + template + __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, unsigned int tid) + { + data = smem[tid]; + } + template + __device__ __forceinline__ void loadToSmem(const thrust::tuple& smem, + const thrust::tuple& data, + unsigned int tid) + { + For<0, thrust::tuple_size >::value>::loadToSmem(smem, data, tid); + } + template + __device__ __forceinline__ void loadFromSmem(const thrust::tuple& smem, + const thrust::tuple& data, + unsigned int tid) + { + For<0, thrust::tuple_size >::value>::loadFromSmem(smem, data, tid); + } + + ////////////////////////////////////////////////////// + // copyVals + + template + __device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width) + { + val = shfl_down(val, delta, width); + } + template + __device__ __forceinline__ void copyVals(volatile V* svals, V& val, unsigned int tid, unsigned int delta) + { + svals[tid] = val = svals[tid + delta]; + } + template + __device__ __forceinline__ void copyValsShfl(const thrust::tuple& val, + unsigned int delta, + int width) + { + For<0, thrust::tuple_size >::value>::copyShfl(val, delta, width); + } + template + __device__ __forceinline__ void copyVals(const thrust::tuple& svals, + const thrust::tuple& val, + unsigned int tid, unsigned int delta) + { + For<0, thrust::tuple_size >::value>::copy(svals, val, tid, delta); + } + + ////////////////////////////////////////////////////// + // merge + + template + __device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width) + { + K reg = shfl_down(key, delta, width); + + if (cmp(reg, key)) + { + key = reg; + copyValsShfl(val, delta, width); + } + } + template + __device__ __forceinline__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, unsigned int tid, unsigned int delta) + { + K reg = skeys[tid + delta]; + + if (cmp(reg, key)) + { + skeys[tid] = key = reg; + copyVals(svals, val, tid, delta); + } + } + template + __device__ __forceinline__ void mergeShfl(K& key, + const thrust::tuple& val, + const Cmp& cmp, + unsigned int delta, int width) + { + K reg = shfl_down(key, delta, width); + + if (cmp(reg, key)) + { + key = reg; + copyValsShfl(val, delta, width); + } + } + template + __device__ __forceinline__ void merge(volatile K* skeys, K& key, + const thrust::tuple& svals, + const thrust::tuple& val, + const Cmp& cmp, unsigned int tid, unsigned int delta) + { + K reg = skeys[tid + delta]; + + if (cmp(reg, key)) + { + skeys[tid] = key = reg; + copyVals(svals, val, tid, delta); + } + } + template + __device__ __forceinline__ void mergeShfl(const thrust::tuple& key, + const thrust::tuple& val, + const thrust::tuple& cmp, + unsigned int delta, int width) + { + For<0, thrust::tuple_size >::value>::mergeShfl(key, val, cmp, delta, width); + } + template + __device__ __forceinline__ void merge(const thrust::tuple& skeys, + const thrust::tuple& key, + const thrust::tuple& svals, + const thrust::tuple& val, + const thrust::tuple& cmp, + unsigned int tid, unsigned int delta) + { + For<0, thrust::tuple_size >::value>::merge(skeys, key, svals, val, cmp, tid, delta); + } + + ////////////////////////////////////////////////////// + // Generic + + template struct Generic + { + template + static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) + { + loadToSmem(skeys, key, tid); + loadValsToSmem(svals, val, tid); + if (N >= 32) + __syncthreads(); + + if (N >= 2048) + { + if (tid < 1024) + merge(skeys, key, svals, val, cmp, tid, 1024); + + __syncthreads(); + } + if (N >= 1024) + { + if (tid < 512) + merge(skeys, key, svals, val, cmp, tid, 512); + + __syncthreads(); + } + if (N >= 512) + { + if (tid < 256) + merge(skeys, key, svals, val, cmp, tid, 256); + + __syncthreads(); + } + if (N >= 256) + { + if (tid < 128) + merge(skeys, key, svals, val, cmp, tid, 128); + + __syncthreads(); + } + if (N >= 128) + { + if (tid < 64) + merge(skeys, key, svals, val, cmp, tid, 64); + + __syncthreads(); + } + if (N >= 64) + { + if (tid < 32) + merge(skeys, key, svals, val, cmp, tid, 32); + } + + if (tid < 16) + { + merge(skeys, key, svals, val, cmp, tid, 16); + merge(skeys, key, svals, val, cmp, tid, 8); + merge(skeys, key, svals, val, cmp, tid, 4); + merge(skeys, key, svals, val, cmp, tid, 2); + merge(skeys, key, svals, val, cmp, tid, 1); + } + } + }; + + template + struct Unroll + { + static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N) + { + mergeShfl(key, val, cmp, I, N); + Unroll::loopShfl(key, val, cmp, N); + } + static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) + { + merge(skeys, key, svals, val, cmp, tid, I); + Unroll::loop(skeys, key, svals, val, tid, cmp); + } + }; + template + struct Unroll<0, KP, KR, VP, VR, Cmp> + { + static __device__ void loopShfl(KR, VR, Cmp, unsigned int) + { + } + static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp) + { + } + }; + + template struct WarpOptimized + { + template + static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) + { + #if 0 // __CUDA_ARCH__ >= 300 + (void) skeys; + (void) svals; + (void) tid; + + Unroll::loopShfl(key, val, cmp, N); + #else + loadToSmem(skeys, key, tid); + loadToSmem(svals, val, tid); + + if (tid < N / 2) + Unroll::loop(skeys, key, svals, val, tid, cmp); + #endif + } + }; + + template struct GenericOptimized32 + { + enum { M = N / 32 }; + + template + static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) + { + const unsigned int laneId = Warp::laneId(); + + #if 0 // __CUDA_ARCH__ >= 300 + Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize); + + if (laneId == 0) + { + loadToSmem(skeys, key, tid / 32); + loadToSmem(svals, val, tid / 32); + } + #else + loadToSmem(skeys, key, tid); + loadToSmem(svals, val, tid); + + if (laneId < 16) + Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp); + + __syncthreads(); + + if (laneId == 0) + { + loadToSmem(skeys, key, tid / 32); + loadToSmem(svals, val, tid / 32); + } + #endif + + __syncthreads(); + + loadFromSmem(skeys, key, tid); + + if (tid < 32) + { + #if 0 // __CUDA_ARCH__ >= 300 + loadFromSmem(svals, val, tid); + + Unroll::loopShfl(key, val, cmp, M); + #else + Unroll::loop(skeys, key, svals, val, tid, cmp); + #endif + } + } + }; + + template struct StaticIf; + template struct StaticIf + { + typedef T1 type; + }; + template struct StaticIf + { + typedef T2 type; + }; + + template struct IsPowerOf2 + { + enum { value = ((N != 0) && !(N & (N - 1))) }; + }; + + template struct Dispatcher + { + typedef typename StaticIf< + (N <= 32) && IsPowerOf2::value, + WarpOptimized, + typename StaticIf< + (N <= 1024) && IsPowerOf2::value, + GenericOptimized32, + Generic + >::type + >::type reductor; + }; + } +}}} + +#endif // __OPENCV_GPU_PRED_VAL_REDUCE_DETAIL_HPP__ diff --git a/modules/gpu/include/opencv2/gpu/device/detail/reduction_detail.hpp b/modules/gpu/include/opencv2/gpu/device/detail/reduction_detail.hpp deleted file mode 100644 index 0274f20..0000000 --- a/modules/gpu/include/opencv2/gpu/device/detail/reduction_detail.hpp +++ /dev/null @@ -1,841 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#ifndef __OPENCV_GPU_REDUCTION_DETAIL_HPP__ -#define __OPENCV_GPU_REDUCTION_DETAIL_HPP__ - -namespace cv { namespace gpu { namespace device -{ - namespace utility_detail - { - /////////////////////////////////////////////////////////////////////////////// - // Reductor - - template struct WarpReductor - { - template static __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) - { - if (tid < n) - data[tid] = partial_reduction; - if (n > 32) __syncthreads(); - - if (n > 32) - { - if (tid < n - 32) - data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]); - if (tid < 16) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); - } - } - else if (n > 16) - { - if (tid < n - 16) - data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); - if (tid < 8) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); - } - } - else if (n > 8) - { - if (tid < n - 8) - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]); - if (tid < 4) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); - } - } - else if (n > 4) - { - if (tid < n - 4) - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); - if (tid < 2) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); - } - } - else if (n > 2) - { - if (tid < n - 2) - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); - if (tid < 2) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); - } - } - } - }; - template <> struct WarpReductor<64> - { - template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) - { - data[tid] = partial_reduction; - __syncthreads(); - - if (tid < 32) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); - } - } - }; - template <> struct WarpReductor<32> - { - template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) - { - data[tid] = partial_reduction; - - if (tid < 16) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); - } - } - }; - template <> struct WarpReductor<16> - { - template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) - { - data[tid] = partial_reduction; - - if (tid < 8) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); - } - } - }; - template <> struct WarpReductor<8> - { - template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) - { - data[tid] = partial_reduction; - - if (tid < 4) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]); - } - } - }; - - template struct ReductionDispatcher; - template <> struct ReductionDispatcher - { - template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) - { - WarpReductor::reduce(data, partial_reduction, tid, op); - } - }; - template <> struct ReductionDispatcher - { - template static __device__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) - { - if (tid < n) - data[tid] = partial_reduction; - __syncthreads(); - - - if (n == 512) { if (tid < 256) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 256]); } __syncthreads(); } - if (n >= 256) { if (tid < 128) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 128]); } __syncthreads(); } - if (n >= 128) { if (tid < 64) { data[tid] = partial_reduction = op(partial_reduction, data[tid + 64]); } __syncthreads(); } - - if (tid < 32) - { - data[tid] = partial_reduction = op(partial_reduction, data[tid + 32]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 8]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 4]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 2]); - data[tid] = partial_reduction = op(partial_reduction, data[tid + 1]); - } - } - }; - - /////////////////////////////////////////////////////////////////////////////// - // PredValWarpReductor - - template struct PredValWarpReductor; - template <> struct PredValWarpReductor<64> - { - template - static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) - { - if (tid < 32) - { - myData = sdata[tid]; - myVal = sval[tid]; - - T reg = sdata[tid + 32]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 32]; - } - - reg = sdata[tid + 16]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 16]; - } - - reg = sdata[tid + 8]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 8]; - } - - reg = sdata[tid + 4]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 4]; - } - - reg = sdata[tid + 2]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 2]; - } - - reg = sdata[tid + 1]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 1]; - } - } - } - }; - template <> struct PredValWarpReductor<32> - { - template - static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) - { - if (tid < 16) - { - myData = sdata[tid]; - myVal = sval[tid]; - - T reg = sdata[tid + 16]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 16]; - } - - reg = sdata[tid + 8]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 8]; - } - - reg = sdata[tid + 4]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 4]; - } - - reg = sdata[tid + 2]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 2]; - } - - reg = sdata[tid + 1]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 1]; - } - } - } - }; - - template <> struct PredValWarpReductor<16> - { - template - static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) - { - if (tid < 8) - { - myData = sdata[tid]; - myVal = sval[tid]; - - T reg = reg = sdata[tid + 8]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 8]; - } - - reg = sdata[tid + 4]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 4]; - } - - reg = sdata[tid + 2]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 2]; - } - - reg = sdata[tid + 1]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 1]; - } - } - } - }; - template <> struct PredValWarpReductor<8> - { - template - static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) - { - if (tid < 4) - { - myData = sdata[tid]; - myVal = sval[tid]; - - T reg = reg = sdata[tid + 4]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 4]; - } - - reg = sdata[tid + 2]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 2]; - } - - reg = sdata[tid + 1]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 1]; - } - } - } - }; - - template struct PredValReductionDispatcher; - template <> struct PredValReductionDispatcher - { - template static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) - { - PredValWarpReductor::reduce(myData, myVal, sdata, sval, tid, pred); - } - }; - template <> struct PredValReductionDispatcher - { - template static __device__ void reduce(T& myData, V& myVal, volatile T* sdata, V* sval, int tid, const Pred& pred) - { - myData = sdata[tid]; - myVal = sval[tid]; - - if (n >= 512 && tid < 256) - { - T reg = sdata[tid + 256]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 256]; - } - __syncthreads(); - } - if (n >= 256 && tid < 128) - { - T reg = sdata[tid + 128]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 128]; - } - __syncthreads(); - } - if (n >= 128 && tid < 64) - { - T reg = sdata[tid + 64]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 64]; - } - __syncthreads(); - } - - if (tid < 32) - { - if (n >= 64) - { - T reg = sdata[tid + 32]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 32]; - } - } - if (n >= 32) - { - T reg = sdata[tid + 16]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 16]; - } - } - if (n >= 16) - { - T reg = sdata[tid + 8]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 8]; - } - } - if (n >= 8) - { - T reg = sdata[tid + 4]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 4]; - } - } - if (n >= 4) - { - T reg = sdata[tid + 2]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 2]; - } - } - if (n >= 2) - { - T reg = sdata[tid + 1]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval[tid] = myVal = sval[tid + 1]; - } - } - } - } - }; - - /////////////////////////////////////////////////////////////////////////////// - // PredVal2WarpReductor - - template struct PredVal2WarpReductor; - template <> struct PredVal2WarpReductor<64> - { - template - static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) - { - if (tid < 32) - { - myData = sdata[tid]; - myVal1 = sval1[tid]; - myVal2 = sval2[tid]; - - T reg = sdata[tid + 32]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 32]; - sval2[tid] = myVal2 = sval2[tid + 32]; - } - - reg = sdata[tid + 16]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 16]; - sval2[tid] = myVal2 = sval2[tid + 16]; - } - - reg = sdata[tid + 8]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 8]; - sval2[tid] = myVal2 = sval2[tid + 8]; - } - - reg = sdata[tid + 4]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 4]; - sval2[tid] = myVal2 = sval2[tid + 4]; - } - - reg = sdata[tid + 2]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 2]; - sval2[tid] = myVal2 = sval2[tid + 2]; - } - - reg = sdata[tid + 1]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 1]; - sval2[tid] = myVal2 = sval2[tid + 1]; - } - } - } - }; - template <> struct PredVal2WarpReductor<32> - { - template - static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) - { - if (tid < 16) - { - myData = sdata[tid]; - myVal1 = sval1[tid]; - myVal2 = sval2[tid]; - - T reg = sdata[tid + 16]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 16]; - sval2[tid] = myVal2 = sval2[tid + 16]; - } - - reg = sdata[tid + 8]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 8]; - sval2[tid] = myVal2 = sval2[tid + 8]; - } - - reg = sdata[tid + 4]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 4]; - sval2[tid] = myVal2 = sval2[tid + 4]; - } - - reg = sdata[tid + 2]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 2]; - sval2[tid] = myVal2 = sval2[tid + 2]; - } - - reg = sdata[tid + 1]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 1]; - sval2[tid] = myVal2 = sval2[tid + 1]; - } - } - } - }; - - template <> struct PredVal2WarpReductor<16> - { - template - static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) - { - if (tid < 8) - { - myData = sdata[tid]; - myVal1 = sval1[tid]; - myVal2 = sval2[tid]; - - T reg = reg = sdata[tid + 8]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 8]; - sval2[tid] = myVal2 = sval2[tid + 8]; - } - - reg = sdata[tid + 4]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 4]; - sval2[tid] = myVal2 = sval2[tid + 4]; - } - - reg = sdata[tid + 2]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 2]; - sval2[tid] = myVal2 = sval2[tid + 2]; - } - - reg = sdata[tid + 1]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 1]; - sval2[tid] = myVal2 = sval2[tid + 1]; - } - } - } - }; - template <> struct PredVal2WarpReductor<8> - { - template - static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) - { - if (tid < 4) - { - myData = sdata[tid]; - myVal1 = sval1[tid]; - myVal2 = sval2[tid]; - - T reg = reg = sdata[tid + 4]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 4]; - sval2[tid] = myVal2 = sval2[tid + 4]; - } - - reg = sdata[tid + 2]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 2]; - sval2[tid] = myVal2 = sval2[tid + 2]; - } - - reg = sdata[tid + 1]; - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 1]; - sval2[tid] = myVal2 = sval2[tid + 1]; - } - } - } - }; - - template struct PredVal2ReductionDispatcher; - template <> struct PredVal2ReductionDispatcher - { - template - static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) - { - PredVal2WarpReductor::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred); - } - }; - template <> struct PredVal2ReductionDispatcher - { - template - static __device__ void reduce(T& myData, V1& myVal1, V2& myVal2, volatile T* sdata, V1* sval1, V2* sval2, int tid, const Pred& pred) - { - myData = sdata[tid]; - myVal1 = sval1[tid]; - myVal2 = sval2[tid]; - - if (n >= 512 && tid < 256) - { - T reg = sdata[tid + 256]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 256]; - sval2[tid] = myVal2 = sval2[tid + 256]; - } - __syncthreads(); - } - if (n >= 256 && tid < 128) - { - T reg = sdata[tid + 128]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 128]; - sval2[tid] = myVal2 = sval2[tid + 128]; - } - __syncthreads(); - } - if (n >= 128 && tid < 64) - { - T reg = sdata[tid + 64]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 64]; - sval2[tid] = myVal2 = sval2[tid + 64]; - } - __syncthreads(); - } - - if (tid < 32) - { - if (n >= 64) - { - T reg = sdata[tid + 32]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 32]; - sval2[tid] = myVal2 = sval2[tid + 32]; - } - } - if (n >= 32) - { - T reg = sdata[tid + 16]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 16]; - sval2[tid] = myVal2 = sval2[tid + 16]; - } - } - if (n >= 16) - { - T reg = sdata[tid + 8]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 8]; - sval2[tid] = myVal2 = sval2[tid + 8]; - } - } - if (n >= 8) - { - T reg = sdata[tid + 4]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 4]; - sval2[tid] = myVal2 = sval2[tid + 4]; - } - } - if (n >= 4) - { - T reg = sdata[tid + 2]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 2]; - sval2[tid] = myVal2 = sval2[tid + 2]; - } - } - if (n >= 2) - { - T reg = sdata[tid + 1]; - - if (pred(reg, myData)) - { - sdata[tid] = myData = reg; - sval1[tid] = myVal1 = sval1[tid + 1]; - sval2[tid] = myVal2 = sval2[tid + 1]; - } - } - } - } - }; - } // namespace utility_detail -}}} // namespace cv { namespace gpu { namespace device - -#endif // __OPENCV_GPU_REDUCTION_DETAIL_HPP__ diff --git a/modules/gpu/include/opencv2/gpu/device/reduce.hpp b/modules/gpu/include/opencv2/gpu/device/reduce.hpp new file mode 100644 index 0000000..2161b06 --- /dev/null +++ b/modules/gpu/include/opencv2/gpu/device/reduce.hpp @@ -0,0 +1,197 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPU_REDUCE_HPP__ +#define __OPENCV_GPU_REDUCE_HPP__ + +#include +#include "detail/reduce.hpp" +#include "detail/reduce_key_val.hpp" + +namespace cv { namespace gpu { namespace device +{ + template + __device__ __forceinline__ void reduce(volatile T* smem, T& val, unsigned int tid, const Op& op) + { + reduce_detail::Dispatcher::reductor::template reduce(smem, val, tid, op); + } + template + __device__ __forceinline__ void reduce(const thrust::tuple& smem, + const thrust::tuple& val, + unsigned int tid, + const thrust::tuple& op) + { + reduce_detail::Dispatcher::reductor::template reduce< + const thrust::tuple&, + const thrust::tuple&, + const thrust::tuple&>(smem, val, tid, op); + } + + template + __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, volatile V* svals, V& val, unsigned int tid, const Cmp& cmp) + { + reduce_key_val_detail::Dispatcher::reductor::template reduce(skeys, key, svals, val, tid, cmp); + } + template + __device__ __forceinline__ void reduceKeyVal(volatile K* skeys, K& key, + const thrust::tuple& svals, + const thrust::tuple& val, + unsigned int tid, const Cmp& cmp) + { + reduce_key_val_detail::Dispatcher::reductor::template reduce&, + const thrust::tuple&, + const Cmp&>(skeys, key, svals, val, tid, cmp); + } + template + __device__ __forceinline__ void reduceKeyVal(const thrust::tuple& skeys, + const thrust::tuple& key, + const thrust::tuple& svals, + const thrust::tuple& val, + unsigned int tid, + const thrust::tuple& cmp) + { + reduce_key_val_detail::Dispatcher::reductor::template reduce< + const thrust::tuple&, + const thrust::tuple&, + const thrust::tuple&, + const thrust::tuple&, + const thrust::tuple& + >(skeys, key, svals, val, tid, cmp); + } + + // smem_tuple + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0) + { + return thrust::make_tuple((volatile T0*) t0); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8); + } + + template + __device__ __forceinline__ + thrust::tuple + smem_tuple(T0* t0, T1* t1, T2* t2, T3* t3, T4* t4, T5* t5, T6* t6, T7* t7, T8* t8, T9* t9) + { + return thrust::make_tuple((volatile T0*) t0, (volatile T1*) t1, (volatile T2*) t2, (volatile T3*) t3, (volatile T4*) t4, (volatile T5*) t5, (volatile T6*) t6, (volatile T7*) t7, (volatile T8*) t8, (volatile T9*) t9); + } +}}} + +#endif // __OPENCV_GPU_UTILITY_HPP__ diff --git a/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp b/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp index 7bb1da7..7a2799f 100644 --- a/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp +++ b/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp @@ -58,35 +58,47 @@ namespace cv { namespace gpu { namespace device template<> __device__ __forceinline__ uchar saturate_cast(schar v) { - return (uchar) ::max((int)v, 0); + uint res = 0; + int vi = v; + asm("cvt.sat.u8.s8 %0, %1;" : "=r"(res) : "r"(vi)); + return res; + } + template<> __device__ __forceinline__ uchar saturate_cast(short v) + { + uint res = 0; + asm("cvt.sat.u8.s16 %0, %1;" : "=r"(res) : "h"(v)); + return res; } template<> __device__ __forceinline__ uchar saturate_cast(ushort v) { - return (uchar) ::min((uint)v, (uint)UCHAR_MAX); + uint res = 0; + asm("cvt.sat.u8.u16 %0, %1;" : "=r"(res) : "h"(v)); + return res; } template<> __device__ __forceinline__ uchar saturate_cast(int v) { - return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); + uint res = 0; + asm("cvt.sat.u8.s32 %0, %1;" : "=r"(res) : "r"(v)); + return res; } template<> __device__ __forceinline__ uchar saturate_cast(uint v) { - return (uchar) ::min(v, (uint)UCHAR_MAX); + uint res = 0; + asm("cvt.sat.u8.u32 %0, %1;" : "=r"(res) : "r"(v)); + return res; } - template<> __device__ __forceinline__ uchar saturate_cast(short v) - { - return saturate_cast((uint)v); - } - template<> __device__ __forceinline__ uchar saturate_cast(float v) { - int iv = __float2int_rn(v); - return saturate_cast(iv); + uint res = 0; + asm("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(res) : "f"(v)); + return res; } template<> __device__ __forceinline__ uchar saturate_cast(double v) { - #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); - return saturate_cast(iv); + #if __CUDA_ARCH__ >= 130 + uint res = 0; + asm("cvt.rni.sat.u8.f64 %0, %1;" : "=r"(res) : "d"(v)); + return res; #else return saturate_cast((float)v); #endif @@ -94,35 +106,47 @@ namespace cv { namespace gpu { namespace device template<> __device__ __forceinline__ schar saturate_cast(uchar v) { - return (schar) ::min((int)v, SCHAR_MAX); + uint res = 0; + uint vi = v; + asm("cvt.sat.s8.u8 %0, %1;" : "=r"(res) : "r"(vi)); + return res; } - template<> __device__ __forceinline__ schar saturate_cast(ushort v) + template<> __device__ __forceinline__ schar saturate_cast(short v) { - return (schar) ::min((uint)v, (uint)SCHAR_MAX); + uint res = 0; + asm("cvt.sat.s8.s16 %0, %1;" : "=r"(res) : "h"(v)); + return res; } - template<> __device__ __forceinline__ schar saturate_cast(int v) + template<> __device__ __forceinline__ schar saturate_cast(ushort v) { - return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); + uint res = 0; + asm("cvt.sat.s8.u16 %0, %1;" : "=r"(res) : "h"(v)); + return res; } - template<> __device__ __forceinline__ schar saturate_cast(short v) + template<> __device__ __forceinline__ schar saturate_cast(int v) { - return saturate_cast((int)v); + uint res = 0; + asm("cvt.sat.s8.s32 %0, %1;" : "=r"(res) : "r"(v)); + return res; } template<> __device__ __forceinline__ schar saturate_cast(uint v) { - return (schar) ::min(v, (uint)SCHAR_MAX); + uint res = 0; + asm("cvt.sat.s8.u32 %0, %1;" : "=r"(res) : "r"(v)); + return res; } - template<> __device__ __forceinline__ schar saturate_cast(float v) { - int iv = __float2int_rn(v); - return saturate_cast(iv); + uint res = 0; + asm("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(res) : "f"(v)); + return res; } template<> __device__ __forceinline__ schar saturate_cast(double v) { - #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); - return saturate_cast(iv); + #if __CUDA_ARCH__ >= 130 + uint res = 0; + asm("cvt.rni.sat.s8.f64 %0, %1;" : "=r"(res) : "d"(v)); + return res; #else return saturate_cast((float)v); #endif @@ -130,30 +154,41 @@ namespace cv { namespace gpu { namespace device template<> __device__ __forceinline__ ushort saturate_cast(schar v) { - return (ushort) ::max((int)v, 0); + ushort res = 0; + int vi = v; + asm("cvt.sat.u16.s8 %0, %1;" : "=h"(res) : "r"(vi)); + return res; } template<> __device__ __forceinline__ ushort saturate_cast(short v) { - return (ushort) ::max((int)v, 0); + ushort res = 0; + asm("cvt.sat.u16.s16 %0, %1;" : "=h"(res) : "h"(v)); + return res; } template<> __device__ __forceinline__ ushort saturate_cast(int v) { - return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); + ushort res = 0; + asm("cvt.sat.u16.s32 %0, %1;" : "=h"(res) : "r"(v)); + return res; } template<> __device__ __forceinline__ ushort saturate_cast(uint v) { - return (ushort) ::min(v, (uint)USHRT_MAX); + ushort res = 0; + asm("cvt.sat.u16.u32 %0, %1;" : "=h"(res) : "r"(v)); + return res; } template<> __device__ __forceinline__ ushort saturate_cast(float v) { - int iv = __float2int_rn(v); - return saturate_cast(iv); + ushort res = 0; + asm("cvt.rni.sat.u16.f32 %0, %1;" : "=h"(res) : "f"(v)); + return res; } template<> __device__ __forceinline__ ushort saturate_cast(double v) { - #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); - return saturate_cast(iv); + #if __CUDA_ARCH__ >= 130 + ushort res = 0; + asm("cvt.rni.sat.u16.f64 %0, %1;" : "=h"(res) : "d"(v)); + return res; #else return saturate_cast((float)v); #endif @@ -161,31 +196,45 @@ namespace cv { namespace gpu { namespace device template<> __device__ __forceinline__ short saturate_cast(ushort v) { - return (short) ::min((int)v, SHRT_MAX); + short res = 0; + asm("cvt.sat.s16.u16 %0, %1;" : "=h"(res) : "h"(v)); + return res; } template<> __device__ __forceinline__ short saturate_cast(int v) { - return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); + short res = 0; + asm("cvt.sat.s16.s32 %0, %1;" : "=h"(res) : "r"(v)); + return res; } template<> __device__ __forceinline__ short saturate_cast(uint v) { - return (short) ::min(v, (uint)SHRT_MAX); + short res = 0; + asm("cvt.sat.s16.u32 %0, %1;" : "=h"(res) : "r"(v)); + return res; } template<> __device__ __forceinline__ short saturate_cast(float v) { - int iv = __float2int_rn(v); - return saturate_cast(iv); + short res = 0; + asm("cvt.rni.sat.s16.f32 %0, %1;" : "=h"(res) : "f"(v)); + return res; } template<> __device__ __forceinline__ short saturate_cast(double v) { - #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130 - int iv = __double2int_rn(v); - return saturate_cast(iv); + #if __CUDA_ARCH__ >= 130 + short res = 0; + asm("cvt.rni.sat.s16.f64 %0, %1;" : "=h"(res) : "d"(v)); + return res; #else return saturate_cast((float)v); #endif } + template<> __device__ __forceinline__ int saturate_cast(uint v) + { + int res = 0; + asm("cvt.sat.s32.u32 %0, %1;" : "=r"(res) : "r"(v)); + return res; + } template<> __device__ __forceinline__ int saturate_cast(float v) { return __float2int_rn(v); @@ -199,6 +248,25 @@ namespace cv { namespace gpu { namespace device #endif } + template<> __device__ __forceinline__ uint saturate_cast(schar v) + { + uint res = 0; + int vi = v; + asm("cvt.sat.u32.s8 %0, %1;" : "=r"(res) : "r"(vi)); + return res; + } + template<> __device__ __forceinline__ uint saturate_cast(short v) + { + uint res = 0; + asm("cvt.sat.u32.s16 %0, %1;" : "=r"(res) : "h"(v)); + return res; + } + template<> __device__ __forceinline__ uint saturate_cast(int v) + { + uint res = 0; + asm("cvt.sat.u32.s32 %0, %1;" : "=r"(res) : "r"(v)); + return res; + } template<> __device__ __forceinline__ uint saturate_cast(float v) { return __float2uint_rn(v); diff --git a/modules/gpu/include/opencv2/gpu/device/utility.hpp b/modules/gpu/include/opencv2/gpu/device/utility.hpp index 88a73a1..83eaaa2 100644 --- a/modules/gpu/include/opencv2/gpu/device/utility.hpp +++ b/modules/gpu/include/opencv2/gpu/device/utility.hpp @@ -45,7 +45,6 @@ #include "saturate_cast.hpp" #include "datamov_utils.hpp" -#include "detail/reduction_detail.hpp" namespace cv { namespace gpu { namespace device { @@ -157,29 +156,6 @@ namespace cv { namespace gpu { namespace device }; /////////////////////////////////////////////////////////////////////////////// - // Reduction - - template __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) - { - StaticAssert= 8 && n <= 512>::check(); - utility_detail::ReductionDispatcher::reduce(data, partial_reduction, tid, op); - } - - template - __device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred) - { - StaticAssert= 8 && n <= 512>::check(); - utility_detail::PredValReductionDispatcher::reduce(myData, myVal, sdata, sval, tid, pred); - } - - template - __device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred) - { - StaticAssert= 8 && n <= 512>::check(); - utility_detail::PredVal2ReductionDispatcher::reduce(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred); - } - - /////////////////////////////////////////////////////////////////////////////// // Solve linear system // solve 2x2 linear system Ax=b diff --git a/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp b/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp index b7861bc..d5b4bb2 100644 --- a/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp +++ b/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp @@ -43,7 +43,7 @@ #ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__ #define __OPENCV_GPU_VEC_DISTANCE_HPP__ -#include "utility.hpp" +#include "reduce.hpp" #include "functional.hpp" #include "detail/vec_distance_detail.hpp" @@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device template __device__ __forceinline__ void reduceAll(int* smem, int tid) { - reduce(smem, mySum, tid, plus()); + reduce(smem, mySum, tid, plus()); } __device__ __forceinline__ operator int() const @@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device template __device__ __forceinline__ void reduceAll(float* smem, int tid) { - reduce(smem, mySum, tid, plus()); + reduce(smem, mySum, tid, plus()); } __device__ __forceinline__ operator float() const @@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device template __device__ __forceinline__ void reduceAll(float* smem, int tid) { - reduce(smem, mySum, tid, plus()); + reduce(smem, mySum, tid, plus()); } __device__ __forceinline__ operator float() const @@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device template __device__ __forceinline__ void reduceAll(int* smem, int tid) { - reduce(smem, mySum, tid, plus()); + reduce(smem, mySum, tid, plus()); } __device__ __forceinline__ operator int() const diff --git a/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp b/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp new file mode 100644 index 0000000..8b4479a --- /dev/null +++ b/modules/gpu/include/opencv2/gpu/device/warp_shuffle.hpp @@ -0,0 +1,145 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPU_WARP_SHUFFLE_HPP__ +#define __OPENCV_GPU_WARP_SHUFFLE_HPP__ + +namespace cv { namespace gpu { namespace device +{ + template + __device__ __forceinline__ T shfl(T val, int srcLane, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + return __shfl(val, srcLane, width); + #else + return T(); + #endif + } + __device__ __forceinline__ unsigned int shfl(unsigned int val, int srcLane, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + return (unsigned int) __shfl((int) val, srcLane, width); + #else + return 0; + #endif + } + __device__ __forceinline__ double shfl(double val, int srcLane, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + int lo = __double2loint(val); + int hi = __double2hiint(val); + + lo = __shfl(lo, srcLane, width); + hi = __shfl(hi, srcLane, width); + + return __hiloint2double(hi, lo); + #else + return 0.0; + #endif + } + + template + __device__ __forceinline__ T shfl_down(T val, unsigned int delta, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + return __shfl_down(val, delta, width); + #else + return T(); + #endif + } + __device__ __forceinline__ unsigned int shfl_down(unsigned int val, unsigned int delta, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + return (unsigned int) __shfl_down((int) val, delta, width); + #else + return 0; + #endif + } + __device__ __forceinline__ double shfl_down(double val, unsigned int delta, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + int lo = __double2loint(val); + int hi = __double2hiint(val); + + lo = __shfl_down(lo, delta, width); + hi = __shfl_down(hi, delta, width); + + return __hiloint2double(hi, lo); + #else + return 0.0; + #endif + } + + template + __device__ __forceinline__ T shfl_up(T val, unsigned int delta, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + return __shfl_up(val, delta, width); + #else + return T(); + #endif + } + __device__ __forceinline__ unsigned int shfl_up(unsigned int val, unsigned int delta, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + return (unsigned int) __shfl_up((int) val, delta, width); + #else + return 0; + #endif + } + __device__ __forceinline__ double shfl_up(double val, unsigned int delta, int width = warpSize) + { + #if __CUDA_ARCH__ >= 300 + int lo = __double2loint(val); + int hi = __double2hiint(val); + + lo = __shfl_up(lo, delta, width); + hi = __shfl_up(hi, delta, width); + + return __hiloint2double(hi, lo); + #else + return 0.0; + #endif + } +}}} + +#endif // __OPENCV_GPU_WARP_SHUFFLE_HPP__ diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index 1a10bc3..242febd 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -68,11 +68,16 @@ void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream) { #ifndef HAVE_CUBLAS - (void)src1; (void)src2; (void)alpha; (void)src3; (void)beta; (void)dst; (void)flags; (void)stream; + (void)src1; + (void)src2; + (void)alpha; + (void)src3; + (void)beta; + (void)dst; + (void)flags; + (void)stream; CV_Error(CV_StsNotImplemented, "The library was build without CUBLAS"); - #else - // CUBLAS works with column-major matrices CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2); @@ -80,7 +85,7 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G if (src1.depth() == CV_64F) { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } @@ -188,7 +193,6 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G } cublasSafeCall( cublasDestroy_v2(handle) ); - #endif } @@ -227,7 +231,7 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s) } else // if (src.elemSize() == 8) { - if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE)) + if (!deviceSupports(NATIVE_DOUBLE)) CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); NppStStreamHandler h(stream); diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp index bc18188..5643c8b 100644 --- a/modules/gpu/src/brute_force_matcher.cpp +++ b/modules/gpu/src/brute_force_matcher.cpp @@ -88,71 +88,71 @@ namespace cv { namespace gpu { namespace device { template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream); + cudaStream_t stream); template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream); + cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream); + cudaStream_t stream); template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream); + cudaStream_t stream); template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream); + cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream); + cudaStream_t stream); } namespace bf_knnmatch { template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, - int cc, cudaStream_t stream); + cudaStream_t stream); template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, - int cc, cudaStream_t stream); + cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, - int cc, cudaStream_t stream); + cudaStream_t stream); template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, - int cc, cudaStream_t stream); + cudaStream_t stream); template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, - int cc, cudaStream_t stream); + cudaStream_t stream); template void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, - int cc, cudaStream_t stream); + cudaStream_t stream); } namespace bf_radius_match { template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream); + cudaStream_t stream); template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream); + cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream); + cudaStream_t stream); template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream); + cudaStream_t stream); template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream); + cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream); + cudaStream_t stream); } }}} @@ -198,11 +198,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const if (query.empty() || train.empty()) return; - using namespace ::cv::gpu::device::bf_match; + using namespace cv::gpu::device::bf_match; typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream); + cudaStream_t stream); static const caller_t callers[3][6] = { @@ -234,10 +234,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const caller_t func = callers[distType][query.depth()]; CV_Assert(func != 0); - DeviceInfo info; - int cc = info.majorVersion() * 10 + info.minorVersion(); - - func(query, train, mask, trainIdx, distance, cc, StreamAccessor::getStream(stream)); + func(query, train, mask, trainIdx, distance, StreamAccessor::getStream(stream)); } void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, vector& matches) @@ -340,11 +337,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c if (query.empty() || trainCollection.empty()) return; - using namespace ::cv::gpu::device::bf_match; + using namespace cv::gpu::device::bf_match; typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream); + cudaStream_t stream); static const caller_t callers[3][6] = { @@ -376,10 +373,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c caller_t func = callers[distType][query.depth()]; CV_Assert(func != 0); - DeviceInfo info; - int cc = info.majorVersion() * 10 + info.minorVersion(); - - func(query, trainCollection, masks, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream)); + func(query, trainCollection, masks, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream)); } void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, vector& matches) @@ -451,11 +445,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co if (query.empty() || train.empty()) return; - using namespace ::cv::gpu::device::bf_knnmatch; + using namespace cv::gpu::device::bf_knnmatch; typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, - int cc, cudaStream_t stream); + cudaStream_t stream); static const caller_t callers[3][6] = { @@ -502,10 +496,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co caller_t func = callers[distType][query.depth()]; CV_Assert(func != 0); - DeviceInfo info; - int cc = info.majorVersion() * 10 + info.minorVersion(); - - func(query, train, k, mask, trainIdx, distance, allDist, cc, StreamAccessor::getStream(stream)); + func(query, train, k, mask, trainIdx, distance, allDist, StreamAccessor::getStream(stream)); } void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, @@ -580,11 +571,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer if (query.empty() || trainCollection.empty()) return; - using namespace ::cv::gpu::device::bf_knnmatch; + using namespace cv::gpu::device::bf_knnmatch; typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, - int cc, cudaStream_t stream); + cudaStream_t stream); static const caller_t callers[3][6] = { @@ -621,10 +612,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer caller_t func = callers[distType][query.depth()]; CV_Assert(func != 0); - DeviceInfo info; - int cc = info.majorVersion() * 10 + info.minorVersion(); - - func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc, StreamAccessor::getStream(stream)); + func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream)); } void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, @@ -765,7 +753,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query, typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream); + cudaStream_t stream); static const caller_t callers[3][6] = { @@ -786,12 +774,6 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query, } }; - DeviceInfo info; - int cc = info.majorVersion() * 10 + info.minorVersion(); - - if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS)) - CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics"); - const int nQuery = query.rows; const int nTrain = train.rows; @@ -814,7 +796,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query, caller_t func = callers[distType][query.depth()]; CV_Assert(func != 0); - func(query, train, maxDistance, mask, trainIdx, distance, nMatches, cc, StreamAccessor::getStream(stream)); + func(query, train, maxDistance, mask, trainIdx, distance, nMatches, StreamAccessor::getStream(stream)); } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches, @@ -897,7 +879,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu typedef void (*caller_t)(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream); + cudaStream_t stream); static const caller_t callers[3][6] = { @@ -918,12 +900,6 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu } }; - DeviceInfo info; - int cc = info.majorVersion() * 10 + info.minorVersion(); - - if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS)) - CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics"); - const int nQuery = query.rows; CV_Assert(query.channels() == 1 && query.depth() < CV_64F); @@ -949,7 +925,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu vector masks_(masks.begin(), masks.end()); func(query, &trains_[0], static_cast(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0], - trainIdx, imgIdx, distance, nMatches, cc, StreamAccessor::getStream(stream)); + trainIdx, imgIdx, distance, nMatches, StreamAccessor::getStream(stream)); } void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches, diff --git a/modules/gpu/src/cuda/bf_knnmatch.cu b/modules/gpu/src/cuda/bf_knnmatch.cu index 6a77873..49bc1dfc 100644 --- a/modules/gpu/src/cuda/bf_knnmatch.cu +++ b/modules/gpu/src/cuda/bf_knnmatch.cu @@ -42,10 +42,13 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/utility.hpp" +#include "opencv2/gpu/device/reduce.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/datamov_utils.hpp" +#include "opencv2/gpu/device/warp_shuffle.hpp" namespace cv { namespace gpu { namespace device { @@ -59,6 +62,45 @@ namespace cv { namespace gpu { namespace device int& bestTrainIdx1, int& bestTrainIdx2, float* s_distance, int* s_trainIdx) { + #if __CUDA_ARCH__ >= 300 + (void) s_distance; + (void) s_trainIdx; + + float d1, d2; + int i1, i2; + + #pragma unroll + for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2) + { + d1 = shfl_down(bestDistance1, i, BLOCK_SIZE); + d2 = shfl_down(bestDistance2, i, BLOCK_SIZE); + i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE); + i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE); + + if (bestDistance1 < d1) + { + if (d1 < bestDistance2) + { + bestDistance2 = d1; + bestTrainIdx2 = i1; + } + } + else + { + bestDistance2 = bestDistance1; + bestTrainIdx2 = bestTrainIdx1; + + bestDistance1 = d1; + bestTrainIdx1 = i1; + + if (d2 < bestDistance2) + { + bestDistance2 = d2; + bestTrainIdx2 = i2; + } + } + } + #else float myBestDistance1 = numeric_limits::max(); float myBestDistance2 = numeric_limits::max(); int myBestTrainIdx1 = -1; @@ -122,6 +164,7 @@ namespace cv { namespace gpu { namespace device bestTrainIdx1 = myBestTrainIdx1; bestTrainIdx2 = myBestTrainIdx2; + #endif } template @@ -130,6 +173,53 @@ namespace cv { namespace gpu { namespace device int& bestImgIdx1, int& bestImgIdx2, float* s_distance, int* s_trainIdx, int* s_imgIdx) { + #if __CUDA_ARCH__ >= 300 + (void) s_distance; + (void) s_trainIdx; + (void) s_imgIdx; + + float d1, d2; + int i1, i2; + int j1, j2; + + #pragma unroll + for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2) + { + d1 = shfl_down(bestDistance1, i, BLOCK_SIZE); + d2 = shfl_down(bestDistance2, i, BLOCK_SIZE); + i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE); + i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE); + j1 = shfl_down(bestImgIdx1, i, BLOCK_SIZE); + j2 = shfl_down(bestImgIdx2, i, BLOCK_SIZE); + + if (bestDistance1 < d1) + { + if (d1 < bestDistance2) + { + bestDistance2 = d1; + bestTrainIdx2 = i1; + bestImgIdx2 = j1; + } + } + else + { + bestDistance2 = bestDistance1; + bestTrainIdx2 = bestTrainIdx1; + bestImgIdx2 = bestImgIdx1; + + bestDistance1 = d1; + bestTrainIdx1 = i1; + bestImgIdx1 = j1; + + if (d2 < bestDistance2) + { + bestDistance2 = d2; + bestTrainIdx2 = i2; + bestImgIdx2 = j2; + } + } + } + #else float myBestDistance1 = numeric_limits::max(); float myBestDistance2 = numeric_limits::max(); int myBestTrainIdx1 = -1; @@ -205,6 +295,7 @@ namespace cv { namespace gpu { namespace device bestImgIdx1 = myBestImgIdx1; bestImgIdx2 = myBestImgIdx2; + #endif } /////////////////////////////////////////////////////////////////////////////// @@ -748,9 +839,8 @@ namespace cv { namespace gpu { namespace device template void match2Dispatcher(const PtrStepSz& query, const PtrStepSz& train, const Mask& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { - (void)cc; if (query.cols <= 64) { matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< PtrStepSz >(trainIdx), static_cast< PtrStepSz > (distance), stream); @@ -780,9 +870,8 @@ namespace cv { namespace gpu { namespace device template void match2Dispatcher(const PtrStepSz& query, const PtrStepSz* trains, int n, const Mask& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { - (void)cc; if (query.cols <= 64) { matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< PtrStepSz >(trainIdx), static_cast< PtrStepSz >(imgIdx), static_cast< PtrStepSz > (distance), stream); @@ -945,9 +1034,8 @@ namespace cv { namespace gpu { namespace device template void calcDistanceDispatcher(const PtrStepSz& query, const PtrStepSz& train, const Mask& mask, const PtrStepSzf& allDist, - int cc, cudaStream_t stream) + cudaStream_t stream) { - (void)cc; if (query.cols <= 64) { calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream); @@ -1005,7 +1093,7 @@ namespace cv { namespace gpu { namespace device s_trainIdx[threadIdx.x] = bestIdx; __syncthreads(); - reducePredVal(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less()); + reduceKeyVal(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less()); if (threadIdx.x == 0) { @@ -1034,7 +1122,7 @@ namespace cv { namespace gpu { namespace device cudaSafeCall( cudaDeviceSynchronize() ); } - void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream) + void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream) { findKnnMatch<256>(k, static_cast(trainIdx), static_cast(distance), allDist, stream); } @@ -1045,16 +1133,16 @@ namespace cv { namespace gpu { namespace device template void matchDispatcher(const PtrStepSz& query, const PtrStepSz& train, int k, const Mask& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (k == 2) { - match2Dispatcher(query, train, mask, trainIdx, distance, cc, stream); + match2Dispatcher(query, train, mask, trainIdx, distance, stream); } else { - calcDistanceDispatcher(query, train, mask, allDist, cc, stream); - findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream); + calcDistanceDispatcher(query, train, mask, allDist, stream); + findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream); } } @@ -1063,105 +1151,105 @@ namespace cv { namespace gpu { namespace device template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (mask.data) - matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); + matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream); else - matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); + matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), k, WithOutMask(), trainIdx, distance, allDist, stream); } - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - //template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + //template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (mask.data) - matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); + matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream); else - matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); + matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), k, WithOutMask(), trainIdx, distance, allDist, stream); } - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (mask.data) - matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); + matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream); else - matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); + matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), k, WithOutMask(), trainIdx, distance, allDist, stream); } - template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); - template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, int cc, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream); template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (masks.data) - match2Dispatcher< L1Dist >(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); + match2Dispatcher< L1Dist >(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream); else - match2Dispatcher< L1Dist >(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); + match2Dispatcher< L1Dist >(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream); } - template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - //template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); + template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + //template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + template void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (masks.data) - match2Dispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); + match2Dispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream); else - match2Dispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); + match2Dispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream); } - //template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - //template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - //template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - //template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - //template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); + //template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + //template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + //template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + //template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + //template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + template void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); template void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (masks.data) - match2Dispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); + match2Dispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream); else - match2Dispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); + match2Dispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream); } - template void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - //template void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - template void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - //template void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); - template void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, int cc, cudaStream_t stream); + template void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + //template void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + template void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + //template void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); + template void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream); } // namespace bf_knnmatch }}} // namespace cv { namespace gpu { namespace device { -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/bf_match.cu b/modules/gpu/src/cuda/bf_match.cu index f50089e..5e64e31 100644 --- a/modules/gpu/src/cuda/bf_match.cu +++ b/modules/gpu/src/cuda/bf_match.cu @@ -42,7 +42,9 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/utility.hpp" +#include "opencv2/gpu/device/reduce.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/datamov_utils.hpp" @@ -60,12 +62,7 @@ namespace cv { namespace gpu { namespace device s_distance += threadIdx.y * BLOCK_SIZE; s_trainIdx += threadIdx.y * BLOCK_SIZE; - s_distance[threadIdx.x] = bestDistance; - s_trainIdx[threadIdx.x] = bestTrainIdx; - - __syncthreads(); - - reducePredVal(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less()); + reduceKeyVal(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less()); } template @@ -75,13 +72,7 @@ namespace cv { namespace gpu { namespace device s_trainIdx += threadIdx.y * BLOCK_SIZE; s_imgIdx += threadIdx.y * BLOCK_SIZE; - s_distance[threadIdx.x] = bestDistance; - s_trainIdx[threadIdx.x] = bestTrainIdx; - s_imgIdx [threadIdx.x] = bestImgIdx; - - __syncthreads(); - - reducePredVal2(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less()); + reduceKeyVal(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less()); } /////////////////////////////////////////////////////////////////////////////// @@ -567,9 +558,8 @@ namespace cv { namespace gpu { namespace device template void matchDispatcher(const PtrStepSz& query, const PtrStepSz& train, const Mask& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { - (void)cc; if (query.cols <= 64) { matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream); @@ -599,9 +589,8 @@ namespace cv { namespace gpu { namespace device template void matchDispatcher(const PtrStepSz& query, const PtrStepSz* trains, int n, const Mask& mask, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { - (void)cc; if (query.cols <= 64) { matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); @@ -633,153 +622,153 @@ namespace cv { namespace gpu { namespace device template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (mask.data) { matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), SingleMask(mask), trainIdx, distance, - cc, stream); + stream); } else { matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), WithOutMask(), trainIdx, distance, - cc, stream); + stream); } } - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (mask.data) { matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), SingleMask(mask), trainIdx, distance, - cc, stream); + stream); } else { matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), WithOutMask(), trainIdx, distance, - cc, stream); + stream); } } - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (mask.data) { matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), SingleMask(mask), trainIdx, distance, - cc, stream); + stream); } else { matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), WithOutMask(), trainIdx, distance, - cc, stream); + stream); } } - template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (masks.data) { matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, - cc, stream); + stream); } else { matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, - cc, stream); + stream); } } - template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (masks.data) { matchDispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, - cc, stream); + stream); } else { matchDispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, - cc, stream); + stream); } } - //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (masks.data) { matchDispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, - cc, stream); + stream); } else { matchDispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, - cc, stream); + stream); } } - template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); - template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, int cc, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + //template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); } // namespace bf_match }}} // namespace cv { namespace gpu { namespace device { -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/bf_radius_match.cu b/modules/gpu/src/cuda/bf_radius_match.cu index 934b8fe..19ee94e 100644 --- a/modules/gpu/src/cuda/bf_radius_match.cu +++ b/modules/gpu/src/cuda/bf_radius_match.cu @@ -42,7 +42,8 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/utility.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/datamov_utils.hpp" @@ -58,8 +59,6 @@ namespace cv { namespace gpu { namespace device __global__ void matchUnrolled(const PtrStepSz query, int imgIdx, const PtrStepSz train, float maxDistance, const Mask mask, PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) { - #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110) - extern __shared__ int smem[]; const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; @@ -110,8 +109,6 @@ namespace cv { namespace gpu { namespace device bestDistance.ptr(queryIdx)[ind] = distVal; } } - - #endif } template @@ -170,8 +167,6 @@ namespace cv { namespace gpu { namespace device __global__ void match(const PtrStepSz query, int imgIdx, const PtrStepSz train, float maxDistance, const Mask mask, PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) { - #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110) - extern __shared__ int smem[]; const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; @@ -221,8 +216,6 @@ namespace cv { namespace gpu { namespace device bestDistance.ptr(queryIdx)[ind] = distVal; } } - - #endif } template @@ -281,9 +274,8 @@ namespace cv { namespace gpu { namespace device template void matchDispatcher(const PtrStepSz& query, const PtrStepSz& train, float maxDistance, const Mask& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream) + cudaStream_t stream) { - (void)cc; if (query.cols <= 64) { matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); @@ -313,9 +305,8 @@ namespace cv { namespace gpu { namespace device template void matchDispatcher(const PtrStepSz& query, const PtrStepSz* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream) + cudaStream_t stream) { - (void)cc; if (query.cols <= 64) { matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); @@ -347,126 +338,126 @@ namespace cv { namespace gpu { namespace device template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (mask.data) { matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), maxDistance, SingleMask(mask), trainIdx, distance, nMatches, - cc, stream); + stream); } else { matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), maxDistance, WithOutMask(), trainIdx, distance, nMatches, - cc, stream); + stream); } } - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (mask.data) { matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), maxDistance, SingleMask(mask), trainIdx, distance, nMatches, - cc, stream); + stream); } else { matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), maxDistance, WithOutMask(), trainIdx, distance, nMatches, - cc, stream); + stream); } } - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchL2_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream) + cudaStream_t stream) { if (mask.data) { matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), maxDistance, SingleMask(mask), trainIdx, distance, nMatches, - cc, stream); + stream); } else { matchDispatcher(static_cast< PtrStepSz >(query), static_cast< PtrStepSz >(train), maxDistance, WithOutMask(), trainIdx, distance, nMatches, - cc, stream); + stream); } } - template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream) + cudaStream_t stream) { matchDispatcher< L1Dist >(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, - cc, stream); + stream); } - template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream) + cudaStream_t stream) { matchDispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, - cc, stream); + stream); } - //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, - int cc, cudaStream_t stream) + cudaStream_t stream) { matchDispatcher(static_cast< PtrStepSz >(query), (const PtrStepSz*)trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, - cc, stream); + stream); } - template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); - template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, int cc, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + //template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); + template void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz& nMatches, cudaStream_t stream); } // namespace bf_radius_match }}} // namespace cv { namespace gpu { namespace device -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/calib3d.cu b/modules/gpu/src/cuda/calib3d.cu index 40c8475..0fd482c 100644 --- a/modules/gpu/src/cuda/calib3d.cu +++ b/modules/gpu/src/cuda/calib3d.cu @@ -42,9 +42,10 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/functional.hpp" +#include "opencv2/gpu/device/reduce.hpp" namespace cv { namespace gpu { namespace device { @@ -66,6 +67,8 @@ namespace cv { namespace gpu { namespace device crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y, crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z); } + __device__ __forceinline__ TransformOp() {} + __device__ __forceinline__ TransformOp(const TransformOp&) {} }; void call(const PtrStepSz src, const float* rot, @@ -103,6 +106,8 @@ namespace cv { namespace gpu { namespace device (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z, (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z); } + __device__ __forceinline__ ProjectOp() {} + __device__ __forceinline__ ProjectOp(const ProjectOp&) {} }; void call(const PtrStepSz src, const float* rot, @@ -134,6 +139,7 @@ namespace cv { namespace gpu { namespace device return x * x; } + template __global__ void computeHypothesisScoresKernel( const int num_points, const float3* object, const float2* image, const float dist_threshold, int* g_num_inliers) @@ -156,19 +162,11 @@ namespace cv { namespace gpu { namespace device ++num_inliers; } - extern __shared__ float s_num_inliers[]; - s_num_inliers[threadIdx.x] = num_inliers; - __syncthreads(); - - for (int step = blockDim.x / 2; step > 0; step >>= 1) - { - if (threadIdx.x < step) - s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step]; - __syncthreads(); - } + __shared__ int s_num_inliers[BLOCK_SIZE]; + reduce(s_num_inliers, num_inliers, threadIdx.x, plus()); if (threadIdx.x == 0) - g_num_inliers[blockIdx.x] = s_num_inliers[0]; + g_num_inliers[blockIdx.x] = num_inliers; } void computeHypothesisScores( @@ -181,9 +179,8 @@ namespace cv { namespace gpu { namespace device dim3 threads(256); dim3 grid(num_hypotheses); - int smem_size = threads.x * sizeof(float); - computeHypothesisScoresKernel<<>>( + computeHypothesisScoresKernel<256><<>>( num_points, object, image, dist_threshold, hypothesis_scores); cudaSafeCall( cudaGetLastError() ); @@ -193,4 +190,4 @@ namespace cv { namespace gpu { namespace device }}} // namespace cv { namespace gpu { namespace device -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/canny.cu b/modules/gpu/src/cuda/canny.cu index 3dc0486..0a5daeb 100644 --- a/modules/gpu/src/cuda/canny.cu +++ b/modules/gpu/src/cuda/canny.cu @@ -43,459 +43,451 @@ #if !defined CUDA_DISABLER #include -#include -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" +#include "opencv2/gpu/device/emulation.hpp" +#include "opencv2/gpu/device/transform.hpp" +#include "opencv2/gpu/device/functional.hpp" +#include "opencv2/gpu/device/utility.hpp" -namespace cv { namespace gpu { namespace device +using namespace cv::gpu; +using namespace cv::gpu::device; + +namespace canny { - namespace canny + struct L1 : binary_function { - __global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) + __device__ __forceinline__ float operator ()(int x, int y) const { - __shared__ int smem[16][18]; - - const int j = blockIdx.x * blockDim.x + threadIdx.x; - const int i = blockIdx.y * blockDim.y + threadIdx.y; - - if (i < rows) - { - smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j]; - if (threadIdx.x == 0) - { - smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)]; - smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)]; - } - __syncthreads(); - - if (j < cols) - { - dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2]; - dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2]; - } - } + return ::abs(x) + ::abs(y); } - void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) + __device__ __forceinline__ L1() {} + __device__ __forceinline__ L1(const L1&) {} + }; + struct L2 : binary_function + { + __device__ __forceinline__ float operator ()(int x, int y) const { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + return ::sqrtf(x * x + y * y); + } - calcSobelRowPass<<>>(src, dx_buf, dy_buf, rows, cols); - cudaSafeCall( cudaGetLastError() ); + __device__ __forceinline__ L2() {} + __device__ __forceinline__ L2(const L2&) {} + }; +} - cudaSafeCall( cudaDeviceSynchronize() ); - } +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits + { + enum { smart_shift = 4 }; + }; + template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits + { + enum { smart_shift = 4 }; + }; +}}} - struct L1 - { - static __device__ __forceinline__ float calc(int x, int y) - { - return ::abs(x) + ::abs(y); - } - }; - struct L2 - { - static __device__ __forceinline__ float calc(int x, int y) - { - return ::sqrtf(x * x + y * y); - } - }; +namespace canny +{ + texture tex_src(false, cudaFilterModePoint, cudaAddressModeClamp); + struct SrcTex + { + const int xoff; + const int yoff; + __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {} - template __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, - PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) + __device__ __forceinline__ int operator ()(int y, int x) const { - __shared__ int sdx[18][16]; - __shared__ int sdy[18][16]; + return tex2D(tex_src, x + xoff, y + yoff); + } + }; - const int j = blockIdx.x * blockDim.x + threadIdx.x; - const int i = blockIdx.y * blockDim.y + threadIdx.y; + template __global__ + void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm) + { + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; - if (j < cols) - { - sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j]; - sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j]; - if (threadIdx.y == 0) - { - sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j]; - sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j]; + if (y >= mag.rows || x >= mag.cols) + return; - sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j]; - sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j]; - } - __syncthreads(); + int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1)); + int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1)); - if (i < rows) - { - int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x]; - int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x]; + dx(y, x) = dxVal; + dy(y, x) = dyVal; - dx.ptr(i)[j] = x; - dy.ptr(i)[j] = y; + mag(y, x) = norm(dxVal, dyVal); + } - mag.ptr(i + 1)[j + 1] = Norm::calc(x, y); - } - } - } + void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad) + { + const dim3 block(16, 16); + const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y)); - void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad) - { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + bindTexture(&tex_src, srcWhole); + SrcTex src(xoff, yoff); - if (L2Grad) - calcMagnitude<<>>(dx_buf, dy_buf, dx, dy, mag, rows, cols); - else - calcMagnitude<<>>(dx_buf, dy_buf, dx, dy, mag, rows, cols); + if (L2Grad) + { + L2 norm; + calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); + } + else + { + L1 norm; + calcMagnitudeKernel<<>>(src, dx, dy, mag, norm); + } - cudaSafeCall( cudaGetLastError() ); + cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); - } + cudaSafeCall(cudaThreadSynchronize()); + } - template __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) + void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad) + { + if (L2Grad) { - const int j = blockIdx.x * blockDim.x + threadIdx.x; - const int i = blockIdx.y * blockDim.y + threadIdx.y; - - if (i < rows && j < cols) - mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]); + L2 norm; + transform(dx, dy, mag, norm, WithOutMask(), 0); } - - void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad) + else { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + L1 norm; + transform(dx, dy, mag, norm, WithOutMask(), 0); + } + } +} - if (L2Grad) - calcMagnitude<<>>(dx, dy, mag, rows, cols); - else - calcMagnitude<<>>(dx, dy, mag, rows, cols); +////////////////////////////////////////////////////////////////////////////////////////// - cudaSafeCall( cudaGetLastError() ); +namespace canny +{ + texture tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp); - cudaSafeCall( cudaDeviceSynchronize() ); - } + __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh) + { + const int CANNY_SHIFT = 15; + const int TG22 = (int)(0.4142135623730950488016887242097*(1<= dx.cols - 1 || y == 0 || y >= dx.rows - 1) + return; - __global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh) - { - __shared__ float smem[18][18]; + int dxVal = dx(y, x); + int dyVal = dy(y, x); - const int j = blockIdx.x * 16 + threadIdx.x; - const int i = blockIdx.y * 16 + threadIdx.y; + const int s = (dxVal ^ dyVal) < 0 ? -1 : 1; + const float m = tex2D(tex_mag, x, y); - const int tid = threadIdx.y * 16 + threadIdx.x; - const int lx = tid % 18; - const int ly = tid / 18; + dxVal = ::abs(dxVal); + dyVal = ::abs(dyVal); - if (ly < 14) - smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx]; + // 0 - the pixel can not belong to an edge + // 1 - the pixel might belong to an edge + // 2 - the pixel does belong to an edge + int edge_type = 0; - if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols) - smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx]; + if (m > low_thresh) + { + const int tg22x = dxVal * TG22; + const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT); - __syncthreads(); + dyVal <<= CANNY_SHIFT; - if (i < rows && j < cols) + if (dyVal < tg22x) { - int x = dx.ptr(i)[j]; - int y = dy.ptr(i)[j]; - const int s = (x ^ y) < 0 ? -1 : 1; - const float m = smem[threadIdx.y + 1][threadIdx.x + 1]; + if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y)) + edge_type = 1 + (int)(m > high_thresh); + } + else if(dyVal > tg67x) + { + if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1)) + edge_type = 1 + (int)(m > high_thresh); + } + else + { + if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1)) + edge_type = 1 + (int)(m > high_thresh); + } + } - x = ::abs(x); - y = ::abs(y); + map(y, x) = edge_type; + } - // 0 - the pixel can not belong to an edge - // 1 - the pixel might belong to an edge - // 2 - the pixel does belong to an edge - int edge_type = 0; + void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh) + { + const dim3 block(16, 16); + const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y)); - if (m > low_thresh) - { - const int tg22x = x * TG22; - const int tg67x = tg22x + ((x + x) << CANNY_SHIFT); - - y <<= CANNY_SHIFT; - - if (y < tg22x) - { - if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2]) - edge_type = 1 + (int)(m > high_thresh); - } - else if( y > tg67x ) - { - if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1]) - edge_type = 1 + (int)(m > high_thresh); - } - else - { - if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s]) - edge_type = 1 + (int)(m > high_thresh); - } - } + bindTexture(&tex_mag, mag); - map.ptr(i + 1)[j + 1] = edge_type; - } - } + calcMapKernel<<>>(dx, dy, map, low_thresh, high_thresh); + cudaSafeCall( cudaGetLastError() ); - #undef CANNY_SHIFT - #undef TG22 + cudaSafeCall( cudaDeviceSynchronize() ); + } +} - void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh) +////////////////////////////////////////////////////////////////////////////////////////// + +namespace canny +{ + __device__ int counter = 0; + + __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, ushort2* st) + { + __shared__ volatile int smem[18][18]; + + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0; + if (threadIdx.y == 0) + smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0; + if (threadIdx.y == blockDim.y - 1) + smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0; + if (threadIdx.x == 0) + smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0; + if (threadIdx.x == blockDim.x - 1) + smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0; + if (threadIdx.x == 0 && threadIdx.y == 0) + smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0; + if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0) + smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0; + if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1) + smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0; + if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1) + smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0; + + __syncthreads(); + + if (x >= map.cols || y >= map.rows) + return; + + int n; + + #pragma unroll + for (int k = 0; k < 16; ++k) { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + n = 0; - calcMap<<>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh); - cudaSafeCall( cudaGetLastError() ); + if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1) + { + n += smem[threadIdx.y ][threadIdx.x ] == 2; + n += smem[threadIdx.y ][threadIdx.x + 1] == 2; + n += smem[threadIdx.y ][threadIdx.x + 2] == 2; - cudaSafeCall( cudaDeviceSynchronize() ); - } + n += smem[threadIdx.y + 1][threadIdx.x ] == 2; + n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2; - ////////////////////////////////////////////////////////////////////////////////////////// + n += smem[threadIdx.y + 2][threadIdx.x ] == 2; + n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2; + n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2; + } - __device__ unsigned int counter = 0; + if (n > 0) + smem[threadIdx.y + 1][threadIdx.x + 1] = 2; + } - __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols) - { - #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120) + const int e = smem[threadIdx.y + 1][threadIdx.x + 1]; - __shared__ int smem[18][18]; + map(y, x) = e; - const int j = blockIdx.x * 16 + threadIdx.x; - const int i = blockIdx.y * 16 + threadIdx.y; + n = 0; - const int tid = threadIdx.y * 16 + threadIdx.x; - const int lx = tid % 18; - const int ly = tid / 18; + if (e == 2) + { + n += smem[threadIdx.y ][threadIdx.x ] == 1; + n += smem[threadIdx.y ][threadIdx.x + 1] == 1; + n += smem[threadIdx.y ][threadIdx.x + 2] == 1; - if (ly < 14) - smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx]; + n += smem[threadIdx.y + 1][threadIdx.x ] == 1; + n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1; - if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols) - smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx]; + n += smem[threadIdx.y + 2][threadIdx.x ] == 1; + n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1; + n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1; + } - __syncthreads(); + if (n > 0) + { + const int ind = ::atomicAdd(&counter, 1); + st[ind] = make_ushort2(x, y); + } + } - if (i < rows && j < cols) - { - int n; + void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1) + { + void* counter_ptr; + cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); - #pragma unroll - for (int k = 0; k < 16; ++k) - { - n = 0; + cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); - if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1) - { - n += smem[threadIdx.y ][threadIdx.x ] == 2; - n += smem[threadIdx.y ][threadIdx.x + 1] == 2; - n += smem[threadIdx.y ][threadIdx.x + 2] == 2; + const dim3 block(16, 16); + const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y)); - n += smem[threadIdx.y + 1][threadIdx.x ] == 2; - n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2; + edgesHysteresisLocalKernel<<>>(map, st1); + cudaSafeCall( cudaGetLastError() ); - n += smem[threadIdx.y + 2][threadIdx.x ] == 2; - n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2; - n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2; - } + cudaSafeCall( cudaDeviceSynchronize() ); + } +} - if (n > 0) - smem[threadIdx.y + 1][threadIdx.x + 1] = 2; - } +////////////////////////////////////////////////////////////////////////////////////////// - const int e = smem[threadIdx.y + 1][threadIdx.x + 1]; +namespace canny +{ + __constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; + __constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; - map.ptr(i + 1)[j + 1] = e; + __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count) + { + const int stack_size = 512; - n = 0; + __shared__ int s_counter; + __shared__ int s_ind; + __shared__ ushort2 s_st[stack_size]; - if (e == 2) - { - n += smem[threadIdx.y ][threadIdx.x ] == 1; - n += smem[threadIdx.y ][threadIdx.x + 1] == 1; - n += smem[threadIdx.y ][threadIdx.x + 2] == 1; + if (threadIdx.x == 0) + s_counter = 0; - n += smem[threadIdx.y + 1][threadIdx.x ] == 1; - n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1; + __syncthreads(); - n += smem[threadIdx.y + 2][threadIdx.x ] == 1; - n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1; - n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1; - } + int ind = blockIdx.y * gridDim.x + blockIdx.x; - if (n > 0) - { - const unsigned int ind = atomicInc(&counter, (unsigned int)(-1)); - st[ind] = make_ushort2(j + 1, i + 1); - } - } + if (ind >= count) + return; - #endif - } + ushort2 pos = st1[ind]; - void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols) + if (threadIdx.x < 8) { - void* counter_ptr; - cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); - - cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); + pos.x += c_dx[threadIdx.x]; + pos.y += c_dy[threadIdx.x]; - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1) + { + map(pos.y, pos.x) = 2; - edgesHysteresisLocal<<>>(map, st1, rows, cols); - cudaSafeCall( cudaGetLastError() ); + ind = Emulation::smem::atomicAdd(&s_counter, 1); - cudaSafeCall( cudaDeviceSynchronize() ); + s_st[ind] = pos; + } } - __constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; - __constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; + __syncthreads(); - __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count) + while (s_counter > 0 && s_counter <= stack_size - blockDim.x) { - #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 120 + const int subTaskIdx = threadIdx.x >> 3; + const int portion = ::min(s_counter, blockDim.x >> 3); - const int stack_size = 512; + if (subTaskIdx < portion) + pos = s_st[s_counter - 1 - subTaskIdx]; - __shared__ unsigned int s_counter; - __shared__ unsigned int s_ind; - __shared__ ushort2 s_st[stack_size]; + __syncthreads(); if (threadIdx.x == 0) - s_counter = 0; - __syncthreads(); + s_counter -= portion; - int ind = blockIdx.y * gridDim.x + blockIdx.x; + __syncthreads(); - if (ind < count) + if (subTaskIdx < portion) { - ushort2 pos = st1[ind]; + pos.x += c_dx[threadIdx.x & 7]; + pos.y += c_dy[threadIdx.x & 7]; - if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) + if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1) { - if (threadIdx.x < 8) - { - pos.x += c_dx[threadIdx.x]; - pos.y += c_dy[threadIdx.x]; - - if (map.ptr(pos.y)[pos.x] == 1) - { - map.ptr(pos.y)[pos.x] = 2; - - ind = atomicInc(&s_counter, (unsigned int)(-1)); - - s_st[ind] = pos; - } - } - __syncthreads(); - - while (s_counter > 0 && s_counter <= stack_size - blockDim.x) - { - const int subTaskIdx = threadIdx.x >> 3; - const int portion = ::min(s_counter, blockDim.x >> 3); - - pos.x = pos.y = 0; - - if (subTaskIdx < portion) - pos = s_st[s_counter - 1 - subTaskIdx]; - __syncthreads(); - - if (threadIdx.x == 0) - s_counter -= portion; - __syncthreads(); - - if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) - { - pos.x += c_dx[threadIdx.x & 7]; - pos.y += c_dy[threadIdx.x & 7]; - - if (map.ptr(pos.y)[pos.x] == 1) - { - map.ptr(pos.y)[pos.x] = 2; - - ind = atomicInc(&s_counter, (unsigned int)(-1)); - - s_st[ind] = pos; - } - } - __syncthreads(); - } - - if (s_counter > 0) - { - if (threadIdx.x == 0) - { - ind = atomicAdd(&counter, s_counter); - s_ind = ind - s_counter; - } - __syncthreads(); - - ind = s_ind; - - for (int i = threadIdx.x; i < s_counter; i += blockDim.x) - { - st2[ind + i] = s_st[i]; - } - } + map(pos.y, pos.x) = 2; + + ind = Emulation::smem::atomicAdd(&s_counter, 1); + + s_st[ind] = pos; } } - #endif + __syncthreads(); } - void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols) + if (s_counter > 0) { - void* counter_ptr; - cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); - - unsigned int count; - cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); - - while (count > 0) + if (threadIdx.x == 0) { - cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); - - dim3 block(128, 1, 1); - dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1); - edgesHysteresisGlobal<<>>(map, st1, st2, rows, cols, count); - cudaSafeCall( cudaGetLastError() ); + ind = ::atomicAdd(&counter, s_counter); + s_ind = ind - s_counter; + } - cudaSafeCall( cudaDeviceSynchronize() ); + __syncthreads(); - cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); + ind = s_ind; - std::swap(st1, st2); - } + for (int i = threadIdx.x; i < s_counter; i += blockDim.x) + st2[ind + i] = s_st[i]; } + } - __global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols) - { - const int j = blockIdx.x * 16 + threadIdx.x; - const int i = blockIdx.y * 16 + threadIdx.y; + void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2) + { + void* counter_ptr; + cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) ); - if (i < rows && j < cols) - dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1)); - } + int count; + cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); - void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols) + while (count > 0) { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); + + const dim3 block(128); + const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1); - getEdges<<>>(map, dst, rows, cols); + edgesHysteresisGlobalKernel<<>>(map, st1, st2, count); cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaDeviceSynchronize() ); + + cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); + + std::swap(st1, st2); } - } // namespace canny -}}} // namespace cv { namespace gpu { namespace device + } +} + +////////////////////////////////////////////////////////////////////////////////////////// +namespace canny +{ + struct GetEdges : unary_function + { + __device__ __forceinline__ uchar operator ()(int e) const + { + return (uchar)(-(e >> 1)); + } + + __device__ __forceinline__ GetEdges() {} + __device__ __forceinline__ GetEdges(const GetEdges&) {} + }; +} + +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits + { + enum { smart_shift = 4 }; + }; +}}} + +namespace canny +{ + void getEdges(PtrStepSzi map, PtrStepSzb dst) + { + transform(map, dst, GetEdges(), WithOutMask(), 0); + } +} -#endif /* CUDA_DISABLER */ \ No newline at end of file +#endif /* CUDA_DISABLER */ diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu index c61601d..4b52cc7 100644 --- a/modules/gpu/src/cuda/element_operations.cu +++ b/modules/gpu/src/cuda/element_operations.cu @@ -42,1851 +42,2398 @@ #if !defined CUDA_DISABLER -#include "internal_shared.hpp" +#include "opencv2/gpu/device/common.hpp" #include "opencv2/gpu/device/functional.hpp" #include "opencv2/gpu/device/vec_math.hpp" #include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/saturate_cast.hpp" -namespace cv { namespace gpu { namespace device -{ - ////////////////////////////////////////////////////////////////////////// - // add +using namespace cv::gpu; +using namespace cv::gpu::device; - template struct Add : binary_function +namespace arithm +{ + template struct ArithmFuncTraits { - __device__ __forceinline__ D operator ()(T a, T b) const - { - return saturate_cast(a + b); - } + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 1 }; }; - template <> struct TransformFunctorTraits< Add > : DefaultTransformFunctorTraits< Add > + template <> struct ArithmFuncTraits<1, 1> { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Add > : DefaultTransformFunctorTraits< Add > + template <> struct ArithmFuncTraits<1, 2> { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Add > : DefaultTransformFunctorTraits< Add > + template <> struct ArithmFuncTraits<1, 4> { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Add > : DefaultTransformFunctorTraits< Add > + + template <> struct ArithmFuncTraits<2, 1> { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream) + template <> struct ArithmFuncTraits<2, 2> { - if (mask.data) - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Add(), SingleMask(mask), stream); - else - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Add(), WithOutMask(), stream); - } + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - template struct AddScalar : unary_function - { - AddScalar(double val_) : val(val_) {} - __device__ __forceinline__ D operator ()(T a) const - { - return saturate_cast(a + val); - } - const double val; + enum { smart_block_dim_x = 32 }; + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; }; - - template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > + template <> struct ArithmFuncTraits<2, 4> { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > + + template <> struct ArithmFuncTraits<4, 1> { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > + template <> struct ArithmFuncTraits<4, 2> { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AddScalar > : DefaultTransformFunctorTraits< AddScalar > + template <> struct ArithmFuncTraits<4, 4> { + enum { simple_block_dim_x = 32 }; + enum { simple_block_dim_y = 8 }; + + enum { smart_block_dim_x = 32 }; enum { smart_block_dim_y = 8 }; enum { smart_shift = 4 }; }; +} - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream) - { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - AddScalar op(val); - if (mask.data) - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, SingleMask(mask), stream); - else - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); - } +////////////////////////////////////////////////////////////////////////// +// addMat - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - ////////////////////////////////////////////////////////////////////////// - // subtract - - template struct Subtract : binary_function +namespace arithm +{ + template struct VAdd4; + template <> struct VAdd4 : binary_function { - __device__ __forceinline__ D operator ()(T a, T b) const + __device__ __forceinline__ uint operator ()(uint a, uint b) const { - return saturate_cast(a - b); + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; } - }; - template <> struct TransformFunctorTraits< Subtract > : DefaultTransformFunctorTraits< Subtract > + __device__ __forceinline__ VAdd4() {} + __device__ __forceinline__ VAdd4(const VAdd4& other) {} + }; + template <> struct VAdd4 : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ uint operator ()(int a, int b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd4() {} + __device__ __forceinline__ VAdd4(const VAdd4& other) {} }; - template <> struct TransformFunctorTraits< Subtract > : DefaultTransformFunctorTraits< Subtract > + template <> struct VAdd4 : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ int operator ()(uint a, uint b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd4() {} + __device__ __forceinline__ VAdd4(const VAdd4& other) {} }; - template <> struct TransformFunctorTraits< Subtract > : DefaultTransformFunctorTraits< Subtract > + template <> struct VAdd4 : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd4() {} + __device__ __forceinline__ VAdd4(const VAdd4& other) {} }; - template <> struct TransformFunctorTraits< Subtract > : DefaultTransformFunctorTraits< Subtract > + + //////////////////////////////////// + + template struct VAdd2; + template <> struct VAdd2 : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd2() {} + __device__ __forceinline__ VAdd2(const VAdd2& other) {} }; + template <> struct VAdd2 : binary_function + { + __device__ __forceinline__ int operator ()(uint a, uint b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream) + __device__ __forceinline__ VAdd2() {} + __device__ __forceinline__ VAdd2(const VAdd2& other) {} + }; + template <> struct VAdd2 : binary_function { - if (mask.data) - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Subtract(), SingleMask(mask), stream); - else - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Subtract(), WithOutMask(), stream); - } + __device__ __forceinline__ uint operator ()(int a, int b) const + { + uint res = 0; - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - template struct SubtractScalar : unary_function - { - SubtractScalar(double val_) : val(val_) {} - __device__ __forceinline__ D operator ()(T a) const + #if __CUDA_ARCH__ >= 300 + asm("vadd2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAdd2() {} + __device__ __forceinline__ VAdd2(const VAdd2& other) {} + }; + template <> struct VAdd2 : binary_function + { + __device__ __forceinline__ int operator ()(int a, int b) const { - return saturate_cast(a - val); + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vadd2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vadd.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vadd.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; } - const double val; + + __device__ __forceinline__ VAdd2() {} + __device__ __forceinline__ VAdd2(const VAdd2& other) {} }; - template <> struct TransformFunctorTraits< SubtractScalar > : DefaultTransformFunctorTraits< SubtractScalar > + //////////////////////////////////// + + template struct AddMat : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ D operator ()(T a, T b) const + { + return saturate_cast(a + b); + } + + __device__ __forceinline__ AddMat() {} + __device__ __forceinline__ AddMat(const AddMat& other) {} }; - template <> struct TransformFunctorTraits< SubtractScalar > : DefaultTransformFunctorTraits< SubtractScalar > +} + +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< arithm::VAdd4 > : arithm::ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< SubtractScalar > : DefaultTransformFunctorTraits< SubtractScalar > + + //////////////////////////////////// + + template struct TransformFunctorTraits< arithm::VAdd2 > : arithm::ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< SubtractScalar > : DefaultTransformFunctorTraits< SubtractScalar > + + //////////////////////////////////// + + template struct TransformFunctorTraits< arithm::AddMat > : arithm::ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; +}}} + +namespace arithm +{ + template + void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAdd4(), WithOutMask(), stream); + } + + template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + + template + void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + { + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VAdd2(), WithOutMask(), stream); + } + + template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream) + template + void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - SubtractScalar op(val); if (mask.data) - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, SingleMask(mask), stream); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, AddMat(), mask, stream); else - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, AddMat(), WithOutMask(), stream); } - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - //template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - template void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream); - - ////////////////////////////////////////////////////////////////////////// - // multiply - - struct multiply_8uc4_32f : binary_function + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// addScalar + +namespace arithm +{ + template struct AddScalar : unary_function { - __device__ __forceinline__ uint operator ()(uint a, float b) const - { - uint res = 0; + S val; - res |= (saturate_cast((0xffu & (a )) * b) ); - res |= (saturate_cast((0xffu & (a >> 8)) * b) << 8); - res |= (saturate_cast((0xffu & (a >> 16)) * b) << 16); - res |= (saturate_cast((0xffu & (a >> 24)) * b) << 24); + explicit AddScalar(S val_) : val(val_) {} - return res; + __device__ __forceinline__ D operator ()(T a) const + { + return saturate_cast(a + val); } }; +} - OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(multiply_8uc4_32f) +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< arithm::AddScalar > : arithm::ArithmFuncTraits { - enum { smart_block_dim_x = 8 }; - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 8 }; }; +}}} - void multiply_gpu(const PtrStepSz& src1, const PtrStepSzf& src2, const PtrStepSz& dst, cudaStream_t stream) +namespace arithm +{ + template + void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) { - cv::gpu::device::transform(static_cast< PtrStepSz >(src1), src2, static_cast< PtrStepSz >(dst), multiply_8uc4_32f(), WithOutMask(), stream); + AddScalar op(static_cast(val)); + + if (mask.data) + transform((PtrStepSz) src1, (PtrStepSz) dst, op, mask, stream); + else + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); } - struct multiply_16sc4_32f : binary_function + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// subMat + +namespace arithm +{ + template struct VSub4; + template <> struct VSub4 : binary_function { - __device__ __forceinline__ short4 operator ()(short4 a, float b) const + __device__ __forceinline__ uint operator ()(uint a, uint b) const { - return make_short4(saturate_cast(a.x * b), saturate_cast(a.y * b), - saturate_cast(a.z * b), saturate_cast(a.w * b)); - } - }; + uint res = 0; - OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(multiply_16sc4_32f) - { - enum { smart_block_dim_x = 8 }; - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 8 }; - }; + #if __CUDA_ARCH__ >= 300 + asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif - void multiply_gpu(const PtrStepSz& src1, const PtrStepSzf& src2, const PtrStepSz& dst, cudaStream_t stream) - { - cv::gpu::device::transform(static_cast< PtrStepSz >(src1), src2, static_cast< PtrStepSz >(dst), multiply_16sc4_32f(), WithOutMask(), stream); - } + return res; + } - template struct Multiply : binary_function + __device__ __forceinline__ VSub4() {} + __device__ __forceinline__ VSub4(const VSub4& other) {} + }; + template <> struct VSub4 : binary_function { - Multiply(float scale_) : scale(scale_) {} - __device__ __forceinline__ D operator ()(T a, T b) const + __device__ __forceinline__ uint operator ()(int a, int b) const { - return saturate_cast(scale * a * b); + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; } - const float scale; + + __device__ __forceinline__ VSub4() {} + __device__ __forceinline__ VSub4(const VSub4& other) {} }; - template struct Multiply : binary_function + template <> struct VSub4 : binary_function { - Multiply(double scale_) : scale(scale_) {} - __device__ __forceinline__ double operator ()(T a, T b) const + __device__ __forceinline__ int operator ()(uint a, uint b) const { - return scale * a * b; + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; } - const double scale; + + __device__ __forceinline__ VSub4() {} + __device__ __forceinline__ VSub4(const VSub4& other) {} }; - template <> struct Multiply : binary_function + template <> struct VSub4 : binary_function { - Multiply(double scale_) : scale(scale_) {} __device__ __forceinline__ int operator ()(int a, int b) const { - return saturate_cast(scale * a * b); + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; } - const double scale; - }; - template <> struct TransformFunctorTraits< Multiply > : DefaultTransformFunctorTraits< Multiply > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Multiply > : DefaultTransformFunctorTraits< Multiply > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Multiply > : DefaultTransformFunctorTraits< Multiply > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Multiply > : DefaultTransformFunctorTraits< Multiply > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ VSub4() {} + __device__ __forceinline__ VSub4(const VSub4& other) {} }; - template struct MultiplyCaller + //////////////////////////////////// + + template struct VSub2; + template <> struct VSub2 : binary_function { - static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) + __device__ __forceinline__ uint operator ()(uint a, uint b) const { - Multiply op(static_cast(scale)); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; } + + __device__ __forceinline__ VSub2() {} + __device__ __forceinline__ VSub2(const VSub2& other) {} }; - template struct MultiplyCaller + template <> struct VSub2 : binary_function { - static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) + __device__ __forceinline__ int operator ()(uint a, uint b) const { - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - Multiply op(scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; } + + __device__ __forceinline__ VSub2() {} + __device__ __forceinline__ VSub2(const VSub2& other) {} }; - template <> struct MultiplyCaller + template <> struct VSub2 : binary_function { - static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) + __device__ __forceinline__ uint operator ()(int a, int b) const { - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - Multiply op(scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; } - }; - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) + __device__ __forceinline__ VSub2() {} + __device__ __forceinline__ VSub2(const VSub2& other) {} + }; + template <> struct VSub2 : binary_function { - MultiplyCaller::call(src1, src2, dst, scale, stream); - } - - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - template struct MultiplyScalar : unary_function - { - MultiplyScalar(double val_, double scale_) : val(val_), scale(scale_) {} - __device__ __forceinline__ D operator ()(T a) const + __device__ __forceinline__ int operator ()(int a, int b) const { - return saturate_cast(scale * a * val); + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vsub2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vsub.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vsub.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; } - const double val; - const double scale; + + __device__ __forceinline__ VSub2() {} + __device__ __forceinline__ VSub2(const VSub2& other) {} }; - template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > + //////////////////////////////////// + + template struct SubMat : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ D operator ()(T a, T b) const + { + return saturate_cast(a - b); + } + + __device__ __forceinline__ SubMat() {} + __device__ __forceinline__ SubMat(const SubMat& other) {} }; - template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > +} + +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< arithm::VSub4 > : arithm::ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > + + //////////////////////////////////// + + template struct TransformFunctorTraits< arithm::VSub2 > : arithm::ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< MultiplyScalar > : DefaultTransformFunctorTraits< MultiplyScalar > + + //////////////////////////////////// + + template struct TransformFunctorTraits< arithm::SubMat > : arithm::ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; +}}} - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream) +namespace arithm +{ + template + void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - MultiplyScalar op(val, scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VSub4(), WithOutMask(), stream); } - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - ////////////////////////////////////////////////////////////////////////// - // divide - - struct divide_8uc4_32f : binary_function - { - __device__ __forceinline__ uchar4 operator ()(uchar4 a, float b) const - { - return b != 0 ? make_uchar4(saturate_cast(a.x / b), saturate_cast(a.y / b), - saturate_cast(a.z / b), saturate_cast(a.w / b)) - : make_uchar4(0,0,0,0); - } - }; + template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(divide_8uc4_32f) + template + void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) { - enum { smart_block_dim_x = 8 }; - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 8 }; - }; + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, VSub2(), WithOutMask(), stream); + } + + template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - void divide_gpu(const PtrStepSz& src1, const PtrStepSzf& src2, const PtrStepSz& dst, cudaStream_t stream) + template + void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) { - cv::gpu::device::transform(static_cast< PtrStepSz >(src1), src2, static_cast< PtrStepSz >(dst), divide_8uc4_32f(), WithOutMask(), stream); + if (mask.data) + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, SubMat(), mask, stream); + else + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, SubMat(), WithOutMask(), stream); } - - struct divide_16sc4_32f : binary_function + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// subScalar + +namespace arithm +{ + template + void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream) { - __device__ __forceinline__ short4 operator ()(short4 a, float b) const + AddScalar op(-static_cast(val)); + + if (mask.data) + transform((PtrStepSz) src1, (PtrStepSz) dst, op, mask, stream); + else + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); + } + + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + //template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); + template void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// mulMat + +namespace arithm +{ + struct Mul_8uc4_32f : binary_function + { + __device__ __forceinline__ uint operator ()(uint a, float b) const { - return b != 0 ? make_short4(saturate_cast(a.x / b), saturate_cast(a.y / b), - saturate_cast(a.z / b), saturate_cast(a.w / b)) - : make_short4(0,0,0,0); + uint res = 0; + + res |= (saturate_cast((0xffu & (a )) * b) ); + res |= (saturate_cast((0xffu & (a >> 8)) * b) << 8); + res |= (saturate_cast((0xffu & (a >> 16)) * b) << 16); + res |= (saturate_cast((0xffu & (a >> 24)) * b) << 24); + + return res; } + + __device__ __forceinline__ Mul_8uc4_32f() {} + __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f& other) {} }; - OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(divide_16sc4_32f) + struct Mul_16sc4_32f : binary_function { - enum { smart_block_dim_x = 8 }; - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 8 }; + __device__ __forceinline__ short4 operator ()(short4 a, float b) const + { + return make_short4(saturate_cast(a.x * b), saturate_cast(a.y * b), + saturate_cast(a.z * b), saturate_cast(a.w * b)); + } + + __device__ __forceinline__ Mul_16sc4_32f() {} + __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f& other) {} }; - void divide_gpu(const PtrStepSz& src1, const PtrStepSzf& src2, const PtrStepSz& dst, cudaStream_t stream) + template struct Mul : binary_function { - cv::gpu::device::transform(static_cast< PtrStepSz >(src1), src2, static_cast< PtrStepSz >(dst), divide_16sc4_32f(), WithOutMask(), stream); - } + __device__ __forceinline__ D operator ()(T a, T b) const + { + return saturate_cast(a * b); + } - template struct Divide : binary_function + __device__ __forceinline__ Mul() {} + __device__ __forceinline__ Mul(const Mul& other) {} + }; + + template struct MulScale : binary_function { - Divide(double scale_) : scale(scale_) {} + S scale; + + explicit MulScale(S scale_) : scale(scale_) {} + __device__ __forceinline__ D operator ()(T a, T b) const { - return b != 0 ? saturate_cast(a * scale / b) : 0; + return saturate_cast(scale * a * b); } - const double scale; }; +} - template <> struct TransformFunctorTraits< Divide > : DefaultTransformFunctorTraits< Divide > +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits : arithm::ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Divide > : DefaultTransformFunctorTraits< Divide > + + template struct TransformFunctorTraits< arithm::Mul > : arithm::ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Divide > : DefaultTransformFunctorTraits< Divide > + + template struct TransformFunctorTraits< arithm::MulScale > : arithm::ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Divide > : DefaultTransformFunctorTraits< Divide > +}}} + +namespace arithm +{ + void mulMat_8uc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream) { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; + transform(src1, src2, dst, Mul_8uc4_32f(), WithOutMask(), stream); + } - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream) + void mulMat_16sc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream) { - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - Divide op(scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); + transform(src1, src2, dst, Mul_16sc4_32f(), WithOutMask(), stream); } - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - template struct DivideScalar : unary_function - { - DivideScalar(double val_, double scale_) : val(val_), scale(scale_) {} - __device__ __forceinline__ D operator ()(T a) const + template + void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream) + { + if (scale == 1) { - return saturate_cast(scale * a / val); + Mul op; + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, op, WithOutMask(), stream); } - const double val; - const double scale; - }; + else + { + MulScale op(static_cast(scale)); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, op, WithOutMask(), stream); + } + } - template <> struct TransformFunctorTraits< DivideScalar > : DefaultTransformFunctorTraits< DivideScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< DivideScalar > : DefaultTransformFunctorTraits< DivideScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< DivideScalar > : DefaultTransformFunctorTraits< DivideScalar > - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< DivideScalar > : DefaultTransformFunctorTraits< DivideScalar > + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// mulScalar + +namespace arithm +{ + template struct MulScalar : unary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; + S val; - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream) - { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - cudaSafeCall( cudaSetDoubleForDevice(&scale) ); - DivideScalar op(val, scale); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); - } + explicit MulScalar(S val_) : val(val_) {} - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - //template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - template void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream); - - template struct Reciprocal : unary_function - { - Reciprocal(double scale_) : scale(scale_) {} __device__ __forceinline__ D operator ()(T a) const { - return a != 0 ? saturate_cast(scale / a) : 0; + return saturate_cast(a * val); } - const double scale; }; +} - template <> struct TransformFunctorTraits< Reciprocal > : DefaultTransformFunctorTraits< Reciprocal > +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< arithm::MulScalar > : arithm::ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Reciprocal > : DefaultTransformFunctorTraits< Reciprocal > +}}} + +namespace arithm +{ + template + void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< Reciprocal > : DefaultTransformFunctorTraits< Reciprocal > + MulScalar op(static_cast(val)); + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); + } + + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// divMat + +namespace arithm +{ + struct Div_8uc4_32f : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ uint operator ()(uint a, float b) const + { + uint res = 0; + + if (b != 0) + { + b = 1.0f / b; + res |= (saturate_cast((0xffu & (a )) * b) ); + res |= (saturate_cast((0xffu & (a >> 8)) * b) << 8); + res |= (saturate_cast((0xffu & (a >> 16)) * b) << 16); + res |= (saturate_cast((0xffu & (a >> 24)) * b) << 24); + } + + return res; + } }; - template <> struct TransformFunctorTraits< Reciprocal > : DefaultTransformFunctorTraits< Reciprocal > + + struct Div_16sc4_32f : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ short4 operator ()(short4 a, float b) const + { + return b != 0 ? make_short4(saturate_cast(a.x / b), saturate_cast(a.y / b), + saturate_cast(a.z / b), saturate_cast(a.w / b)) + : make_short4(0,0,0,0); + } }; - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream) + template struct Div : binary_function { - cudaSafeCall( cudaSetDoubleForDevice(&scalar) ); - Reciprocal op(scalar); - cv::gpu::device::transform((PtrStepSz)src2, (PtrStepSz)dst, op, WithOutMask(), stream); - } - - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - //template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - template void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream); - - ////////////////////////////////////////////////////////////////////////// - // absdiff - - template struct Absdiff : binary_function - { - static __device__ __forceinline__ int abs(int a) + __device__ __forceinline__ D operator ()(T a, T b) const { - return ::abs(a); + return b != 0 ? saturate_cast(a / b) : 0; } - static __device__ __forceinline__ float abs(float a) + + __device__ __forceinline__ Div() {} + __device__ __forceinline__ Div(const Div& other) {} + }; + template struct Div : binary_function + { + __device__ __forceinline__ float operator ()(T a, T b) const { - return ::fabsf(a); + return b != 0 ? static_cast(a) / b : 0; } - static __device__ __forceinline__ double abs(double a) + + __device__ __forceinline__ Div() {} + __device__ __forceinline__ Div(const Div& other) {} + }; + template struct Div : binary_function + { + __device__ __forceinline__ double operator ()(T a, T b) const { - return ::fabs(a); + return b != 0 ? static_cast(a) / b : 0; } - __device__ __forceinline__ T operator ()(T a, T b) const + __device__ __forceinline__ Div() {} + __device__ __forceinline__ Div(const Div& other) {} + }; + + template struct DivScale : binary_function + { + S scale; + + explicit DivScale(S scale_) : scale(scale_) {} + + __device__ __forceinline__ D operator ()(T a, T b) const { - return saturate_cast(::abs(a - b)); + return b != 0 ? saturate_cast(scale * a / b) : 0; } }; +} - template <> struct TransformFunctorTraits< Absdiff > : DefaultTransformFunctorTraits< Absdiff > +namespace cv { namespace gpu { namespace device +{ + template <> struct TransformFunctorTraits : arithm::ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Absdiff > : DefaultTransformFunctorTraits< Absdiff > + + template struct TransformFunctorTraits< arithm::Div > : arithm::ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Absdiff > : DefaultTransformFunctorTraits< Absdiff > + + template struct TransformFunctorTraits< arithm::DivScale > : arithm::ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< Absdiff > : DefaultTransformFunctorTraits< Absdiff > +}}} + +namespace arithm +{ + void divMat_8uc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream) { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; + transform(src1, src2, dst, Div_8uc4_32f(), WithOutMask(), stream); + } - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream) + void divMat_16sc4_32f(PtrStepSz src1, PtrStepSzf src2, PtrStepSz dst, cudaStream_t stream) { - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)src2, (PtrStepSz)dst, Absdiff(), WithOutMask(), stream); + transform(src1, src2, dst, Div_16sc4_32f(), WithOutMask(), stream); } - //template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - //template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - //template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream); + template + void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream) + { + if (scale == 1) + { + Div op; + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, op, WithOutMask(), stream); + } + else + { + DivScale op(static_cast(scale)); + transform((PtrStepSz) src1, (PtrStepSz) src2, (PtrStepSz) dst, op, WithOutMask(), stream); + } + } - template struct AbsdiffScalar : unary_function + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + //template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); + template void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// divScalar + +namespace arithm +{ + template + void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) { - AbsdiffScalar(double val_) : val(val_) {} - __device__ __forceinline__ T operator ()(T a) const + MulScalar op(static_cast(1.0 / val)); + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); + } + + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// divInv + +namespace arithm +{ + template struct DivInv : unary_function + { + S val; + + explicit DivInv(S val_) : val(val_) {} + + __device__ __forceinline__ D operator ()(T a) const { - return saturate_cast(::fabs(a - val)); + return a != 0 ? saturate_cast(val / a) : 0; } - double val; }; +} - template <> struct TransformFunctorTraits< AbsdiffScalar > : DefaultTransformFunctorTraits< AbsdiffScalar > +namespace cv { namespace gpu { namespace device +{ + template struct TransformFunctorTraits< arithm::DivInv > : arithm::ArithmFuncTraits { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; }; - template <> struct TransformFunctorTraits< AbsdiffScalar > : DefaultTransformFunctorTraits< AbsdiffScalar > +}}} + +namespace arithm +{ + template + void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits< AbsdiffScalar > : DefaultTransformFunctorTraits< AbsdiffScalar > + DivInv op(static_cast(val)); + transform((PtrStepSz) src1, (PtrStepSz) dst, op, WithOutMask(), stream); + } + + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + //template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); + template void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream); +} + +////////////////////////////////////////////////////////////////////////// +// absDiffMat + +namespace arithm +{ + template struct VAbsDiff4; + template <> struct VAbsDiff4 : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAbsDiff4() {} + __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {} }; - template <> struct TransformFunctorTraits< AbsdiffScalar > : DefaultTransformFunctorTraits< AbsdiffScalar > + template <> struct VAbsDiff4 : binary_function { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; + __device__ __forceinline__ int operator ()(int a, int b) const + { + int res = 0; + + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif + + return res; + } + + __device__ __forceinline__ VAbsDiff4() {} + __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {} }; - template void absdiff_gpu(const PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream) + //////////////////////////////////// + + template struct VAbsDiff2; + template <> struct VAbsDiff2 : binary_function { - cudaSafeCall( cudaSetDoubleForDevice(&val) ); - AbsdiffScalar op(val); - cv::gpu::device::transform((PtrStepSz)src1, (PtrStepSz)dst, op, WithOutMask(), stream); - } + __device__ __forceinline__ uint operator ()(uint a, uint b) const + { + uint res = 0; - //template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - //template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - //template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); - template void absdiff_gpu(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream); + #if __CUDA_ARCH__ >= 300 + asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #elif __CUDA_ARCH__ >= 200 + asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res)); + #endif - ////////////////////////////////////////////////////////////////////////////////////// - // Compare + return res; + } - template