From deac5d972e115398a1467a7808710174091f91d1 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 14 Feb 2011 15:50:17 +0000 Subject: [PATCH] fixed errors in gpu on old video cards (SURF_GPU, BruteForceMatcher_GPU, min/max, setTo, convertTo) added assertion after all kernels calls --- modules/gpu/include/opencv2/gpu/gpu.hpp | 4 +- modules/gpu/src/brute_force_matcher.cpp | 47 +- modules/gpu/src/cuda/brute_force_matcher.cu | 136 +++--- modules/gpu/src/cuda/color.cu | 30 +- modules/gpu/src/cuda/element_operations.cu | 204 +++++--- modules/gpu/src/cuda/filters.cu | 193 +------- modules/gpu/src/cuda/hog.cu | 17 + modules/gpu/src/cuda/imgproc.cu | 25 + modules/gpu/src/cuda/match_template.cu | 32 ++ modules/gpu/src/cuda/mathfunc.cu | 2 + modules/gpu/src/cuda/matrix_operations.cu | 529 +++++++++++---------- modules/gpu/src/cuda/matrix_reductions.cu | 79 +++ modules/gpu/src/cuda/split_merge.cu | 12 + modules/gpu/src/cuda/stereobm.cu | 4 + modules/gpu/src/cuda/stereobp.cu | 13 + modules/gpu/src/cuda/stereocsbp.cu | 20 +- modules/gpu/src/cuda/surf.cu | 298 ++++++++++++ modules/gpu/src/cudastream.cpp | 44 +- modules/gpu/src/element_operations.cpp | 97 ++-- modules/gpu/src/imgproc_gpu.cpp | 4 + modules/gpu/src/matrix_operations.cpp | 42 +- modules/gpu/src/mssegmentation.cpp | 2 + .../src/opencv2/gpu/device/border_interpolate.hpp | 123 ++++- modules/gpu/src/opencv2/gpu/device/transform.hpp | 4 + modules/gpu/src/surf.cpp | 25 +- tests/gpu/src/brute_force_matcher.cpp | 35 +- tests/gpu/src/features2d.cpp | 17 +- tests/gpu/src/gputest_main.cpp | 12 +- tests/gpu/src/meanshift.cpp | 19 + tests/gpu/src/mssegmentation.cpp | 8 + tests/gpu/src/operator_convert_to.cpp | 6 +- 31 files changed, 1363 insertions(+), 720 deletions(-) diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index a18e88b..bb152c3 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -435,8 +435,8 @@ namespace cv void enqueueCopy(const GpuMat& src, GpuMat& dst); - void enqueueMemSet(const GpuMat& src, Scalar val); - void enqueueMemSet(const GpuMat& src, Scalar val, const GpuMat& mask); + void enqueueMemSet(GpuMat& src, Scalar val); + void enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask); // converts matrix type, ex from float to uchar depending on type void enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0); diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp index 2c835cd..222feb6 100644 --- a/modules/gpu/src/brute_force_matcher.cpp +++ b/modules/gpu/src/brute_force_matcher.cpp @@ -76,18 +76,22 @@ namespace cv { namespace gpu { namespace bfmatcher { template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + bool cc_12); template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + bool cc_12); template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, - const DevMem2Df& distance); + const DevMem2Df& distance, + bool cc_12); template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, - const DevMem2Df& distance); + const DevMem2Df& distance, + bool cc_12); template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, @@ -160,17 +164,20 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs, using namespace cv::gpu::bfmatcher; typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + bool cc_12); static const match_caller_t match_callers[2][8] = { { - matchSingleL1_gpu, matchSingleL1_gpu, matchSingleL1_gpu, - matchSingleL1_gpu, matchSingleL1_gpu, matchSingleL1_gpu, 0, 0 + matchSingleL1_gpu, matchSingleL1_gpu, + matchSingleL1_gpu, matchSingleL1_gpu, + matchSingleL1_gpu, matchSingleL1_gpu, 0, 0 }, { - matchSingleL2_gpu, matchSingleL2_gpu, matchSingleL2_gpu, - matchSingleL2_gpu, matchSingleL2_gpu, matchSingleL2_gpu, 0, 0 + matchSingleL2_gpu, matchSingleL2_gpu, + matchSingleL2_gpu, matchSingleL2_gpu, + matchSingleL2_gpu, matchSingleL2_gpu, 0, 0 } }; @@ -185,9 +192,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs, match_caller_t func = match_callers[distType][queryDescs.depth()]; CV_Assert(func != 0); + bool cc_12 = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12); + // For single train there is no need to save imgIdx, so we just save imgIdx to trainIdx. // trainIdx store after imgIdx, so we doesn't lose it value. - func(queryDescs, trainDescs, mask, trainIdx, trainIdx, distance); + func(queryDescs, trainDescs, mask, trainIdx, trainIdx, distance, cc_12); } void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, @@ -284,17 +293,17 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, - const DevMem2Df& distance); + const DevMem2Df& distance, bool cc_12); static const match_caller_t match_callers[2][8] = { { - matchCollectionL1_gpu, matchCollectionL1_gpu, + matchCollectionL1_gpu, matchCollectionL1_gpu, matchCollectionL1_gpu, matchCollectionL1_gpu, matchCollectionL1_gpu, matchCollectionL1_gpu, 0, 0 }, { - matchCollectionL2_gpu, matchCollectionL2_gpu, + matchCollectionL2_gpu, matchCollectionL2_gpu, matchCollectionL2_gpu, matchCollectionL2_gpu, matchCollectionL2_gpu, matchCollectionL2_gpu, 0, 0 } @@ -311,7 +320,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes match_caller_t func = match_callers[distType][queryDescs.depth()]; CV_Assert(func != 0); - func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance); + bool cc_12 = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12); + + func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance, cc_12); } void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, @@ -383,11 +394,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con static const match_caller_t match_callers[2][8] = { { - knnMatchL1_gpu, knnMatchL1_gpu, knnMatchL1_gpu, + knnMatchL1_gpu, knnMatchL1_gpu, knnMatchL1_gpu, knnMatchL1_gpu, knnMatchL1_gpu, knnMatchL1_gpu, 0, 0 }, { - knnMatchL2_gpu, knnMatchL2_gpu, knnMatchL2_gpu, + knnMatchL2_gpu, knnMatchL2_gpu, knnMatchL2_gpu, knnMatchL2_gpu, knnMatchL2_gpu, knnMatchL2_gpu, 0, 0 } }; @@ -522,11 +533,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, static const radiusMatch_caller_t radiusMatch_callers[2][8] = { { - radiusMatchL1_gpu, radiusMatchL1_gpu, radiusMatchL1_gpu, + radiusMatchL1_gpu, radiusMatchL1_gpu, radiusMatchL1_gpu, radiusMatchL1_gpu, radiusMatchL1_gpu, radiusMatchL1_gpu, 0, 0 }, { - radiusMatchL2_gpu, radiusMatchL2_gpu, radiusMatchL2_gpu, + radiusMatchL2_gpu, radiusMatchL2_gpu, radiusMatchL2_gpu, radiusMatchL2_gpu, radiusMatchL2_gpu, radiusMatchL2_gpu, 0, 0 } }; diff --git a/modules/gpu/src/cuda/brute_force_matcher.cu b/modules/gpu/src/cuda/brute_force_matcher.cu index 44f823d..6ebf5a4 100644 --- a/modules/gpu/src/cuda/brute_force_matcher.cu +++ b/modules/gpu/src/cuda/brute_force_matcher.cu @@ -555,6 +555,7 @@ namespace cv { namespace gpu { namespace bfmatcher match, Dist, T> <<>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaThreadSynchronize() ); } @@ -575,6 +576,7 @@ namespace cv { namespace gpu { namespace bfmatcher Dist, T> <<>>(queryDescs, train, mask, trainIdx.data, imgIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaThreadSynchronize() ); } @@ -584,7 +586,8 @@ namespace cv { namespace gpu { namespace bfmatcher template void match_chooser(const DevMem2D_& queryDescs, const Train& train, - const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance) + const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + bool cc_12) { if (queryDescs.cols < 64) matchCached_caller<16, 16, 64, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance); @@ -596,7 +599,7 @@ namespace cv { namespace gpu { namespace bfmatcher matchCached_caller<16, 16, 128, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance); else if (queryDescs.cols < 256) matchCached_caller<16, 16, 256, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance); - else if (queryDescs.cols == 256) + else if (queryDescs.cols == 256 && cc_12) matchCached_caller<16, 16, 256, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance); else matchSimple_caller<16, 16, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance); @@ -606,95 +609,99 @@ namespace cv { namespace gpu { namespace bfmatcher template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance) + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + bool cc_12) { SingleTrain train((DevMem2D_)trainDescs); if (mask.data) { SingleMask m(mask); - match_chooser((DevMem2D_)queryDescs, train, m, trainIdx, imgIdx, distance); + match_chooser((DevMem2D_)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12); } else { - match_chooser((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance); + match_chooser((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); } } - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); - template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); + template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, - const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance) + const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + bool cc_12) { SingleTrain train((DevMem2D_)trainDescs); if (mask.data) { SingleMask m(mask); - match_chooser((DevMem2D_)queryDescs, train, m, trainIdx, imgIdx, distance); + match_chooser((DevMem2D_)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12); } else { - match_chooser((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance); + match_chooser((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); } } - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); - template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); + template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, - const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance) + const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, + const DevMem2Df& distance, bool cc_12) { TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols); if (maskCollection.data) { MaskCollection mask(maskCollection.data); - match_chooser((DevMem2D_)queryDescs, train, mask, trainIdx, imgIdx, distance); + match_chooser((DevMem2D_)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12); } else { - match_chooser((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance); + match_chooser((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); } } - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); - template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, - const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance) + const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, + const DevMem2Df& distance, bool cc_12) { TrainCollection train((DevMem2D_*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols); if (maskCollection.data) { MaskCollection mask(maskCollection.data); - match_chooser((DevMem2D_)queryDescs, train, mask, trainIdx, imgIdx, distance); + match_chooser((DevMem2D_)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12); } else { - match_chooser((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance); + match_chooser((DevMem2D_)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12); } } - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); - template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance); + template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); + template void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12); /////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////// Knn Match //////////////////////////////////// @@ -748,6 +755,7 @@ namespace cv { namespace gpu { namespace bfmatcher calcDistance<<>>( queryDescs, trainDescs, mask, distance); + cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaThreadSynchronize() ); } @@ -923,7 +931,10 @@ namespace cv { namespace gpu { namespace bfmatcher dim3 grid(trainIdx.rows, 1, 1); for (int i = 0; i < knn; ++i) + { findBestMatch<<>>(allDist, i, trainIdx, distance); + cudaSafeCall( cudaGetLastError() ); + } cudaSafeCall( cudaThreadSynchronize() ); } @@ -949,12 +960,12 @@ namespace cv { namespace gpu { namespace bfmatcher findKnnMatch_caller<256>(knn, trainIdx, distance, allDist); } - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + template void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, @@ -974,12 +985,12 @@ namespace cv { namespace gpu { namespace bfmatcher findKnnMatch_caller<256>(knn, trainIdx, distance, allDist); } - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); - template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); + template void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist); /////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////// Radius Match ////////////////////////////////// @@ -1044,6 +1055,7 @@ namespace cv { namespace gpu { namespace bfmatcher radiusMatch<<>>( queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches, distance); + cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaThreadSynchronize() ); } @@ -1067,12 +1079,12 @@ namespace cv { namespace gpu { namespace bfmatcher } } - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + template void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, @@ -1090,10 +1102,10 @@ namespace cv { namespace gpu { namespace bfmatcher } } - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); - template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); + template void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance); }}} diff --git a/modules/gpu/src/cuda/color.cu b/modules/gpu/src/cuda/color.cu index 00ebb91..6228fb7 100644 --- a/modules/gpu/src/cuda/color.cu +++ b/modules/gpu/src/cuda/color.cu @@ -43,6 +43,7 @@ #include "internal_shared.hpp" #include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/vecmath.hpp" +#include "opencv2/gpu/device/limits_gpu.hpp" using namespace cv::gpu; using namespace cv::gpu::device; @@ -51,13 +52,9 @@ using namespace cv::gpu::device; #define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n)) #endif -#ifndef FLT_EPSILON - #define FLT_EPSILON 1.192092896e-07F -#endif - namespace cv { namespace gpu { namespace color { - template struct ColorChannel {}; + template struct ColorChannel; template<> struct ColorChannel { typedef float worktype_f; @@ -133,6 +130,7 @@ namespace cv { namespace gpu { namespace color RGB2RGB<<>>(src.data, src.step, dst.data, dst.step, src.rows, src.cols, bidx); + cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -276,6 +274,7 @@ namespace cv { namespace gpu { namespace color RGB5x52RGB<<>>(src.data, src.step, dst.data, dst.step, src.rows, src.cols, bidx); + cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -304,6 +303,7 @@ namespace cv { namespace gpu { namespace color RGB2RGB5x5<<>>(src.data, src.step, dst.data, dst.step, src.rows, src.cols, bidx); + cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -385,6 +385,7 @@ namespace cv { namespace gpu { namespace color Gray2RGB<<>>(src.data, src.step, dst.data, dst.step, src.rows, src.cols); + cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -425,6 +426,7 @@ namespace cv { namespace gpu { namespace color Gray2RGB5x5<<>>(src.data, src.step, dst.data, dst.step, src.rows, src.cols); + cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -533,6 +535,7 @@ namespace cv { namespace gpu { namespace color RGB2Gray<<>>(src.data, src.step, dst.data, dst.step, src.rows, src.cols, bidx); + cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -573,6 +576,7 @@ namespace cv { namespace gpu { namespace color RGB5x52Gray<<>>(src.data, src.step, dst.data, dst.step, src.rows, src.cols); + cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -698,6 +702,7 @@ namespace cv { namespace gpu { namespace color RGB2YCrCb<<>>(src.data, src.step, dst.data, dst.step, src.rows, src.cols, bidx); + cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -756,6 +761,7 @@ namespace cv { namespace gpu { namespace color YCrCb2RGB<<>>(src.data, src.step, dst.data, dst.step, src.rows, src.cols, bidx); + cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -902,6 +908,7 @@ namespace cv { namespace gpu { namespace color RGB2XYZ<<>>(src.data, src.step, dst.data, dst.step, src.rows, src.cols); + cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -960,6 +967,7 @@ namespace cv { namespace gpu { namespace color XYZ2RGB<<>>(src.data, src.step, dst.data, dst.step, src.rows, src.cols); + cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); @@ -1063,8 +1071,8 @@ namespace cv { namespace gpu { namespace color vmin = fmin(vmin, b); diff = v - vmin; - s = diff / (float)(fabs(v) + FLT_EPSILON); - diff = (float)(60. / (diff + FLT_EPSILON)); + s = diff / (float)(fabs(v) + numeric_limits_gpu::epsilon()); + diff = (float)(60. / (diff + numeric_limits_gpu::epsilon())); if (v == r) h = (g - b) * diff; @@ -1199,6 +1207,8 @@ namespace cv { namespace gpu { namespace color RGB2HSV<<>>(src.data, src.step, dst.data, dst.step, src.rows, src.cols, bidx); + cudaSafeCall( cudaGetLastError() ); + if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); } @@ -1281,6 +1291,8 @@ namespace cv { namespace gpu { namespace color HSV2RGB<<>>(src.data, src.step, dst.data, dst.step, src.rows, src.cols, bidx); + cudaSafeCall( cudaGetLastError() ); + if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); } @@ -1342,7 +1354,7 @@ namespace cv { namespace gpu { namespace color diff = vmax - vmin; l = (vmax + vmin) * 0.5f; - if (diff > FLT_EPSILON) + if (diff > numeric_limits_gpu::epsilon()) { s = l < 0.5f ? diff / (vmax + vmin) : diff / (2.0f - vmax - vmin); diff = 60.f / diff; @@ -1550,6 +1562,8 @@ namespace cv { namespace gpu { namespace color HLS2RGB<<>>(src.data, src.step, dst.data, dst.step, src.rows, src.cols, bidx); + cudaSafeCall( cudaGetLastError() ); + if (stream == 0) cudaSafeCall( cudaThreadSynchronize() ); } diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu index 4d20525..30b6e05 100644 --- a/modules/gpu/src/cuda/element_operations.cu +++ b/modules/gpu/src/cuda/element_operations.cu @@ -130,6 +130,7 @@ namespace cv { namespace gpu { namespace mathfunc divUp(rows, threads.y)); bitwiseUnOpKernel<<>>(rows, width, src, dst); + cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall(cudaThreadSynchronize()); @@ -161,6 +162,7 @@ namespace cv { namespace gpu { namespace mathfunc dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); bitwiseUnOpKernel<<>>(rows, cols, cn, src, mask, dst); + cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall(cudaThreadSynchronize()); @@ -251,6 +253,7 @@ namespace cv { namespace gpu { namespace mathfunc dim3 grid(divUp(width, threads.x * sizeof(uint)), divUp(rows, threads.y)); bitwiseBinOpKernel<<>>(rows, width, src1, src2, dst); + cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall(cudaThreadSynchronize()); @@ -283,7 +286,8 @@ namespace cv { namespace gpu { namespace mathfunc dim3 threads(16, 16); dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); - bitwiseBinOpKernel<<>>(rows, cols, cn, src1, src2, mask, dst); + bitwiseBinOpKernel<<>>(rows, cols, cn, src1, src2, mask, dst); + cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall(cudaThreadSynchronize()); @@ -384,29 +388,71 @@ namespace cv { namespace gpu { namespace mathfunc } }; - struct ScalarMinOp + template struct ScalarMinOp + { + T s; + + explicit ScalarMinOp(T s_) : s(s_) {} + + __device__ T operator()(T a) + { + return min(a, s); + } + }; + template <> struct ScalarMinOp + { + float s; + + explicit ScalarMinOp(float s_) : s(s_) {} + + __device__ float operator()(float a) + { + return fmin(a, s); + } + }; + template <> struct ScalarMinOp { double s; explicit ScalarMinOp(double s_) : s(s_) {} - template - __device__ T operator()(T a) + __device__ double operator()(double a) { - return saturate_cast(fmin((double)a, s)); + return fmin(a, s); } }; - struct ScalarMaxOp + template struct ScalarMaxOp + { + T s; + + explicit ScalarMaxOp(T s_) : s(s_) {} + + __device__ T operator()(T a) + { + return max(a, s); + } + }; + template <> struct ScalarMaxOp + { + float s; + + explicit ScalarMaxOp(float s_) : s(s_) {} + + __device__ float operator()(float a) + { + return fmax(a, s); + } + }; + template <> struct ScalarMaxOp { double s; explicit ScalarMaxOp(double s_) : s(s_) {} - template - __device__ T operator()(T a) + __device__ double operator()(double a) { - return saturate_cast(fmax((double)a, s)); + return fmax(a, s); } }; @@ -418,7 +464,7 @@ namespace cv { namespace gpu { namespace mathfunc } template void min_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); - template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); + template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); @@ -433,7 +479,7 @@ namespace cv { namespace gpu { namespace mathfunc } template void max_gpu(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream); - template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); + template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); @@ -441,122 +487,145 @@ namespace cv { namespace gpu { namespace mathfunc template void max_gpu(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, cudaStream_t stream); template - void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream) + void min_gpu(const DevMem2D_& src1, T src2, const DevMem2D_& dst, cudaStream_t stream) { - ScalarMinOp op(src2); + ScalarMinOp op(src2); transform(src1, dst, op, stream); } - template void min_gpu(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream); - template void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); - template void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); - template void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); - template void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); - template void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); + template void min_gpu(const DevMem2D& src1, uchar src2, const DevMem2D& dst, cudaStream_t stream); + template void min_gpu(const DevMem2D_& src1, schar src2, const DevMem2D_& dst, cudaStream_t stream); + template void min_gpu(const DevMem2D_& src1, ushort src2, const DevMem2D_& dst, cudaStream_t stream); + template void min_gpu(const DevMem2D_& src1, short src2, const DevMem2D_& dst, cudaStream_t stream); + template void min_gpu(const DevMem2D_& src1, int src2, const DevMem2D_& dst, cudaStream_t stream); + template void min_gpu(const DevMem2D_& src1, float src2, const DevMem2D_& dst, cudaStream_t stream); template void min_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); template - void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream) + void max_gpu(const DevMem2D_& src1, T src2, const DevMem2D_& dst, cudaStream_t stream) { - ScalarMaxOp op(src2); + ScalarMaxOp op(src2); transform(src1, dst, op, stream); } - template void max_gpu(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream); - template void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); - template void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); - template void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); - template void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); - template void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); + template void max_gpu(const DevMem2D& src1, uchar src2, const DevMem2D& dst, cudaStream_t stream); + template void max_gpu(const DevMem2D_& src1, schar src2, const DevMem2D_& dst, cudaStream_t stream); + template void max_gpu(const DevMem2D_& src1, ushort src2, const DevMem2D_& dst, cudaStream_t stream); + template void max_gpu(const DevMem2D_& src1, short src2, const DevMem2D_& dst, cudaStream_t stream); + template void max_gpu(const DevMem2D_& src1, int src2, const DevMem2D_& dst, cudaStream_t stream); + template void max_gpu(const DevMem2D_& src1, float src2, const DevMem2D_& dst, cudaStream_t stream); template void max_gpu(const DevMem2D_& src1, double src2, const DevMem2D_& dst, cudaStream_t stream); ////////////////////////////////////////////////////////////////////////// // threshold - class ThreshOp + template struct ThreshBinary { - public: - ThreshOp(float thresh_, float maxVal_) : thresh(thresh_), maxVal(maxVal_) {} + ThreshBinary(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {} - protected: - float thresh; - float maxVal; + __device__ T operator()(const T& src) const + { + return src > thresh ? maxVal : 0; + } + + private: + T thresh; + T maxVal; }; - class ThreshBinary : public ThreshOp + template struct ThreshBinaryInv { - public: - ThreshBinary(float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal_) {} + ThreshBinaryInv(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {} - template __device__ T operator()(const T& src) const { - return (float)src > thresh ? saturate_cast(maxVal) : 0; + return src > thresh ? 0 : maxVal; } + + private: + T thresh; + T maxVal; }; - class ThreshBinaryInv : public ThreshOp + template struct ThreshTrunc { - public: - ThreshBinaryInv(float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal_) {} + ThreshTrunc(T thresh_, T) : thresh(thresh_) {} - template __device__ T operator()(const T& src) const { - return (float)src > thresh ? 0 : saturate_cast(maxVal); + return min(src, thresh); } + + private: + T thresh; }; + template <> struct ThreshTrunc + { + ThreshTrunc(float thresh_, float) : thresh(thresh_) {} + + __device__ float operator()(const float& src) const + { + return fmin(src, thresh); + } - class ThreshTrunc : public ThreshOp + private: + float thresh; + }; + template <> struct ThreshTrunc { - public: - ThreshTrunc(float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal_) {} + ThreshTrunc(double thresh_, double) : thresh(thresh_) {} - template - __device__ T operator()(const T& src) const + __device__ double operator()(const double& src) const { - return saturate_cast(fmin((float)src, thresh)); + return fmin(src, thresh); } + + private: + double thresh; }; - class ThreshToZero : public ThreshOp + template struct ThreshToZero { public: - ThreshToZero(float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal_) {} + ThreshToZero(T thresh_, T) : thresh(thresh_) {} - template __device__ T operator()(const T& src) const { - return (float)src > thresh ? src : 0; + return src > thresh ? src : 0; } + + private: + T thresh; }; - class ThreshToZeroInv : public ThreshOp + template struct ThreshToZeroInv { public: - ThreshToZeroInv(float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal_) {} + ThreshToZeroInv(T thresh_, T) : thresh(thresh_) {} - template __device__ T operator()(const T& src) const { - return (float)src > thresh ? 0 : src; + return src > thresh ? 0 : src; } + + private: + T thresh; }; - template - void threshold_caller(const DevMem2D_& src, const DevMem2D_& dst, float thresh, float maxVal, + template