From 0f53f2993e8fba18a93d9a80be85a44e7c756553 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 14 Nov 2011 09:02:06 +0000 Subject: [PATCH] removed BEGIN_OPENCV_DEVICE_NAMESPACE macros --- modules/gpu/src/arithm.cpp | 19 +- modules/gpu/src/bilateral_filter.cpp | 19 +- modules/gpu/src/blend.cpp | 19 +- modules/gpu/src/brute_force_matcher.cpp | 153 +- modules/gpu/src/calib3d.cpp | 39 +- modules/gpu/src/color.cpp | 275 +- modules/gpu/src/cuda/bf_knnmatch.cu | 1816 +++++----- modules/gpu/src/cuda/bf_match.cu | 1190 ++++--- modules/gpu/src/cuda/bf_radius_match.cu | 712 ++-- modules/gpu/src/cuda/bilateral_filter.cu | 308 +- modules/gpu/src/cuda/blend.cu | 124 +- modules/gpu/src/cuda/calib3d.cu | 243 +- modules/gpu/src/cuda/canny.cu | 690 ++-- modules/gpu/src/cuda/color.cu | 621 ++-- modules/gpu/src/cuda/column_filter.cu | 352 +- modules/gpu/src/cuda/copy_make_border.cu | 162 +- modules/gpu/src/cuda/element_operations.cu | 3642 ++++++++++---------- modules/gpu/src/cuda/hist.cu | 284 +- modules/gpu/src/cuda/hog.cu | 1274 ++++--- modules/gpu/src/cuda/imgproc.cu | 1568 +++++---- modules/gpu/src/cuda/internal_shared.hpp | 111 +- modules/gpu/src/cuda/match_template.cu | 1536 +++++---- modules/gpu/src/cuda/mathfunc.cu | 294 +- modules/gpu/src/cuda/matrix_operations.cu | 495 ++- modules/gpu/src/cuda/matrix_reductions.cu | 3408 +++++++++--------- modules/gpu/src/cuda/pyr_down.cu | 260 +- modules/gpu/src/cuda/pyr_up.cu | 200 +- modules/gpu/src/cuda/remap.cu | 398 ++- modules/gpu/src/cuda/resize.cu | 428 ++- modules/gpu/src/cuda/row_filter.cu | 388 ++- modules/gpu/src/cuda/safe_call.hpp | 63 +- modules/gpu/src/cuda/split_merge.cu | 922 +++-- modules/gpu/src/cuda/stereobm.cu | 800 +++-- modules/gpu/src/cuda/stereobp.cu | 782 +++-- modules/gpu/src/cuda/stereocsbp.cu | 1334 ++++--- modules/gpu/src/cuda/surf.cu | 1570 +++++---- modules/gpu/src/cudastream.cpp | 21 +- modules/gpu/src/element_operations.cpp | 250 +- modules/gpu/src/filtering.cpp | 29 +- modules/gpu/src/hog.cpp | 71 +- modules/gpu/src/imgproc.cpp | 341 +- modules/gpu/src/initialization.cpp | 29 +- modules/gpu/src/match_template.cpp | 163 +- modules/gpu/src/matrix_reductions.cpp | 162 +- .../src/opencv2/gpu/device/border_interpolate.hpp | 1141 +++--- modules/gpu/src/opencv2/gpu/device/color.hpp | 345 +- .../gpu/src/opencv2/gpu/device/datamov_utils.hpp | 46 +- .../src/opencv2/gpu/device/detail/color_detail.hpp | 1787 +++++----- .../opencv2/gpu/device/detail/transform_detail.hpp | 577 ++-- .../gpu/device/detail/type_traits_detail.hpp | 257 +- .../opencv2/gpu/device/detail/utility_detail.hpp | 1255 ++++--- .../gpu/device/detail/vec_distance_detail.hpp | 103 +- modules/gpu/src/opencv2/gpu/device/emulation.hpp | 33 +- modules/gpu/src/opencv2/gpu/device/filters.hpp | 131 +- modules/gpu/src/opencv2/gpu/device/funcattrib.hpp | 39 +- modules/gpu/src/opencv2/gpu/device/functional.hpp | 742 ++-- modules/gpu/src/opencv2/gpu/device/limits.hpp | 370 +- .../gpu/src/opencv2/gpu/device/saturate_cast.hpp | 319 +- .../gpu/src/opencv2/gpu/device/static_check.hpp | 29 +- modules/gpu/src/opencv2/gpu/device/transform.hpp | 45 +- modules/gpu/src/opencv2/gpu/device/type_traits.hpp | 55 +- modules/gpu/src/opencv2/gpu/device/utility.hpp | 255 +- .../gpu/src/opencv2/gpu/device/vec_distance.hpp | 263 +- modules/gpu/src/opencv2/gpu/device/vec_math.hpp | 295 +- modules/gpu/src/opencv2/gpu/device/vec_traits.hpp | 319 +- modules/gpu/src/opencv2/gpu/device/warp.hpp | 111 +- modules/gpu/src/opencv2/gpu/device/warp_reduce.hpp | 39 +- modules/gpu/src/split_merge.cpp | 21 +- modules/gpu/src/stereobm.cpp | 19 +- modules/gpu/src/stereobp.cpp | 41 +- modules/gpu/src/stereocsbp.cpp | 63 +- modules/gpu/src/surf.cpp | 41 +- modules/gpu/test/test_video.cpp | 2 +- 73 files changed, 18038 insertions(+), 18270 deletions(-) diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index a47d222..1f40156 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -425,21 +425,20 @@ void cv::gpu::magnitudeSqr(const GpuMat& src, GpuMat& dst, Stream& stream) //////////////////////////////////////////////////////////////////////// // Polar <-> Cart -BEGIN_OPENCV_DEVICE_NAMESPACE - -namespace mathfunc +namespace cv { namespace gpu { namespace device { - void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream); - void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream); -} - -END_OPENCV_DEVICE_NAMESPACE + namespace mathfunc + { + void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream); + void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream); + } +}}} namespace { inline void cartToPolar_caller(const GpuMat& x, const GpuMat& y, GpuMat* mag, bool magSqr, GpuMat* angle, bool angleInDegrees, cudaStream_t stream) { - using namespace OPENCV_DEVICE_NAMESPACE_ mathfunc; + using namespace ::cv::gpu::device::mathfunc; CV_DbgAssert(x.size() == y.size() && x.type() == y.type()); CV_Assert(x.depth() == CV_32F); @@ -459,7 +458,7 @@ namespace inline void polarToCart_caller(const GpuMat& mag, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, cudaStream_t stream) { - using namespace OPENCV_DEVICE_NAMESPACE_ mathfunc; + using namespace ::cv::gpu::device::mathfunc; CV_DbgAssert((mag.empty() || mag.size() == angle.size()) && mag.type() == angle.type()); CV_Assert(mag.depth() == CV_32F); diff --git a/modules/gpu/src/bilateral_filter.cpp b/modules/gpu/src/bilateral_filter.cpp index 12c159a..d24adee 100644 --- a/modules/gpu/src/bilateral_filter.cpp +++ b/modules/gpu/src/bilateral_filter.cpp @@ -55,19 +55,18 @@ void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&, #else /* !defined (HAVE_CUDA) */ -BEGIN_OPENCV_DEVICE_NAMESPACE - -namespace bilateral_filter +namespace cv { namespace gpu { namespace device { - void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc); - - void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream); - void bilateral_filter_gpu(DevMem2D_ disp, DevMem2Db img, int channels, int iters, cudaStream_t stream); -} + namespace bilateral_filter + { + void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc); -END_OPENCV_DEVICE_NAMESPACE + void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream); + void bilateral_filter_gpu(DevMem2D_ disp, DevMem2Db img, int channels, int iters, cudaStream_t stream); + } +}}} -using namespace OPENCV_DEVICE_NAMESPACE_ bilateral_filter; +using namespace ::cv::gpu::device::bilateral_filter; namespace { diff --git a/modules/gpu/src/blend.cpp b/modules/gpu/src/blend.cpp index 4c4afc5..7c2a86e 100644 --- a/modules/gpu/src/blend.cpp +++ b/modules/gpu/src/blend.cpp @@ -52,19 +52,18 @@ void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const Gpu #else -BEGIN_OPENCV_DEVICE_NAMESPACE - -namespace blend +namespace cv { namespace gpu { namespace device { - template - void blendLinearCaller(int rows, int cols, int cn, PtrStep img1, PtrStep img2, PtrStepf weights1, PtrStepf weights2, PtrStep result, cudaStream_t stream); - - void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream); -} + namespace blend + { + template + void blendLinearCaller(int rows, int cols, int cn, PtrStep img1, PtrStep img2, PtrStepf weights1, PtrStepf weights2, PtrStep result, cudaStream_t stream); -END_OPENCV_DEVICE_NAMESPACE + void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream); + } +}}} -using namespace OPENCV_DEVICE_NAMESPACE_ blend; +using namespace ::cv::gpu::device::blend; void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, GpuMat& result, Stream& stream) diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp index 1d93146..7f11282 100644 --- a/modules/gpu/src/brute_force_matcher.cpp +++ b/modules/gpu/src/brute_force_matcher.cpp @@ -82,80 +82,79 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, vector< vec #else /* !defined (HAVE_CUDA) */ -BEGIN_OPENCV_DEVICE_NAMESPACE - -namespace bf_match -{ - template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, - int cc, cudaStream_t stream); - template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, - int cc, cudaStream_t stream); - template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, - int cc, cudaStream_t stream); - - template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - int cc, cudaStream_t stream); - template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - int cc, cudaStream_t stream); - template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - int cc, cudaStream_t stream); -} - -namespace bf_knnmatch +namespace cv { namespace gpu { namespace device { - template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, - const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, - int cc, cudaStream_t stream); - template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, - const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, - int cc, cudaStream_t stream); - template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, - const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, - int cc, cudaStream_t stream); - - template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, - const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, - int cc, cudaStream_t stream); - template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, - const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, - int cc, cudaStream_t stream); - template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, - const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, - int cc, cudaStream_t stream); -} + namespace bf_match + { + template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream); + template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream); + + template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream); + template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream); + } -namespace bf_radius_match -{ - template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - int cc, cudaStream_t stream); - template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - int cc, cudaStream_t stream); - template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - int cc, cudaStream_t stream); - - template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - int cc, cudaStream_t stream); - - template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - int cc, cudaStream_t stream); - - template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - int cc, cudaStream_t stream); -} + namespace bf_knnmatch + { + template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, + const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, + int cc, cudaStream_t stream); + template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, + const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, + int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, + const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, + int cc, cudaStream_t stream); + + template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, + const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, + int cc, cudaStream_t stream); + template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, + const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, + int cc, cudaStream_t stream); + template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, + const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, + int cc, cudaStream_t stream); + } -END_OPENCV_DEVICE_NAMESPACE + namespace bf_radius_match + { + template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream); + template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream); + + template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream); + + template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream); + + template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream); + } +}}} //////////////////////////////////////////////////////////////////// // Train collection @@ -199,7 +198,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const if (query.empty() || train.empty()) return; - using namespace OPENCV_DEVICE_NAMESPACE_ bf_match; + using namespace ::cv::gpu::device::bf_match; typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, @@ -341,7 +340,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c if (query.empty() || trainCollection.empty()) return; - using namespace OPENCV_DEVICE_NAMESPACE_ bf_match; + using namespace ::cv::gpu::device::bf_match; typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, @@ -452,7 +451,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co if (query.empty() || train.empty()) return; - using namespace OPENCV_DEVICE_NAMESPACE_ bf_knnmatch; + using namespace ::cv::gpu::device::bf_knnmatch; typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, @@ -581,7 +580,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer if (query.empty() || trainCollection.empty()) return; - using namespace OPENCV_DEVICE_NAMESPACE_ bf_knnmatch; + using namespace ::cv::gpu::device::bf_knnmatch; typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, @@ -762,7 +761,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query, if (query.empty() || train.empty()) return; - using namespace OPENCV_DEVICE_NAMESPACE_ bf_radius_match; + using namespace ::cv::gpu::device::bf_radius_match; typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, @@ -893,7 +892,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu if (query.empty() || empty()) return; - using namespace OPENCV_DEVICE_NAMESPACE_ bf_radius_match; + using namespace ::cv::gpu::device::bf_radius_match; typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, diff --git a/modules/gpu/src/calib3d.cpp b/modules/gpu/src/calib3d.cpp index 8e6e838..bc522f3 100644 --- a/modules/gpu/src/calib3d.cpp +++ b/modules/gpu/src/calib3d.cpp @@ -56,31 +56,30 @@ void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat #else -BEGIN_OPENCV_DEVICE_NAMESPACE - -namespace transform_points -{ - void call(const DevMem2D_ src, const float* rot, const float* transl, DevMem2D_ dst, cudaStream_t stream); -} - -namespace project_points +namespace cv { namespace gpu { namespace device { - void call(const DevMem2D_ src, const float* rot, const float* transl, const float* proj, DevMem2D_ dst, cudaStream_t stream); -} + namespace transform_points + { + void call(const DevMem2D_ src, const float* rot, const float* transl, DevMem2D_ dst, cudaStream_t stream); + } -namespace solve_pnp_ransac -{ - int maxNumIters(); + namespace project_points + { + void call(const DevMem2D_ src, const float* rot, const float* transl, const float* proj, DevMem2D_ dst, cudaStream_t stream); + } - void computeHypothesisScores( - const int num_hypotheses, const int num_points, const float* rot_matrices, - const float3* transl_vectors, const float3* object, const float2* image, - const float dist_threshold, int* hypothesis_scores); -} + namespace solve_pnp_ransac + { + int maxNumIters(); -END_OPENCV_DEVICE_NAMESPACE + void computeHypothesisScores( + const int num_hypotheses, const int num_points, const float* rot_matrices, + const float3* transl_vectors, const float3* object, const float2* image, + const float dist_threshold, int* hypothesis_scores); + } +}}} -using namespace OPENCV_DEVICE_NAMESPACE; +using namespace ::cv::gpu::device; namespace { diff --git a/modules/gpu/src/color.cpp b/modules/gpu/src/color.cpp index c4f8b60..d52d797 100644 --- a/modules/gpu/src/color.cpp +++ b/modules/gpu/src/color.cpp @@ -51,8 +51,8 @@ void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu( #else /* !defined (HAVE_CUDA) */ -BEGIN_OPENCV_DEVICE_NAMESPACE - +namespace cv { namespace gpu { namespace device +{ #define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \ void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream); @@ -67,142 +67,141 @@ BEGIN_OPENCV_DEVICE_NAMESPACE OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_8u) \ OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_32f) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba) - -OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555) -OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565) -OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555) -OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565) -OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555) -OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565) -OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555) -OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565) - -OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb) -OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb) -OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr) -OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr) -OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba) -OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba) -OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra) -OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra) - -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra) - -OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555) -OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565) - -OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray) -OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray) - -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray) - -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4) - -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra) - -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4) - -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra) - -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4) - -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra) -OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra) - -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv4) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv4) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv4) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv4) - -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgb) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgba) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgb) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgba) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgr) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgra) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgr) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgra) - -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls4) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls4) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls4) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls4) - -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgb) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgba) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgb) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgba) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgr) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgra) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgr) -OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgra) - -#undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE -#undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL -#undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F - -END_OPENCV_DEVICE_NAMESPACE - -using namespace OPENCV_DEVICE_NAMESPACE; + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba) + + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555) + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565) + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555) + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565) + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555) + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565) + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555) + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565) + + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb) + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb) + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr) + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr) + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba) + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba) + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra) + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra) + + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra) + + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555) + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565) + + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray) + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray) + + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray) + + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4) + + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra) + + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4) + + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra) + + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4) + + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra) + OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra) + + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv4) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv4) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv4) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv4) + + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgb) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgba) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgb) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgba) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgr) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgra) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgr) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgra) + + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls4) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls4) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls4) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls4) + + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgb) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgba) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgb) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgba) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgr) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgra) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgr) + OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgra) + + #undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE + #undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL + #undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F +}}} + +using namespace ::cv::gpu::device; namespace { diff --git a/modules/gpu/src/cuda/bf_knnmatch.cu b/modules/gpu/src/cuda/bf_knnmatch.cu index c8b1171..f59cef0 100644 --- a/modules/gpu/src/cuda/bf_knnmatch.cu +++ b/modules/gpu/src/cuda/bf_knnmatch.cu @@ -45,1117 +45,1115 @@ #include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/datamov_utils.hpp" -BEGIN_OPENCV_DEVICE_NAMESPACE - -namespace bf_knnmatch { - -/////////////////////////////////////////////////////////////////////////////// -// Reduction - -template -__device__ void findBestMatch(float& bestDistance1, float& bestDistance2, - int& bestTrainIdx1, int& bestTrainIdx2, - float* s_distance, int* s_trainIdx) +namespace cv { namespace gpu { namespace device { - float myBestDistance1 = numeric_limits::max(); - float myBestDistance2 = numeric_limits::max(); - int myBestTrainIdx1 = -1; - int myBestTrainIdx2 = -1; + namespace bf_knnmatch + { + /////////////////////////////////////////////////////////////////////////////// + // Reduction - s_distance += threadIdx.y * BLOCK_SIZE; - s_trainIdx += threadIdx.y * BLOCK_SIZE; + template + __device__ void findBestMatch(float& bestDistance1, float& bestDistance2, + int& bestTrainIdx1, int& bestTrainIdx2, + float* s_distance, int* s_trainIdx) + { + float myBestDistance1 = numeric_limits::max(); + float myBestDistance2 = numeric_limits::max(); + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; - s_distance[threadIdx.x] = bestDistance1; - s_trainIdx[threadIdx.x] = bestTrainIdx1; + s_distance += threadIdx.y * BLOCK_SIZE; + s_trainIdx += threadIdx.y * BLOCK_SIZE; - __syncthreads(); + s_distance[threadIdx.x] = bestDistance1; + s_trainIdx[threadIdx.x] = bestTrainIdx1; - if (threadIdx.x == 0) - { - #pragma unroll - for (int i = 0; i < BLOCK_SIZE; ++i) - { - float val = s_distance[i]; + __syncthreads(); - if (val < myBestDistance1) + if (threadIdx.x == 0) { - myBestDistance2 = myBestDistance1; - myBestTrainIdx2 = myBestTrainIdx1; - - myBestDistance1 = val; - myBestTrainIdx1 = s_trainIdx[i]; + #pragma unroll + for (int i = 0; i < BLOCK_SIZE; ++i) + { + float val = s_distance[i]; + + if (val < myBestDistance1) + { + myBestDistance2 = myBestDistance1; + myBestTrainIdx2 = myBestTrainIdx1; + + myBestDistance1 = val; + myBestTrainIdx1 = s_trainIdx[i]; + } + else if (val < myBestDistance2) + { + myBestDistance2 = val; + myBestTrainIdx2 = s_trainIdx[i]; + } + } } - else if (val < myBestDistance2) - { - myBestDistance2 = val; - myBestTrainIdx2 = s_trainIdx[i]; - } - } - } - - __syncthreads(); - s_distance[threadIdx.x] = bestDistance2; - s_trainIdx[threadIdx.x] = bestTrainIdx2; + __syncthreads(); - __syncthreads(); + s_distance[threadIdx.x] = bestDistance2; + s_trainIdx[threadIdx.x] = bestTrainIdx2; - if (threadIdx.x == 0) - { - #pragma unroll - for (int i = 0; i < BLOCK_SIZE; ++i) - { - float val = s_distance[i]; + __syncthreads(); - if (val < myBestDistance2) + if (threadIdx.x == 0) { - myBestDistance2 = val; - myBestTrainIdx2 = s_trainIdx[i]; + #pragma unroll + for (int i = 0; i < BLOCK_SIZE; ++i) + { + float val = s_distance[i]; + + if (val < myBestDistance2) + { + myBestDistance2 = val; + myBestTrainIdx2 = s_trainIdx[i]; + } + } } - } - } - - bestDistance1 = myBestDistance1; - bestDistance2 = myBestDistance2; - bestTrainIdx1 = myBestTrainIdx1; - bestTrainIdx2 = myBestTrainIdx2; -} + bestDistance1 = myBestDistance1; + bestDistance2 = myBestDistance2; -template -__device__ void findBestMatch(float& bestDistance1, float& bestDistance2, - int& bestTrainIdx1, int& bestTrainIdx2, - int& bestImgIdx1, int& bestImgIdx2, - float* s_distance, int* s_trainIdx, int* s_imgIdx) -{ - float myBestDistance1 = numeric_limits::max(); - float myBestDistance2 = numeric_limits::max(); - int myBestTrainIdx1 = -1; - int myBestTrainIdx2 = -1; - int myBestImgIdx1 = -1; - int myBestImgIdx2 = -1; + bestTrainIdx1 = myBestTrainIdx1; + bestTrainIdx2 = myBestTrainIdx2; + } - s_distance += threadIdx.y * BLOCK_SIZE; - s_trainIdx += threadIdx.y * BLOCK_SIZE; - s_imgIdx += threadIdx.y * BLOCK_SIZE; + template + __device__ void findBestMatch(float& bestDistance1, float& bestDistance2, + int& bestTrainIdx1, int& bestTrainIdx2, + int& bestImgIdx1, int& bestImgIdx2, + float* s_distance, int* s_trainIdx, int* s_imgIdx) + { + float myBestDistance1 = numeric_limits::max(); + float myBestDistance2 = numeric_limits::max(); + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; + int myBestImgIdx1 = -1; + int myBestImgIdx2 = -1; - s_distance[threadIdx.x] = bestDistance1; - s_trainIdx[threadIdx.x] = bestTrainIdx1; - s_imgIdx[threadIdx.x] = bestImgIdx1; + s_distance += threadIdx.y * BLOCK_SIZE; + s_trainIdx += threadIdx.y * BLOCK_SIZE; + s_imgIdx += threadIdx.y * BLOCK_SIZE; - __syncthreads(); + s_distance[threadIdx.x] = bestDistance1; + s_trainIdx[threadIdx.x] = bestTrainIdx1; + s_imgIdx[threadIdx.x] = bestImgIdx1; - if (threadIdx.x == 0) - { - #pragma unroll - for (int i = 0; i < BLOCK_SIZE; ++i) - { - float val = s_distance[i]; - - if (val < myBestDistance1) - { - myBestDistance2 = myBestDistance1; - myBestTrainIdx2 = myBestTrainIdx1; - myBestImgIdx2 = myBestImgIdx1; + __syncthreads(); - myBestDistance1 = val; - myBestTrainIdx1 = s_trainIdx[i]; - myBestImgIdx1 = s_imgIdx[i]; - } - else if (val < myBestDistance2) + if (threadIdx.x == 0) { - myBestDistance2 = val; - myBestTrainIdx2 = s_trainIdx[i]; - myBestImgIdx2 = s_imgIdx[i]; + #pragma unroll + for (int i = 0; i < BLOCK_SIZE; ++i) + { + float val = s_distance[i]; + + if (val < myBestDistance1) + { + myBestDistance2 = myBestDistance1; + myBestTrainIdx2 = myBestTrainIdx1; + myBestImgIdx2 = myBestImgIdx1; + + myBestDistance1 = val; + myBestTrainIdx1 = s_trainIdx[i]; + myBestImgIdx1 = s_imgIdx[i]; + } + else if (val < myBestDistance2) + { + myBestDistance2 = val; + myBestTrainIdx2 = s_trainIdx[i]; + myBestImgIdx2 = s_imgIdx[i]; + } + } } - } - } - - __syncthreads(); - s_distance[threadIdx.x] = bestDistance2; - s_trainIdx[threadIdx.x] = bestTrainIdx2; - s_imgIdx[threadIdx.x] = bestImgIdx2; + __syncthreads(); - __syncthreads(); + s_distance[threadIdx.x] = bestDistance2; + s_trainIdx[threadIdx.x] = bestTrainIdx2; + s_imgIdx[threadIdx.x] = bestImgIdx2; - if (threadIdx.x == 0) - { - #pragma unroll - for (int i = 0; i < BLOCK_SIZE; ++i) - { - float val = s_distance[i]; + __syncthreads(); - if (val < myBestDistance2) + if (threadIdx.x == 0) { - myBestDistance2 = val; - myBestTrainIdx2 = s_trainIdx[i]; - myBestImgIdx2 = s_imgIdx[i]; + #pragma unroll + for (int i = 0; i < BLOCK_SIZE; ++i) + { + float val = s_distance[i]; + + if (val < myBestDistance2) + { + myBestDistance2 = val; + myBestTrainIdx2 = s_trainIdx[i]; + myBestImgIdx2 = s_imgIdx[i]; + } + } } - } - } - bestDistance1 = myBestDistance1; - bestDistance2 = myBestDistance2; + bestDistance1 = myBestDistance1; + bestDistance2 = myBestDistance2; - bestTrainIdx1 = myBestTrainIdx1; - bestTrainIdx2 = myBestTrainIdx2; + bestTrainIdx1 = myBestTrainIdx1; + bestTrainIdx2 = myBestTrainIdx2; - bestImgIdx1 = myBestImgIdx1; - bestImgIdx2 = myBestImgIdx2; -} + bestImgIdx1 = myBestImgIdx1; + bestImgIdx2 = myBestImgIdx2; + } -/////////////////////////////////////////////////////////////////////////////// -// Match Unrolled Cached + /////////////////////////////////////////////////////////////////////////////// + // Match Unrolled Cached -template -__device__ void loadQueryToSmem(int queryIdx, const DevMem2D_& query, U* s_query) -{ - #pragma unroll - for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) - { - const int loadX = threadIdx.x + i * BLOCK_SIZE; - s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0; - } -} - -template -__device__ void loopUnrolledCached(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, - typename Dist::value_type* s_query, typename Dist::value_type* s_train, - float& bestDistance1, float& bestDistance2, - int& bestTrainIdx1, int& bestTrainIdx2, - int& bestImgIdx1, int& bestImgIdx2) -{ - for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) - { - Dist dist; - - #pragma unroll - for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) + template + __device__ void loadQueryToSmem(int queryIdx, const DevMem2D_& query, U* s_query) { - const int loadX = threadIdx.x + i * BLOCK_SIZE; - - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; - - if (loadX < train.cols) + #pragma unroll + for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) { - T val; - - ForceGlob::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; + const int loadX = threadIdx.x + i * BLOCK_SIZE; + s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0; } - - __syncthreads(); - - #pragma unroll - for (int j = 0; j < BLOCK_SIZE; ++j) - dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); - - __syncthreads(); } - typename Dist::result_type distVal = dist; - - const int trainIdx = t * BLOCK_SIZE + threadIdx.x; - - if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx)) + template + __device__ void loopUnrolledCached(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, + typename Dist::value_type* s_query, typename Dist::value_type* s_train, + float& bestDistance1, float& bestDistance2, + int& bestTrainIdx1, int& bestTrainIdx2, + int& bestImgIdx1, int& bestImgIdx2) { - if (distVal < bestDistance1) - { - bestImgIdx2 = bestImgIdx1; - bestDistance2 = bestDistance1; - bestTrainIdx2 = bestTrainIdx1; - - bestImgIdx1 = imgIdx; - bestDistance1 = distVal; - bestTrainIdx1 = trainIdx; - } - else if (distVal < bestDistance2) + for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) { - bestImgIdx2 = imgIdx; - bestDistance2 = distVal; - bestTrainIdx2 = trainIdx; + Dist dist; + + #pragma unroll + for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; + + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + + if (loadX < train.cols) + { + T val; + + ForceGlob::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; + } + + __syncthreads(); + + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + + __syncthreads(); + } + + typename Dist::result_type distVal = dist; + + const int trainIdx = t * BLOCK_SIZE + threadIdx.x; + + if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx)) + { + if (distVal < bestDistance1) + { + bestImgIdx2 = bestImgIdx1; + bestDistance2 = bestDistance1; + bestTrainIdx2 = bestTrainIdx1; + + bestImgIdx1 = imgIdx; + bestDistance1 = distVal; + bestTrainIdx1 = trainIdx; + } + else if (distVal < bestDistance2) + { + bestImgIdx2 = imgIdx; + bestDistance2 = distVal; + bestTrainIdx2 = trainIdx; + } + } } } - } -} - -template -__global__ void matchUnrolledCached(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) -{ - extern __shared__ int smem[]; - - const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); - loadQueryToSmem(queryIdx, query, s_query); - - float myBestDistance1 = numeric_limits::max(); - float myBestDistance2 = numeric_limits::max(); - int myBestTrainIdx1 = -1; - int myBestTrainIdx2 = -1; - - loopUnrolledCached(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2); - - __syncthreads(); + template + __global__ void matchUnrolledCached(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) + { + extern __shared__ int smem[]; - float* s_distance = (float*)(smem); - int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx); + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); - if (queryIdx < query.rows && threadIdx.x == 0) - { - bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); - bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); - } -} - -template -void matchUnrolledCached(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2D_& trainIdx, const DevMem2D_& distance, - cudaStream_t stream) -{ - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(query.rows, BLOCK_SIZE)); + loadQueryToSmem(queryIdx, query, s_query); - const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + float myBestDistance1 = numeric_limits::max(); + float myBestDistance2 = numeric_limits::max(); + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; - matchUnrolledCached<<>>(query, train, mask, trainIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); + loopUnrolledCached(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} + __syncthreads(); -template -__global__ void matchUnrolledCached(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) -{ - extern __shared__ int smem[]; + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx); - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); + bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); + } + } - loadQueryToSmem(queryIdx, query, s_query); + template + void matchUnrolledCached(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& distance, + cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - float myBestDistance1 = numeric_limits::max(); - float myBestDistance2 = numeric_limits::max(); - int myBestTrainIdx1 = -1; - int myBestTrainIdx2 = -1; - int myBestImgIdx1 = -1; - int myBestImgIdx2 = -1; + const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - Mask m = mask; + matchUnrolledCached<<>>(query, train, mask, trainIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); - for (int imgIdx = 0; imgIdx < n; ++imgIdx) - { - const DevMem2D_ train = trains[imgIdx]; - m.next(); - loopUnrolledCached(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2); - } - - __syncthreads(); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } - float* s_distance = (float*)(smem); - int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); + template + __global__ void matchUnrolledCached(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) + { + extern __shared__ int smem[]; - findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx); + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - if (queryIdx < query.rows && threadIdx.x == 0) - { - bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); - bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2); - bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); - } -} - -template -void matchUnrolledCached(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, - const DevMem2D_& trainIdx, const DevMem2D_& imgIdx, const DevMem2D_& distance, - cudaStream_t stream) -{ - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(query.rows, BLOCK_SIZE)); + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); - const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + loadQueryToSmem(queryIdx, query, s_query); - matchUnrolledCached<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); + float myBestDistance1 = numeric_limits::max(); + float myBestDistance2 = numeric_limits::max(); + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; + int myBestImgIdx1 = -1; + int myBestImgIdx2 = -1; - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} + Mask m = mask; -/////////////////////////////////////////////////////////////////////////////// -// Match Unrolled + for (int imgIdx = 0; imgIdx < n; ++imgIdx) + { + const DevMem2D_ train = trains[imgIdx]; + m.next(); + loopUnrolledCached(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2); + } -template -__device__ void loopUnrolled(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, - typename Dist::value_type* s_query, typename Dist::value_type* s_train, - float& bestDistance1, float& bestDistance2, - int& bestTrainIdx1, int& bestTrainIdx2, - int& bestImgIdx1, int& bestImgIdx2) -{ - for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) - { - Dist dist; + __syncthreads(); - #pragma unroll - for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) - { - const int loadX = threadIdx.x + i * BLOCK_SIZE; + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx); - if (loadX < query.cols) + if (queryIdx < query.rows && threadIdx.x == 0) { - T val; - - ForceGlob::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val); - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; - - ForceGlob::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; + bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); + bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2); + bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); } + } - __syncthreads(); + template + void matchUnrolledCached(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& imgIdx, const DevMem2D_& distance, + cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - #pragma unroll - for (int j = 0; j < BLOCK_SIZE; ++j) - dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - __syncthreads(); - } + matchUnrolledCached<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); - typename Dist::result_type distVal = dist; + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } - const int trainIdx = t * BLOCK_SIZE + threadIdx.x; + /////////////////////////////////////////////////////////////////////////////// + // Match Unrolled - if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx)) + template + __device__ void loopUnrolled(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, + typename Dist::value_type* s_query, typename Dist::value_type* s_train, + float& bestDistance1, float& bestDistance2, + int& bestTrainIdx1, int& bestTrainIdx2, + int& bestImgIdx1, int& bestImgIdx2) { - if (distVal < bestDistance1) - { - bestImgIdx2 = bestImgIdx1; - bestDistance2 = bestDistance1; - bestTrainIdx2 = bestTrainIdx1; - - bestImgIdx1 = imgIdx; - bestDistance1 = distVal; - bestTrainIdx1 = trainIdx; - } - else if (distVal < bestDistance2) + for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) { - bestImgIdx2 = imgIdx; - bestDistance2 = distVal; - bestTrainIdx2 = trainIdx; + Dist dist; + + #pragma unroll + for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; + + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + + if (loadX < query.cols) + { + T val; + + ForceGlob::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val); + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; + + ForceGlob::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; + } + + __syncthreads(); + + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + + __syncthreads(); + } + + typename Dist::result_type distVal = dist; + + const int trainIdx = t * BLOCK_SIZE + threadIdx.x; + + if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx)) + { + if (distVal < bestDistance1) + { + bestImgIdx2 = bestImgIdx1; + bestDistance2 = bestDistance1; + bestTrainIdx2 = bestTrainIdx1; + + bestImgIdx1 = imgIdx; + bestDistance1 = distVal; + bestTrainIdx1 = trainIdx; + } + else if (distVal < bestDistance2) + { + bestImgIdx2 = imgIdx; + bestDistance2 = distVal; + bestTrainIdx2 = trainIdx; + } + } } } - } -} -template -__global__ void matchUnrolled(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) -{ - extern __shared__ int smem[]; + template + __global__ void matchUnrolled(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) + { + extern __shared__ int smem[]; - const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); - float myBestDistance1 = numeric_limits::max(); - float myBestDistance2 = numeric_limits::max(); - int myBestTrainIdx1 = -1; - int myBestTrainIdx2 = -1; + float myBestDistance1 = numeric_limits::max(); + float myBestDistance2 = numeric_limits::max(); + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; - loopUnrolled(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2); + loopUnrolled(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2); - __syncthreads(); + __syncthreads(); - float* s_distance = (float*)(smem); - int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx); + findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx); - if (queryIdx < query.rows && threadIdx.x == 0) - { - bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); - bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); - } -} + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); + bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); + } + } -template -void matchUnrolled(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2D_& trainIdx, const DevMem2D_& distance, - cudaStream_t stream) -{ - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(query.rows, BLOCK_SIZE)); + template + void matchUnrolled(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& distance, + cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - matchUnrolled<<>>(query, train, mask, trainIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); + matchUnrolled<<>>(query, train, mask, trainIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } -template -__global__ void matchUnrolled(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) -{ - extern __shared__ int smem[]; + template + __global__ void matchUnrolled(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) + { + extern __shared__ int smem[]; - const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); - float myBestDistance1 = numeric_limits::max(); - float myBestDistance2 = numeric_limits::max(); - int myBestTrainIdx1 = -1; - int myBestTrainIdx2 = -1; - int myBestImgIdx1 = -1; - int myBestImgIdx2 = -1; + float myBestDistance1 = numeric_limits::max(); + float myBestDistance2 = numeric_limits::max(); + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; + int myBestImgIdx1 = -1; + int myBestImgIdx2 = -1; - Mask m = mask; + Mask m = mask; - for (int imgIdx = 0; imgIdx < n; ++imgIdx) - { - const DevMem2D_ train = trains[imgIdx]; - m.next(); - loopUnrolled(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2); - } + for (int imgIdx = 0; imgIdx < n; ++imgIdx) + { + const DevMem2D_ train = trains[imgIdx]; + m.next(); + loopUnrolled(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2); + } - __syncthreads(); + __syncthreads(); - float* s_distance = (float*)(smem); - int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); - findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx); + findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx); - if (queryIdx < query.rows && threadIdx.x == 0) - { - bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); - bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2); - bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); - } -} - -template -void matchUnrolled(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, - const DevMem2D_& trainIdx, const DevMem2D_& imgIdx, const DevMem2D_& distance, - cudaStream_t stream) -{ - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(query.rows, BLOCK_SIZE)); + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); + bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2); + bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); + } + } - const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + template + void matchUnrolled(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& imgIdx, const DevMem2D_& distance, + cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - matchUnrolled<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); + const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} + matchUnrolled<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); -/////////////////////////////////////////////////////////////////////////////// -// Match + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } -template -__device__ void loop(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, - typename Dist::value_type* s_query, typename Dist::value_type* s_train, - float& bestDistance1, float& bestDistance2, - int& bestTrainIdx1, int& bestTrainIdx2, - int& bestImgIdx1, int& bestImgIdx2) -{ - for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) - { - Dist dist; + /////////////////////////////////////////////////////////////////////////////// + // Match - for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i) + template + __device__ void loop(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, + typename Dist::value_type* s_query, typename Dist::value_type* s_train, + float& bestDistance1, float& bestDistance2, + int& bestTrainIdx1, int& bestTrainIdx2, + int& bestImgIdx1, int& bestImgIdx2) { - const int loadX = threadIdx.x + i * BLOCK_SIZE; - - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; - - if (loadX < query.cols) + for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) { - T val; + Dist dist; + + for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; + + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + + if (loadX < query.cols) + { + T val; + + ForceGlob::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val); + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; + + ForceGlob::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; + } + + __syncthreads(); + + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + + __syncthreads(); + } + + typename Dist::result_type distVal = dist; + + const int trainIdx = t * BLOCK_SIZE + threadIdx.x; + + if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx)) + { + if (distVal < bestDistance1) + { + bestImgIdx2 = bestImgIdx1; + bestDistance2 = bestDistance1; + bestTrainIdx2 = bestTrainIdx1; + + bestImgIdx1 = imgIdx; + bestDistance1 = distVal; + bestTrainIdx1 = trainIdx; + } + else if (distVal < bestDistance2) + { + bestImgIdx2 = imgIdx; + bestDistance2 = distVal; + bestTrainIdx2 = trainIdx; + } + } + } + } - ForceGlob::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val); - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; + template + __global__ void match(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) + { + extern __shared__ int smem[]; - ForceGlob::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; - } + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - __syncthreads(); + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); - #pragma unroll - for (int j = 0; j < BLOCK_SIZE; ++j) - dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + float myBestDistance1 = numeric_limits::max(); + float myBestDistance2 = numeric_limits::max(); + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; + + loop(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2); __syncthreads(); - } - typename Dist::result_type distVal = dist; + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - const int trainIdx = t * BLOCK_SIZE + threadIdx.x; + findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx); - if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx)) - { - if (distVal < bestDistance1) + if (queryIdx < query.rows && threadIdx.x == 0) { - bestImgIdx2 = bestImgIdx1; - bestDistance2 = bestDistance1; - bestTrainIdx2 = bestTrainIdx1; - - bestImgIdx1 = imgIdx; - bestDistance1 = distVal; - bestTrainIdx1 = trainIdx; - } - else if (distVal < bestDistance2) - { - bestImgIdx2 = imgIdx; - bestDistance2 = distVal; - bestTrainIdx2 = trainIdx; + bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); + bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); } } - } -} -template -__global__ void match(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) -{ - extern __shared__ int smem[]; + template + void match(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& distance, + cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + match<<>>(query, train, mask, trainIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); - float myBestDistance1 = numeric_limits::max(); - float myBestDistance2 = numeric_limits::max(); - int myBestTrainIdx1 = -1; - int myBestTrainIdx2 = -1; + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } - loop(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2); + template + __global__ void match(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) + { + extern __shared__ int smem[]; - __syncthreads(); + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - float* s_distance = (float*)(smem); - int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); - findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx); + float myBestDistance1 = numeric_limits::max(); + float myBestDistance2 = numeric_limits::max(); + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; + int myBestImgIdx1 = -1; + int myBestImgIdx2 = -1; - if (queryIdx < query.rows && threadIdx.x == 0) - { - bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); - bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); - } -} - -template -void match(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2D_& trainIdx, const DevMem2D_& distance, - cudaStream_t stream) -{ - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(query.rows, BLOCK_SIZE)); + Mask m = mask; - const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + for (int imgIdx = 0; imgIdx < n; ++imgIdx) + { + const DevMem2D_ train = trains[imgIdx]; + m.next(); + loop(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2); + } - match<<>>(query, train, mask, trainIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); + __syncthreads(); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); -template -__global__ void match(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) -{ - extern __shared__ int smem[]; + findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx); - const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); + bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2); + bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); + } + } - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + template + void match(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& imgIdx, const DevMem2D_& distance, + cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - float myBestDistance1 = numeric_limits::max(); - float myBestDistance2 = numeric_limits::max(); - int myBestTrainIdx1 = -1; - int myBestTrainIdx2 = -1; - int myBestImgIdx1 = -1; - int myBestImgIdx2 = -1; + const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - Mask m = mask; + match<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); - for (int imgIdx = 0; imgIdx < n; ++imgIdx) - { - const DevMem2D_ train = trains[imgIdx]; - m.next(); - loop(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2); - } + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } - __syncthreads(); + /////////////////////////////////////////////////////////////////////////////// + // knnMatch 2 dispatcher - float* s_distance = (float*)(smem); - int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); + template + void match2Dispatcher(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2Db& trainIdx, const DevMem2Db& distance, + int cc, cudaStream_t stream) + { + if (query.cols <= 64) + { + matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); + } + else if (query.cols <= 128) + { + matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); + } + /*else if (query.cols <= 256) + { + matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); + } + else if (query.cols <= 512) + { + matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); + } + else if (query.cols <= 1024) + { + matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); + }*/ + else + { + match<16, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); + } + } - findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx); + template + void match2Dispatcher(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, + const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, + int cc, cudaStream_t stream) + { + if (query.cols <= 64) + { + matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); + } + else if (query.cols <= 128) + { + matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); + } + /*else if (query.cols <= 256) + { + matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); + } + else if (query.cols <= 512) + { + matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); + } + else if (query.cols <= 1024) + { + matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); + }*/ + else + { + match<16, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); + } + } - if (queryIdx < query.rows && threadIdx.x == 0) - { - bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); - bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2); - bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); - } -} - -template -void match(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, - const DevMem2D_& trainIdx, const DevMem2D_& imgIdx, const DevMem2D_& distance, - cudaStream_t stream) -{ - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(query.rows, BLOCK_SIZE)); + /////////////////////////////////////////////////////////////////////////////// + // Calc distance kernel - const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + template + __global__ void calcDistanceUnrolled(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, PtrStepf allDist) + { + extern __shared__ int smem[]; - match<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); + const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; + const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x; - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); -/////////////////////////////////////////////////////////////////////////////// -// knnMatch 2 dispatcher + Dist dist; -template -void match2Dispatcher(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2Db& trainIdx, const DevMem2Db& distance, - int cc, cudaStream_t stream) -{ - if (query.cols <= 64) - { - matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); - } - else if (query.cols <= 128) - { - matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); - } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); - }*/ - else - { - match<16, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); - } -} - -template -void match2Dispatcher(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, - const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, - int cc, cudaStream_t stream) -{ - if (query.cols <= 64) - { - matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); - } - else if (query.cols <= 128) - { - matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); - } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); - }*/ - else - { - match<16, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); - } -} + #pragma unroll + for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; + + if (loadX < query.cols) + { + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX]; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX]; + } + else + { + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + } + + __syncthreads(); + + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + + __syncthreads(); + } -/////////////////////////////////////////////////////////////////////////////// -// Calc distance kernel + if (queryIdx < query.rows && trainIdx < train.rows) + { + float distVal = numeric_limits::max(); -template -__global__ void calcDistanceUnrolled(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, PtrStepf allDist) -{ - extern __shared__ int smem[]; + if (mask(queryIdx, trainIdx)) + distVal = (typename Dist::result_type)dist; - const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; - const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x; + allDist.ptr(queryIdx)[trainIdx] = distVal; + } + } - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + template + void calcDistanceUnrolled(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); - Dist dist; + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - #pragma unroll - for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) - { - const int loadX = threadIdx.x + i * BLOCK_SIZE; + calcDistanceUnrolled<<>>(query, train, mask, allDist); + cudaSafeCall( cudaGetLastError() ); - if (loadX < query.cols) - { - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX]; - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX]; - } - else - { - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } - __syncthreads(); - - #pragma unroll - for (int j = 0; j < BLOCK_SIZE; ++j) - dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); - - __syncthreads(); - } + template + __global__ void calcDistance(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, PtrStepf allDist) + { + extern __shared__ int smem[]; - if (queryIdx < query.rows && trainIdx < train.rows) - { - float distVal = numeric_limits::max(); + const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; + const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x; - if (mask(queryIdx, trainIdx)) - distVal = (typename Dist::result_type)dist; + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); - allDist.ptr(queryIdx)[trainIdx] = distVal; - } -} + Dist dist; -template -void calcDistanceUnrolled(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream) -{ - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); + for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; + + if (loadX < query.cols) + { + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX]; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX]; + } + else + { + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + } + + __syncthreads(); + + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + + __syncthreads(); + } - const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + if (queryIdx < query.rows && trainIdx < train.rows) + { + float distVal = numeric_limits::max(); - calcDistanceUnrolled<<>>(query, train, mask, allDist); - cudaSafeCall( cudaGetLastError() ); + if (mask(queryIdx, trainIdx)) + distVal = (typename Dist::result_type)dist; - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} + allDist.ptr(queryIdx)[trainIdx] = distVal; + } + } -template -__global__ void calcDistance(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, PtrStepf allDist) -{ - extern __shared__ int smem[]; + template + void calcDistance(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); - const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; - const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x; + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + calcDistance<<>>(query, train, mask, allDist); + cudaSafeCall( cudaGetLastError() ); - Dist dist; + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } - for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i) - { - const int loadX = threadIdx.x + i * BLOCK_SIZE; + /////////////////////////////////////////////////////////////////////////////// + // Calc Distance dispatcher - if (loadX < query.cols) + template + void calcDistanceDispatcher(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2Df& allDist, + int cc, cudaStream_t stream) { - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX]; - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX]; - } - else - { - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + if (query.cols <= 64) + { + calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream); + } + else if (query.cols <= 128) + { + calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream); + } + /*else if (query.cols <= 256) + { + calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream); + } + else if (query.cols <= 512) + { + calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream); + } + else if (query.cols <= 1024) + { + calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream); + }*/ + else + { + calcDistance<16, Dist>(query, train, mask, allDist, stream); + } } - __syncthreads(); - - #pragma unroll - for (int j = 0; j < BLOCK_SIZE; ++j) - dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + /////////////////////////////////////////////////////////////////////////////// + // find knn match kernel - __syncthreads(); - } + template + __global__ void findBestMatch(DevMem2Df allDist, int i, PtrStepi trainIdx, PtrStepf distance) + { + const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64; + __shared__ float s_dist[SMEM_SIZE]; + __shared__ int s_trainIdx[SMEM_SIZE]; - if (queryIdx < query.rows && trainIdx < train.rows) - { - float distVal = numeric_limits::max(); + const int queryIdx = blockIdx.x; - if (mask(queryIdx, trainIdx)) - distVal = (typename Dist::result_type)dist; + float* allDistRow = allDist.ptr(queryIdx); - allDist.ptr(queryIdx)[trainIdx] = distVal; - } -} + float dist = numeric_limits::max(); + int bestIdx = -1; + + for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE) + { + float reg = allDistRow[i]; + if (reg < dist) + { + dist = reg; + bestIdx = i; + } + } -template -void calcDistance(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream) -{ - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); + s_dist[threadIdx.x] = dist; + s_trainIdx[threadIdx.x] = bestIdx; + __syncthreads(); - const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + reducePredVal(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less()); - calcDistance<<>>(query, train, mask, allDist); - cudaSafeCall( cudaGetLastError() ); + if (threadIdx.x == 0) + { + if (dist < numeric_limits::max()) + { + allDistRow[bestIdx] = numeric_limits::max(); + trainIdx.ptr(queryIdx)[i] = bestIdx; + distance.ptr(queryIdx)[i] = dist; + } + } + } - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} + template + void findKnnMatch(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, 1, 1); + const dim3 grid(trainIdx.rows, 1, 1); -/////////////////////////////////////////////////////////////////////////////// -// Calc Distance dispatcher + for (int i = 0; i < k; ++i) + { + findBestMatch<<>>(allDist, i, trainIdx, distance); + cudaSafeCall( cudaGetLastError() ); + } -template -void calcDistanceDispatcher(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2Df& allDist, - int cc, cudaStream_t stream) -{ - if (query.cols <= 64) - { - calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream); - } - else if (query.cols <= 128) - { - calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream); - } - /*else if (query.cols <= 256) - { - calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream); - } - else if (query.cols <= 512) - { - calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream); - } - else if (query.cols <= 1024) - { - calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream); - }*/ - else - { - calcDistance<16, Dist>(query, train, mask, allDist, stream); - } -} + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } -/////////////////////////////////////////////////////////////////////////////// -// find knn match kernel + void findKnnMatchDispatcher(int k, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream) + { + findKnnMatch<256>(k, static_cast(trainIdx), static_cast(distance), allDist, stream); + } -template -__global__ void findBestMatch(DevMem2Df allDist, int i, PtrStepi trainIdx, PtrStepf distance) -{ - const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64; - __shared__ float s_dist[SMEM_SIZE]; - __shared__ int s_trainIdx[SMEM_SIZE]; + /////////////////////////////////////////////////////////////////////////////// + // knn match Dispatcher - const int queryIdx = blockIdx.x; + template + void matchDispatcher(const DevMem2D_& query, const DevMem2D_& train, int k, const Mask& mask, + const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, + int cc, cudaStream_t stream) + { + if (k == 2) + { + match2Dispatcher(query, train, mask, trainIdx, distance, cc, stream); + } + else + { + calcDistanceDispatcher(query, train, mask, allDist, cc, stream); + findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream); + } + } - float* allDistRow = allDist.ptr(queryIdx); + /////////////////////////////////////////////////////////////////////////////// + // knn match caller - float dist = numeric_limits::max(); - int bestIdx = -1; - - for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE) - { - float reg = allDistRow[i]; - if (reg < dist) + template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, + const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, + int cc, cudaStream_t stream) { - dist = reg; - bestIdx = i; + if (mask.data) + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); + else + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); } - } - s_dist[threadIdx.x] = dist; - s_trainIdx[threadIdx.x] = bestIdx; - __syncthreads(); + template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + //template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); - reducePredVal(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less()); - - if (threadIdx.x == 0) - { - if (dist < numeric_limits::max()) + template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, + const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, + int cc, cudaStream_t stream) { - allDistRow[bestIdx] = numeric_limits::max(); - trainIdx.ptr(queryIdx)[i] = bestIdx; - distance.ptr(queryIdx)[i] = dist; + if (mask.data) + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); + else + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); } - } -} - -template -void findKnnMatch(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream) -{ - const dim3 block(BLOCK_SIZE, 1, 1); - const dim3 grid(trainIdx.rows, 1, 1); - - for (int i = 0; i < k; ++i) - { - findBestMatch<<>>(allDist, i, trainIdx, distance); - cudaSafeCall( cudaGetLastError() ); - } - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} + //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); -void findKnnMatchDispatcher(int k, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream) -{ - findKnnMatch<256>(k, static_cast(trainIdx), static_cast(distance), allDist, stream); -} + template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, + const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, + int cc, cudaStream_t stream) + { + if (mask.data) + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); + else + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); + } -/////////////////////////////////////////////////////////////////////////////// -// knn match Dispatcher + template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + //template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + //template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); -template -void matchDispatcher(const DevMem2D_& query, const DevMem2D_& train, int k, const Mask& mask, - const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, - int cc, cudaStream_t stream) -{ - if (k == 2) - { - match2Dispatcher(query, train, mask, trainIdx, distance, cc, stream); - } - else - { - calcDistanceDispatcher(query, train, mask, allDist, cc, stream); - findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream); - } -} + template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, + const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, + int cc, cudaStream_t stream) + { + if (masks.data) + match2Dispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); + else + match2Dispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); + } -/////////////////////////////////////////////////////////////////////////////// -// knn match caller + template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); + //template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); + template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); + template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); + template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); + template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); -template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, - const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, - int cc, cudaStream_t stream) -{ - if (mask.data) - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); - else - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); -} - -template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); -//template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); -template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); -template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); -template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); -template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); - -template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, - const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, - int cc, cudaStream_t stream) -{ - if (mask.data) - matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); - else - matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); -} - -//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); -//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); -//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); -//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); -//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); -template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); - -template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, - const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, - int cc, cudaStream_t stream) -{ - if (mask.data) - matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); - else - matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); -} - -template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); -//template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); -template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); -//template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); -template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); - -template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, - const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, - int cc, cudaStream_t stream) -{ - if (masks.data) - match2Dispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); - else - match2Dispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); -} - -template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); -//template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); -template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); -template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); -template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); -template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); - -template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, - const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, - int cc, cudaStream_t stream) -{ - if (masks.data) - match2Dispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); - else - match2Dispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); -} - -//template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); -//template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); -//template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); -//template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); -//template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Di& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); -template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); - -template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, - const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, - int cc, cudaStream_t stream) -{ - if (masks.data) - match2Dispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); - else - match2Dispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); -} + template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, + const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, + int cc, cudaStream_t stream) + { + if (masks.data) + match2Dispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); + else + match2Dispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); + } -template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); -//template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); -template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); -//template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); -template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); + //template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); + //template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); + //template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); + //template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); + //template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Di& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); + template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); -} // namespace bf_knnmatch + template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, + const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, + int cc, cudaStream_t stream) + { + if (masks.data) + match2Dispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); + else + match2Dispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); + } -END_OPENCV_DEVICE_NAMESPACE + template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); + //template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); + template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); + //template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); + template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); + } // namespace bf_knnmatch +}}} // namespace cv { namespace gpu { namespace device { diff --git a/modules/gpu/src/cuda/bf_match.cu b/modules/gpu/src/cuda/bf_match.cu index 0ab56be..7d6d62b 100644 --- a/modules/gpu/src/cuda/bf_match.cu +++ b/modules/gpu/src/cuda/bf_match.cu @@ -45,736 +45,734 @@ #include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/datamov_utils.hpp" -BEGIN_OPENCV_DEVICE_NAMESPACE +namespace cv { namespace gpu { namespace device +{ + namespace bf_match + { + /////////////////////////////////////////////////////////////////////////////// + // Reduction -namespace bf_match { + template + __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx) + { + s_distance += threadIdx.y * BLOCK_SIZE; + s_trainIdx += threadIdx.y * BLOCK_SIZE; -/////////////////////////////////////////////////////////////////////////////// -// Reduction + s_distance[threadIdx.x] = bestDistance; + s_trainIdx[threadIdx.x] = bestTrainIdx; -template -__device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx) -{ - s_distance += threadIdx.y * BLOCK_SIZE; - s_trainIdx += threadIdx.y * BLOCK_SIZE; + __syncthreads(); - s_distance[threadIdx.x] = bestDistance; - s_trainIdx[threadIdx.x] = bestTrainIdx; + reducePredVal(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less()); + } - __syncthreads(); + template + __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx) + { + s_distance += threadIdx.y * BLOCK_SIZE; + s_trainIdx += threadIdx.y * BLOCK_SIZE; + s_imgIdx += threadIdx.y * BLOCK_SIZE; - reducePredVal(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less()); -} + s_distance[threadIdx.x] = bestDistance; + s_trainIdx[threadIdx.x] = bestTrainIdx; + s_imgIdx [threadIdx.x] = bestImgIdx; -template -__device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx) -{ - s_distance += threadIdx.y * BLOCK_SIZE; - s_trainIdx += threadIdx.y * BLOCK_SIZE; - s_imgIdx += threadIdx.y * BLOCK_SIZE; + __syncthreads(); - s_distance[threadIdx.x] = bestDistance; - s_trainIdx[threadIdx.x] = bestTrainIdx; - s_imgIdx [threadIdx.x] = bestImgIdx; + reducePredVal2(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less()); + } - __syncthreads(); + /////////////////////////////////////////////////////////////////////////////// + // Match Unrolled Cached - reducePredVal2(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less()); -} + template + __device__ void loadQueryToSmem(int queryIdx, const DevMem2D_& query, U* s_query) + { + #pragma unroll + for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; + s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0; + } + } -/////////////////////////////////////////////////////////////////////////////// -// Match Unrolled Cached + template + __device__ void loopUnrolledCached(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, + typename Dist::value_type* s_query, typename Dist::value_type* s_train, + float& bestDistance, int& bestTrainIdx, int& bestImgIdx) + { + for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) + { + Dist dist; -template -__device__ void loadQueryToSmem(int queryIdx, const DevMem2D_& query, U* s_query) -{ - #pragma unroll - for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) - { - const int loadX = threadIdx.x + i * BLOCK_SIZE; - s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0; - } -} - -template -__device__ void loopUnrolledCached(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, - typename Dist::value_type* s_query, typename Dist::value_type* s_train, - float& bestDistance, int& bestTrainIdx, int& bestImgIdx) -{ - for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) - { - Dist dist; + #pragma unroll + for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; - #pragma unroll - for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) - { - const int loadX = threadIdx.x + i * BLOCK_SIZE; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + if (loadX < train.cols) + { + T val; - if (loadX < train.cols) - { - T val; + ForceGlob::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; + } - ForceGlob::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; - } + __syncthreads(); - __syncthreads(); + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); - #pragma unroll - for (int j = 0; j < BLOCK_SIZE; ++j) - dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + __syncthreads(); + } - __syncthreads(); - } + typename Dist::result_type distVal = dist; - typename Dist::result_type distVal = dist; + const int trainIdx = t * BLOCK_SIZE + threadIdx.x; - const int trainIdx = t * BLOCK_SIZE + threadIdx.x; + if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx)) + { + bestImgIdx = imgIdx; + bestDistance = distVal; + bestTrainIdx = trainIdx; + } + } + } - if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx)) + template + __global__ void matchUnrolledCached(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int* bestTrainIdx, float* bestDistance) { - bestImgIdx = imgIdx; - bestDistance = distVal; - bestTrainIdx = trainIdx; - } - } -} + extern __shared__ int smem[]; -template -__global__ void matchUnrolledCached(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int* bestTrainIdx, float* bestDistance) -{ - extern __shared__ int smem[]; + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); + loadQueryToSmem(queryIdx, query, s_query); - loadQueryToSmem(queryIdx, query, s_query); + float myBestDistance = numeric_limits::max(); + int myBestTrainIdx = -1; - float myBestDistance = numeric_limits::max(); - int myBestTrainIdx = -1; + loopUnrolledCached(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx); - loopUnrolledCached(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx); + __syncthreads(); - __syncthreads(); + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - float* s_distance = (float*)(smem); - int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + findBestMatch(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx); - findBestMatch(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx); + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = myBestTrainIdx; + bestDistance[queryIdx] = myBestDistance; + } + } - if (queryIdx < query.rows && threadIdx.x == 0) - { - bestTrainIdx[queryIdx] = myBestTrainIdx; - bestDistance[queryIdx] = myBestDistance; - } -} - -template -void matchUnrolledCached(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, - cudaStream_t stream) -{ - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(query.rows, BLOCK_SIZE)); + template + void matchUnrolledCached(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - matchUnrolledCached<<>>(query, train, mask, trainIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); + matchUnrolledCached<<>>(query, train, mask, trainIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } -template -__global__ void matchUnrolledCached(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, - int* bestTrainIdx, int* bestImgIdx, float* bestDistance) -{ - extern __shared__ int smem[]; + template + __global__ void matchUnrolledCached(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, + int* bestTrainIdx, int* bestImgIdx, float* bestDistance) + { + extern __shared__ int smem[]; - const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); - loadQueryToSmem(queryIdx, query, s_query); + loadQueryToSmem(queryIdx, query, s_query); - float myBestDistance = numeric_limits::max(); - int myBestTrainIdx = -1; - int myBestImgIdx = -1; + float myBestDistance = numeric_limits::max(); + int myBestTrainIdx = -1; + int myBestImgIdx = -1; - Mask m = mask; + Mask m = mask; - for (int imgIdx = 0; imgIdx < n; ++imgIdx) - { - const DevMem2D_ train = trains[imgIdx]; - m.next(); - loopUnrolledCached(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx); - } + for (int imgIdx = 0; imgIdx < n; ++imgIdx) + { + const DevMem2D_ train = trains[imgIdx]; + m.next(); + loopUnrolledCached(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx); + } - __syncthreads(); + __syncthreads(); - float* s_distance = (float*)(smem); - int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); - findBestMatch(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdx); + findBestMatch(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdx); - if (queryIdx < query.rows && threadIdx.x == 0) - { - bestTrainIdx[queryIdx] = myBestTrainIdx; - bestImgIdx[queryIdx] = myBestImgIdx; - bestDistance[queryIdx] = myBestDistance; - } -} - -template -void matchUnrolledCached(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - cudaStream_t stream) -{ - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(query.rows, BLOCK_SIZE)); + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = myBestTrainIdx; + bestImgIdx[queryIdx] = myBestImgIdx; + bestDistance[queryIdx] = myBestDistance; + } + } - const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + template + void matchUnrolledCached(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - matchUnrolledCached<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); + const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} + matchUnrolledCached<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); -/////////////////////////////////////////////////////////////////////////////// -// Match Unrolled + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } -template -__device__ void loopUnrolled(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, - typename Dist::value_type* s_query, typename Dist::value_type* s_train, - float& bestDistance, int& bestTrainIdx, int& bestImgIdx) -{ - for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) - { - Dist dist; + /////////////////////////////////////////////////////////////////////////////// + // Match Unrolled - #pragma unroll - for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) + template + __device__ void loopUnrolled(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, + typename Dist::value_type* s_query, typename Dist::value_type* s_train, + float& bestDistance, int& bestTrainIdx, int& bestImgIdx) { - const int loadX = threadIdx.x + i * BLOCK_SIZE; + for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) + { + Dist dist; - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + #pragma unroll + for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; - if (loadX < query.cols) - { - T val; + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + + if (loadX < query.cols) + { + T val; + + ForceGlob::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val); + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; + + ForceGlob::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; + } + + __syncthreads(); + + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + + __syncthreads(); + } - ForceGlob::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val); - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; + typename Dist::result_type distVal = dist; - ForceGlob::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; + const int trainIdx = t * BLOCK_SIZE + threadIdx.x; + + if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx)) + { + bestImgIdx = imgIdx; + bestDistance = distVal; + bestTrainIdx = trainIdx; + } } + } - __syncthreads(); + template + __global__ void matchUnrolled(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int* bestTrainIdx, float* bestDistance) + { + extern __shared__ int smem[]; - #pragma unroll - for (int j = 0; j < BLOCK_SIZE; ++j) - dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + + float myBestDistance = numeric_limits::max(); + int myBestTrainIdx = -1; + + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + + loopUnrolled(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx); __syncthreads(); - } - typename Dist::result_type distVal = dist; + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - const int trainIdx = t * BLOCK_SIZE + threadIdx.x; + findBestMatch(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx); - if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx)) + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = myBestTrainIdx; + bestDistance[queryIdx] = myBestDistance; + } + } + + template + void matchUnrolled(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + cudaStream_t stream) { - bestImgIdx = imgIdx; - bestDistance = distVal; - bestTrainIdx = trainIdx; + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); + + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + matchUnrolled<<>>(query, train, mask, trainIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } - } -} -template -__global__ void matchUnrolled(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int* bestTrainIdx, float* bestDistance) -{ - extern __shared__ int smem[]; + template + __global__ void matchUnrolled(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, + int* bestTrainIdx, int* bestImgIdx, float* bestDistance) + { + extern __shared__ int smem[]; - const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - float myBestDistance = numeric_limits::max(); - int myBestTrainIdx = -1; + float myBestDistance = numeric_limits::max(); + int myBestTrainIdx = -1; + int myBestImgIdx = -1; - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); - - loopUnrolled(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx); + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); - __syncthreads(); + Mask m = mask; + + for (int imgIdx = 0; imgIdx < n; ++imgIdx) + { + const DevMem2D_ train = trains[imgIdx]; + m.next(); + loopUnrolled(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx); + } - float* s_distance = (float*)(smem); - int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + __syncthreads(); - findBestMatch(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx); + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); - if (queryIdx < query.rows && threadIdx.x == 0) - { - bestTrainIdx[queryIdx] = myBestTrainIdx; - bestDistance[queryIdx] = myBestDistance; - } -} + findBestMatch(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx); -template -void matchUnrolled(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, - cudaStream_t stream) -{ - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(query.rows, BLOCK_SIZE)); + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = myBestTrainIdx; + bestImgIdx[queryIdx] = myBestImgIdx; + bestDistance[queryIdx] = myBestDistance; + } + } - const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + template + void matchUnrolled(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - matchUnrolled<<>>(query, train, mask, trainIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); + const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} + matchUnrolled<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); -template -__global__ void matchUnrolled(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, - int* bestTrainIdx, int* bestImgIdx, float* bestDistance) -{ - extern __shared__ int smem[]; + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } - const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + /////////////////////////////////////////////////////////////////////////////// + // Match - float myBestDistance = numeric_limits::max(); - int myBestTrainIdx = -1; - int myBestImgIdx = -1; + template + __device__ void loop(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, + typename Dist::value_type* s_query, typename Dist::value_type* s_train, + float& bestDistance, int& bestTrainIdx, int& bestImgIdx) + { + for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) + { + Dist dist; - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; - Mask m = mask; - - for (int imgIdx = 0; imgIdx < n; ++imgIdx) - { - const DevMem2D_ train = trains[imgIdx]; - m.next(); - loopUnrolled(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx); - } + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; - __syncthreads(); + if (loadX < query.cols) + { + T val; - float* s_distance = (float*)(smem); - int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); + ForceGlob::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val); + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; - findBestMatch(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx); + ForceGlob::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; + } - if (queryIdx < query.rows && threadIdx.x == 0) - { - bestTrainIdx[queryIdx] = myBestTrainIdx; - bestImgIdx[queryIdx] = myBestImgIdx; - bestDistance[queryIdx] = myBestDistance; - } -} - -template -void matchUnrolled(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - cudaStream_t stream) -{ - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(query.rows, BLOCK_SIZE)); + __syncthreads(); - const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); - matchUnrolled<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); + __syncthreads(); + } - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} + typename Dist::result_type distVal = dist; -/////////////////////////////////////////////////////////////////////////////// -// Match + const int trainIdx = t * BLOCK_SIZE + threadIdx.x; -template -__device__ void loop(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, - typename Dist::value_type* s_query, typename Dist::value_type* s_train, - float& bestDistance, int& bestTrainIdx, int& bestImgIdx) -{ - for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) - { - Dist dist; + if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx)) + { + bestImgIdx = imgIdx; + bestDistance = distVal; + bestTrainIdx = trainIdx; + } + } + } - for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i) + template + __global__ void match(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int* bestTrainIdx, float* bestDistance) { - const int loadX = threadIdx.x + i * BLOCK_SIZE; + extern __shared__ int smem[]; - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - if (loadX < query.cols) - { - T val; + float myBestDistance = numeric_limits::max(); + int myBestTrainIdx = -1; - ForceGlob::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val); - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; - - ForceGlob::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; - } + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + + loop(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx); __syncthreads(); - #pragma unroll - for (int j = 0; j < BLOCK_SIZE; ++j) - dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - __syncthreads(); + findBestMatch(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx); + + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = myBestTrainIdx; + bestDistance[queryIdx] = myBestDistance; + } } - typename Dist::result_type distVal = dist; + template + void match(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); + + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - const int trainIdx = t * BLOCK_SIZE + threadIdx.x; + match<<>>(query, train, mask, trainIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); - if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx)) - { - bestImgIdx = imgIdx; - bestDistance = distVal; - bestTrainIdx = trainIdx; + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } - } -} -template -__global__ void match(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int* bestTrainIdx, float* bestDistance) -{ - extern __shared__ int smem[]; + template + __global__ void match(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, + int* bestTrainIdx, int* bestImgIdx, float* bestDistance) + { + extern __shared__ int smem[]; - const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - float myBestDistance = numeric_limits::max(); - int myBestTrainIdx = -1; + float myBestDistance = numeric_limits::max(); + int myBestTrainIdx = -1; + int myBestImgIdx = -1; - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); - - loop(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx); + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); - __syncthreads(); + Mask m = mask; + for (int imgIdx = 0; imgIdx < n; ++imgIdx) + { + const DevMem2D_ train = trains[imgIdx]; + m.next(); + loop(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx); + } - float* s_distance = (float*)(smem); - int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + __syncthreads(); - findBestMatch(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx); + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); - if (queryIdx < query.rows && threadIdx.x == 0) - { - bestTrainIdx[queryIdx] = myBestTrainIdx; - bestDistance[queryIdx] = myBestDistance; - } -} - -template -void match(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, - cudaStream_t stream) -{ - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - - const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + findBestMatch(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx); - match<<>>(query, train, mask, trainIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = myBestTrainIdx; + bestImgIdx[queryIdx] = myBestImgIdx; + bestDistance[queryIdx] = myBestDistance; + } + } - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} + template + void match(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); -template -__global__ void match(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, - int* bestTrainIdx, int* bestImgIdx, float* bestDistance) -{ - extern __shared__ int smem[]; + const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + match<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); - float myBestDistance = numeric_limits::max(); - int myBestTrainIdx = -1; - int myBestImgIdx = -1; + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + /////////////////////////////////////////////////////////////////////////////// + // Match dispatcher - Mask m = mask; - for (int imgIdx = 0; imgIdx < n; ++imgIdx) - { - const DevMem2D_ train = trains[imgIdx]; - m.next(); - loop(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx); - } + template + void matchDispatcher(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream) + { + if (query.cols <= 64) + { + matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream); + } + else if (query.cols <= 128) + { + matchUnrolledCached<16, 128, Dist>(query, train, mask, trainIdx, distance, stream); + } + /*else if (query.cols <= 256) + { + matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream); + } + else if (query.cols <= 512) + { + matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream); + } + else if (query.cols <= 1024) + { + matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream); + }*/ + else + { + match<16, Dist>(query, train, mask, trainIdx, distance, stream); + } + } - __syncthreads(); + template + void matchDispatcher(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream) + { + if (query.cols <= 64) + { + matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); + } + else if (query.cols <= 128) + { + matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); + } + /*else if (query.cols <= 256) + { + matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); + } + else if (query.cols <= 512) + { + matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); + } + else if (query.cols <= 1024) + { + matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); + }*/ + else + { + match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); + } + } - float* s_distance = (float*)(smem); - int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); + /////////////////////////////////////////////////////////////////////////////// + // Match caller - findBestMatch(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx); + template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream) + { + if (mask.data) + { + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), SingleMask(mask), + trainIdx, distance, + cc, stream); + } + else + { + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), WithOutMask(), + trainIdx, distance, + cc, stream); + } + } - if (queryIdx < query.rows && threadIdx.x == 0) - { - bestTrainIdx[queryIdx] = myBestTrainIdx; - bestImgIdx[queryIdx] = myBestImgIdx; - bestDistance[queryIdx] = myBestDistance; - } -} - -template -void match(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - cudaStream_t stream) -{ - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(query.rows, BLOCK_SIZE)); + template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream) + { + if (mask.data) + { + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), SingleMask(mask), + trainIdx, distance, + cc, stream); + } + else + { + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), WithOutMask(), + trainIdx, distance, + cc, stream); + } + } - match<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); + //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} + template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream) + { + if (mask.data) + { + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), SingleMask(mask), + trainIdx, distance, + cc, stream); + } + else + { + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), WithOutMask(), + trainIdx, distance, + cc, stream); + } + } -/////////////////////////////////////////////////////////////////////////////// -// Match dispatcher + template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -template -void matchDispatcher(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, - int cc, cudaStream_t stream) -{ - if (query.cols <= 64) - { - matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream); - } - else if (query.cols <= 128) - { - matchUnrolledCached<16, 128, Dist>(query, train, mask, trainIdx, distance, stream); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream); - } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream); - }*/ - else - { - match<16, Dist>(query, train, mask, trainIdx, distance, stream); - } -} - -template -void matchDispatcher(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - int cc, cudaStream_t stream) -{ - if (query.cols <= 64) - { - matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); - } - else if (query.cols <= 128) - { - matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); - } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); - }*/ - else - { - match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); - } -} + template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream) + { + if (masks.data) + { + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), + trainIdx, imgIdx, distance, + cc, stream); + } + else + { + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), + trainIdx, imgIdx, distance, + cc, stream); + } + } -/////////////////////////////////////////////////////////////////////////////// -// Match caller + template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, - int cc, cudaStream_t stream) -{ - if (mask.data) - { - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), SingleMask(mask), - trainIdx, distance, - cc, stream); - } - else - { - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), WithOutMask(), - trainIdx, distance, - cc, stream); - } -} - -template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -//template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - -template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, - int cc, cudaStream_t stream) -{ - if (mask.data) - { - matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), SingleMask(mask), - trainIdx, distance, - cc, stream); - } - else - { - matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), WithOutMask(), - trainIdx, distance, - cc, stream); - } -} - -//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - -template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, - int cc, cudaStream_t stream) -{ - if (mask.data) - { - matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), SingleMask(mask), - trainIdx, distance, - cc, stream); - } - else - { - matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), WithOutMask(), - trainIdx, distance, - cc, stream); - } -} - -template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -//template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -//template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - -template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - int cc, cudaStream_t stream) -{ - if (masks.data) - { - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), - trainIdx, imgIdx, distance, - cc, stream); - } - else - { - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), - trainIdx, imgIdx, distance, - cc, stream); - } -} - -template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -//template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - -template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - int cc, cudaStream_t stream) -{ - if (masks.data) - { - matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), - trainIdx, imgIdx, distance, - cc, stream); - } - else - { - matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), - trainIdx, imgIdx, distance, - cc, stream); - } -} - -//template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -//template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -//template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -//template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -//template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - -template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - int cc, cudaStream_t stream) -{ - if (masks.data) - { - matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), - trainIdx, imgIdx, distance, - cc, stream); - } - else - { - matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), - trainIdx, imgIdx, distance, - cc, stream); - } -} + template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream) + { + if (masks.data) + { + matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), + trainIdx, imgIdx, distance, + cc, stream); + } + else + { + matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), + trainIdx, imgIdx, distance, + cc, stream); + } + } -template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -//template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -//template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -} // namespace bf_match + template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream) + { + if (masks.data) + { + matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), + trainIdx, imgIdx, distance, + cc, stream); + } + else + { + matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), + trainIdx, imgIdx, distance, + cc, stream); + } + } -END_OPENCV_DEVICE_NAMESPACE + template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + //template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + } // namespace bf_match +}}} // namespace cv { namespace gpu { namespace device { diff --git a/modules/gpu/src/cuda/bf_radius_match.cu b/modules/gpu/src/cuda/bf_radius_match.cu index 519ed7f..39b721a 100644 --- a/modules/gpu/src/cuda/bf_radius_match.cu +++ b/modules/gpu/src/cuda/bf_radius_match.cu @@ -45,423 +45,421 @@ #include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/datamov_utils.hpp" -BEGIN_OPENCV_DEVICE_NAMESPACE - -namespace bf_radius_match { - -/////////////////////////////////////////////////////////////////////////////// -// Match Unrolled - -template -__global__ void matchUnrolled(const DevMem2D_ query, int imgIdx, const DevMem2D_ train, float maxDistance, const Mask mask, - PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) +namespace cv { namespace gpu { namespace device { - #if __CUDA_ARCH__ >= 110 - - extern __shared__ int smem[]; - - const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; - const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x; - - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + namespace bf_radius_match + { + /////////////////////////////////////////////////////////////////////////////// + // Match Unrolled - Dist dist; + template + __global__ void matchUnrolled(const DevMem2D_ query, int imgIdx, const DevMem2D_ train, float maxDistance, const Mask mask, + PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) + { + #if __CUDA_ARCH__ >= 110 - #pragma unroll - for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) - { - const int loadX = threadIdx.x + i * BLOCK_SIZE; + extern __shared__ int smem[]; - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; + const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x; - if (loadX < query.cols) - { - T val; + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); - ForceGlob::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val); - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; + Dist dist; - ForceGlob::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; - } + #pragma unroll + for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; - __syncthreads(); + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; - #pragma unroll - for (int j = 0; j < BLOCK_SIZE; ++j) - dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + if (loadX < query.cols) + { + T val; - __syncthreads(); - } + ForceGlob::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val); + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; - float distVal = (typename Dist::result_type)dist; + ForceGlob::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; + } - if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance) - { - unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1); - if (ind < maxCount) - { - bestTrainIdx.ptr(queryIdx)[ind] = trainIdx; - if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx; - bestDistance.ptr(queryIdx)[ind] = distVal; - } - } + __syncthreads(); - #endif -} + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); -template -void matchUnrolled(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, cudaStream_t stream) -{ - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); + __syncthreads(); + } - const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + float distVal = (typename Dist::result_type)dist; - matchUnrolled<<>>(query, 0, train, maxDistance, mask, - trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols); - cudaSafeCall( cudaGetLastError() ); + if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance) + { + unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1); + if (ind < maxCount) + { + bestTrainIdx.ptr(queryIdx)[ind] = trainIdx; + if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx; + bestDistance.ptr(queryIdx)[ind] = distVal; + } + } - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} + #endif + } -template -void matchUnrolled(const DevMem2D_& query, const DevMem2D_* trains, int n, float maxDistance, const DevMem2Db* masks, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - cudaStream_t stream) -{ - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + template + void matchUnrolled(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); - const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - for (int i = 0; i < n; ++i) - { - const DevMem2D_ train = trains[i]; + matchUnrolled<<>>(query, 0, train, maxDistance, mask, + trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols); + cudaSafeCall( cudaGetLastError() ); - const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } - if (masks != 0 && masks[i].data) - { - matchUnrolled<<>>(query, i, train, maxDistance, SingleMask(masks[i]), - trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols); - } - else + template + void matchUnrolled(const DevMem2D_& query, const DevMem2D_* trains, int n, float maxDistance, const DevMem2Db* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + cudaStream_t stream) { - matchUnrolled<<>>(query, i, train, maxDistance, WithOutMask(), - trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols); + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + for (int i = 0; i < n; ++i) + { + const DevMem2D_ train = trains[i]; + + const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); + + if (masks != 0 && masks[i].data) + { + matchUnrolled<<>>(query, i, train, maxDistance, SingleMask(masks[i]), + trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols); + } + else + { + matchUnrolled<<>>(query, i, train, maxDistance, WithOutMask(), + trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols); + } + cudaSafeCall( cudaGetLastError() ); + } + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); } - cudaSafeCall( cudaGetLastError() ); - } - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} + /////////////////////////////////////////////////////////////////////////////// + // Match -/////////////////////////////////////////////////////////////////////////////// -// Match + template + __global__ void match(const DevMem2D_ query, int imgIdx, const DevMem2D_ train, float maxDistance, const Mask mask, + PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) + { + #if __CUDA_ARCH__ >= 110 -template -__global__ void match(const DevMem2D_ query, int imgIdx, const DevMem2D_ train, float maxDistance, const Mask mask, - PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) -{ - #if __CUDA_ARCH__ >= 110 + extern __shared__ int smem[]; - extern __shared__ int smem[]; + const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; + const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x; - const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; - const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x; + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + Dist dist; - Dist dist; + for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; - for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i) - { - const int loadX = threadIdx.x + i * BLOCK_SIZE; + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + if (loadX < query.cols) + { + T val; - if (loadX < query.cols) - { - T val; + ForceGlob::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val); + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; - ForceGlob::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val); - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; + ForceGlob::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; + } - ForceGlob::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; - } + __syncthreads(); - __syncthreads(); + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); - #pragma unroll - for (int j = 0; j < BLOCK_SIZE; ++j) - dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + __syncthreads(); + } - __syncthreads(); - } + float distVal = (typename Dist::result_type)dist; - float distVal = (typename Dist::result_type)dist; + if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance) + { + unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1); + if (ind < maxCount) + { + bestTrainIdx.ptr(queryIdx)[ind] = trainIdx; + if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx; + bestDistance.ptr(queryIdx)[ind] = distVal; + } + } - if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance) - { - unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1); - if (ind < maxCount) - { - bestTrainIdx.ptr(queryIdx)[ind] = trainIdx; - if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx; - bestDistance.ptr(queryIdx)[ind] = distVal; + #endif } - } - #endif -} + template + void match(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); -template -void match(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - cudaStream_t stream) -{ - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + match<<>>(query, 0, train, maxDistance, mask, + trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols); + cudaSafeCall( cudaGetLastError() ); - match<<>>(query, 0, train, maxDistance, mask, - trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols); - cudaSafeCall( cudaGetLastError() ); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} + template + void match(const DevMem2D_& query, const DevMem2D_* trains, int n, float maxDistance, const DevMem2Db* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + cudaStream_t stream) + { + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + + for (int i = 0; i < n; ++i) + { + const DevMem2D_ train = trains[i]; + + const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); + + if (masks != 0 && masks[i].data) + { + match<<>>(query, i, train, maxDistance, SingleMask(masks[i]), + trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols); + } + else + { + match<<>>(query, i, train, maxDistance, WithOutMask(), + trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols); + } + cudaSafeCall( cudaGetLastError() ); + } + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } -template -void match(const DevMem2D_& query, const DevMem2D_* trains, int n, float maxDistance, const DevMem2Db* masks, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - cudaStream_t stream) -{ - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + /////////////////////////////////////////////////////////////////////////////// + // Match dispatcher - const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + template + void matchDispatcher(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream) + { + if (query.cols <= 64) + { + matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); + } + else if (query.cols <= 128) + { + matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); + } + /*else if (query.cols <= 256) + { + matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); + } + else if (query.cols <= 512) + { + matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); + } + else if (query.cols <= 1024) + { + matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); + }*/ + else + { + match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); + } + } - for (int i = 0; i < n; ++i) - { - const DevMem2D_ train = trains[i]; + template + void matchDispatcher(const DevMem2D_& query, const DevMem2D_* trains, int n, float maxDistance, const DevMem2Db* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream) + { + if (query.cols <= 64) + { + matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); + } + else if (query.cols <= 128) + { + matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); + } + /*else if (query.cols <= 256) + { + matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); + } + else if (query.cols <= 512) + { + matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); + } + else if (query.cols <= 1024) + { + matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); + }*/ + else + { + match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); + } + } + + /////////////////////////////////////////////////////////////////////////////// + // Radius Match caller + + template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream) + { + if (mask.data) + { + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), + trainIdx, distance, nMatches, + cc, stream); + } + else + { + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), + trainIdx, distance, nMatches, + cc, stream); + } + } - const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); + template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - if (masks != 0 && masks[i].data) + template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream) { - match<<>>(query, i, train, maxDistance, SingleMask(masks[i]), - trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols); + if (mask.data) + { + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), + trainIdx, distance, nMatches, + cc, stream); + } + else + { + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), + trainIdx, distance, nMatches, + cc, stream); + } } - else + + //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + + template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream) { - match<<>>(query, i, train, maxDistance, WithOutMask(), - trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols); + if (mask.data) + { + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), + trainIdx, distance, nMatches, + cc, stream); + } + else + { + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), + trainIdx, distance, nMatches, + cc, stream); + } } - cudaSafeCall( cudaGetLastError() ); - } - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} -/////////////////////////////////////////////////////////////////////////////// -// Match dispatcher + template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -template -void matchDispatcher(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - int cc, cudaStream_t stream) -{ - if (query.cols <= 64) - { - matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); - } - else if (query.cols <= 128) - { - matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); - } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); - }*/ - else - { - match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); - } -} - -template -void matchDispatcher(const DevMem2D_& query, const DevMem2D_* trains, int n, float maxDistance, const DevMem2Db* masks, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - int cc, cudaStream_t stream) -{ - if (query.cols <= 64) - { - matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); - } - else if (query.cols <= 128) - { - matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); - } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); - }*/ - else - { - match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); - } -} + template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream) + { + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains, n, maxDistance, masks, + trainIdx, imgIdx, distance, nMatches, + cc, stream); + } -/////////////////////////////////////////////////////////////////////////////// -// Radius Match caller + template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - int cc, cudaStream_t stream) -{ - if (mask.data) - { - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), - trainIdx, distance, nMatches, - cc, stream); - } - else - { - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), - trainIdx, distance, nMatches, - cc, stream); - } -} - -template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -//template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - -template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - int cc, cudaStream_t stream) -{ - if (mask.data) - { - matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), - trainIdx, distance, nMatches, - cc, stream); - } - else - { - matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), - trainIdx, distance, nMatches, - cc, stream); - } -} - -//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - -template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - int cc, cudaStream_t stream) -{ - if (mask.data) - { - matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), - trainIdx, distance, nMatches, - cc, stream); - } - else - { - matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), - trainIdx, distance, nMatches, - cc, stream); - } -} - -template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -//template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -//template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - -template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - int cc, cudaStream_t stream) -{ - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains, n, maxDistance, masks, - trainIdx, imgIdx, distance, nMatches, - cc, stream); -} - -template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -//template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - -template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - int cc, cudaStream_t stream) -{ - matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains, n, maxDistance, masks, - trainIdx, imgIdx, distance, nMatches, - cc, stream); -} - -//template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -//template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -//template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -//template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -//template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - -template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - int cc, cudaStream_t stream) -{ - matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains, n, maxDistance, masks, - trainIdx, imgIdx, distance, nMatches, - cc, stream); -} + template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream) + { + matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains, n, maxDistance, masks, + trainIdx, imgIdx, distance, nMatches, + cc, stream); + } -template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -//template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -//template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -} // namespace bf_radius_match + template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream) + { + matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains, n, maxDistance, masks, + trainIdx, imgIdx, distance, nMatches, + cc, stream); + } -END_OPENCV_DEVICE_NAMESPACE + template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + //template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + } // namespace bf_radius_match +}}} // namespace cv { namespace gpu { namespace device diff --git a/modules/gpu/src/cuda/bilateral_filter.cu b/modules/gpu/src/cuda/bilateral_filter.cu index 4d3d9bc..0e2aa28 100644 --- a/modules/gpu/src/cuda/bilateral_filter.cu +++ b/modules/gpu/src/cuda/bilateral_filter.cu @@ -43,186 +43,184 @@ #include "internal_shared.hpp" #include "opencv2/gpu/device/limits.hpp" -BEGIN_OPENCV_DEVICE_NAMESPACE - -namespace bilateral_filter { - -__constant__ float* ctable_color; -__constant__ float* ctable_space; -__constant__ size_t ctable_space_step; - -__constant__ int cndisp; -__constant__ int cradius; - -__constant__ short cedge_disc; -__constant__ short cmax_disc; - -void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc) -{ - cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) ); - cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) ); - size_t table_space_step = table_space.step / sizeof(float); - cudaSafeCall( cudaMemcpyToSymbol(ctable_space_step, &table_space_step, sizeof(size_t)) ); - - cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) ); - cudaSafeCall( cudaMemcpyToSymbol(cradius, &radius, sizeof(int)) ); - - cudaSafeCall( cudaMemcpyToSymbol(cedge_disc, &edge_disc, sizeof(short)) ); - cudaSafeCall( cudaMemcpyToSymbol(cmax_disc, &max_disc, sizeof(short)) ); -} - -template -struct DistRgbMax -{ - static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b) - { - uchar x = ::abs(a[0] - b[0]); - uchar y = ::abs(a[1] - b[1]); - uchar z = ::abs(a[2] - b[2]); - return (::max(::max(x, y), z)); - } -}; - -template <> -struct DistRgbMax<1> +namespace cv { namespace gpu { namespace device { - static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b) + namespace bilateral_filter { - return ::abs(a[0] - b[0]); - } -}; + __constant__ float* ctable_color; + __constant__ float* ctable_space; + __constant__ size_t ctable_space_step; -template -__global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w) -{ - const int y = blockIdx.y * blockDim.y + threadIdx.y; - const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1); - - T dp[5]; + __constant__ int cndisp; + __constant__ int cradius; - if (y > 0 && y < h - 1 && x > 0 && x < w - 1) - { - dp[0] = *(disp + (y ) * disp_step + x + 0); - dp[1] = *(disp + (y-1) * disp_step + x + 0); - dp[2] = *(disp + (y ) * disp_step + x - 1); - dp[3] = *(disp + (y+1) * disp_step + x + 0); - dp[4] = *(disp + (y ) * disp_step + x + 1); + __constant__ short cedge_disc; + __constant__ short cmax_disc; - if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc) + void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc) { - const int ymin = ::max(0, y - cradius); - const int xmin = ::max(0, x - cradius); - const int ymax = ::min(h - 1, y + cradius); - const int xmax = ::min(w - 1, x + cradius); + cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) ); + cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) ); + size_t table_space_step = table_space.step / sizeof(float); + cudaSafeCall( cudaMemcpyToSymbol(ctable_space_step, &table_space_step, sizeof(size_t)) ); - float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbol(cradius, &radius, sizeof(int)) ); - const uchar* ic = img + y * img_step + channels * x; + cudaSafeCall( cudaMemcpyToSymbol(cedge_disc, &edge_disc, sizeof(short)) ); + cudaSafeCall( cudaMemcpyToSymbol(cmax_disc, &max_disc, sizeof(short)) ); + } - for(int yi = ymin; yi <= ymax; yi++) + template + struct DistRgbMax + { + static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b) { - const T* disp_y = disp + yi * disp_step; + uchar x = ::abs(a[0] - b[0]); + uchar y = ::abs(a[1] - b[1]); + uchar z = ::abs(a[2] - b[2]); + return (::max(::max(x, y), z)); + } + }; - for(int xi = xmin; xi <= xmax; xi++) - { - const uchar* in = img + yi * img_step + channels * xi; + template <> + struct DistRgbMax<1> + { + static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b) + { + return ::abs(a[0] - b[0]); + } + }; - uchar dist_rgb = DistRgbMax::calc(in, ic); + template + __global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w) + { + const int y = blockIdx.y * blockDim.y + threadIdx.y; + const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1); - const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)]; + T dp[5]; - const T disp_reg = disp_y[xi]; + if (y > 0 && y < h - 1 && x > 0 && x < w - 1) + { + dp[0] = *(disp + (y ) * disp_step + x + 0); + dp[1] = *(disp + (y-1) * disp_step + x + 0); + dp[2] = *(disp + (y ) * disp_step + x - 1); + dp[3] = *(disp + (y+1) * disp_step + x + 0); + dp[4] = *(disp + (y ) * disp_step + x + 1); - cost[0] += ::min(cmax_disc, ::abs(disp_reg - dp[0])) * weight; - cost[1] += ::min(cmax_disc, ::abs(disp_reg - dp[1])) * weight; - cost[2] += ::min(cmax_disc, ::abs(disp_reg - dp[2])) * weight; - cost[3] += ::min(cmax_disc, ::abs(disp_reg - dp[3])) * weight; - cost[4] += ::min(cmax_disc, ::abs(disp_reg - dp[4])) * weight; + if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc) + { + const int ymin = ::max(0, y - cradius); + const int xmin = ::max(0, x - cradius); + const int ymax = ::min(h - 1, y + cradius); + const int xmax = ::min(w - 1, x + cradius); + + float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + + const uchar* ic = img + y * img_step + channels * x; + + for(int yi = ymin; yi <= ymax; yi++) + { + const T* disp_y = disp + yi * disp_step; + + for(int xi = xmin; xi <= xmax; xi++) + { + const uchar* in = img + yi * img_step + channels * xi; + + uchar dist_rgb = DistRgbMax::calc(in, ic); + + const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)]; + + const T disp_reg = disp_y[xi]; + + cost[0] += ::min(cmax_disc, ::abs(disp_reg - dp[0])) * weight; + cost[1] += ::min(cmax_disc, ::abs(disp_reg - dp[1])) * weight; + cost[2] += ::min(cmax_disc, ::abs(disp_reg - dp[2])) * weight; + cost[3] += ::min(cmax_disc, ::abs(disp_reg - dp[3])) * weight; + cost[4] += ::min(cmax_disc, ::abs(disp_reg - dp[4])) * weight; + } + } + + float minimum = numeric_limits::max(); + int id = 0; + + if (cost[0] < minimum) + { + minimum = cost[0]; + id = 0; + } + if (cost[1] < minimum) + { + minimum = cost[1]; + id = 1; + } + if (cost[2] < minimum) + { + minimum = cost[2]; + id = 2; + } + if (cost[3] < minimum) + { + minimum = cost[3]; + id = 3; + } + if (cost[4] < minimum) + { + minimum = cost[4]; + id = 4; + } + + *(disp + y * disp_step + x) = dp[id]; } } + } - float minimum = numeric_limits::max(); - int id = 0; + template + void bilateral_filter_caller(DevMem2D_ disp, DevMem2Db img, int channels, int iters, cudaStream_t stream) + { + dim3 threads(32, 8, 1); + dim3 grid(1, 1, 1); + grid.x = divUp(disp.cols, threads.x << 1); + grid.y = divUp(disp.rows, threads.y); - if (cost[0] < minimum) + switch (channels) { - minimum = cost[0]; - id = 0; - } - if (cost[1] < minimum) - { - minimum = cost[1]; - id = 1; - } - if (cost[2] < minimum) - { - minimum = cost[2]; - id = 2; - } - if (cost[3] < minimum) - { - minimum = cost[3]; - id = 3; - } - if (cost[4] < minimum) - { - minimum = cost[4]; - id = 4; + case 1: + for (int i = 0; i < iters; ++i) + { + bilateral_filter<1><<>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); + cudaSafeCall( cudaGetLastError() ); + + bilateral_filter<1><<>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); + cudaSafeCall( cudaGetLastError() ); + } + break; + case 3: + for (int i = 0; i < iters; ++i) + { + bilateral_filter<3><<>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); + cudaSafeCall( cudaGetLastError() ); + + bilateral_filter<3><<>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); + cudaSafeCall( cudaGetLastError() ); + } + break; + default: + cv::gpu::error("Unsupported channels count", __FILE__, __LINE__); } - *(disp + y * disp_step + x) = dp[id]; + if (stream != 0) + cudaSafeCall( cudaDeviceSynchronize() ); } - } -} -template -void bilateral_filter_caller(DevMem2D_ disp, DevMem2Db img, int channels, int iters, cudaStream_t stream) -{ - dim3 threads(32, 8, 1); - dim3 grid(1, 1, 1); - grid.x = divUp(disp.cols, threads.x << 1); - grid.y = divUp(disp.rows, threads.y); - - switch (channels) - { - case 1: - for (int i = 0; i < iters; ++i) + void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream) { - bilateral_filter<1><<>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); - cudaSafeCall( cudaGetLastError() ); - - bilateral_filter<1><<>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); - cudaSafeCall( cudaGetLastError() ); + bilateral_filter_caller(disp, img, channels, iters, stream); } - break; - case 3: - for (int i = 0; i < iters; ++i) - { - bilateral_filter<3><<>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); - cudaSafeCall( cudaGetLastError() ); - bilateral_filter<3><<>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); - cudaSafeCall( cudaGetLastError() ); + void bilateral_filter_gpu(DevMem2D_ disp, DevMem2Db img, int channels, int iters, cudaStream_t stream) + { + bilateral_filter_caller(disp, img, channels, iters, stream); } - break; - default: - cv::gpu::error("Unsupported channels count", __FILE__, __LINE__); - } - - if (stream != 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} - -void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream) -{ - bilateral_filter_caller(disp, img, channels, iters, stream); -} - -void bilateral_filter_gpu(DevMem2D_ disp, DevMem2Db img, int channels, int iters, cudaStream_t stream) -{ - bilateral_filter_caller(disp, img, channels, iters, stream); -} - -} // namespace bilateral_filter - -END_OPENCV_DEVICE_NAMESPACE + } // namespace bilateral_filter +}}} // namespace cv { namespace gpu { namespace device diff --git a/modules/gpu/src/cuda/blend.cu b/modules/gpu/src/cuda/blend.cu index fca1b96..02e9649 100644 --- a/modules/gpu/src/cuda/blend.cu +++ b/modules/gpu/src/cuda/blend.cu @@ -42,77 +42,75 @@ #include "internal_shared.hpp" -BEGIN_OPENCV_DEVICE_NAMESPACE - -namespace blend { - -template -__global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep img1, const PtrStep img2, - const PtrStepf weights1, const PtrStepf weights2, PtrStep result) +namespace cv { namespace gpu { namespace device { - int x = blockIdx.x * blockDim.x + threadIdx.x; - int y = blockIdx.y * blockDim.y + threadIdx.y; - - if (y < rows && x < cols) + namespace blend { - int x_ = x / cn; - float w1 = weights1.ptr(y)[x_]; - float w2 = weights2.ptr(y)[x_]; - T p1 = img1.ptr(y)[x]; - T p2 = img2.ptr(y)[x]; - result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f); - } -} - -template -void blendLinearCaller(int rows, int cols, int cn, PtrStep img1, PtrStep img2, PtrStepf weights1, PtrStepf weights2, PtrStep result, cudaStream_t stream) -{ - dim3 threads(16, 16); - dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y)); - - blendLinearKernel<<>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result); - cudaSafeCall( cudaGetLastError() ); + template + __global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep img1, const PtrStep img2, + const PtrStepf weights1, const PtrStepf weights2, PtrStep result) + { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; - if (stream == 0) - cudaSafeCall(cudaDeviceSynchronize()); -} + if (y < rows && x < cols) + { + int x_ = x / cn; + float w1 = weights1.ptr(y)[x_]; + float w2 = weights2.ptr(y)[x_]; + T p1 = img1.ptr(y)[x]; + T p2 = img2.ptr(y)[x]; + result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f); + } + } -template void blendLinearCaller(int, int, int, PtrStep, PtrStep, PtrStepf, PtrStepf, PtrStep, cudaStream_t stream); -template void blendLinearCaller(int, int, int, PtrStep, PtrStep, PtrStepf, PtrStepf, PtrStep, cudaStream_t stream); + template + void blendLinearCaller(int rows, int cols, int cn, PtrStep img1, PtrStep img2, PtrStepf weights1, PtrStepf weights2, PtrStep result, cudaStream_t stream) + { + dim3 threads(16, 16); + dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y)); + + blendLinearKernel<<>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result); + cudaSafeCall( cudaGetLastError() ); + if (stream == 0) + cudaSafeCall(cudaDeviceSynchronize()); + } -__global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2, - const PtrStepf weights1, const PtrStepf weights2, PtrStepb result) -{ - int x = blockIdx.x * blockDim.x + threadIdx.x; - int y = blockIdx.y * blockDim.y + threadIdx.y; + template void blendLinearCaller(int, int, int, PtrStep, PtrStep, PtrStepf, PtrStepf, PtrStep, cudaStream_t stream); + template void blendLinearCaller(int, int, int, PtrStep, PtrStep, PtrStepf, PtrStepf, PtrStep, cudaStream_t stream); - if (y < rows && x < cols) - { - float w1 = weights1.ptr(y)[x]; - float w2 = weights2.ptr(y)[x]; - float sum_inv = 1.f / (w1 + w2 + 1e-5f); - w1 *= sum_inv; - w2 *= sum_inv; - uchar4 p1 = ((const uchar4*)img1.ptr(y))[x]; - uchar4 p2 = ((const uchar4*)img2.ptr(y))[x]; - ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2, - p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2); - } -} -void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream) -{ - dim3 threads(16, 16); - dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); - - blendLinearKernel8UC4<<>>(rows, cols, img1, img2, weights1, weights2, result); - cudaSafeCall( cudaGetLastError() ); + __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2, + const PtrStepf weights1, const PtrStepf weights2, PtrStepb result) + { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; - if (stream == 0) - cudaSafeCall(cudaDeviceSynchronize()); -} + if (y < rows && x < cols) + { + float w1 = weights1.ptr(y)[x]; + float w2 = weights2.ptr(y)[x]; + float sum_inv = 1.f / (w1 + w2 + 1e-5f); + w1 *= sum_inv; + w2 *= sum_inv; + uchar4 p1 = ((const uchar4*)img1.ptr(y))[x]; + uchar4 p2 = ((const uchar4*)img2.ptr(y))[x]; + ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2, + p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2); + } + } -} // namespace blend + void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream) + { + dim3 threads(16, 16); + dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); + + blendLinearKernel8UC4<<>>(rows, cols, img1, img2, weights1, weights2, result); + cudaSafeCall( cudaGetLastError() ); -END_OPENCV_DEVICE_NAMESPACE + if (stream == 0) + cudaSafeCall(cudaDeviceSynchronize()); + } + } // namespace blend +}}} // namespace cv { namespace gpu { namespace device diff --git a/modules/gpu/src/cuda/calib3d.cu b/modules/gpu/src/cuda/calib3d.cu index 1cdf191..27c2afb 100644 --- a/modules/gpu/src/cuda/calib3d.cu +++ b/modules/gpu/src/cuda/calib3d.cu @@ -44,149 +44,148 @@ #include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/functional.hpp" -BEGIN_OPENCV_DEVICE_NAMESPACE - -#define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200 - -namespace transform_points +namespace cv { namespace gpu { namespace device { - __constant__ float3 crot0; - __constant__ float3 crot1; - __constant__ float3 crot2; - __constant__ float3 ctransl; + #define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200 - struct TransformOp : unary_function + namespace transform_points { - __device__ __forceinline__ float3 operator()(const float3& p) const + __constant__ float3 crot0; + __constant__ float3 crot1; + __constant__ float3 crot2; + __constant__ float3 ctransl; + + struct TransformOp : unary_function { - return make_float3( - crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x, - crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y, - crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z); + __device__ __forceinline__ float3 operator()(const float3& p) const + { + return make_float3( + crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x, + crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y, + crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z); + } + }; + + void call(const DevMem2D_ src, const float* rot, + const float* transl, DevMem2D_ dst, + cudaStream_t stream) + { + cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3)); + cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3)); + cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3)); + cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3)); + ::cv::gpu::device::transform(src, dst, TransformOp(), stream); } - }; + } // namespace transform_points - void call(const DevMem2D_ src, const float* rot, - const float* transl, DevMem2D_ dst, - cudaStream_t stream) - { - cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3)); - cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3)); - cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3)); - cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3)); - OPENCV_DEVICE_NAMESPACE_ transform(src, dst, TransformOp(), stream); - } -} // namespace transform_points - -namespace project_points -{ - __constant__ float3 crot0; - __constant__ float3 crot1; - __constant__ float3 crot2; - __constant__ float3 ctransl; - __constant__ float3 cproj0; - __constant__ float3 cproj1; - - struct ProjectOp : unary_function + namespace project_points { - __device__ __forceinline__ float2 operator()(const float3& p) const + __constant__ float3 crot0; + __constant__ float3 crot1; + __constant__ float3 crot2; + __constant__ float3 ctransl; + __constant__ float3 cproj0; + __constant__ float3 cproj1; + + struct ProjectOp : unary_function { - // Rotate and translate in 3D - float3 t = make_float3( - crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x, - crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y, - crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z); - // Project on 2D plane - return make_float2( - (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z, - (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z); + __device__ __forceinline__ float2 operator()(const float3& p) const + { + // Rotate and translate in 3D + float3 t = make_float3( + crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x, + crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y, + crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z); + // Project on 2D plane + return make_float2( + (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z, + (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z); + } + }; + + void call(const DevMem2D_ src, const float* rot, + const float* transl, const float* proj, DevMem2D_ dst, + cudaStream_t stream) + { + cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3)); + cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3)); + cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3)); + cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3)); + cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3)); + cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3)); + ::cv::gpu::device::transform(src, dst, ProjectOp(), stream); } - }; + } // namespace project_points - void call(const DevMem2D_ src, const float* rot, - const float* transl, const float* proj, DevMem2D_ dst, - cudaStream_t stream) + namespace solve_pnp_ransac { - cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3)); - cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3)); - cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3)); - cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3)); - cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3)); - cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3)); - OPENCV_DEVICE_NAMESPACE_ transform(src, dst, ProjectOp(), stream); - } -} // namespace project_points - -namespace solve_pnp_ransac -{ - __constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_MAX_NUM_ITERS * 3]; - __constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_MAX_NUM_ITERS]; + __constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_MAX_NUM_ITERS * 3]; + __constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_MAX_NUM_ITERS]; - int maxNumIters() - { - return SOLVE_PNP_RANSAC_MAX_NUM_ITERS; - } - - __device__ __forceinline__ float sqr(float x) - { - return x * x; - } - - __global__ void computeHypothesisScoresKernel( - const int num_points, const float3* object, const float2* image, - const float dist_threshold, int* g_num_inliers) - { - const float3* const &rot_mat = crot_matrices + blockIdx.x * 3; - const float3 &transl_vec = ctransl_vectors[blockIdx.x]; - int num_inliers = 0; - - for (int i = threadIdx.x; i < num_points; i += blockDim.x) + int maxNumIters() { - float3 p = object[i]; - p = make_float3( - rot_mat[0].x * p.x + rot_mat[0].y * p.y + rot_mat[0].z * p.z + transl_vec.x, - rot_mat[1].x * p.x + rot_mat[1].y * p.y + rot_mat[1].z * p.z + transl_vec.y, - rot_mat[2].x * p.x + rot_mat[2].y * p.y + rot_mat[2].z * p.z + transl_vec.z); - p.x /= p.z; - p.y /= p.z; - float2 image_p = image[i]; - if (sqr(p.x - image_p.x) + sqr(p.y - image_p.y) < dist_threshold) - ++num_inliers; + return SOLVE_PNP_RANSAC_MAX_NUM_ITERS; } - extern __shared__ float s_num_inliers[]; - s_num_inliers[threadIdx.x] = num_inliers; - __syncthreads(); + __device__ __forceinline__ float sqr(float x) + { + return x * x; + } - for (int step = blockDim.x / 2; step > 0; step >>= 1) + __global__ void computeHypothesisScoresKernel( + const int num_points, const float3* object, const float2* image, + const float dist_threshold, int* g_num_inliers) { - if (threadIdx.x < step) - s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step]; + const float3* const &rot_mat = crot_matrices + blockIdx.x * 3; + const float3 &transl_vec = ctransl_vectors[blockIdx.x]; + int num_inliers = 0; + + for (int i = threadIdx.x; i < num_points; i += blockDim.x) + { + float3 p = object[i]; + p = make_float3( + rot_mat[0].x * p.x + rot_mat[0].y * p.y + rot_mat[0].z * p.z + transl_vec.x, + rot_mat[1].x * p.x + rot_mat[1].y * p.y + rot_mat[1].z * p.z + transl_vec.y, + rot_mat[2].x * p.x + rot_mat[2].y * p.y + rot_mat[2].z * p.z + transl_vec.z); + p.x /= p.z; + p.y /= p.z; + float2 image_p = image[i]; + if (sqr(p.x - image_p.x) + sqr(p.y - image_p.y) < dist_threshold) + ++num_inliers; + } + + extern __shared__ float s_num_inliers[]; + s_num_inliers[threadIdx.x] = num_inliers; __syncthreads(); - } - if (threadIdx.x == 0) - g_num_inliers[blockIdx.x] = s_num_inliers[0]; - } + for (int step = blockDim.x / 2; step > 0; step >>= 1) + { + if (threadIdx.x < step) + s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step]; + __syncthreads(); + } - void computeHypothesisScores( - const int num_hypotheses, const int num_points, const float* rot_matrices, - const float3* transl_vectors, const float3* object, const float2* image, - const float dist_threshold, int* hypothesis_scores) - { - cudaSafeCall(cudaMemcpyToSymbol(crot_matrices, rot_matrices, num_hypotheses * 3 * sizeof(float3))); - cudaSafeCall(cudaMemcpyToSymbol(ctransl_vectors, transl_vectors, num_hypotheses * sizeof(float3))); + if (threadIdx.x == 0) + g_num_inliers[blockIdx.x] = s_num_inliers[0]; + } - dim3 threads(256); - dim3 grid(num_hypotheses); - int smem_size = threads.x * sizeof(float); + void computeHypothesisScores( + const int num_hypotheses, const int num_points, const float* rot_matrices, + const float3* transl_vectors, const float3* object, const float2* image, + const float dist_threshold, int* hypothesis_scores) + { + cudaSafeCall(cudaMemcpyToSymbol(crot_matrices, rot_matrices, num_hypotheses * 3 * sizeof(float3))); + cudaSafeCall(cudaMemcpyToSymbol(ctransl_vectors, transl_vectors, num_hypotheses * sizeof(float3))); - computeHypothesisScoresKernel<<>>( - num_points, object, image, dist_threshold, hypothesis_scores); - cudaSafeCall( cudaGetLastError() ); + dim3 threads(256); + dim3 grid(num_hypotheses); + int smem_size = threads.x * sizeof(float); - cudaSafeCall( cudaDeviceSynchronize() ); - } -} // namespace solvepnp_ransac + computeHypothesisScoresKernel<<>>( + num_points, object, image, dist_threshold, hypothesis_scores); + cudaSafeCall( cudaGetLastError() ); -END_OPENCV_DEVICE_NAMESPACE + cudaSafeCall( cudaDeviceSynchronize() ); + } + } // namespace solvepnp_ransac +}}} // namespace cv { namespace gpu { namespace device diff --git a/modules/gpu/src/cuda/canny.cu b/modules/gpu/src/cuda/canny.cu index 5f31fa7..bf31eee 100644 --- a/modules/gpu/src/cuda/canny.cu +++ b/modules/gpu/src/cuda/canny.cu @@ -44,450 +44,448 @@ #include #include "internal_shared.hpp" -BEGIN_OPENCV_DEVICE_NAMESPACE - -namespace canny { - -__global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) +namespace cv { namespace gpu { namespace device { - __shared__ int smem[16][18]; - - const int j = blockIdx.x * blockDim.x + threadIdx.x; - const int i = blockIdx.y * blockDim.y + threadIdx.y; - - if (i < rows) + namespace canny { - smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j]; - if (threadIdx.x == 0) + __global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) { - smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)]; - smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)]; + __shared__ int smem[16][18]; + + const int j = blockIdx.x * blockDim.x + threadIdx.x; + const int i = blockIdx.y * blockDim.y + threadIdx.y; + + if (i < rows) + { + smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j]; + if (threadIdx.x == 0) + { + smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)]; + smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)]; + } + __syncthreads(); + + if (j < cols) + { + dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2]; + dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2]; + } + } } - __syncthreads(); - if (j < cols) + void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) { - dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2]; - dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2]; + dim3 block(16, 16, 1); + dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + + calcSobelRowPass<<>>(src, dx_buf, dy_buf, rows, cols); + cudaSafeCall( cudaGetLastError() ); + + cudaSafeCall(cudaThreadSynchronize()); } - } -} -void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) -{ - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + struct L1 + { + static __device__ __forceinline__ float calc(int x, int y) + { + return ::abs(x) + ::abs(y); + } + }; + struct L2 + { + static __device__ __forceinline__ float calc(int x, int y) + { + return ::sqrtf(x * x + y * y); + } + }; - calcSobelRowPass<<>>(src, dx_buf, dy_buf, rows, cols); - cudaSafeCall( cudaGetLastError() ); + template __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, + PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) + { + __shared__ int sdx[18][16]; + __shared__ int sdy[18][16]; - cudaSafeCall(cudaThreadSynchronize()); -} + const int j = blockIdx.x * blockDim.x + threadIdx.x; + const int i = blockIdx.y * blockDim.y + threadIdx.y; -struct L1 -{ - static __device__ __forceinline__ float calc(int x, int y) - { - return ::abs(x) + ::abs(y); - } -}; -struct L2 -{ - static __device__ __forceinline__ float calc(int x, int y) - { - return ::sqrtf(x * x + y * y); - } -}; + if (j < cols) + { + sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j]; + sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j]; + if (threadIdx.y == 0) + { + sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j]; + sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j]; -template __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, - PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) -{ - __shared__ int sdx[18][16]; - __shared__ int sdy[18][16]; + sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j]; + sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j]; + } + __syncthreads(); - const int j = blockIdx.x * blockDim.x + threadIdx.x; - const int i = blockIdx.y * blockDim.y + threadIdx.y; + if (i < rows) + { + int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x]; + int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x]; - if (j < cols) - { - sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j]; - sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j]; - if (threadIdx.y == 0) - { - sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j]; - sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j]; + dx.ptr(i)[j] = x; + dy.ptr(i)[j] = y; - sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j]; - sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j]; + mag.ptr(i + 1)[j + 1] = Norm::calc(x, y); + } + } } - __syncthreads(); - if (i < rows) + void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad) { - int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x]; - int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x]; + dim3 block(16, 16, 1); + dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); - dx.ptr(i)[j] = x; - dy.ptr(i)[j] = y; + if (L2Grad) + calcMagnitude<<>>(dx_buf, dy_buf, dx, dy, mag, rows, cols); + else + calcMagnitude<<>>(dx_buf, dy_buf, dx, dy, mag, rows, cols); - mag.ptr(i + 1)[j + 1] = Norm::calc(x, y); + cudaSafeCall( cudaGetLastError() ); + + cudaSafeCall(cudaThreadSynchronize()); } - } -} -void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad) -{ - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + template __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) + { + const int j = blockIdx.x * blockDim.x + threadIdx.x; + const int i = blockIdx.y * blockDim.y + threadIdx.y; - if (L2Grad) - calcMagnitude<<>>(dx_buf, dy_buf, dx, dy, mag, rows, cols); - else - calcMagnitude<<>>(dx_buf, dy_buf, dx, dy, mag, rows, cols); + if (i < rows && j < cols) + mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]); + } - cudaSafeCall( cudaGetLastError() ); + void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad) + { + dim3 block(16, 16, 1); + dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); - cudaSafeCall(cudaThreadSynchronize()); -} + if (L2Grad) + calcMagnitude<<>>(dx, dy, mag, rows, cols); + else + calcMagnitude<<>>(dx, dy, mag, rows, cols); -template __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) -{ - const int j = blockIdx.x * blockDim.x + threadIdx.x; - const int i = blockIdx.y * blockDim.y + threadIdx.y; + cudaSafeCall( cudaGetLastError() ); - if (i < rows && j < cols) - mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]); -} + cudaSafeCall(cudaThreadSynchronize()); + } -void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad) -{ - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + ////////////////////////////////////////////////////////////////////////////////////////// + + #define CANNY_SHIFT 15 + #define TG22 (int)(0.4142135623730950488016887242097*(1<<<>>(dx, dy, mag, rows, cols); - else - calcMagnitude<<>>(dx, dy, mag, rows, cols); + __global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh) + { + __shared__ float smem[18][18]; - cudaSafeCall( cudaGetLastError() ); + const int j = blockIdx.x * 16 + threadIdx.x; + const int i = blockIdx.y * 16 + threadIdx.y; - cudaSafeCall(cudaThreadSynchronize()); -} + const int tid = threadIdx.y * 16 + threadIdx.x; + const int lx = tid % 18; + const int ly = tid / 18; -////////////////////////////////////////////////////////////////////////////////////////// - -#define CANNY_SHIFT 15 -#define TG22 (int)(0.4142135623730950488016887242097*(1< low_thresh) + { + const int tg22x = x * TG22; + const int tg67x = tg22x + ((x + x) << CANNY_SHIFT); - if (i < rows && j < cols) - { - int x = dx.ptr(i)[j]; - int y = dy.ptr(i)[j]; - const int s = (x ^ y) < 0 ? -1 : 1; - const float m = smem[threadIdx.y + 1][threadIdx.x + 1]; + y <<= CANNY_SHIFT; - x = ::abs(x); - y = ::abs(y); + if (y < tg22x) + { + if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2]) + edge_type = 1 + (int)(m > high_thresh); + } + else if( y > tg67x ) + { + if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1]) + edge_type = 1 + (int)(m > high_thresh); + } + else + { + if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s]) + edge_type = 1 + (int)(m > high_thresh); + } + } + + map.ptr(i + 1)[j + 1] = edge_type; + } + } - // 0 - the pixel can not belong to an edge - // 1 - the pixel might belong to an edge - // 2 - the pixel does belong to an edge - int edge_type = 0; + #undef CANNY_SHIFT + #undef TG22 - if (m > low_thresh) + void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh) { - const int tg22x = x * TG22; - const int tg67x = tg22x + ((x + x) << CANNY_SHIFT); + dim3 block(16, 16, 1); + dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); - y <<= CANNY_SHIFT; + calcMap<<>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh); + cudaSafeCall( cudaGetLastError() ); - if (y < tg22x) - { - if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2]) - edge_type = 1 + (int)(m > high_thresh); - } - else if( y > tg67x ) - { - if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1]) - edge_type = 1 + (int)(m > high_thresh); - } - else - { - if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s]) - edge_type = 1 + (int)(m > high_thresh); - } + cudaSafeCall(cudaThreadSynchronize()); } - - map.ptr(i + 1)[j + 1] = edge_type; - } -} -#undef CANNY_SHIFT -#undef TG22 - -void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh) -{ - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + ////////////////////////////////////////////////////////////////////////////////////////// - calcMap<<>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh); - cudaSafeCall( cudaGetLastError() ); + __device__ unsigned int counter = 0; - cudaSafeCall(cudaThreadSynchronize()); -} - -////////////////////////////////////////////////////////////////////////////////////////// + __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols) + { + #if __CUDA_ARCH__ >= 120 -__device__ unsigned int counter = 0; + __shared__ int smem[18][18]; -__global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols) -{ - #if __CUDA_ARCH__ >= 120 + const int j = blockIdx.x * 16 + threadIdx.x; + const int i = blockIdx.y * 16 + threadIdx.y; - __shared__ int smem[18][18]; + const int tid = threadIdx.y * 16 + threadIdx.x; + const int lx = tid % 18; + const int ly = tid / 18; - const int j = blockIdx.x * 16 + threadIdx.x; - const int i = blockIdx.y * 16 + threadIdx.y; + if (ly < 14) + smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx]; - const int tid = threadIdx.y * 16 + threadIdx.x; - const int lx = tid % 18; - const int ly = tid / 18; + if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols) + smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx]; - if (ly < 14) - smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx]; + __syncthreads(); - if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols) - smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx]; + if (i < rows && j < cols) + { + int n; - __syncthreads(); + #pragma unroll + for (int k = 0; k < 16; ++k) + { + n = 0; - if (i < rows && j < cols) - { - int n; + if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1) + { + n += smem[threadIdx.y ][threadIdx.x ] == 2; + n += smem[threadIdx.y ][threadIdx.x + 1] == 2; + n += smem[threadIdx.y ][threadIdx.x + 2] == 2; + + n += smem[threadIdx.y + 1][threadIdx.x ] == 2; + n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2; + + n += smem[threadIdx.y + 2][threadIdx.x ] == 2; + n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2; + n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2; + } - #pragma unroll - for (int k = 0; k < 16; ++k) - { - n = 0; + if (n > 0) + smem[threadIdx.y + 1][threadIdx.x + 1] = 2; + } - if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1) - { - n += smem[threadIdx.y ][threadIdx.x ] == 2; - n += smem[threadIdx.y ][threadIdx.x + 1] == 2; - n += smem[threadIdx.y ][threadIdx.x + 2] == 2; - - n += smem[threadIdx.y + 1][threadIdx.x ] == 2; - n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2; - - n += smem[threadIdx.y + 2][threadIdx.x ] == 2; - n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2; - n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2; - } + const int e = smem[threadIdx.y + 1][threadIdx.x + 1]; - if (n > 0) - smem[threadIdx.y + 1][threadIdx.x + 1] = 2; - } + map.ptr(i + 1)[j + 1] = e; - const int e = smem[threadIdx.y + 1][threadIdx.x + 1]; + n = 0; - map.ptr(i + 1)[j + 1] = e; + if (e == 2) + { + n += smem[threadIdx.y ][threadIdx.x ] == 1; + n += smem[threadIdx.y ][threadIdx.x + 1] == 1; + n += smem[threadIdx.y ][threadIdx.x + 2] == 1; + + n += smem[threadIdx.y + 1][threadIdx.x ] == 1; + n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1; + + n += smem[threadIdx.y + 2][threadIdx.x ] == 1; + n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1; + n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1; + } - n = 0; + if (n > 0) + { + const unsigned int ind = atomicInc(&counter, (unsigned int)(-1)); + st[ind] = make_ushort2(j + 1, i + 1); + } + } - if (e == 2) - { - n += smem[threadIdx.y ][threadIdx.x ] == 1; - n += smem[threadIdx.y ][threadIdx.x + 1] == 1; - n += smem[threadIdx.y ][threadIdx.x + 2] == 1; - - n += smem[threadIdx.y + 1][threadIdx.x ] == 1; - n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1; - - n += smem[threadIdx.y + 2][threadIdx.x ] == 1; - n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1; - n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1; + #endif } - if (n > 0) + void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols) { - const unsigned int ind = atomicInc(&counter, (unsigned int)(-1)); - st[ind] = make_ushort2(j + 1, i + 1); - } - } - - #endif -} - -void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols) -{ - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); - - edgesHysteresisLocal<<>>(map, st1, rows, cols); - cudaSafeCall( cudaGetLastError() ); - - cudaSafeCall(cudaThreadSynchronize()); -} - -__constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; -__constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; - -__global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count) -{ - #if __CUDA_ARCH__ >= 120 + dim3 block(16, 16, 1); + dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); - const int stack_size = 512; - - __shared__ unsigned int s_counter; - __shared__ unsigned int s_ind; - __shared__ ushort2 s_st[stack_size]; + edgesHysteresisLocal<<>>(map, st1, rows, cols); + cudaSafeCall( cudaGetLastError() ); - if (threadIdx.x == 0) - s_counter = 0; - __syncthreads(); + cudaSafeCall(cudaThreadSynchronize()); + } - int ind = blockIdx.y * gridDim.x + blockIdx.x; + __constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; + __constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; - if (ind < count) - { - ushort2 pos = st1[ind]; - - if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) + __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count) { - if (threadIdx.x < 8) - { - pos.x += c_dx[threadIdx.x]; - pos.y += c_dy[threadIdx.x]; + #if __CUDA_ARCH__ >= 120 - if (map.ptr(pos.y)[pos.x] == 1) - { - map.ptr(pos.y)[pos.x] = 2; - - ind = atomicInc(&s_counter, (unsigned int)(-1)); + const int stack_size = 512; + + __shared__ unsigned int s_counter; + __shared__ unsigned int s_ind; + __shared__ ushort2 s_st[stack_size]; - s_st[ind] = pos; - } - } + if (threadIdx.x == 0) + s_counter = 0; __syncthreads(); - while (s_counter > 0 && s_counter <= stack_size - blockDim.x) - { - const int subTaskIdx = threadIdx.x >> 3; - const int portion = ::min(s_counter, blockDim.x >> 3); + int ind = blockIdx.y * gridDim.x + blockIdx.x; - pos.x = pos.y = 0; + if (ind < count) + { + ushort2 pos = st1[ind]; - if (subTaskIdx < portion) - pos = s_st[s_counter - 1 - subTaskIdx]; - __syncthreads(); - - if (threadIdx.x == 0) - s_counter -= portion; - __syncthreads(); - if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) { - pos.x += c_dx[threadIdx.x & 7]; - pos.y += c_dy[threadIdx.x & 7]; - - if (map.ptr(pos.y)[pos.x] == 1) + if (threadIdx.x < 8) { - map.ptr(pos.y)[pos.x] = 2; + pos.x += c_dx[threadIdx.x]; + pos.y += c_dy[threadIdx.x]; - ind = atomicInc(&s_counter, (unsigned int)(-1)); + if (map.ptr(pos.y)[pos.x] == 1) + { + map.ptr(pos.y)[pos.x] = 2; - s_st[ind] = pos; - } - } - __syncthreads(); - } + ind = atomicInc(&s_counter, (unsigned int)(-1)); - if (s_counter > 0) - { - if (threadIdx.x == 0) - { - ind = atomicAdd(&counter, s_counter); - s_ind = ind - s_counter; - } - __syncthreads(); + s_st[ind] = pos; + } + } + __syncthreads(); - ind = s_ind; + while (s_counter > 0 && s_counter <= stack_size - blockDim.x) + { + const int subTaskIdx = threadIdx.x >> 3; + const int portion = ::min(s_counter, blockDim.x >> 3); + + pos.x = pos.y = 0; + + if (subTaskIdx < portion) + pos = s_st[s_counter - 1 - subTaskIdx]; + __syncthreads(); + + if (threadIdx.x == 0) + s_counter -= portion; + __syncthreads(); + + if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) + { + pos.x += c_dx[threadIdx.x & 7]; + pos.y += c_dy[threadIdx.x & 7]; + + if (map.ptr(pos.y)[pos.x] == 1) + { + map.ptr(pos.y)[pos.x] = 2; + + ind = atomicInc(&s_counter, (unsigned int)(-1)); + + s_st[ind] = pos; + } + } + __syncthreads(); + } - for (int i = threadIdx.x; i < s_counter; i += blockDim.x) - { - st2[ind + i] = s_st[i]; + if (s_counter > 0) + { + if (threadIdx.x == 0) + { + ind = atomicAdd(&counter, s_counter); + s_ind = ind - s_counter; + } + __syncthreads(); + + ind = s_ind; + + for (int i = threadIdx.x; i < s_counter; i += blockDim.x) + { + st2[ind + i] = s_st[i]; + } + } } } - } - } - - #endif -} - -void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols) -{ - void* counter_ptr; - cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); - - unsigned int count; - cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); - while (count > 0) - { - cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); + #endif + } - dim3 block(128, 1, 1); - dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1); - edgesHysteresisGlobal<<>>(map, st1, st2, rows, cols, count); - cudaSafeCall( cudaGetLastError() ); + void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols) + { + void* counter_ptr; + cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); + + unsigned int count; + cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); - cudaSafeCall(cudaThreadSynchronize()); + while (count > 0) + { + cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); - cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); + dim3 block(128, 1, 1); + dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1); + edgesHysteresisGlobal<<>>(map, st1, st2, rows, cols, count); + cudaSafeCall( cudaGetLastError() ); - std::swap(st1, st2); - } -} + cudaSafeCall(cudaThreadSynchronize()); -__global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols) -{ - const int j = blockIdx.x * 16 + threadIdx.x; - const int i = blockIdx.y * 16 + threadIdx.y; + cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); - if (i < rows && j < cols) - dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1)); -} + std::swap(st1, st2); + } + } -void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols) -{ - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + __global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols) + { + const int j = blockIdx.x * 16 + threadIdx.x; + const int i = blockIdx.y * 16 + threadIdx.y; - getEdges<<>>(map, dst, rows, cols); - cudaSafeCall( cudaGetLastError() ); + if (i < rows && j < cols) + dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1)); + } - cudaSafeCall(cudaThreadSynchronize()); -} + void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols) + { + dim3 block(16, 16, 1); + dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); -} // namespace canny + getEdges<<>>(map, dst, rows, cols); + cudaSafeCall( cudaGetLastError() ); -END_OPENCV_DEVICE_NAMESPACE + cudaSafeCall(cudaThreadSynchronize()); + } + } // namespace canny +}}} // namespace cv { namespace gpu { namespace device diff --git a/modules/gpu/src/cuda/color.cu b/modules/gpu/src/cuda/color.cu index 4da3f77..9384ea6 100644 --- a/modules/gpu/src/cuda/color.cu +++ b/modules/gpu/src/cuda/color.cu @@ -44,181 +44,181 @@ #include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/color.hpp" -BEGIN_OPENCV_DEVICE_NAMESPACE - -DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits::functor_type) -{ - enum { smart_block_dim_x = 8 }; - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; - -DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; -DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; -DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; -DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; - -DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; -DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; -DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; -DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; - -DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; - -DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type) -{ - enum { smart_shift = 4 }; -}; -DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type) -{ - enum { smart_shift = 4 }; -}; - -DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; -DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; - -DEFINE_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; -DEFINE_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; - -DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; -DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; - -DEFINE_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; -DEFINE_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; - -DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; -DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; - -DEFINE_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; -DEFINE_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; - -DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; -DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; - -DEFINE_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; -DEFINE_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; - -DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; -DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; - -DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; -DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits::functor_type) -{ - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; -}; +namespace cv { namespace gpu { namespace device +{ + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits::functor_type) + { + enum { smart_block_dim_x = 8 }; + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type) + { + enum { smart_shift = 4 }; + }; + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type) + { + enum { smart_shift = 4 }; + }; + + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; + OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits::functor_type) + { + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; + }; #define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \ void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream) \ @@ -226,7 +226,7 @@ DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits::functor_type) traits::functor_type functor = traits::create_functor(); \ typedef typename traits::functor_type::argument_type src_t; \ typedef typename traits::functor_type::result_type dst_t; \ - OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_)src, (DevMem2D_)dst, functor, stream); \ + ::cv::gpu::device::transform((DevMem2D_)src, (DevMem2D_)dst, functor, stream); \ } #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \ @@ -243,138 +243,137 @@ DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits::functor_type) OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits) \ OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba) - -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565) - -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra) - -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra) - -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565) - -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray) - -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray) - -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4) - -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra) - -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4) - -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra) - -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4) - -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra) - -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4) - -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra) - -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4) - -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr) -OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra) - -#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR -#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE -#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL -#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F - -END_OPENCV_DEVICE_NAMESPACE + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba) + + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565) + + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra) + + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra) + + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565) + + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray) + + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray) + + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4) + + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra) + + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4) + + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra) + + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4) + + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra) + + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4) + + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra) + + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4) + + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr) + OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra) + + #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR + #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE + #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL + #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F +}}} // namespace cv { namespace gpu { namespace device diff --git a/modules/gpu/src/cuda/column_filter.cu b/modules/gpu/src/cuda/column_filter.cu index c16ca82..df85641 100644 --- a/modules/gpu/src/cuda/column_filter.cu +++ b/modules/gpu/src/cuda/column_filter.cu @@ -47,203 +47,201 @@ #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/border_interpolate.hpp" -BEGIN_OPENCV_DEVICE_NAMESPACE - -#define MAX_KERNEL_SIZE 16 -#define BLOCK_DIM_X 16 -#define BLOCK_DIM_Y 4 -#define RESULT_STEPS 8 -#define HALO_STEPS 1 - -namespace column_filter { +namespace cv { namespace gpu { namespace device +{ + #define MAX_KERNEL_SIZE 16 + #define BLOCK_DIM_X 16 + #define BLOCK_DIM_Y 4 + #define RESULT_STEPS 8 + #define HALO_STEPS 1 -__constant__ float c_kernel[MAX_KERNEL_SIZE]; + namespace column_filter + { + __constant__ float c_kernel[MAX_KERNEL_SIZE]; -void loadKernel(const float kernel[], int ksize) -{ - cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) ); -} + void loadKernel(const float kernel[], int ksize) + { + cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) ); + } -template -__global__ void linearColumnFilter(const DevMem2D_ src, PtrStep dst, int anchor, const B b) -{ - typedef typename TypeVec::cn>::vec_type sum_t; + template + __global__ void linearColumnFilter(const DevMem2D_ src, PtrStep dst, int anchor, const B b) + { + typedef typename TypeVec::cn>::vec_type sum_t; - __shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1]; + __shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1]; - //Offset to the upper halo edge - const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x; - const int y = (blockIdx.y * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_Y + threadIdx.y; + //Offset to the upper halo edge + const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x; + const int y = (blockIdx.y * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_Y + threadIdx.y; - if (x < src.cols) - { - const T* src_col = src.ptr() + x; + if (x < src.cols) + { + const T* src_col = src.ptr() + x; - //Main data - #pragma unroll - for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i) - smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step); + //Main data + #pragma unroll + for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i) + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step); - //Upper halo - #pragma unroll - for(int i = 0; i < HALO_STEPS; ++i) - smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_low(y + i * BLOCK_DIM_Y, src_col, src.step); + //Upper halo + #pragma unroll + for(int i = 0; i < HALO_STEPS; ++i) + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_low(y + i * BLOCK_DIM_Y, src_col, src.step); - //Lower halo - #pragma unroll - for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i) - smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y]= b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step); + //Lower halo + #pragma unroll + for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i) + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y]= b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step); - __syncthreads(); + __syncthreads(); - #pragma unroll - for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i) - { - sum_t sum = VecTraits::all(0); + #pragma unroll + for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i) + { + sum_t sum = VecTraits::all(0); - #pragma unroll - for(int j = 0; j < KERNEL_SIZE; ++j) - sum = sum + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y + j - anchor] * c_kernel[j]; + #pragma unroll + for(int j = 0; j < KERNEL_SIZE; ++j) + sum = sum + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y + j - anchor] * c_kernel[j]; - int dstY = y + i * BLOCK_DIM_Y; + int dstY = y + i * BLOCK_DIM_Y; - if (dstY < src.rows) - dst.ptr(dstY)[x] = saturate_cast(sum); + if (dstY < src.rows) + dst.ptr(dstY)[x] = saturate_cast(sum); + } + } } - } -} -template class B> -void linearColumnFilter_caller(const DevMem2D_& src, const DevMem2D_& dst, int anchor, cudaStream_t stream) -{ - const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); - const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS * BLOCK_DIM_Y)); + template class B> + void linearColumnFilter_caller(const DevMem2D_& src, const DevMem2D_& dst, int anchor, cudaStream_t stream) + { + const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); + const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS * BLOCK_DIM_Y)); - B b(src.rows); + B b(src.rows); - linearColumnFilter<<>>(src, dst, anchor, b); - cudaSafeCall( cudaGetLastError() ); + linearColumnFilter<<>>(src, dst, anchor, b); + cudaSafeCall( cudaGetLastError() ); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); -} + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } -template -void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream) -{ - typedef void (*caller_t)(const DevMem2D_& src, const DevMem2D_& dst, int anchor, cudaStream_t stream); - static const caller_t callers[5][17] = - { - { - 0, - linearColumnFilter_caller<1 , T, D, BrdColReflect101>, - linearColumnFilter_caller<2 , T, D, BrdColReflect101>, - linearColumnFilter_caller<3 , T, D, BrdColReflect101>, - linearColumnFilter_caller<4 , T, D, BrdColReflect101>, - linearColumnFilter_caller<5 , T, D, BrdColReflect101>, - linearColumnFilter_caller<6 , T, D, BrdColReflect101>, - linearColumnFilter_caller<7 , T, D, BrdColReflect101>, - linearColumnFilter_caller<8 , T, D, BrdColReflect101>, - linearColumnFilter_caller<9 , T, D, BrdColReflect101>, - linearColumnFilter_caller<10, T, D, BrdColReflect101>, - linearColumnFilter_caller<11, T, D, BrdColReflect101>, - linearColumnFilter_caller<12, T, D, BrdColReflect101>, - linearColumnFilter_caller<13, T, D, BrdColReflect101>, - linearColumnFilter_caller<14, T, D, BrdColReflect101>, - linearColumnFilter_caller<15, T, D, BrdColReflect101>, - linearColumnFilter_caller<16, T, D, BrdColReflect101> - }, - { - 0, - linearColumnFilter_caller<1 , T, D, BrdColReplicate>, - linearColumnFilter_caller<2 , T, D, BrdColReplicate>, - linearColumnFilter_caller<3 , T, D, BrdColReplicate>, - linearColumnFilter_caller<4 , T, D, BrdColReplicate>, - linearColumnFilter_caller<5 , T, D, BrdColReplicate>, - linearColumnFilter_caller<6 , T, D, BrdColReplicate>, - linearColumnFilter_caller<7 , T, D, BrdColReplicate>, - linearColumnFilter_caller<8 , T, D, BrdColReplicate>, - linearColumnFilter_caller<9 , T, D, BrdColReplicate>, - linearColumnFilter_caller<10, T, D, BrdColReplicate>, - linearColumnFilter_caller<11, T, D, BrdColReplicate>, - linearColumnFilter_caller<12, T, D, BrdColReplicate>, - linearColumnFilter_caller<13, T, D, BrdColReplicate>, - linearColumnFilter_caller<14, T, D, BrdColReplicate>, - linearColumnFilter_caller<15, T, D, BrdColReplicate>, - linearColumnFilter_caller<16, T, D, BrdColReplicate> - }, + template + void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream) { - 0, - linearColumnFilter_caller<1 , T, D, BrdColConstant>, - linearColumnFilter_caller<2 , T, D, BrdColConstant>, - linearColumnFilter_caller<3 , T, D, BrdColConstant>, - linearColumnFilter_caller<4 , T, D, BrdColConstant>, - linearColumnFilter_caller<5 , T, D, BrdColConstant>, - linearColumnFilter_caller<6 , T, D, BrdColConstant>, - linearColumnFilter_caller<7 , T, D, BrdColConstant>, - linearColumnFilter_caller<8 , T, D, BrdColConstant>, - linearColumnFilter_caller<9 , T, D, BrdColConstant>, - linearColumnFilter_caller<10, T, D, BrdColConstant>, - linearColumnFilter_caller<11, T, D, BrdColConstant>, - linearColumnFilter_caller<12, T, D, BrdColConstant>, - linearColumnFilter_caller<13, T, D, BrdColConstant>, - linearColumnFilter_caller<14, T, D, BrdColConstant>, - linearColumnFilter_caller<15, T, D, BrdColConstant>, - linearColumnFilter_caller<16, T, D, BrdColConstant> - }, - { - 0, - linearColumnFilter_caller<1 , T, D, BrdColReflect>, - linearColumnFilter_caller<2 , T, D, BrdColReflect>, - linearColumnFilter_caller<3 , T, D, BrdColReflect>, - linearColumnFilter_caller<4 , T, D, BrdColReflect>, - linearColumnFilter_caller<5 , T, D, BrdColReflect>, - linearColumnFilter_caller<6 , T, D, BrdColReflect>, - linearColumnFilter_caller<7 , T, D, BrdColReflect>, - linearColumnFilter_caller<8 , T, D, BrdColReflect>, - linearColumnFilter_caller<9 , T, D, BrdColReflect>, - linearColumnFilter_caller<10, T, D, BrdColReflect>, - linearColumnFilter_caller<11, T, D, BrdColReflect>, - linearColumnFilter_caller<12, T, D, BrdColReflect>, - linearColumnFilter_caller<13, T, D, BrdColReflect>, - linearColumnFilter_caller<14, T, D, BrdColReflect>, - linearColumnFilter_caller<15, T, D, BrdColReflect>, - linearColumnFilter_caller<16, T, D, BrdColReflect> - }, - { - 0, - linearColumnFilter_caller<1 , T, D, BrdColWrap>, - linearColumnFilter_caller<2 , T, D, BrdColWrap>, - linearColumnFilter_caller<3 , T, D, BrdColWrap>, - linearColumnFilter_caller<4 , T, D, BrdColWrap>, - linearColumnFilter_caller<5 , T, D, BrdColWrap>, - linearColumnFilter_caller<6 , T, D, BrdColWrap>, - linearColumnFilter_caller<7 , T, D, BrdColWrap>, - linearColumnFilter_caller<8 , T, D, BrdColWrap>, - linearColumnFilter_caller<9 , T, D, BrdColWrap>, - linearColumnFilter_caller<10, T, D, BrdColWrap>, - linearColumnFilter_caller<11, T, D, BrdColWrap>, - linearColumnFilter_caller<12, T, D, BrdColWrap>, - linearColumnFilter_caller<13, T, D, BrdColWrap>, - linearColumnFilter_caller<14, T, D, BrdColWrap>, - linearColumnFilter_caller<15, T, D, BrdColWrap>, - linearColumnFilter_caller<16, T, D, BrdColWrap>, + typedef void (*caller_t)(const DevMem2D_& src, const DevMem2D_& dst, int anchor, cudaStream_t stream); + static const caller_t callers[5][17] = + { + { + 0, + linearColumnFilter_caller<1 , T, D, BrdColReflect101>, + linearColumnFilter_caller<2 , T, D, BrdColReflect101>, + linearColumnFilter_caller<3 , T, D, BrdColReflect101>, + linearColumnFilter_caller<4 , T, D, BrdColReflect101>, + linearColumnFilter_caller<5 , T, D, BrdColReflect101>, + linearColumnFilter_caller<6 , T, D, BrdColReflect101>, + linearColumnFilter_caller<7 , T, D, BrdColReflect101>, + linearColumnFilter_caller<8 , T, D, BrdColReflect101>, + linearColumnFilter_caller<9 , T, D, BrdColReflect101>, + linearColumnFilter_caller<10, T, D, BrdColReflect101>, + linearColumnFilter_caller<11, T, D, BrdColReflect101>, + linearColumnFilter_caller<12, T, D, BrdColReflect101>, + linearColumnFilter_caller<13, T, D, BrdColReflect101>, + linearColumnFilter_caller<14, T, D, BrdColReflect101>, + linearColumnFilter_caller<15, T, D, BrdColReflect101>, + linearColumnFilter_caller<16, T, D, BrdColReflect101> + }, + { + 0, + linearColumnFilter_caller<1 , T, D, BrdColReplicate>, + linearColumnFilter_caller<2 , T, D, BrdColReplicate>, + linearColumnFilter_caller<3 , T, D, BrdColReplicate>, + linearColumnFilter_caller<4 , T, D, BrdColReplicate>, + linearColumnFilter_caller<5 , T, D, BrdColReplicate>, + linearColumnFilter_caller<6 , T, D, BrdColReplicate>, + linearColumnFilter_caller<7 , T, D, BrdColReplicate>, + linearColumnFilter_caller<8 , T, D, BrdColReplicate>, + linearColumnFilter_caller<9 , T, D, BrdColReplicate>, + linearColumnFilter_caller<10, T, D, BrdColReplicate>, + linearColumnFilter_caller<11, T, D, BrdColReplicate>, + linearColumnFilter_caller<12, T, D, BrdColReplicate>, + linearColumnFilter_caller<13, T, D, BrdColReplicate>, + linearColumnFilter_caller<14, T, D, BrdColReplicate>, + linearColumnFilter_caller<15, T, D, BrdColReplicate>, + linearColumnFilter_caller<16, T, D, BrdColReplicate> + }, + { + 0, + linearColumnFilter_caller<1 , T, D, BrdColConstant>, + linearColumnFilter_caller<2 , T, D, BrdColConstant>, + linearColumnFilter_caller<3 , T, D, BrdColConstant>, + linearColumnFilter_caller<4 , T, D, BrdColConstant>, + linearColumnFilter_caller<5 , T, D, BrdColConstant>, + linearColumnFilter_caller<6 , T, D, BrdColConstant>, + linearColumnFilter_caller<7 , T, D, BrdColConstant>, + linearColumnFilter_caller<8 , T, D, BrdColConstant>, + linearColumnFilter_caller<9 , T, D, BrdColConstant>, + linearColumnFilter_caller<10, T, D, BrdColConstant>, + linearColumnFilter_caller<11, T, D, BrdColConstant>, + linearColumnFilter_caller<12, T, D, BrdColConstant>, + linearColumnFilter_caller<13, T, D, BrdColConstant>, + linearColumnFilter_caller<14, T, D, BrdColConstant>, + linearColumnFilter_caller<15, T, D, BrdColConstant>, + linearColumnFilter_caller<16, T, D, BrdColConstant> + }, + { + 0, + linearColumnFilter_caller<1 , T, D, BrdColReflect>, + linearColumnFilter_caller<2 , T, D, BrdColReflect>, + linearColumnFilter_caller<3 , T, D, BrdColReflect>, + linearColumnFilter_caller<4 , T, D, BrdColReflect>, + linearColumnFilter_caller<5 , T, D, BrdColReflect>, + linearColumnFilter_caller<6 , T, D, BrdColReflect>, + linearColumnFilter_caller<7 , T, D, BrdColReflect>, + linearColumnFilter_caller<8 , T, D, BrdColReflect>, + linearColumnFilter_caller<9 , T, D, BrdColReflect>, + linearColumnFilter_caller<10, T, D, BrdColReflect>, + linearColumnFilter_caller<11, T, D, BrdColReflect>, + linearColumnFilter_caller<12, T, D, BrdColReflect>, + linearColumnFilter_caller<13, T, D, BrdColReflect>, + linearColumnFilter_caller<14, T, D, BrdColReflect>, + linearColumnFilter_caller<15, T, D, BrdColReflect>, + linearColumnFilter_caller<16, T, D, BrdColReflect> + }, + { + 0, + linearColumnFilter_caller<1 , T, D, BrdColWrap>, + linearColumnFilter_caller<2 , T, D, BrdColWrap>, + linearColumnFilter_caller<3 , T, D, BrdColWrap>, + linearColumnFilter_caller<4 , T, D, BrdColWrap>, + linearColumnFilter_caller<5 , T, D, BrdColWrap>, + linearColumnFilter_caller<6 , T, D, BrdColWrap>, + linearColumnFilter_caller<7 , T, D, BrdColWrap>, + linearColumnFilter_caller<8 , T, D, BrdColWrap>, + linearColumnFilter_caller<9 , T, D, BrdColWrap>, + linearColumnFilter_caller<10, T, D, BrdColWrap>, + linearColumnFilter_caller<11, T, D, BrdColWrap>, + linearColumnFilter_caller<12, T, D, BrdColWrap>, + linearColumnFilter_caller<13, T, D, BrdColWrap>, + linearColumnFilter_caller<14, T, D, BrdColWrap>, + linearColumnFilter_caller<15, T, D, BrdColWrap>, + linearColumnFilter_caller<16, T, D, BrdColWrap>, + } + }; + + loadKernel(kernel, ksize); + + callers[brd_type][ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor, stream); } - }; - - loadKernel(kernel, ksize); - - callers[brd_type][ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor, stream); -} - -template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); -template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); -//template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); -//template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); -template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); -template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); -template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - -} // namespace column_filter -END_OPENCV_DEVICE_NAMESPACE + template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + //template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + //template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + } // namespace column_filter +}}} // namespace cv { namespace gpu { namespace device diff --git a/modules/gpu/src/cuda/copy_make_border.cu b/modules/gpu/src/cuda/copy_make_border.cu index aafcdf5..3397672 100644 --- a/modules/gpu/src/cuda/copy_make_border.cu +++ b/modules/gpu/src/cuda/copy_make_border.cu @@ -43,87 +43,85 @@ #include "internal_shared.hpp" #include "opencv2/gpu/device/border_interpolate.hpp" -BEGIN_OPENCV_DEVICE_NAMESPACE - -namespace imgproc { - -template __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_ dst, int top, int left) -{ - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; - - if (x < dst.cols && y < dst.rows) - dst.ptr(y)[x] = src(y - top, x - left); -} - -template