From 19c87d1c9d75b348bb1027afe6fc29cf7457a3c0 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Mon, 12 Nov 2012 14:17:48 +0400 Subject: [PATCH] ORB --- modules/gpu/src/cuda/orb.cu | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/modules/gpu/src/cuda/orb.cu b/modules/gpu/src/cuda/orb.cu index 91c5709..d66b3e9 100644 --- a/modules/gpu/src/cuda/orb.cu +++ b/modules/gpu/src/cuda/orb.cu @@ -50,7 +50,7 @@ #include #include "opencv2/gpu/device/common.hpp" -#include "opencv2/gpu/device/utility.hpp" +#include "opencv2/gpu/device/reduce.hpp" #include "opencv2/gpu/device/functional.hpp" namespace cv { namespace gpu { namespace device @@ -75,9 +75,9 @@ namespace cv { namespace gpu { namespace device __global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k) { - __shared__ int smem[8 * 32]; - - volatile int* srow = smem + threadIdx.y * blockDim.x; + __shared__ int smem0[8 * 32]; + __shared__ int smem1[8 * 32]; + __shared__ int smem2[8 * 32]; const int ptidx = blockIdx.x * blockDim.y + threadIdx.y; @@ -109,9 +109,12 @@ namespace cv { namespace gpu { namespace device c += Ix * Iy; } - reduce_old<32>(srow, a, threadIdx.x, plus()); - reduce_old<32>(srow, b, threadIdx.x, plus()); - reduce_old<32>(srow, c, threadIdx.x, plus()); + int* srow0 = smem0 + threadIdx.y * blockDim.x; + int* srow1 = smem1 + threadIdx.y * blockDim.x; + int* srow2 = smem2 + threadIdx.y * blockDim.x; + + plus op; + reduce<32>(smem_tuple(srow0, srow1, srow2), thrust::tie(a, b, c), threadIdx.x, thrust::make_tuple(op, op, op)); if (threadIdx.x == 0) { @@ -151,9 +154,13 @@ namespace cv { namespace gpu { namespace device __global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k) { - __shared__ int smem[8 * 32]; + __shared__ int smem0[8 * 32]; + __shared__ int smem1[8 * 32]; + + int* srow0 = smem0 + threadIdx.y * blockDim.x; + int* srow1 = smem1 + threadIdx.y * blockDim.x; - volatile int* srow = smem + threadIdx.y * blockDim.x; + plus op; const int ptidx = blockIdx.x * blockDim.y + threadIdx.y; @@ -167,7 +174,7 @@ namespace cv { namespace gpu { namespace device for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x) m_10 += u * image(loc.y, loc.x + u); - reduce_old<32>(srow, m_10, threadIdx.x, plus()); + reduce<32>(srow0, m_10, threadIdx.x, op); for (int v = 1; v <= half_k; ++v) { @@ -185,8 +192,7 @@ namespace cv { namespace gpu { namespace device m_sum += u * (val_plus + val_minus); } - reduce_old<32>(srow, v_sum, threadIdx.x, plus()); - reduce_old<32>(srow, m_sum, threadIdx.x, plus()); + reduce<32>(smem_tuple(srow0, srow1), thrust::tie(v_sum, m_sum), threadIdx.x, thrust::make_tuple(op, op)); m_10 += m_sum; m_01 += v * v_sum; -- 2.7.4