ORB

author Vladislav Vinogradov <vlad.vinogradov@itseez.com>

Mon, 12 Nov 2012 10:17:48 +0000 (14:17 +0400)

committer Vladislav Vinogradov <vlad.vinogradov@itseez.com>

Mon, 26 Nov 2012 07:37:38 +0000 (11:37 +0400)
author Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Mon, 12 Nov 2012 10:17:48 +0000 (14:17 +0400)
committer Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Mon, 26 Nov 2012 07:37:38 +0000 (11:37 +0400)
diff --git a/modules/gpu/src/cuda/orb.cu b/modules/gpu/src/cuda/orb.cu

index 91c5709..d66b3e9 100644 (file)
--- a/modules/gpu/src/cuda/orb.cu
+++ b/modules/gpu/src/cuda/orb.cu
@@ -50,7 +50,7 @@
  #include <thrust/sort.h>
  
  #include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
  #include "opencv2/gpu/device/functional.hpp"
  
  namespace cv { namespace gpu { namespace device
@@ -75,9 +75,9 @@ namespace cv { namespace gpu { namespace device
  
          __global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k)
          {
-            __shared__ int smem[8 * 32];
-
-            volatile int* srow = smem + threadIdx.y * blockDim.x;
+            __shared__ int smem0[8 * 32];
+            __shared__ int smem1[8 * 32];
+            __shared__ int smem2[8 * 32];
  
              const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
  
@@ -109,9 +109,12 @@ namespace cv { namespace gpu { namespace device
                      c += Ix * Iy;
                  }
  
-                reduce_old<32>(srow, a, threadIdx.x, plus<volatile int>());
-                reduce_old<32>(srow, b, threadIdx.x, plus<volatile int>());
-                reduce_old<32>(srow, c, threadIdx.x, plus<volatile int>());
+                int* srow0 = smem0 + threadIdx.y * blockDim.x;
+                int* srow1 = smem1 + threadIdx.y * blockDim.x;
+                int* srow2 = smem2 + threadIdx.y * blockDim.x;
+
+                plus<int> op;
+                reduce<32>(smem_tuple(srow0, srow1, srow2), thrust::tie(a, b, c), threadIdx.x, thrust::make_tuple(op, op, op));
  
                  if (threadIdx.x == 0)
                  {
@@ -151,9 +154,13 @@ namespace cv { namespace gpu { namespace device
  
          __global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k)
          {
-            __shared__ int smem[8 * 32];
+            __shared__ int smem0[8 * 32];
+            __shared__ int smem1[8 * 32];
+
+            int* srow0 = smem0 + threadIdx.y * blockDim.x;
+            int* srow1 = smem1 + threadIdx.y * blockDim.x;
  
-            volatile int* srow = smem + threadIdx.y * blockDim.x;
+            plus<int> op;
  
              const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
  
@@ -167,7 +174,7 @@ namespace cv { namespace gpu { namespace device
                  for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
                      m_10 += u * image(loc.y, loc.x + u);
  
-                reduce_old<32>(srow, m_10, threadIdx.x, plus<volatile int>());
+                reduce<32>(srow0, m_10, threadIdx.x, op);
  
                  for (int v = 1; v <= half_k; ++v)
                  {
@@ -185,8 +192,7 @@ namespace cv { namespace gpu { namespace device
                          m_sum += u * (val_plus + val_minus);
                      }
  
-                    reduce_old<32>(srow, v_sum, threadIdx.x, plus<volatile int>());
-                    reduce_old<32>(srow, m_sum, threadIdx.x, plus<volatile int>());
+                    reduce<32>(smem_tuple(srow0, srow1), thrust::tie(v_sum, m_sum), threadIdx.x, thrust::make_tuple(op, op));
  
                      m_10 += m_sum;
                      m_01 += v * v_sum;
author	Vladislav Vinogradov <vlad.vinogradov@itseez.com>
	Mon, 12 Nov 2012 10:17:48 +0000 (14:17 +0400)
committer	Vladislav Vinogradov <vlad.vinogradov@itseez.com>
	Mon, 26 Nov 2012 07:37:38 +0000 (11:37 +0400)