FastNonLocalMeans

author Vladislav Vinogradov <vlad.vinogradov@itseez.com>

Mon, 12 Nov 2012 10:14:48 +0000 (14:14 +0400)

committer Vladislav Vinogradov <vlad.vinogradov@itseez.com>

Mon, 26 Nov 2012 07:37:37 +0000 (11:37 +0400)
author Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Mon, 12 Nov 2012 10:14:48 +0000 (14:14 +0400)
committer Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Mon, 26 Nov 2012 07:37:37 +0000 (11:37 +0400)
diff --git a/modules/gpu/src/cuda/nlm.cu b/modules/gpu/src/cuda/nlm.cu

index e267c73..cd3f0b5 100644 (file)
--- a/modules/gpu/src/cuda/nlm.cu
+++ b/modules/gpu/src/cuda/nlm.cu
@@ -43,11 +43,11 @@
  
  #if !defined CUDA_DISABLER
  
-#include "internal_shared.hpp"
-
+#include "opencv2/gpu/device/common.hpp"
  #include "opencv2/gpu/device/vec_traits.hpp"
  #include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/block.hpp"
+#include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
  #include "opencv2/gpu/device/border_interpolate.hpp"
  
  using namespace cv::gpu;
@@ -184,6 +184,85 @@ namespace cv { namespace gpu { namespace device
  {
      namespace imgproc
      {
+
+        template <int cn> struct Unroll;
+        template <> struct Unroll<1>
+        {
+            template <int BLOCK_SIZE>
+            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*> smem_tuple(float* smem)
+            {
+                return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE);
+            }
+
+            static __device__ __forceinline__ thrust::tuple<float&, float&> tie(float& val1, float& val2)
+            {
+                return thrust::tie(val1, val2);
+            }
+
+            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float> > op()
+            {
+                plus<float> op;
+                return thrust::make_tuple(op, op);
+            }
+        };
+        template <> struct Unroll<2>
+        {
+            template <int BLOCK_SIZE>
+            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
+            {
+                return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
+            }
+
+            static __device__ __forceinline__ thrust::tuple<float&, float&, float&> tie(float& val1, float2& val2)
+            {
+                return thrust::tie(val1, val2.x, val2.y);
+            }
+
+            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float> > op()
+            {
+                plus<float> op;
+                return thrust::make_tuple(op, op, op);
+            }
+        };
+        template <> struct Unroll<3>
+        {
+            template <int BLOCK_SIZE>
+            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
+            {
+                return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
+            }
+
+            static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&> tie(float& val1, float3& val2)
+            {
+                return thrust::tie(val1, val2.x, val2.y, val2.z);
+            }
+
+            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float> > op()
+            {
+                plus<float> op;
+                return thrust::make_tuple(op, op, op, op);
+            }
+        };
+        template <> struct Unroll<4>
+        {
+            template <int BLOCK_SIZE>
+            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
+            {
+                return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE, smem + 4 * BLOCK_SIZE);
+            }
+
+            static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&, float&> tie(float& val1, float4& val2)
+            {
+                return thrust::tie(val1, val2.x, val2.y, val2.z, val2.w);
+            }
+
+            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float>, plus<float> > op()
+            {
+                plus<float> op;
+                return thrust::make_tuple(op, op, op, op, op);
+            }
+        };
+
          __device__ __forceinline__ int calcDist(const uchar&  a, const uchar&  b) { return (a-b)*(a-b); }
          __device__ __forceinline__ int calcDist(const uchar2& a, const uchar2& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y); }
          __device__ __forceinline__ int calcDist(const uchar3& a, const uchar3& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y) + (a.z-b.z)*(a.z-b.z); }
@@ -340,30 +419,15 @@ namespace cv { namespace gpu { namespace device
                      sum = sum + weight * saturate_cast<sum_type>(src(sy + y, sx + x));
                  }
  
-                volatile __shared__ float cta_buffer[CTA_SIZE];
-
-                int tid = threadIdx.x;
+                __shared__ float cta_buffer[CTA_SIZE * (VecTraits<T>::cn + 1)];
  
-                cta_buffer[tid] = weights_sum;
-                __syncthreads();
-                Block::reduce<CTA_SIZE>(cta_buffer, plus());
-                weights_sum = cta_buffer[0];
-
-                __syncthreads();
-
-
-                for(int n = 0; n < VecTraits<T>::cn; ++n)
-                {
-                    cta_buffer[tid] = reinterpret_cast<float*>(&sum)[n];
-                    __syncthreads();
-                    Block::reduce<CTA_SIZE>(cta_buffer, plus());
-                    reinterpret_cast<float*>(&sum)[n] = cta_buffer[0];
-
-                    __syncthreads();
-                }
+                reduce<CTA_SIZE>(Unroll<VecTraits<T>::cn>::template smem_tuple<CTA_SIZE>(cta_buffer),
+                                 Unroll<VecTraits<T>::cn>::tie(weights_sum, sum),
+                                 threadIdx.x,
+                                 Unroll<VecTraits<T>::cn>::op());
  
-                if (tid == 0)
-                    dst = saturate_cast<T>(sum/weights_sum);
+                if (threadIdx.x == 0)
+                    dst = saturate_cast<T>(sum / weights_sum);
              }
  
              __device__ __forceinline__ void operator()(PtrStepSz<T>& dst) const
@@ -503,4 +567,4 @@ namespace cv { namespace gpu { namespace device
  }}}
  
  
-#endif /* CUDA_DISABLER */
-\ No newline at end of file
+#endif /* CUDA_DISABLER */
author	Vladislav Vinogradov <vlad.vinogradov@itseez.com>
	Mon, 12 Nov 2012 10:14:48 +0000 (14:14 +0400)
committer	Vladislav Vinogradov <vlad.vinogradov@itseez.com>
	Mon, 26 Nov 2012 07:37:37 +0000 (11:37 +0400)