fixed backward compatibility with less than 1.2 CUDA capability

author Marina Kolpakova <no@email>

Sat, 7 Jul 2012 21:48:53 +0000 (21:48 +0000)

committer Marina Kolpakova <no@email>

Sat, 7 Jul 2012 21:48:53 +0000 (21:48 +0000)
author Marina Kolpakova <no@email>
Sat, 7 Jul 2012 21:48:53 +0000 (21:48 +0000)
committer Marina Kolpakova <no@email>
Sat, 7 Jul 2012 21:48:53 +0000 (21:48 +0000)
diff --git a/modules/gpu/src/cuda/lbp.cu b/modules/gpu/src/cuda/lbp.cu

index cd46945..9981fa6 100644 (file)
--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@@ -86,8 +86,11 @@ namespace cv { namespace gpu { namespace device
              rect.y = roundf(y * scale);
              rect.z = roundf(clWidth);
              rect.w = roundf(clHeight);
-
-            int res = atomicInc(n, 100);
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
+            int res = __atomicInc(n, 100U);
+#else
+            int res = atomicInc(n, 100U);
+#endif
              objects(0, res) = rect;
          }
  
@@ -111,14 +114,24 @@ namespace cv { namespace gpu { namespace device
              __syncthreads();
  
              int cls = labels[tid];
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
+            __atomicAdd((int*)(rrects + cls * 4 + 0), candidates[tid].x);
+            __atomicAdd((int*)(rrects + cls * 4 + 1), candidates[tid].y);
+            __atomicAdd((int*)(rrects + cls * 4 + 2), candidates[tid].z);
+            __atomicAdd((int*)(rrects + cls * 4 + 3), candidates[tid].w);
+#else
              atomicAdd((int*)(rrects + cls * 4 + 0), candidates[tid].x);
              atomicAdd((int*)(rrects + cls * 4 + 1), candidates[tid].y);
              atomicAdd((int*)(rrects + cls * 4 + 2), candidates[tid].z);
              atomicAdd((int*)(rrects + cls * 4 + 3), candidates[tid].w);
+#endif
              labels[tid] = 0;
              __syncthreads();
-
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
+            __atomicInc((unsigned int*)labels + cls, n);
+#else
              atomicInc((unsigned int*)labels + cls, n);
+#endif
              *nclasses = 0;
  
              int active = labels[tid];
@@ -154,7 +167,11 @@ namespace cv { namespace gpu { namespace device
                      }
                      if( j == n)
                      {
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
+                        objects[__atomicInc(nclasses, n)] = VecTraits<int4>::make(r1[0], r1[1], r1[2], r1[3]);
+#else
                          objects[atomicInc(nclasses, n)] = VecTraits<int4>::make(r1[0], r1[1], r1[2], r1[3]);
+#endif
                      }
                  }
              }
diff --git a/modules/gpu/src/opencv2/gpu/device/lbp.hpp b/modules/gpu/src/opencv2/gpu/device/lbp.hpp

index f4ec78b..8a7624d 100644 (file)
--- a/modules/gpu/src/opencv2/gpu/device/lbp.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/lbp.hpp
@@ -48,6 +48,46 @@
  namespace cv { namespace gpu { namespace device {
  
  namespace lbp{
+
+    #define TAG_MASK ( (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U )
+template<typename T>
+__device__ __forceinline__ T __atomicInc(T* address, T val)
+{
+    T count;
+    unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
+    do
+    {
+        count = *address & TAG_MASK;
+        count = tag | (count + 1);
+        *address = count;
+    } while (*address != count);
+    return (count & TAG_MASK) - 1;
+}
+
+template<typename T>
+__device__ __forceinline__ void __atomicAdd(T* address, T val)
+{
+    T count;
+    unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
+    do
+    {
+        count = *address & TAG_MASK;
+        count = tag | (count + val);
+        *address = count;
+    } while (*address != count);
+}
+
+template<typename T>
+__device__ __forceinline__ T __atomicMin(T* address, T val)
+{
+    T count = min(*address, val);
+    do
+    {
+        *address = count;
+    } while (*address > count);
+    return count;
+}
+
      struct Stage
      {
          int    first;
@@ -94,11 +134,19 @@ namespace lbp{
  
                  if (p < q)
                  {
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
+                    __atomicMin(labels + id, p);
+#else
                      atomicMin(labels + id, p);
+#endif
                  }
                  else if (p > q)
                  {
+#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
+                    __atomicMin(labels + tid, q);
+#else
                      atomicMin(labels + tid, q);
+#endif
                  }
              }
          }
author	Marina Kolpakova <no@email>
	Sat, 7 Jul 2012 21:48:53 +0000 (21:48 +0000)
committer	Marina Kolpakova <no@email>
	Sat, 7 Jul 2012 21:48:53 +0000 (21:48 +0000)
modules/gpu/src/cuda/lbp.cu		patch \| blob \| history
modules/gpu/src/opencv2/gpu/device/lbp.hpp		patch \| blob \| history