[*] Approach to the bug with integral image calculation on SM_2.0 (Fermi)

author Anton Obukhov <no@email>

Tue, 14 Jun 2011 17:34:00 +0000 (17:34 +0000)

committer Anton Obukhov <no@email>

Tue, 14 Jun 2011 17:34:00 +0000 (17:34 +0000)
author Anton Obukhov <no@email>
Tue, 14 Jun 2011 17:34:00 +0000 (17:34 +0000)
committer Anton Obukhov <no@email>
Tue, 14 Jun 2011 17:34:00 +0000 (17:34 +0000)
diff --git a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu

index 87320557bc517b97adc30902af82f8f5030b687e..03ecb57ec00a4e025e5c2f68f6cc94733d85a4a0 100644 (file)
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
@@ -71,6 +71,9 @@
  //==============================================================================\r
  \r
  \r
+NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive\r
+\r
+\r
  //Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()\r
  //assuming size <= WARP_SIZE and size is power of 2\r
  template <class T>\r
@@ -81,10 +84,16 @@ inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
      pos += K_WARP_SIZE;\r
      s_Data[pos] = idata;\r
  \r
-    for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)\r
-    {\r
-        s_Data[pos] += s_Data[pos - offset];\r
-    }\r
+    //for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)\r
+    //{\r
+    //    s_Data[pos] += s_Data[pos - offset];\r
+    //}\r
+\r
+    s_Data[pos] += s_Data[pos - 1];\r
+    s_Data[pos] += s_Data[pos - 2];\r
+    s_Data[pos] += s_Data[pos - 4];\r
+    s_Data[pos] += s_Data[pos - 8];\r
+    s_Data[pos] += s_Data[pos - 16];\r
  \r
      return s_Data[pos];\r
  }\r
diff --git a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu

index 3434f0d32c8815d9af0715658d983f677c6ca4a8..f7cdfc68878ed5ce1543d5bf3ef727d8d9920da0 100644 (file)
--- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
@@ -82,6 +82,9 @@ cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream)
  //==============================================================================\r
  \r
  \r
+NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive\r
+\r
+\r
  //Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()\r
  //assuming size <= WARP_SIZE and size is power of 2\r
  template <class T>\r
@@ -92,10 +95,16 @@ inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
      pos += K_WARP_SIZE;\r
      s_Data[pos] = idata;\r
  \r
-    for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)\r
-    {\r
-        s_Data[pos] += s_Data[pos - offset];\r
-    }\r
+    //for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)\r
+    //{\r
+    //    s_Data[pos] += s_Data[pos - offset];\r
+    //}\r
+\r
+    s_Data[pos] += s_Data[pos - 1];\r
+    s_Data[pos] += s_Data[pos - 2];\r
+    s_Data[pos] += s_Data[pos - 4];\r
+    s_Data[pos] += s_Data[pos - 8];\r
+    s_Data[pos] += s_Data[pos - 16];\r
  \r
      return s_Data[pos];\r
  }\r
author	Anton Obukhov <no@email>
	Tue, 14 Jun 2011 17:34:00 +0000 (17:34 +0000)
committer	Anton Obukhov <no@email>
	Tue, 14 Jun 2011 17:34:00 +0000 (17:34 +0000)
modules/gpu/src/nvidia/NCVHaarObjectDetection.cu		patch \| blob \| history
modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu		patch \| blob \| history