fixed bug #1640
authorVladislav Vinogradov <no@email>
Wed, 28 Mar 2012 14:25:41 +0000 (14:25 +0000)
committerVladislav Vinogradov <no@email>
Wed, 28 Mar 2012 14:25:41 +0000 (14:25 +0000)
modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
modules/gpu/test/main.cpp
modules/gpu/test/test_nvidia.cpp

index c4e70a4..fded861 100644 (file)
@@ -77,110 +77,52 @@ NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of th
 \r
 //Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()\r
 //assuming size <= WARP_SIZE and size is power of 2\r
-//template <class T>\r
-//inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)\r
-//{\r
-//    Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));\r
-//    s_Data[pos] = 0;\r
-//    pos += K_WARP_SIZE;\r
-//    s_Data[pos] = idata;\r
-//\r
-//    s_Data[pos] += s_Data[pos - 1];\r
-//    s_Data[pos] += s_Data[pos - 2];\r
-//    s_Data[pos] += s_Data[pos - 4];\r
-//    s_Data[pos] += s_Data[pos - 8];\r
-//    s_Data[pos] += s_Data[pos - 16];\r
-//\r
-//    return s_Data[pos];\r
-//}\r
-\r
-\r
-//template <class T>\r
-//inline __device__ T warpScanExclusive(T idata, volatile T *s_Data)\r
-//{\r
-//    return warpScanInclusive(idata, s_Data) - idata;\r
-//}\r
-//\r
-//\r
-//template <class T, Ncv32u tiNumScanThreads>\r
-//inline __device__ T blockScanInclusive(T idata, volatile T *s_Data)\r
-//{\r
-//    if (tiNumScanThreads > K_WARP_SIZE)\r
-//    {\r
-//        //Bottom-level inclusive warp scan\r
-//        T warpResult = warpScanInclusive(idata, s_Data);\r
-//\r
-//        //Save top elements of each warp for exclusive warp scan\r
-//        //sync to wait for warp scans to complete (because s_Data is being overwritten)\r
-//        __syncthreads();\r
-//        if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )\r
-//        {\r
-//            s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;\r
-//        }\r
-//\r
-//        //wait for warp scans to complete\r
-//        __syncthreads();\r
-//\r
-//        if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )\r
-//        {\r
-//            //grab top warp elements\r
-//            T val = s_Data[threadIdx.x];\r
-//            //calculate exclusive scan and write back to shared memory\r
-//            s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);\r
-//        }\r
-//\r
-//        //return updated warp scans with exclusive scan results\r
-//        __syncthreads();\r
-//        return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];\r
-//    }\r
-//    else\r
-//    {\r
-//        return warpScanInclusive(idata, s_Data);\r
-//    }\r
-//}\r
-\r
-template <Ncv32u size>\r
-__device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u* s_Data)\r
+__device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data)\r
 {\r
-    Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (size - 1));\r
+    Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));\r
     s_Data[pos] = 0;\r
-    pos += size;\r
+    pos += K_WARP_SIZE;\r
     s_Data[pos] = idata;\r
 \r
-    for(Ncv32u offset = 1; offset < size; offset <<= 1)\r
-        s_Data[pos] += s_Data[pos - offset];\r
+    s_Data[pos] += s_Data[pos - 1];\r
+    s_Data[pos] += s_Data[pos - 2];\r
+    s_Data[pos] += s_Data[pos - 4];\r
+    s_Data[pos] += s_Data[pos - 8];\r
+    s_Data[pos] += s_Data[pos - 16];\r
 \r
     return s_Data[pos];\r
 }\r
 \r
-template <Ncv32u size>\r
-__forceinline__ __device__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)\r
+__device__ __forceinline__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)\r
 {\r
-    return warpScanInclusive<size>(idata, s_Data) - idata;\r
+    return warpScanInclusive(idata, s_Data) - idata;\r
 }\r
 \r
-template <Ncv32u size, Ncv32u tiNumScanThreads>\r
+template <Ncv32u tiNumScanThreads>\r
 __device__ Ncv32u scan1Inclusive(Ncv32u idata, volatile Ncv32u *s_Data)\r
 {\r
-    if(size > K_WARP_SIZE)\r
+    if (tiNumScanThreads > K_WARP_SIZE)\r
     {\r
         //Bottom-level inclusive warp scan\r
-        Ncv32u warpResult = warpScanInclusive<K_WARP_SIZE>(idata, s_Data);\r
+        Ncv32u warpResult = warpScanInclusive(idata, s_Data);\r
 \r
         //Save top elements of each warp for exclusive warp scan\r
         //sync to wait for warp scans to complete (because s_Data is being overwritten)\r
         __syncthreads();\r
         if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )\r
+        {\r
             s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;\r
+        }\r
 \r
         //wait for warp scans to complete\r
         __syncthreads();\r
+\r
         if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )\r
         {\r
             //grab top warp elements\r
             Ncv32u val = s_Data[threadIdx.x];\r
-            //calculate exclsive scan and write back to shared memory\r
-            s_Data[threadIdx.x] = warpScanExclusive<(size >> K_LOG2_WARP_SIZE)>(val, s_Data);\r
+            //calculate exclusive scan and write back to shared memory\r
+            s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);\r
         }\r
 \r
         //return updated warp scans with exclusive scan results\r
@@ -189,7 +131,7 @@ __device__ Ncv32u scan1Inclusive(Ncv32u idata, volatile Ncv32u *s_Data)
     }\r
     else\r
     {\r
-        return warpScanInclusive<size>(idata, s_Data);\r
+        return warpScanInclusive(idata, s_Data);\r
     }\r
 }\r
 \r
@@ -295,7 +237,7 @@ __device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u
     __shared__ Ncv32u numPassed;\r
     __shared__ Ncv32u outMaskOffset;\r
 \r
-    Ncv32u incScan = scan1Inclusive<NUM_THREADS_ANCHORSPARALLEL, NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem);\r
+    Ncv32u incScan = scan1Inclusive<NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem);\r
     __syncthreads();\r
 \r
     if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1)\r
@@ -391,11 +333,14 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
 \r
     NcvBool bPass = true;\r
 \r
-    if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)\r
+    if (!tbDoAtomicCompaction || tbDoAtomicCompaction)\r
     {\r
-        Ncv32f pixelStdDev = d_weights[y_offs * weightsStride + x_offs];\r
+        Ncv32f pixelStdDev = 0.0f;\r
 \r
-        for (Ncv32u iStage = startStageInc; iStage<endStageExc; iStage++)\r
+        if (!bInactiveThread)\r
+            pixelStdDev = d_weights[y_offs * weightsStride + x_offs];\r
+\r
+        for (Ncv32u iStage = startStageInc; iStage < endStageExc; iStage++)\r
         {\r
             Ncv32f curStageSum = 0.0f;\r
 \r
@@ -409,67 +354,70 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
                 NcvBool bMoreNodesToTraverse = true;\r
                 Ncv32u iNode = curRootNodeOffset;\r
 \r
-                while (bMoreNodesToTraverse)\r
+                if (bPass && !bInactiveThread)\r
                 {\r
-                    HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);\r
-                    HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();\r
-                    Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();\r
-                    Ncv32u iFeature = featuresDesc.getFeaturesOffset();\r
-\r
-                    Ncv32f curNodeVal = 0.0f;\r
-\r
-                    for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)\r
+                    while (bMoreNodesToTraverse)\r
                     {\r
-                        Ncv32f rectWeight;\r
-                        Ncv32u rectX, rectY, rectWidth, rectHeight;\r
-                        getFeature<tbCacheTextureCascade>\r
-                            (iFeature + iRect, d_Features,\r
-                            &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);\r
-\r
-                        Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);\r
-                        Ncv32u iioffsTR = iioffsTL + rectWidth;\r
-                        Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;\r
-                        Ncv32u iioffsBR = iioffsBL + rectWidth;\r
+                        HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);\r
+                        HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();\r
+                        Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();\r
+                        Ncv32u iFeature = featuresDesc.getFeaturesOffset();\r
 \r
-                        Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -\r
-                                         getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +\r
-                                         getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -\r
-                                         getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);\r
+                        Ncv32f curNodeVal = 0.0f;\r
 \r
-#if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY\r
-                    curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);\r
-#else\r
-                    curNodeVal += (Ncv32f)rectSum * rectWeight;\r
-#endif\r
-                    }\r
+                        for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)\r
+                        {\r
+                            Ncv32f rectWeight;\r
+                            Ncv32u rectX, rectY, rectWidth, rectHeight;\r
+                            getFeature<tbCacheTextureCascade>\r
+                                (iFeature + iRect, d_Features,\r
+                                &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);\r
+\r
+                            Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);\r
+                            Ncv32u iioffsTR = iioffsTL + rectWidth;\r
+                            Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;\r
+                            Ncv32u iioffsBR = iioffsBL + rectWidth;\r
+\r
+                            Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -\r
+                                             getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +\r
+                                             getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -\r
+                                             getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);\r
+\r
+    #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY\r
+                        curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);\r
+    #else\r
+                        curNodeVal += (Ncv32f)rectSum * rectWeight;\r
+    #endif\r
+                        }\r
 \r
-                    HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();\r
-                    HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();\r
-                    Ncv32f nodeThreshold = curNode.getThreshold();\r
+                        HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();\r
+                        HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();\r
+                        Ncv32f nodeThreshold = curNode.getThreshold();\r
 \r
-                    HaarClassifierNodeDescriptor32 nextNodeDescriptor;\r
-                    NcvBool nextNodeIsLeaf;\r
+                        HaarClassifierNodeDescriptor32 nextNodeDescriptor;\r
+                        NcvBool nextNodeIsLeaf;\r
 \r
-                    if (curNodeVal < scaleArea * pixelStdDev * nodeThreshold)\r
-                    {\r
-                        nextNodeDescriptor = nodeLeft;\r
-                        nextNodeIsLeaf = featuresDesc.isLeftNodeLeaf();\r
-                    }\r
-                    else\r
-                    {\r
-                        nextNodeDescriptor = nodeRight;\r
-                        nextNodeIsLeaf = featuresDesc.isRightNodeLeaf();\r
-                    }\r
+                        if (curNodeVal < scaleArea * pixelStdDev * nodeThreshold)\r
+                        {\r
+                            nextNodeDescriptor = nodeLeft;\r
+                            nextNodeIsLeaf = featuresDesc.isLeftNodeLeaf();\r
+                        }\r
+                        else\r
+                        {\r
+                            nextNodeDescriptor = nodeRight;\r
+                            nextNodeIsLeaf = featuresDesc.isRightNodeLeaf();\r
+                        }\r
 \r
-                    if (nextNodeIsLeaf)\r
-                    {\r
-                        Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue();\r
-                        curStageSum += tmpLeafValue;\r
-                        bMoreNodesToTraverse = false;\r
-                    }\r
-                    else\r
-                    {\r
-                        iNode = nextNodeDescriptor.getNextNodeOffset();\r
+                        if (nextNodeIsLeaf)\r
+                        {\r
+                            Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue();\r
+                            curStageSum += tmpLeafValue;\r
+                            bMoreNodesToTraverse = false;\r
+                        }\r
+                        else\r
+                        {\r
+                            iNode = nextNodeDescriptor.getNextNodeOffset();\r
+                        }\r
                     }\r
                 }\r
 \r
@@ -481,7 +429,6 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
             {\r
                 bPass = false;\r
                 outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U;\r
-                break;\r
             }\r
         }\r
     }\r
@@ -1100,7 +1047,7 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
 \r
     NcvBool bTexCacheCascade = devProp.major < 2;\r
     NcvBool bTexCacheIImg = true; //this works better even on Fermi so far\r
-    NcvBool bDoAtomicCompaction = false;// devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3);\r
+    NcvBool bDoAtomicCompaction = devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3);\r
 \r
     NCVVector<Ncv32u> *d_ptrNowData = &d_vecPixelMask;\r
     NCVVector<Ncv32u> *d_ptrNowTmp = &d_vecPixelMaskTmp;\r
index 3370fbc..4d9d380 100644 (file)
@@ -116,7 +116,7 @@ int main(int argc, char** argv)
     TS::ptr()->init("gpu");\r
     InitGoogleTest(&argc, argv);\r
 \r
-    const char* keys ="{ nvtest_output_level | nvtest_output_level | none | NVidia test verbosity level }";\r
+    const char* keys ="{ nvtest_output_level | nvtest_output_level | compact | NVidia test verbosity level }";\r
 \r
     CommandLineParser parser(argc, (const char**)argv, keys);\r
 \r
index 3142f68..4c4aa6d 100644 (file)
@@ -84,7 +84,7 @@ struct NVidiaTest : TestWithParam<cv::gpu::DeviceInfo>
 struct NPPST : NVidiaTest {};\r
 struct NCV : NVidiaTest {};\r
 \r
-OutputLevel nvidiaTestOutputLevel = OutputLevelNone;\r
+OutputLevel nvidiaTestOutputLevel = OutputLevelCompact;\r
 \r
 TEST_P(NPPST, Integral)\r
 {\r