From 5aae21c037576ce488fa7be5eb700c186f875167 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 28 Mar 2012 14:25:41 +0000 Subject: [PATCH] fixed bug #1640 --- modules/gpu/src/nvidia/NCVHaarObjectDetection.cu | 219 +++++++++-------------- modules/gpu/test/main.cpp | 2 +- modules/gpu/test/test_nvidia.cpp | 2 +- 3 files changed, 85 insertions(+), 138 deletions(-) diff --git a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu index c4e70a4..fded861 100644 --- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu +++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu @@ -77,110 +77,52 @@ NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of th //Almost the same as naive scan1Inclusive, but doesn't need __syncthreads() //assuming size <= WARP_SIZE and size is power of 2 -//template -//inline __device__ T warpScanInclusive(T idata, volatile T *s_Data) -//{ -// Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1)); -// s_Data[pos] = 0; -// pos += K_WARP_SIZE; -// s_Data[pos] = idata; -// -// s_Data[pos] += s_Data[pos - 1]; -// s_Data[pos] += s_Data[pos - 2]; -// s_Data[pos] += s_Data[pos - 4]; -// s_Data[pos] += s_Data[pos - 8]; -// s_Data[pos] += s_Data[pos - 16]; -// -// return s_Data[pos]; -//} - - -//template -//inline __device__ T warpScanExclusive(T idata, volatile T *s_Data) -//{ -// return warpScanInclusive(idata, s_Data) - idata; -//} -// -// -//template -//inline __device__ T blockScanInclusive(T idata, volatile T *s_Data) -//{ -// if (tiNumScanThreads > K_WARP_SIZE) -// { -// //Bottom-level inclusive warp scan -// T warpResult = warpScanInclusive(idata, s_Data); -// -// //Save top elements of each warp for exclusive warp scan -// //sync to wait for warp scans to complete (because s_Data is being overwritten) -// __syncthreads(); -// if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) ) -// { -// s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult; -// } -// -// //wait for warp scans to complete -// __syncthreads(); -// -// if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) ) -// { -// //grab top warp elements -// T val = s_Data[threadIdx.x]; -// //calculate exclusive scan and write back to shared memory -// s_Data[threadIdx.x] = warpScanExclusive(val, s_Data); -// } -// -// //return updated warp scans with exclusive scan results -// __syncthreads(); -// return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE]; -// } -// else -// { -// return warpScanInclusive(idata, s_Data); -// } -//} - -template -__device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u* s_Data) +__device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data) { - Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (size - 1)); + Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1)); s_Data[pos] = 0; - pos += size; + pos += K_WARP_SIZE; s_Data[pos] = idata; - for(Ncv32u offset = 1; offset < size; offset <<= 1) - s_Data[pos] += s_Data[pos - offset]; + s_Data[pos] += s_Data[pos - 1]; + s_Data[pos] += s_Data[pos - 2]; + s_Data[pos] += s_Data[pos - 4]; + s_Data[pos] += s_Data[pos - 8]; + s_Data[pos] += s_Data[pos - 16]; return s_Data[pos]; } -template -__forceinline__ __device__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data) +__device__ __forceinline__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data) { - return warpScanInclusive(idata, s_Data) - idata; + return warpScanInclusive(idata, s_Data) - idata; } -template +template __device__ Ncv32u scan1Inclusive(Ncv32u idata, volatile Ncv32u *s_Data) { - if(size > K_WARP_SIZE) + if (tiNumScanThreads > K_WARP_SIZE) { //Bottom-level inclusive warp scan - Ncv32u warpResult = warpScanInclusive(idata, s_Data); + Ncv32u warpResult = warpScanInclusive(idata, s_Data); //Save top elements of each warp for exclusive warp scan //sync to wait for warp scans to complete (because s_Data is being overwritten) __syncthreads(); if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) ) + { s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult; + } //wait for warp scans to complete __syncthreads(); + if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) ) { //grab top warp elements Ncv32u val = s_Data[threadIdx.x]; - //calculate exclsive scan and write back to shared memory - s_Data[threadIdx.x] = warpScanExclusive<(size >> K_LOG2_WARP_SIZE)>(val, s_Data); + //calculate exclusive scan and write back to shared memory + s_Data[threadIdx.x] = warpScanExclusive(val, s_Data); } //return updated warp scans with exclusive scan results @@ -189,7 +131,7 @@ __device__ Ncv32u scan1Inclusive(Ncv32u idata, volatile Ncv32u *s_Data) } else { - return warpScanInclusive(idata, s_Data); + return warpScanInclusive(idata, s_Data); } } @@ -295,7 +237,7 @@ __device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u __shared__ Ncv32u numPassed; __shared__ Ncv32u outMaskOffset; - Ncv32u incScan = scan1Inclusive(threadPassFlag, shmem); + Ncv32u incScan = scan1Inclusive(threadPassFlag, shmem); __syncthreads(); if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1) @@ -391,11 +333,14 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr NcvBool bPass = true; - if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread) + if (!tbDoAtomicCompaction || tbDoAtomicCompaction) { - Ncv32f pixelStdDev = d_weights[y_offs * weightsStride + x_offs]; + Ncv32f pixelStdDev = 0.0f; - for (Ncv32u iStage = startStageInc; iStage(iNode, d_ClassifierNodes); - HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc(); - Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures(); - Ncv32u iFeature = featuresDesc.getFeaturesOffset(); - - Ncv32f curNodeVal = 0.0f; - - for (Ncv32u iRect=0; iRect - (iFeature + iRect, d_Features, - &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight); - - Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX); - Ncv32u iioffsTR = iioffsTL + rectWidth; - Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride; - Ncv32u iioffsBR = iioffsBL + rectWidth; + HaarClassifierNode128 curNode = getClassifierNode(iNode, d_ClassifierNodes); + HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc(); + Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures(); + Ncv32u iFeature = featuresDesc.getFeaturesOffset(); - Ncv32u rectSum = getElemIImg(iioffsBR, d_IImg) - - getElemIImg(iioffsBL, d_IImg) + - getElemIImg(iioffsTL, d_IImg) - - getElemIImg(iioffsTR, d_IImg); + Ncv32f curNodeVal = 0.0f; -#if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY - curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight); -#else - curNodeVal += (Ncv32f)rectSum * rectWeight; -#endif - } + for (Ncv32u iRect=0; iRect + (iFeature + iRect, d_Features, + &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight); + + Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX); + Ncv32u iioffsTR = iioffsTL + rectWidth; + Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride; + Ncv32u iioffsBR = iioffsBL + rectWidth; + + Ncv32u rectSum = getElemIImg(iioffsBR, d_IImg) - + getElemIImg(iioffsBL, d_IImg) + + getElemIImg(iioffsTL, d_IImg) - + getElemIImg(iioffsTR, d_IImg); + + #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY + curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight); + #else + curNodeVal += (Ncv32f)rectSum * rectWeight; + #endif + } - HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc(); - HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc(); - Ncv32f nodeThreshold = curNode.getThreshold(); + HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc(); + HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc(); + Ncv32f nodeThreshold = curNode.getThreshold(); - HaarClassifierNodeDescriptor32 nextNodeDescriptor; - NcvBool nextNodeIsLeaf; + HaarClassifierNodeDescriptor32 nextNodeDescriptor; + NcvBool nextNodeIsLeaf; - if (curNodeVal < scaleArea * pixelStdDev * nodeThreshold) - { - nextNodeDescriptor = nodeLeft; - nextNodeIsLeaf = featuresDesc.isLeftNodeLeaf(); - } - else - { - nextNodeDescriptor = nodeRight; - nextNodeIsLeaf = featuresDesc.isRightNodeLeaf(); - } + if (curNodeVal < scaleArea * pixelStdDev * nodeThreshold) + { + nextNodeDescriptor = nodeLeft; + nextNodeIsLeaf = featuresDesc.isLeftNodeLeaf(); + } + else + { + nextNodeDescriptor = nodeRight; + nextNodeIsLeaf = featuresDesc.isRightNodeLeaf(); + } - if (nextNodeIsLeaf) - { - Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue(); - curStageSum += tmpLeafValue; - bMoreNodesToTraverse = false; - } - else - { - iNode = nextNodeDescriptor.getNextNodeOffset(); + if (nextNodeIsLeaf) + { + Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue(); + curStageSum += tmpLeafValue; + bMoreNodesToTraverse = false; + } + else + { + iNode = nextNodeDescriptor.getNextNodeOffset(); + } } } @@ -481,7 +429,6 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr { bPass = false; outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U; - break; } } } @@ -1100,7 +1047,7 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix &d_integralImag NcvBool bTexCacheCascade = devProp.major < 2; NcvBool bTexCacheIImg = true; //this works better even on Fermi so far - NcvBool bDoAtomicCompaction = false;// devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3); + NcvBool bDoAtomicCompaction = devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3); NCVVector *d_ptrNowData = &d_vecPixelMask; NCVVector *d_ptrNowTmp = &d_vecPixelMaskTmp; diff --git a/modules/gpu/test/main.cpp b/modules/gpu/test/main.cpp index 3370fbc..4d9d380 100644 --- a/modules/gpu/test/main.cpp +++ b/modules/gpu/test/main.cpp @@ -116,7 +116,7 @@ int main(int argc, char** argv) TS::ptr()->init("gpu"); InitGoogleTest(&argc, argv); - const char* keys ="{ nvtest_output_level | nvtest_output_level | none | NVidia test verbosity level }"; + const char* keys ="{ nvtest_output_level | nvtest_output_level | compact | NVidia test verbosity level }"; CommandLineParser parser(argc, (const char**)argv, keys); diff --git a/modules/gpu/test/test_nvidia.cpp b/modules/gpu/test/test_nvidia.cpp index 3142f68..4c4aa6d 100644 --- a/modules/gpu/test/test_nvidia.cpp +++ b/modules/gpu/test/test_nvidia.cpp @@ -84,7 +84,7 @@ struct NVidiaTest : TestWithParam struct NPPST : NVidiaTest {}; struct NCV : NVidiaTest {}; -OutputLevel nvidiaTestOutputLevel = OutputLevelNone; +OutputLevel nvidiaTestOutputLevel = OutputLevelCompact; TEST_P(NPPST, Integral) { -- 2.7.4