\r
//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()\r
//assuming size <= WARP_SIZE and size is power of 2\r
-//template <class T>\r
-//inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)\r
-//{\r
-// Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));\r
-// s_Data[pos] = 0;\r
-// pos += K_WARP_SIZE;\r
-// s_Data[pos] = idata;\r
-//\r
-// s_Data[pos] += s_Data[pos - 1];\r
-// s_Data[pos] += s_Data[pos - 2];\r
-// s_Data[pos] += s_Data[pos - 4];\r
-// s_Data[pos] += s_Data[pos - 8];\r
-// s_Data[pos] += s_Data[pos - 16];\r
-//\r
-// return s_Data[pos];\r
-//}\r
-\r
-\r
-//template <class T>\r
-//inline __device__ T warpScanExclusive(T idata, volatile T *s_Data)\r
-//{\r
-// return warpScanInclusive(idata, s_Data) - idata;\r
-//}\r
-//\r
-//\r
-//template <class T, Ncv32u tiNumScanThreads>\r
-//inline __device__ T blockScanInclusive(T idata, volatile T *s_Data)\r
-//{\r
-// if (tiNumScanThreads > K_WARP_SIZE)\r
-// {\r
-// //Bottom-level inclusive warp scan\r
-// T warpResult = warpScanInclusive(idata, s_Data);\r
-//\r
-// //Save top elements of each warp for exclusive warp scan\r
-// //sync to wait for warp scans to complete (because s_Data is being overwritten)\r
-// __syncthreads();\r
-// if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )\r
-// {\r
-// s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;\r
-// }\r
-//\r
-// //wait for warp scans to complete\r
-// __syncthreads();\r
-//\r
-// if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )\r
-// {\r
-// //grab top warp elements\r
-// T val = s_Data[threadIdx.x];\r
-// //calculate exclusive scan and write back to shared memory\r
-// s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);\r
-// }\r
-//\r
-// //return updated warp scans with exclusive scan results\r
-// __syncthreads();\r
-// return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];\r
-// }\r
-// else\r
-// {\r
-// return warpScanInclusive(idata, s_Data);\r
-// }\r
-//}\r
-\r
-template <Ncv32u size>\r
-__device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u* s_Data)\r
+__device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data)\r
{\r
- Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (size - 1));\r
+ Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));\r
s_Data[pos] = 0;\r
- pos += size;\r
+ pos += K_WARP_SIZE;\r
s_Data[pos] = idata;\r
\r
- for(Ncv32u offset = 1; offset < size; offset <<= 1)\r
- s_Data[pos] += s_Data[pos - offset];\r
+ s_Data[pos] += s_Data[pos - 1];\r
+ s_Data[pos] += s_Data[pos - 2];\r
+ s_Data[pos] += s_Data[pos - 4];\r
+ s_Data[pos] += s_Data[pos - 8];\r
+ s_Data[pos] += s_Data[pos - 16];\r
\r
return s_Data[pos];\r
}\r
\r
-template <Ncv32u size>\r
-__forceinline__ __device__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)\r
+__device__ __forceinline__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)\r
{\r
- return warpScanInclusive<size>(idata, s_Data) - idata;\r
+ return warpScanInclusive(idata, s_Data) - idata;\r
}\r
\r
-template <Ncv32u size, Ncv32u tiNumScanThreads>\r
+template <Ncv32u tiNumScanThreads>\r
__device__ Ncv32u scan1Inclusive(Ncv32u idata, volatile Ncv32u *s_Data)\r
{\r
- if(size > K_WARP_SIZE)\r
+ if (tiNumScanThreads > K_WARP_SIZE)\r
{\r
//Bottom-level inclusive warp scan\r
- Ncv32u warpResult = warpScanInclusive<K_WARP_SIZE>(idata, s_Data);\r
+ Ncv32u warpResult = warpScanInclusive(idata, s_Data);\r
\r
//Save top elements of each warp for exclusive warp scan\r
//sync to wait for warp scans to complete (because s_Data is being overwritten)\r
__syncthreads();\r
if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )\r
+ {\r
s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;\r
+ }\r
\r
//wait for warp scans to complete\r
__syncthreads();\r
+\r
if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )\r
{\r
//grab top warp elements\r
Ncv32u val = s_Data[threadIdx.x];\r
- //calculate exclsive scan and write back to shared memory\r
- s_Data[threadIdx.x] = warpScanExclusive<(size >> K_LOG2_WARP_SIZE)>(val, s_Data);\r
+ //calculate exclusive scan and write back to shared memory\r
+ s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);\r
}\r
\r
//return updated warp scans with exclusive scan results\r
}\r
else\r
{\r
- return warpScanInclusive<size>(idata, s_Data);\r
+ return warpScanInclusive(idata, s_Data);\r
}\r
}\r
\r
__shared__ Ncv32u numPassed;\r
__shared__ Ncv32u outMaskOffset;\r
\r
- Ncv32u incScan = scan1Inclusive<NUM_THREADS_ANCHORSPARALLEL, NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem);\r
+ Ncv32u incScan = scan1Inclusive<NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem);\r
__syncthreads();\r
\r
if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1)\r
\r
NcvBool bPass = true;\r
\r
- if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)\r
+ if (!tbDoAtomicCompaction || tbDoAtomicCompaction)\r
{\r
- Ncv32f pixelStdDev = d_weights[y_offs * weightsStride + x_offs];\r
+ Ncv32f pixelStdDev = 0.0f;\r
\r
- for (Ncv32u iStage = startStageInc; iStage<endStageExc; iStage++)\r
+ if (!bInactiveThread)\r
+ pixelStdDev = d_weights[y_offs * weightsStride + x_offs];\r
+\r
+ for (Ncv32u iStage = startStageInc; iStage < endStageExc; iStage++)\r
{\r
Ncv32f curStageSum = 0.0f;\r
\r
NcvBool bMoreNodesToTraverse = true;\r
Ncv32u iNode = curRootNodeOffset;\r
\r
- while (bMoreNodesToTraverse)\r
+ if (bPass && !bInactiveThread)\r
{\r
- HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);\r
- HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();\r
- Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();\r
- Ncv32u iFeature = featuresDesc.getFeaturesOffset();\r
-\r
- Ncv32f curNodeVal = 0.0f;\r
-\r
- for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)\r
+ while (bMoreNodesToTraverse)\r
{\r
- Ncv32f rectWeight;\r
- Ncv32u rectX, rectY, rectWidth, rectHeight;\r
- getFeature<tbCacheTextureCascade>\r
- (iFeature + iRect, d_Features,\r
- &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);\r
-\r
- Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);\r
- Ncv32u iioffsTR = iioffsTL + rectWidth;\r
- Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;\r
- Ncv32u iioffsBR = iioffsBL + rectWidth;\r
+ HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);\r
+ HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();\r
+ Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();\r
+ Ncv32u iFeature = featuresDesc.getFeaturesOffset();\r
\r
- Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -\r
- getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +\r
- getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -\r
- getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);\r
+ Ncv32f curNodeVal = 0.0f;\r
\r
-#if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY\r
- curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);\r
-#else\r
- curNodeVal += (Ncv32f)rectSum * rectWeight;\r
-#endif\r
- }\r
+ for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)\r
+ {\r
+ Ncv32f rectWeight;\r
+ Ncv32u rectX, rectY, rectWidth, rectHeight;\r
+ getFeature<tbCacheTextureCascade>\r
+ (iFeature + iRect, d_Features,\r
+ &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);\r
+\r
+ Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);\r
+ Ncv32u iioffsTR = iioffsTL + rectWidth;\r
+ Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;\r
+ Ncv32u iioffsBR = iioffsBL + rectWidth;\r
+\r
+ Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -\r
+ getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +\r
+ getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -\r
+ getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);\r
+\r
+ #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY\r
+ curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);\r
+ #else\r
+ curNodeVal += (Ncv32f)rectSum * rectWeight;\r
+ #endif\r
+ }\r
\r
- HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();\r
- HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();\r
- Ncv32f nodeThreshold = curNode.getThreshold();\r
+ HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();\r
+ HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();\r
+ Ncv32f nodeThreshold = curNode.getThreshold();\r
\r
- HaarClassifierNodeDescriptor32 nextNodeDescriptor;\r
- NcvBool nextNodeIsLeaf;\r
+ HaarClassifierNodeDescriptor32 nextNodeDescriptor;\r
+ NcvBool nextNodeIsLeaf;\r
\r
- if (curNodeVal < scaleArea * pixelStdDev * nodeThreshold)\r
- {\r
- nextNodeDescriptor = nodeLeft;\r
- nextNodeIsLeaf = featuresDesc.isLeftNodeLeaf();\r
- }\r
- else\r
- {\r
- nextNodeDescriptor = nodeRight;\r
- nextNodeIsLeaf = featuresDesc.isRightNodeLeaf();\r
- }\r
+ if (curNodeVal < scaleArea * pixelStdDev * nodeThreshold)\r
+ {\r
+ nextNodeDescriptor = nodeLeft;\r
+ nextNodeIsLeaf = featuresDesc.isLeftNodeLeaf();\r
+ }\r
+ else\r
+ {\r
+ nextNodeDescriptor = nodeRight;\r
+ nextNodeIsLeaf = featuresDesc.isRightNodeLeaf();\r
+ }\r
\r
- if (nextNodeIsLeaf)\r
- {\r
- Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue();\r
- curStageSum += tmpLeafValue;\r
- bMoreNodesToTraverse = false;\r
- }\r
- else\r
- {\r
- iNode = nextNodeDescriptor.getNextNodeOffset();\r
+ if (nextNodeIsLeaf)\r
+ {\r
+ Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue();\r
+ curStageSum += tmpLeafValue;\r
+ bMoreNodesToTraverse = false;\r
+ }\r
+ else\r
+ {\r
+ iNode = nextNodeDescriptor.getNextNodeOffset();\r
+ }\r
}\r
}\r
\r
{\r
bPass = false;\r
outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U;\r
- break;\r
}\r
}\r
}\r
\r
NcvBool bTexCacheCascade = devProp.major < 2;\r
NcvBool bTexCacheIImg = true; //this works better even on Fermi so far\r
- NcvBool bDoAtomicCompaction = false;// devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3);\r
+ NcvBool bDoAtomicCompaction = devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3);\r
\r
NCVVector<Ncv32u> *d_ptrNowData = &d_vecPixelMask;\r
NCVVector<Ncv32u> *d_ptrNowTmp = &d_vecPixelMaskTmp;\r