\r
//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()\r
//assuming size <= WARP_SIZE and size is power of 2\r
-template <class T>\r
-inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)\r
+//template <class T>\r
+//inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)\r
+//{\r
+// Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));\r
+// s_Data[pos] = 0;\r
+// pos += K_WARP_SIZE;\r
+// s_Data[pos] = idata;\r
+//\r
+// s_Data[pos] += s_Data[pos - 1];\r
+// s_Data[pos] += s_Data[pos - 2];\r
+// s_Data[pos] += s_Data[pos - 4];\r
+// s_Data[pos] += s_Data[pos - 8];\r
+// s_Data[pos] += s_Data[pos - 16];\r
+//\r
+// return s_Data[pos];\r
+//}\r
+\r
+\r
+//template <class T>\r
+//inline __device__ T warpScanExclusive(T idata, volatile T *s_Data)\r
+//{\r
+// return warpScanInclusive(idata, s_Data) - idata;\r
+//}\r
+//\r
+//\r
+//template <class T, Ncv32u tiNumScanThreads>\r
+//inline __device__ T blockScanInclusive(T idata, volatile T *s_Data)\r
+//{\r
+// if (tiNumScanThreads > K_WARP_SIZE)\r
+// {\r
+// //Bottom-level inclusive warp scan\r
+// T warpResult = warpScanInclusive(idata, s_Data);\r
+//\r
+// //Save top elements of each warp for exclusive warp scan\r
+// //sync to wait for warp scans to complete (because s_Data is being overwritten)\r
+// __syncthreads();\r
+// if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )\r
+// {\r
+// s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;\r
+// }\r
+//\r
+// //wait for warp scans to complete\r
+// __syncthreads();\r
+//\r
+// if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )\r
+// {\r
+// //grab top warp elements\r
+// T val = s_Data[threadIdx.x];\r
+// //calculate exclusive scan and write back to shared memory\r
+// s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);\r
+// }\r
+//\r
+// //return updated warp scans with exclusive scan results\r
+// __syncthreads();\r
+// return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];\r
+// }\r
+// else\r
+// {\r
+// return warpScanInclusive(idata, s_Data);\r
+// }\r
+//}\r
+\r
+template <Ncv32u size>\r
+__device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u* s_Data)\r
{\r
- Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));\r
+ Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (size - 1));\r
s_Data[pos] = 0;\r
- pos += K_WARP_SIZE;\r
+ pos += size;\r
s_Data[pos] = idata;\r
\r
- s_Data[pos] += s_Data[pos - 1];\r
- s_Data[pos] += s_Data[pos - 2];\r
- s_Data[pos] += s_Data[pos - 4];\r
- s_Data[pos] += s_Data[pos - 8];\r
- s_Data[pos] += s_Data[pos - 16];\r
+ for(Ncv32u offset = 1; offset < size; offset <<= 1)\r
+ s_Data[pos] += s_Data[pos - offset];\r
\r
return s_Data[pos];\r
}\r
\r
-\r
-template <class T>\r
-inline __device__ T warpScanExclusive(T idata, volatile T *s_Data)\r
+template <Ncv32u size>\r
+__forceinline__ __device__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)\r
{\r
- return warpScanInclusive(idata, s_Data) - idata;\r
+ return warpScanInclusive<size>(idata, s_Data) - idata;\r
}\r
\r
-\r
-template <class T, Ncv32u tiNumScanThreads>\r
-inline __device__ T blockScanInclusive(T idata, volatile T *s_Data)\r
+template <Ncv32u size, Ncv32u tiNumScanThreads>\r
+__device__ Ncv32u scan1Inclusive(Ncv32u idata, volatile Ncv32u *s_Data)\r
{\r
- if (tiNumScanThreads > K_WARP_SIZE)\r
+ if(size > K_WARP_SIZE)\r
{\r
//Bottom-level inclusive warp scan\r
- T warpResult = warpScanInclusive(idata, s_Data);\r
+ Ncv32u warpResult = warpScanInclusive<K_WARP_SIZE>(idata, s_Data);\r
\r
//Save top elements of each warp for exclusive warp scan\r
//sync to wait for warp scans to complete (because s_Data is being overwritten)\r
__syncthreads();\r
if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )\r
- {\r
s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;\r
- }\r
\r
//wait for warp scans to complete\r
__syncthreads();\r
-\r
if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )\r
{\r
//grab top warp elements\r
- T val = s_Data[threadIdx.x];\r
- //calculate exclusive scan and write back to shared memory\r
- s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);\r
+ Ncv32u val = s_Data[threadIdx.x];\r
+ //calculate exclsive scan and write back to shared memory\r
+ s_Data[threadIdx.x] = warpScanExclusive<(size >> K_LOG2_WARP_SIZE)>(val, s_Data);\r
}\r
\r
//return updated warp scans with exclusive scan results\r
}\r
else\r
{\r
- return warpScanInclusive(idata, s_Data);\r
+ return warpScanInclusive<size>(idata, s_Data);\r
}\r
}\r
\r
__device__ Ncv32u d_outMaskPosition;\r
\r
\r
-__inline __device__ void compactBlockWriteOutAnchorParallel(NcvBool threadPassFlag,\r
- Ncv32u threadElem,\r
- Ncv32u *vectorOut)\r
+__device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u threadElem, Ncv32u *vectorOut)\r
{\r
#if __CUDA_ARCH__ >= 110\r
- Ncv32u passMaskElem = threadPassFlag ? 1 : 0;\r
+ \r
__shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL * 2];\r
- Ncv32u incScan = blockScanInclusive<Ncv32u, NUM_THREADS_ANCHORSPARALLEL>(passMaskElem, shmem);\r
- __syncthreads();\r
- Ncv32u excScan = incScan - passMaskElem;\r
-\r
__shared__ Ncv32u numPassed;\r
__shared__ Ncv32u outMaskOffset;\r
+\r
+ Ncv32u incScan = scan1Inclusive<NUM_THREADS_ANCHORSPARALLEL, NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem);\r
+ __syncthreads();\r
+\r
if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1)\r
{\r
numPassed = incScan;\r
outMaskOffset = atomicAdd(&d_outMaskPosition, incScan);\r
}\r
- __syncthreads();\r
\r
if (threadPassFlag)\r
{\r
+ Ncv32u excScan = incScan - threadPassFlag;\r
shmem[excScan] = threadElem;\r
}\r
+\r
__syncthreads();\r
\r
if (threadIdx.x < numPassed)\r
\r
NcvBool bTexCacheCascade = devProp.major < 2;\r
NcvBool bTexCacheIImg = true; //this works better even on Fermi so far\r
- NcvBool bDoAtomicCompaction = devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3);\r
+ NcvBool bDoAtomicCompaction = false;// devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3);\r
\r
NCVVector<Ncv32u> *d_ptrNowData = &d_vecPixelMask;\r
NCVVector<Ncv32u> *d_ptrNowTmp = &d_vecPixelMaskTmp;\r
std::vector<HaarClassifierNode128> &haarClassifierNodes,\r
std::vector<HaarFeature64> &haarFeatures)\r
{\r
+ size_t readCount;\r
FILE *fp = fopen(filename.c_str(), "rb");\r
ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);\r
Ncv32u fileVersion;\r
- ncvAssertReturn(1 == fread(&fileVersion, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR);\r
+ readCount = fread(&fileVersion, sizeof(Ncv32u), 1, fp);\r
+ ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);\r
ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);\r
Ncv32u fsize;\r
- ncvAssertReturn(1 == fread(&fsize, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR);\r
+ readCount = fread(&fsize, sizeof(Ncv32u), 1, fp);\r
+ ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);\r
fseek(fp, 0, SEEK_END);\r
Ncv32u fsizeActual = ftell(fp);\r
ncvAssertReturn(fsize == fsizeActual, NCV_FILE_ERROR);\r
fdata.resize(fsize);\r
Ncv32u dataOffset = 0;\r
fseek(fp, 0, SEEK_SET);\r
- ncvAssertReturn(1 == fread(&fdata[0], fsize, 1, fp), NCV_FILE_ERROR);\r
+ readCount = fread(&fdata[0], fsize, 1, fp);\r
+ ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);\r
fclose(fp);\r
\r
//data\r
NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages,\r
Ncv32u &numNodes, Ncv32u &numFeatures)\r
{\r
+ size_t readCount;\r
NCVStatus ncvStat;\r
\r
std::string fext = filename.substr(filename.find_last_of(".") + 1);\r
FILE *fp = fopen(filename.c_str(), "rb");\r
ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);\r
Ncv32u fileVersion;\r
- ncvAssertReturn(1 == fread(&fileVersion, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR);\r
+ readCount = fread(&fileVersion, sizeof(Ncv32u), 1, fp);\r
+ ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);\r
ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);\r
fseek(fp, NVBIN_HAAR_SIZERESERVED, SEEK_SET);\r
Ncv32u tmp;\r
- ncvAssertReturn(1 == fread(&numStages, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR);\r
- ncvAssertReturn(1 == fread(&tmp, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR);\r
- ncvAssertReturn(1 == fread(&numNodes, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR);\r
- ncvAssertReturn(1 == fread(&numFeatures, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR);\r
+ readCount = fread(&numStages, sizeof(Ncv32u), 1, fp);\r
+ ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);\r
+ readCount = fread(&tmp, sizeof(Ncv32u), 1, fp);\r
+ ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);\r
+ readCount = fread(&numNodes, sizeof(Ncv32u), 1, fp);\r
+ ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);\r
+ readCount = fread(&numFeatures, sizeof(Ncv32u), 1, fp);\r
+ ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);\r
fclose(fp);\r
}\r
else if (fext == "xml")\r