//==============================================================================\r
\r
\r
+NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive\r
+\r
+\r
//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()\r
//assuming size <= WARP_SIZE and size is power of 2\r
template <class T>\r
pos += K_WARP_SIZE;\r
s_Data[pos] = idata;\r
\r
- for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)\r
- {\r
- s_Data[pos] += s_Data[pos - offset];\r
- }\r
+ //for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)\r
+ //{\r
+ // s_Data[pos] += s_Data[pos - offset];\r
+ //}\r
+\r
+ s_Data[pos] += s_Data[pos - 1];\r
+ s_Data[pos] += s_Data[pos - 2];\r
+ s_Data[pos] += s_Data[pos - 4];\r
+ s_Data[pos] += s_Data[pos - 8];\r
+ s_Data[pos] += s_Data[pos - 16];\r
\r
return s_Data[pos];\r
}\r
//==============================================================================\r
\r
\r
+NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive\r
+\r
+\r
//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()\r
//assuming size <= WARP_SIZE and size is power of 2\r
template <class T>\r
pos += K_WARP_SIZE;\r
s_Data[pos] = idata;\r
\r
- for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)\r
- {\r
- s_Data[pos] += s_Data[pos - offset];\r
- }\r
+ //for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)\r
+ //{\r
+ // s_Data[pos] += s_Data[pos - offset];\r
+ //}\r
+\r
+ s_Data[pos] += s_Data[pos - 1];\r
+ s_Data[pos] += s_Data[pos - 2];\r
+ s_Data[pos] += s_Data[pos - 4];\r
+ s_Data[pos] += s_Data[pos - 8];\r
+ s_Data[pos] += s_Data[pos - 16];\r
\r
return s_Data[pos];\r
}\r