if (tid < N / 2)
{
+ #if __CUDA_ARCH__ >= 200
#pragma unroll
+ #endif
for (unsigned int i = N / 2; i >= 1; i /= 2)
merge(smem, val, tid, i, op);
}
if (laneId < 16)
{
+ #if __CUDA_ARCH__ >= 200
#pragma unroll
+ #endif
for (int i = 16; i >= 1; i /= 2)
merge(smem, val, tid, i, op);
}
for (int i = M / 2; i >= 1; i /= 2)
mergeShfl(val, i, M, op);
#else
+ #if __CUDA_ARCH__ >= 200
#pragma unroll
+ #endif
for (int i = M / 2; i >= 1; i /= 2)
merge(smem, val, tid, i, op);
#endif
if (tid < N / 2)
{
+ #if __CUDA_ARCH__ >= 200
#pragma unroll
+ #endif
for (unsigned int i = N / 2; i >= 1; i /= 2)
merge(skeys, key, svals, val, cmp, tid, i);
}
if (laneId < 16)
{
+ #if __CUDA_ARCH__ >= 200
#pragma unroll
+ #endif
for (int i = 16; i >= 1; i /= 2)
merge(skeys, key, svals, val, cmp, tid, i);
}
for (unsigned int i = M / 2; i >= 1; i /= 2)
mergeShfl(key, val, cml, i, M);
#else
+ #if __CUDA_ARCH__ >= 200
#pragma unroll
+ #endif
for (unsigned int i = M / 2; i >= 1; i /= 2)
merge(skeys, key, svals, val, cmp, tid, i);
#endif
float bestx = 0, besty = 0, best_mod = 0;
+ #if __CUDA_ARCH__ >= 200
#pragma unroll
+ #endif
for (int i = 0; i < 18; ++i)
{
const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;