}
};
+ template <unsigned int I, typename Pointer, typename Reference, class Op>
+ struct Unroll
+ {
+ static __device__ void loopShfl(Reference val, Op op, unsigned int N)
+ {
+ mergeShfl(val, I, N, op);
+ Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
+ }
+ static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op)
+ {
+ merge(smem, val, tid, I, op);
+ Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+ }
+ };
+ template <typename Pointer, typename Reference, class Op>
+ struct Unroll<0, Pointer, Reference, Op>
+ {
+ static __device__ void loopShfl(Reference, Op, unsigned int)
+ {
+ }
+ static __device__ void loop(Pointer, Reference, unsigned int, Op)
+ {
+ }
+ };
+
template <unsigned int N> struct WarpOptimized
{
template <typename Pointer, typename Reference, class Op>
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
{
- #if __CUDA_ARCH >= 300
+ #if __CUDA_ARCH__ >= 300
(void) smem;
(void) tid;
- #pragma unroll
- for (unsigned int i = N / 2; i >= 1; i /= 2)
- mergeShfl(val, i, N, op);
+ Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
#else
loadToSmem(smem, val, tid);
if (tid < N / 2)
- {
- #if __CUDA_ARCH__ >= 200
- #pragma unroll
- #endif
- for (unsigned int i = N / 2; i >= 1; i /= 2)
- merge(smem, val, tid, i, op);
- }
+ Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
#endif
}
};
{
const unsigned int laneId = Warp::laneId();
- #if __CUDA_ARCH >= 300
- #pragma unroll
- for (int i = 16; i >= 1; i /= 2)
- mergeShfl(val, i, warpSize, op);
+ #if __CUDA_ARCH__ >= 300
+ Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize);
if (laneId == 0)
loadToSmem(smem, val, tid / 32);
loadToSmem(smem, val, tid);
if (laneId < 16)
- {
- #if __CUDA_ARCH__ >= 200
- #pragma unroll
- #endif
- for (int i = 16; i >= 1; i /= 2)
- merge(smem, val, tid, i, op);
- }
+ Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);
__syncthreads();
if (tid < 32)
{
- #if __CUDA_ARCH >= 300
- #pragma unroll
- for (int i = M / 2; i >= 1; i /= 2)
- mergeShfl(val, i, M, op);
+ #if __CUDA_ARCH__ >= 300
+ Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M);
#else
- #if __CUDA_ARCH__ >= 200
- #pragma unroll
- #endif
- for (int i = M / 2; i >= 1; i /= 2)
- merge(smem, val, tid, i, op);
+ Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
#endif
}
}
}
};
+ template <unsigned int I, class KP, class KR, class VP, class VR, class Cmp>
+ struct Unroll
+ {
+ static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N)
+ {
+ mergeShfl(key, val, cmp, I, N);
+ Unroll<I / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
+ }
+ static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+ {
+ merge(skeys, key, svals, val, cmp, tid, I);
+ Unroll<I / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+ }
+ };
+ template <class KP, class KR, class VP, class VR, class Cmp>
+ struct Unroll<0, KP, KR, VP, VR, Cmp>
+ {
+ static __device__ void loopShfl(KR, VR, Cmp, unsigned int)
+ {
+ }
+ static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp)
+ {
+ }
+ };
+
template <unsigned int N> struct WarpOptimized
{
template <class KP, class KR, class VP, class VR, class Cmp>
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
{
- #if __CUDA_ARCH >= 300
+ #if __CUDA_ARCH__ >= 300
(void) skeys;
(void) svals;
(void) tid;
- #pragma unroll
- for (unsigned int i = N / 2; i >= 1; i /= 2)
- mergeShfl(key, val, cml, i, N);
+ Unroll<N / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
#else
loadToSmem(skeys, key, tid);
loadToSmem(svals, val, tid);
if (tid < N / 2)
- {
- #if __CUDA_ARCH__ >= 200
- #pragma unroll
- #endif
- for (unsigned int i = N / 2; i >= 1; i /= 2)
- merge(skeys, key, svals, val, cmp, tid, i);
- }
+ Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
#endif
}
};
{
const unsigned int laneId = Warp::laneId();
- #if __CUDA_ARCH >= 300
- #pragma unroll
- for (unsigned int i = 16; i >= 1; i /= 2)
- mergeShfl(key, val, cml, i, warpSize);
+ #if __CUDA_ARCH__ >= 300
+ Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize);
if (laneId == 0)
{
loadToSmem(svals, val, tid);
if (laneId < 16)
- {
- #if __CUDA_ARCH__ >= 200
- #pragma unroll
- #endif
- for (int i = 16; i >= 1; i /= 2)
- merge(skeys, key, svals, val, cmp, tid, i);
- }
+ Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
__syncthreads();
if (tid < 32)
{
- #if __CUDA_ARCH >= 300
+ #if __CUDA_ARCH__ >= 300
loadFromSmem(svals, val, tid);
- #pragma unroll
- for (unsigned int i = M / 2; i >= 1; i /= 2)
- mergeShfl(key, val, cml, i, M);
+ Unroll<M / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, M);
#else
- #if __CUDA_ARCH__ >= 200
- #pragma unroll
- #endif
- for (unsigned int i = M / 2; i >= 1; i /= 2)
- merge(skeys, key, svals, val, cmp, tid, i);
+ Unroll<M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
#endif
}
}