#define op(A, B) (*A)+(B)
data[tid] = *partial_reduction;
barrier(CLK_LOCAL_MEM_FENCE);
-
+#ifndef WAVE_SIZE
+#define WAVE_SIZE 1
+#endif
if (tid < 16)
+ {
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 16]);
+#if WAVE_SIZE < 16
+ }
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
+ {
+#endif
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8 ]);
+#if WAVE_SIZE < 8
+ }
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 4)
+ {
+#endif
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4 ]);
+#if WAVE_SIZE < 4
+ }
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 2)
+ {
+#endif
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 2 ]);
+#if WAVE_SIZE < 2
+ }
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 1)
+ {
+#endif
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]);
+ }
+#undef WAVE_SIZE
#undef op
}
int tid
)
{
+#ifndef WAVE_SIZE
+#define WAVE_SIZE 1
+#endif
// first step is to reduce from 25 to 16
- if (tid < 9) // use 9 threads
+ if (tid < 9)
{
sdata1[tid] += sdata1[tid + 16];
sdata2[tid] += sdata2[tid + 16];
sdata3[tid] += sdata3[tid + 16];
sdata4[tid] += sdata4[tid + 16];
+#if WAVE_SIZE < 16
}
-
- // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
+ barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 8)
{
+#endif
sdata1[tid] += sdata1[tid + 8];
- sdata1[tid] += sdata1[tid + 4];
- sdata1[tid] += sdata1[tid + 2];
- sdata1[tid] += sdata1[tid + 1];
sdata2[tid] += sdata2[tid + 8];
- sdata2[tid] += sdata2[tid + 4];
- sdata2[tid] += sdata2[tid + 2];
- sdata2[tid] += sdata2[tid + 1];
sdata3[tid] += sdata3[tid + 8];
- sdata3[tid] += sdata3[tid + 4];
- sdata3[tid] += sdata3[tid + 2];
- sdata3[tid] += sdata3[tid + 1];
sdata4[tid] += sdata4[tid + 8];
+#if WAVE_SIZE < 8
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < 4)
+ {
+#endif
+ sdata1[tid] += sdata1[tid + 4];
+ sdata2[tid] += sdata2[tid + 4];
+ sdata3[tid] += sdata3[tid + 4];
sdata4[tid] += sdata4[tid + 4];
+#if WAVE_SIZE < 4
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < 2)
+ {
+#endif
+ sdata1[tid] += sdata1[tid + 2];
+ sdata2[tid] += sdata2[tid + 2];
+ sdata3[tid] += sdata3[tid + 2];
sdata4[tid] += sdata4[tid + 2];
+#if WAVE_SIZE < 2
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < 1)
+ {
+#endif
+ sdata1[tid] += sdata1[tid + 1];
+ sdata2[tid] += sdata2[tid + 1];
+ sdata3[tid] += sdata3[tid + 1];
sdata4[tid] += sdata4[tid + 1];
}
+#undef WAVE_SIZE
}
__kernel
void compute_descriptors64(
IMAGE_INT8 imgTex,
- volatile __global float * descriptors,
+ __global float * descriptors,
__global const float * keypoints,
int descriptors_step,
int keypoints_step,
sdyabs[tid] = fabs(sdy[tid]); // |dy| array
}
barrier(CLK_LOCAL_MEM_FENCE);
- if (tid < 25)
- {
+
reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
- }
+
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 25)
{
- volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2);
+ __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2);
// write dx, dy, |dx|, |dy|
if (tid == 0)
__kernel
void compute_descriptors128(
IMAGE_INT8 imgTex,
- __global volatile float * descriptors,
+ __global float * descriptors,
__global float * keypoints,
int descriptors_step,
int keypoints_step,
sd2[tid] = sdx[tid];
sdabs2[tid] = fabs(sdx[tid]);
}
- //barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
- //barrier(CLK_LOCAL_MEM_FENCE);
-
- volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3);
+ if (tid < 25)
+ {
// write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
if (tid == 0)
{
sd2[tid] = sdy[tid];
sdabs2[tid] = fabs(sdy[tid]);
}
- //barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
- //barrier(CLK_LOCAL_MEM_FENCE);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < 25)
+ {
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
if (tid == 0)
{
}
}
}
+void reduce_sum128(volatile __local float* smem, int tid)
+{
+#ifndef WAVE_SIZE
+#define WAVE_SIZE 1
+#endif
+ if (tid < 64)
+ {
+ smem[tid] += smem[tid + 64];
+#if WAVE_SIZE < 64
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < 32)
+ {
+#endif
+ smem[tid] += smem[tid + 32];
+#if WAVE_SIZE < 32
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < 16)
+ {
+#endif
+ smem[tid] += smem[tid + 16];
+#if WAVE_SIZE < 16
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < 8)
+ {
+#endif
+ smem[tid] += smem[tid + 8];
+#if WAVE_SIZE < 8
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < 4)
+ {
+#endif
+ smem[tid] += smem[tid + 4];
+#if WAVE_SIZE < 4
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < 2)
+ {
+#endif
+ smem[tid] += smem[tid + 2];
+#if WAVE_SIZE < 2
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < 1)
+ {
+#endif
+ smem[tid] += smem[tid + 1];
+ }
+}
+void reduce_sum64(volatile __local float* smem, int tid)
+{
+#ifndef WAVE_SIZE
+#define WAVE_SIZE 1
+#endif
+ if (tid < 32)
+ {
+ smem[tid] += smem[tid + 32];
+#if WAVE_SIZE < 32
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < 16)
+ {
+#endif
+ smem[tid] += smem[tid + 16];
+#if WAVE_SIZE < 16
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < 8)
+ {
+#endif
+ smem[tid] += smem[tid + 8];
+#if WAVE_SIZE < 8
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < 4)
+ {
+#endif
+ smem[tid] += smem[tid + 4];
+#if WAVE_SIZE < 4
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < 2)
+ {
+#endif
+ smem[tid] += smem[tid + 2];
+#if WAVE_SIZE < 2
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (tid < 1)
+ {
+#endif
+ smem[tid] += smem[tid + 1];
+ }
+}
__kernel
void normalize_descriptors128(__global float * descriptors, int descriptors_step)
sqDesc[get_local_id(0)] = lookup * lookup;
barrier(CLK_LOCAL_MEM_FENCE);
- if (get_local_id(0) < 64)
- sqDesc[get_local_id(0)] += sqDesc[get_local_id(0) + 64];
+ reduce_sum128(sqDesc, get_local_id(0));
barrier(CLK_LOCAL_MEM_FENCE);
- // reduction to get total
- if (get_local_id(0) < 32)
- {
- volatile __local float* smem = sqDesc;
-
- smem[get_local_id(0)] += smem[get_local_id(0) + 32];
- smem[get_local_id(0)] += smem[get_local_id(0) + 16];
- smem[get_local_id(0)] += smem[get_local_id(0) + 8];
- smem[get_local_id(0)] += smem[get_local_id(0) + 4];
- smem[get_local_id(0)] += smem[get_local_id(0) + 2];
- smem[get_local_id(0)] += smem[get_local_id(0) + 1];
- }
+
// compute length (square root)
volatile __local float len;
sqDesc[get_local_id(0)] = lookup * lookup;
barrier(CLK_LOCAL_MEM_FENCE);
- // reduction to get total
- if (get_local_id(0) < 32)
- {
- volatile __local float* smem = sqDesc;
-
- smem[get_local_id(0)] += smem[get_local_id(0) + 32];
- smem[get_local_id(0)] += smem[get_local_id(0) + 16];
- smem[get_local_id(0)] += smem[get_local_id(0) + 8];
- smem[get_local_id(0)] += smem[get_local_id(0) + 4];
- smem[get_local_id(0)] += smem[get_local_id(0) + 2];
- smem[get_local_id(0)] += smem[get_local_id(0) + 1];
- }
+
+ reduce_sum64(sqDesc, get_local_id(0));
+ barrier(CLK_LOCAL_MEM_FENCE);
// compute length (square root)
volatile __local float len;