int lid = get_local_id(0);
__global HT * hist = (__global HT *)(histptr + hist_offset);
-
+#if WGS >= BINS
+ HT res = (HT)(0);
+#else
#pragma unroll
for (int i = lid; i < BINS; i += WGS)
- hist[i] = ghist[i];
- barrier(CLK_LOCAL_MEM_FENCE);
+ hist[i] = (HT)(0);
+#endif
#pragma unroll
- for (int i = 1; i < HISTS_COUNT; ++i)
+ for (int i = 0; i < HISTS_COUNT; ++i)
{
- ghist += BINS;
#pragma unroll
for (int j = lid; j < BINS; j += WGS)
+#if WGS >= BINS
+ res += convertToHT(ghist[j]);
+#else
hist[j] += convertToHT(ghist[j]);
- barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+ ghist += BINS;
}
+
+#if WGS >= BINS
+ if (lid < BINS)
+ *(__global HT *)(histptr + mad24(lid, hist_step, hist_offset)) = res;
+#endif
}
__kernel void calcLUT(__global uchar * dst, __constant int * hist, int total)