From 437927b7bbea7f8bf0c0a674cc6754521bf5e05d Mon Sep 17 00:00:00 2001 From: Elena Gvozdeva Date: Tue, 27 May 2014 17:27:24 +0400 Subject: [PATCH] optimized index access --- modules/core/src/opencl/flip.cl | 68 +++++++++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 20 deletions(-) diff --git a/modules/core/src/opencl/flip.cl b/modules/core/src/opencl/flip.cl index c81dd43..cf51882 100644 --- a/modules/core/src/opencl/flip.cl +++ b/modules/core/src/opencl/flip.cl @@ -54,18 +54,28 @@ __kernel void arithm_flip_rows(__global const uchar * srcptr, int src_step, int int rows, int cols, int thread_rows, int thread_cols) { int x = get_global_id(0); - int y = get_global_id(1)*PIX_PER_WI_Y; + int y0 = get_global_id(1)*PIX_PER_WI_Y; if (x < cols) { + int src_index0 = mad24(y0, src_step, mad24(x, TSIZE, src_offset)); + int src_index1 = mad24(rows - y0 - 1, src_step, mad24(x, TSIZE, src_offset)); + int dst_index0 = mad24(y0, dst_step, mad24(x, TSIZE, dst_offset)); + int dst_index1 = mad24(rows - y0 - 1, dst_step, mad24(x, TSIZE, dst_offset)); + #pragma unroll - for (int cy = 0; cy < PIX_PER_WI_Y && y < thread_rows; ++cy, ++y) + for (int y = y0, y1 = min(thread_rows, y0 + PIX_PER_WI_Y); y < y1; ++y) { - T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))); - T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x, TSIZE, src_offset))); + T src0 = loadpix(srcptr + src_index0); + T src1 = loadpix(srcptr + src_index1); + + storepix(src1, dstptr + dst_index0); + storepix(src0, dstptr + dst_index1); - storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset))); - storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x, TSIZE, dst_offset))); + src_index0 += src_step; + src_index1 -= src_step; + dst_index0 += dst_step; + dst_index1 -= dst_step; } } } @@ -75,19 +85,28 @@ __kernel void arithm_flip_rows_cols(__global const uchar * srcptr, int src_step, int rows, int cols, int thread_rows, int thread_cols) { int x = get_global_id(0); - int y = get_global_id(1)*PIX_PER_WI_Y; + int y0 = get_global_id(1)*PIX_PER_WI_Y; if (x < cols) { - int x1 = cols - x - 1; + int src_index0 = mad24(y0, src_step, mad24(x, TSIZE, src_offset)); + int src_index1 = mad24(rows - y0 - 1, src_step, mad24(cols - x - 1, TSIZE, src_offset)); + int dst_index0 = mad24(y0, dst_step, mad24(x, TSIZE, dst_offset)); + int dst_index1 = mad24(rows - y0 - 1, dst_step, mad24(cols - x - 1, TSIZE, dst_offset)); + #pragma unroll - for (int cy = 0; cy < PIX_PER_WI_Y && y < thread_rows; ++cy, ++y) + for (int y = y0, y1 = min(thread_rows, y0 + PIX_PER_WI_Y); y < y1; ++y) { - T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))); - T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x1, TSIZE, src_offset))); + T src0 = loadpix(srcptr + src_index0); + T src1 = loadpix(srcptr + src_index1); - storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x1, TSIZE, dst_offset))); - storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset))); + storepix(src1, dstptr + dst_index0); + storepix(src0, dstptr + dst_index1); + + src_index0 += src_step; + src_index1 -= src_step; + dst_index0 += dst_step; + dst_index1 -= dst_step; } } } @@ -97,19 +116,28 @@ __kernel void arithm_flip_cols(__global const uchar * srcptr, int src_step, int int rows, int cols, int thread_rows, int thread_cols) { int x = get_global_id(0); - int y = get_global_id(1)*PIX_PER_WI_Y; + int y0 = get_global_id(1)*PIX_PER_WI_Y; if (x < thread_cols) { - int x1 = cols - x - 1; + int src_index0 = mad24(y0, src_step, mad24(x, TSIZE, src_offset)); + int src_index1 = mad24(y0, src_step, mad24(cols - x - 1, TSIZE, src_offset)); + int dst_index0 = mad24(y0, dst_step, mad24(x, TSIZE, dst_offset)); + int dst_index1 = mad24(y0, dst_step, mad24(cols - x - 1, TSIZE, dst_offset)); + #pragma unroll - for (int cy = 0; cy < PIX_PER_WI_Y && y < rows; ++cy, ++y) + for (int y = y0, y1 = min(rows, y0 + PIX_PER_WI_Y); y < y1; ++y) { - T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))); - T src1 = loadpix(srcptr + mad24(y, src_step, mad24(x1, TSIZE, src_offset))); + T src0 = loadpix(srcptr + src_index0); + T src1 = loadpix(srcptr + src_index1); + + storepix(src1, dstptr + dst_index0); + storepix(src0, dstptr + dst_index1); - storepix(src0, dstptr + mad24(y, dst_step, mad24(x1, TSIZE, dst_offset))); - storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset))); + src_index0 += src_step; + src_index1 += src_step; + dst_index0 += dst_step; + dst_index1 += dst_step; } } } -- 2.7.4