int rows, int cols, int thread_rows, int thread_cols)
{
int x = get_global_id(0);
- int y = get_global_id(1)*PIX_PER_WI_Y;
+ int y0 = get_global_id(1)*PIX_PER_WI_Y;
if (x < cols)
{
+ int src_index0 = mad24(y0, src_step, mad24(x, TSIZE, src_offset));
+ int src_index1 = mad24(rows - y0 - 1, src_step, mad24(x, TSIZE, src_offset));
+ int dst_index0 = mad24(y0, dst_step, mad24(x, TSIZE, dst_offset));
+ int dst_index1 = mad24(rows - y0 - 1, dst_step, mad24(x, TSIZE, dst_offset));
+
#pragma unroll
- for (int cy = 0; cy < PIX_PER_WI_Y && y < thread_rows; ++cy, ++y)
+ for (int y = y0, y1 = min(thread_rows, y0 + PIX_PER_WI_Y); y < y1; ++y)
{
- T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset)));
- T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x, TSIZE, src_offset)));
+ T src0 = loadpix(srcptr + src_index0);
+ T src1 = loadpix(srcptr + src_index1);
+
+ storepix(src1, dstptr + dst_index0);
+ storepix(src0, dstptr + dst_index1);
- storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset)));
- storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x, TSIZE, dst_offset)));
+ src_index0 += src_step;
+ src_index1 -= src_step;
+ dst_index0 += dst_step;
+ dst_index1 -= dst_step;
}
}
}
int rows, int cols, int thread_rows, int thread_cols)
{
int x = get_global_id(0);
- int y = get_global_id(1)*PIX_PER_WI_Y;
+ int y0 = get_global_id(1)*PIX_PER_WI_Y;
if (x < cols)
{
- int x1 = cols - x - 1;
+ int src_index0 = mad24(y0, src_step, mad24(x, TSIZE, src_offset));
+ int src_index1 = mad24(rows - y0 - 1, src_step, mad24(cols - x - 1, TSIZE, src_offset));
+ int dst_index0 = mad24(y0, dst_step, mad24(x, TSIZE, dst_offset));
+ int dst_index1 = mad24(rows - y0 - 1, dst_step, mad24(cols - x - 1, TSIZE, dst_offset));
+
#pragma unroll
- for (int cy = 0; cy < PIX_PER_WI_Y && y < thread_rows; ++cy, ++y)
+ for (int y = y0, y1 = min(thread_rows, y0 + PIX_PER_WI_Y); y < y1; ++y)
{
- T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset)));
- T src1 = loadpix(srcptr + mad24(rows - y - 1, src_step, mad24(x1, TSIZE, src_offset)));
+ T src0 = loadpix(srcptr + src_index0);
+ T src1 = loadpix(srcptr + src_index1);
- storepix(src0, dstptr + mad24(rows - y - 1, dst_step, mad24(x1, TSIZE, dst_offset)));
- storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset)));
+ storepix(src1, dstptr + dst_index0);
+ storepix(src0, dstptr + dst_index1);
+
+ src_index0 += src_step;
+ src_index1 -= src_step;
+ dst_index0 += dst_step;
+ dst_index1 -= dst_step;
}
}
}
int rows, int cols, int thread_rows, int thread_cols)
{
int x = get_global_id(0);
- int y = get_global_id(1)*PIX_PER_WI_Y;
+ int y0 = get_global_id(1)*PIX_PER_WI_Y;
if (x < thread_cols)
{
- int x1 = cols - x - 1;
+ int src_index0 = mad24(y0, src_step, mad24(x, TSIZE, src_offset));
+ int src_index1 = mad24(y0, src_step, mad24(cols - x - 1, TSIZE, src_offset));
+ int dst_index0 = mad24(y0, dst_step, mad24(x, TSIZE, dst_offset));
+ int dst_index1 = mad24(y0, dst_step, mad24(cols - x - 1, TSIZE, dst_offset));
+
#pragma unroll
- for (int cy = 0; cy < PIX_PER_WI_Y && y < rows; ++cy, ++y)
+ for (int y = y0, y1 = min(rows, y0 + PIX_PER_WI_Y); y < y1; ++y)
{
- T src0 = loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset)));
- T src1 = loadpix(srcptr + mad24(y, src_step, mad24(x1, TSIZE, src_offset)));
+ T src0 = loadpix(srcptr + src_index0);
+ T src1 = loadpix(srcptr + src_index1);
+
+ storepix(src1, dstptr + dst_index0);
+ storepix(src0, dstptr + dst_index1);
- storepix(src0, dstptr + mad24(y, dst_step, mad24(x1, TSIZE, dst_offset)));
- storepix(src1, dstptr + mad24(y, dst_step, mad24(x, TSIZE, dst_offset)));
+ src_index0 += src_step;
+ src_index1 += src_step;
+ dst_index0 += dst_step;
+ dst_index1 += dst_step;
}
}
}