From: Ilya Lavrenov Date: Tue, 17 Jun 2014 15:24:25 +0000 (+0400) Subject: increased number of rows per work-item X-Git-Tag: submit/tizen_ivi/20141117.190038~2^2~290^2~1 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=c41a13439424ad67f49b9407335e0710749541b7;p=profile%2Fivi%2Fopencv.git increased number of rows per work-item --- diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index c946afc..c6d6b1f 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -3582,7 +3582,9 @@ private: static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, InputArray _map2, int interpolation, int borderType, const Scalar& borderValue) { - int cn = _src.channels(), type = _src.type(), depth = _src.depth(); + const ocl::Device & dev = ocl::Device::getDefault(); + int cn = _src.channels(), type = _src.type(), depth = _src.depth(), + rowsPerWI = dev.isIntel() ? 4 : 1; if (borderType == BORDER_TRANSPARENT || !(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST) || _map1.type() == CV_16SC1 || _map2.type() == CV_16SC1) @@ -3619,12 +3621,14 @@ static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, Input static const char * const interMap[] = { "INTER_NEAREST", "INTER_LINEAR", "INTER_CUBIC", "INTER_LINEAR", "INTER_LANCZOS" }; static const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101", "BORDER_TRANSPARENT" }; - String buildOptions = format("-D %s -D %s -D T=%s", interMap[interpolation], borderMap[borderType], ocl::typeToStr(type)); + String buildOptions = format("-D %s -D %s -D T=%s -D rowsPerWI=%d", + interMap[interpolation], borderMap[borderType], + ocl::typeToStr(type), rowsPerWI); if (interpolation != INTER_NEAREST) { char cvt[3][40]; - int wdepth = std::max(CV_32F, dst.depth()); + int wdepth = std::max(CV_32F, depth); buildOptions = buildOptions + format(" -D WT=%s -D convertToT=%s -D convertToWT=%s" " -D convertToWT2=%s -D WT2=%s", @@ -3653,7 +3657,7 @@ static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, Input else k.args(srcarg, dstarg, map1arg, ocl::KernelArg::ReadOnlyNoSize(map2), scalararg); - size_t globalThreads[2] = { dst.cols, dst.rows }; + size_t globalThreads[2] = { dst.cols, (dst.rows + rowsPerWI - 1) / rowsPerWI }; return k.run(2, globalThreads, NULL, false); } diff --git a/modules/imgproc/src/opencl/remap.cl b/modules/imgproc/src/opencl/remap.cl index bd043c5..76b5c33 100644 --- a/modules/imgproc/src/opencl/remap.cl +++ b/modules/imgproc/src/opencl/remap.cl @@ -147,37 +147,43 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src ST nVal) { int x = get_global_id(0); - int y = get_global_id(1); - - T scalar = convertScalar(nVal); + int y = get_global_id(1) * rowsPerWI; - if (x < dst_cols && y < dst_rows) + if (x < dst_cols) { - int map1_index = mad24(y, map1_step, x * (int)sizeof(float) + map1_offset); - int map2_index = mad24(y, map2_step, x * (int)sizeof(float) + map2_offset); - int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset); - - __global const float * map1 = (__global const float *)(map1ptr + map1_index); - __global const float * map2 = (__global const float *)(map2ptr + map2_index); - __global T * dst = (__global T *)(dstptr + dst_index); - - int gx = convert_int_sat_rte(map1[0]); - int gy = convert_int_sat_rte(map2[0]); - - if (NEED_EXTRAPOLATION(gx, gy)) - { + T scalar = convertScalar(nVal); + + int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(float), map1_offset)); + int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(float), map2_offset)); + int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset)); + + #pragma unroll + for (int i = 0; i < rowsPerWI; ++i, ++y, + map1_index += map1_step, map2_index += map2_step, dst_index += dst_step) + if (y < dst_rows) + { + __global const float * map1 = (__global const float *)(map1ptr + map1_index); + __global const float * map2 = (__global const float *)(map2ptr + map2_index); + __global T * dst = (__global T *)(dstptr + dst_index); + + int gx = convert_int_sat_rte(map1[0]); + int gy = convert_int_sat_rte(map2[0]); + + if (NEED_EXTRAPOLATION(gx, gy)) + { #ifndef BORDER_CONSTANT - int2 gxy = (int2)(gx, gy); + int2 gxy = (int2)(gx, gy); #endif - T v; - EXTRAPOLATE(gxy, v) - storepix(v, dst); - } - else - { - int src_index = mad24(gy, src_step, gx * TSIZE + src_offset); - storepix(loadpix((__global const T*)(srcptr + src_index)), dst); - } + T v; + EXTRAPOLATE(gxy, v) + storepix(v, dst); + } + else + { + int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset)); + storepix(loadpix((__global const T*)(srcptr + src_index)), dst); + } + } } } @@ -187,31 +193,36 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o ST nVal) { int x = get_global_id(0); - int y = get_global_id(1); - - T scalar = convertScalar(nVal); + int y = get_global_id(1) * rowsPerWI; - if (x < dst_cols && y < dst_rows) + if (x < dst_cols) { - int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset); - int map_index = mad24(y, map_step, x * (int)sizeof(float2) + map_offset); - - __global const float2 * map = (__global const float2 *)(mapptr + map_index); - __global T * dst = (__global T *)(dstptr + dst_index); - - int2 gxy = convert_int2_sat_rte(map[0]); - int gx = gxy.x, gy = gxy.y; - - if (NEED_EXTRAPOLATION(gx, gy)) - { - T v; - EXTRAPOLATE(gxy, v) - storepix(v, dst); - } - else - { - int src_index = mad24(gy, src_step, gx * TSIZE + src_offset); - storepix(loadpix((__global const T *)(srcptr + src_index)), dst); + T scalar = convertScalar(nVal); + int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset)); + int map_index = mad24(y, map_step, mad24(x, (int)sizeof(float2), map_offset)); + + #pragma unroll + for (int i = 0; i < rowsPerWI; ++i, ++y, + map_index += map_step, dst_index += dst_step) + if (y < dst_rows) + { + __global const float2 * map = (__global const float2 *)(mapptr + map_index); + __global T * dst = (__global T *)(dstptr + dst_index); + + int2 gxy = convert_int2_sat_rte(map[0]); + int gx = gxy.x, gy = gxy.y; + + if (NEED_EXTRAPOLATION(gx, gy)) + { + T v; + EXTRAPOLATE(gxy, v) + storepix(v, dst); + } + else + { + int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset)); + storepix(loadpix((__global const T *)(srcptr + src_index)), dst); + } } } } @@ -222,32 +233,37 @@ __kernel void remap_16SC2(__global const uchar * srcptr, int src_step, int src_o ST nVal) { int x = get_global_id(0); - int y = get_global_id(1); - - T scalar = convertScalar(nVal); + int y = get_global_id(1) * rowsPerWI; - if (x < dst_cols && y < dst_rows) + if (x < dst_cols) { - int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset); - int map_index = mad24(y, map_step, x * (int)sizeof(short2) + map_offset); - - __global const short2 * map = (__global const short2 *)(mapptr + map_index); - __global T * dst = (__global T *)(dstptr + dst_index); - - int2 gxy = convert_int2(map[0]); - int gx = gxy.x, gy = gxy.y; - - if (NEED_EXTRAPOLATION(gx, gy)) - { - T v; - EXTRAPOLATE(gxy, v) - storepix(v, dst); - } - else - { - int src_index = mad24(gy, src_step, gx * TSIZE + src_offset); - storepix(loadpix((__global const T *)(srcptr + src_index)), dst); - } + T scalar = convertScalar(nVal); + int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset)); + int map_index = mad24(y, map_step, mad24(x, (int)sizeof(short2), map_offset)); + + #pragma unroll + for (int i = 0; i < rowsPerWI; ++i, ++y, + map_index += map_step, dst_index += dst_step) + if (y < dst_rows) + { + __global const short2 * map = (__global const short2 *)(mapptr + map_index); + __global T * dst = (__global T *)(dstptr + dst_index); + + int2 gxy = convert_int2(map[0]); + int gx = gxy.x, gy = gxy.y; + + if (NEED_EXTRAPOLATION(gx, gy)) + { + T v; + EXTRAPOLATE(gxy, v) + storepix(v, dst); + } + else + { + int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset)); + storepix(loadpix((__global const T *)(srcptr + src_index)), dst); + } + } } } @@ -260,35 +276,40 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int int x = get_global_id(0); int y = get_global_id(1); - T scalar = convertScalar(nVal); - - if (x < dst_cols && y < dst_rows) + if (x < dst_cols) { - int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset); - int map1_index = mad24(y, map1_step, x * (int)sizeof(short2) + map1_offset); - int map2_index = mad24(y, map2_step, x * (int)sizeof(ushort) + map2_offset); - - __global const short2 * map1 = (__global const short2 *)(map1ptr + map1_index); - __global const ushort * map2 = (__global const ushort *)(map2ptr + map2_index); - __global T * dst = (__global T *)(dstptr + dst_index); - - int map2Value = convert_int(map2[0]) & (INTER_TAB_SIZE2 - 1); - int dx = (map2Value & (INTER_TAB_SIZE - 1)) < (INTER_TAB_SIZE >> 1) ? 1 : 0; - int dy = (map2Value >> INTER_BITS) < (INTER_TAB_SIZE >> 1) ? 1 : 0; - int2 gxy = convert_int2(map1[0]) + (int2)(dx, dy); - int gx = gxy.x, gy = gxy.y; - - if (NEED_EXTRAPOLATION(gx, gy)) - { - T v; - EXTRAPOLATE(gxy, v) - storepix(v, dst); - } - else - { - int src_index = mad24(gy, src_step, gx * TSIZE + src_offset); - storepix(loadpix((__global const T *)(srcptr + src_index)), dst); - } + T scalar = convertScalar(nVal); + int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset)); + int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(short2), map1_offset)); + int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(ushort), map2_offset)); + + #pragma unroll + for (int i = 0; i < rowsPerWI; ++i, ++y, + map1_index += map1_step, map2_index += map2_step, dst_index += dst_step) + if (y < dst_rows) + { + __global const short2 * map1 = (__global const short2 *)(map1ptr + map1_index); + __global const ushort * map2 = (__global const ushort *)(map2ptr + map2_index); + __global T * dst = (__global T *)(dstptr + dst_index); + + int map2Value = convert_int(map2[0]) & (INTER_TAB_SIZE2 - 1); + int dx = (map2Value & (INTER_TAB_SIZE - 1)) < (INTER_TAB_SIZE >> 1) ? 1 : 0; + int dy = (map2Value >> INTER_BITS) < (INTER_TAB_SIZE >> 1) ? 1 : 0; + int2 gxy = convert_int2(map1[0]) + (int2)(dx, dy); + int gx = gxy.x, gy = gxy.y; + + if (NEED_EXTRAPOLATION(gx, gy)) + { + T v; + EXTRAPOLATE(gxy, v) + storepix(v, dst); + } + else + { + int src_index = mad24(gy, src_step, mad24(gx, TSIZE, src_offset)); + storepix(loadpix((__global const T *)(srcptr + src_index)), dst); + } + } } } @@ -301,54 +322,60 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int ST nVal) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * rowsPerWI; - if (x < dst_cols && y < dst_rows) + if (x < dst_cols) { - int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset); - int map1_index = mad24(y, map1_step, x * (int)sizeof(short2) + map1_offset); - int map2_index = mad24(y, map2_step, x * (int)sizeof(ushort) + map2_offset); - - __global const short2 * map1 = (__global const short2 *)(map1ptr + map1_index); - __global const ushort * map2 = (__global const ushort *)(map2ptr + map2_index); - __global T * dst = (__global T *)(dstptr + dst_index); - - int2 map_dataA = convert_int2(map1[0]); - int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y); - int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1); - int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1); - - ushort map2Value = (ushort)(map2[0] & (INTER_TAB_SIZE2 - 1)); - WT2 u = (WT2)(map2Value & (INTER_TAB_SIZE - 1), map2Value >> INTER_BITS) / (WT2)(INTER_TAB_SIZE); - - WT scalar = convertToWT(convertScalar(nVal)); - WT a = scalar, b = scalar, c = scalar, d = scalar; - - if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y)) - a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset)))); - else - EXTRAPOLATE(map_dataA, a); - - if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y)) - b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset)))); - else - EXTRAPOLATE(map_dataB, b); - - if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y)) - c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset)))); - else - EXTRAPOLATE(map_dataC, c); - - if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y)) - d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset)))); - else - EXTRAPOLATE(map_dataD, d); - - WT dst_data = a * (1 - u.x) * (1 - u.y) + - b * (u.x) * (1 - u.y) + - c * (1 - u.x) * (u.y) + - d * (u.x) * (u.y); - storepix(convertToT(dst_data), dst); + int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset)); + int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(short2), map1_offset)); + int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(ushort), map2_offset)); + + #pragma unroll + for (int i = 0; i < rowsPerWI; ++i, ++y, + map1_index += map1_step, map2_index += map2_step, dst_index += dst_step) + if (y < dst_rows) + { + __global const short2 * map1 = (__global const short2 *)(map1ptr + map1_index); + __global const ushort * map2 = (__global const ushort *)(map2ptr + map2_index); + __global T * dst = (__global T *)(dstptr + dst_index); + + int2 map_dataA = convert_int2(map1[0]); + int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y); + int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1); + int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1); + + ushort map2Value = (ushort)(map2[0] & (INTER_TAB_SIZE2 - 1)); + WT2 u = (WT2)(map2Value & (INTER_TAB_SIZE - 1), map2Value >> INTER_BITS) / (WT2)(INTER_TAB_SIZE); + + WT scalar = convertToWT(convertScalar(nVal)); + WT a = scalar, b = scalar, c = scalar, d = scalar; + + if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y)) + a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset)))); + else + EXTRAPOLATE(map_dataA, a); + + if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y)) + b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset)))); + else + EXTRAPOLATE(map_dataB, b); + + if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y)) + c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset)))); + else + EXTRAPOLATE(map_dataC, c); + + if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y)) + d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset)))); + else + EXTRAPOLATE(map_dataD, d); + + WT dst_data = a * (1 - u.x) * (1 - u.y) + + b * (u.x) * (1 - u.y) + + c * (1 - u.x) * (u.y) + + d * (u.x) * (u.y); + storepix(convertToT(dst_data), dst); + } } } @@ -359,55 +386,61 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src ST nVal) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * rowsPerWI; - if (x < dst_cols && y < dst_rows) + if (x < dst_cols) { - int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset); - int map1_index = mad24(y, map1_step, x * (int)sizeof(float) + map1_offset); - int map2_index = mad24(y, map2_step, x * (int)sizeof(float) + map2_offset); - - __global const float * map1 = (__global const float *)(map1ptr + map1_index); - __global const float * map2 = (__global const float *)(map2ptr + map2_index); - __global T * dst = (__global T *)(dstptr + dst_index); - - float2 map_data = (float2)(map1[0], map2[0]); - - int2 map_dataA = convert_int2_sat_rtn(map_data); - int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y); - int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1); - int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1); - - float2 _u = map_data - convert_float2(map_dataA); - WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE; - WT scalar = convertToWT(convertScalar(nVal)); - WT a = scalar, b = scalar, c = scalar, d = scalar; - - if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y)) - a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset)))); - else - EXTRAPOLATE(map_dataA, a); - - if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y)) - b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset)))); - else - EXTRAPOLATE(map_dataB, b); - - if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y)) - c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset)))); - else - EXTRAPOLATE(map_dataC, c); - - if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y)) - d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset)))); - else - EXTRAPOLATE(map_dataD, d); - - WT dst_data = a * (1 - u.x) * (1 - u.y) + - b * (u.x) * (1 - u.y) + - c * (1 - u.x) * (u.y) + - d * (u.x) * (u.y); - storepix(convertToT(dst_data), dst); + int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset)); + int map1_index = mad24(y, map1_step, mad24(x, (int)sizeof(float), map1_offset)); + int map2_index = mad24(y, map2_step, mad24(x, (int)sizeof(float), map2_offset)); + + #pragma unroll + for (int i = 0; i < rowsPerWI; ++i, ++y, + map1_index += map1_step, map2_index += map2_step, dst_index += dst_step) + if (y < dst_rows) + { + __global const float * map1 = (__global const float *)(map1ptr + map1_index); + __global const float * map2 = (__global const float *)(map2ptr + map2_index); + __global T * dst = (__global T *)(dstptr + dst_index); + + float2 map_data = (float2)(map1[0], map2[0]); + + int2 map_dataA = convert_int2_sat_rtn(map_data); + int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y); + int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1); + int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1); + + float2 _u = map_data - convert_float2(map_dataA); + WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE; + WT scalar = convertToWT(convertScalar(nVal)); + WT a = scalar, b = scalar, c = scalar, d = scalar; + + if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y)) + a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset)))); + else + EXTRAPOLATE(map_dataA, a); + + if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y)) + b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset)))); + else + EXTRAPOLATE(map_dataB, b); + + if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y)) + c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset)))); + else + EXTRAPOLATE(map_dataC, c); + + if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y)) + d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset)))); + else + EXTRAPOLATE(map_dataD, d); + + WT dst_data = a * (1 - u.x) * (1 - u.y) + + b * (u.x) * (1 - u.y) + + c * (1 - u.x) * (u.y) + + d * (u.x) * (u.y); + storepix(convertToT(dst_data), dst); + } } } @@ -417,52 +450,58 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o ST nVal) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * rowsPerWI; - if (x < dst_cols && y < dst_rows) + if (x < dst_cols) { - int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset); - int map_index = mad24(y, map_step, x * (int)sizeof(float2) + map_offset); - - __global const float2 * map = (__global const float2 *)(mapptr + map_index); - __global T * dst = (__global T *)(dstptr + dst_index); - - float2 map_data = map[0]; - int2 map_dataA = convert_int2_sat_rtn(map_data); - int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y); - int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1); - int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1); - - float2 _u = map_data - convert_float2(map_dataA); - WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE; - WT scalar = convertToWT(convertScalar(nVal)); - WT a = scalar, b = scalar, c = scalar, d = scalar; - - if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y)) - a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset)))); - else - EXTRAPOLATE(map_dataA, a); - - if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y)) - b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset)))); - else - EXTRAPOLATE(map_dataB, b); - - if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y)) - c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset)))); - else - EXTRAPOLATE(map_dataC, c); - - if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y)) - d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset)))); - else - EXTRAPOLATE(map_dataD, d); - - WT dst_data = a * (1 - u.x) * (1 - u.y) + - b * (u.x) * (1 - u.y) + - c * (1 - u.x) * (u.y) + - d * (u.x) * (u.y); - storepix(convertToT(dst_data), dst); + int dst_index = mad24(y, dst_step, mad24(x, TSIZE, dst_offset)); + int map_index = mad24(y, map_step, mad24(x, (int)sizeof(float2), map_offset)); + + #pragma unroll + for (int i = 0; i < rowsPerWI; ++i, ++y, + map_index += map_step, dst_index += dst_step) + if (y < dst_rows) + { + __global const float2 * map = (__global const float2 *)(mapptr + map_index); + __global T * dst = (__global T *)(dstptr + dst_index); + + float2 map_data = map[0]; + int2 map_dataA = convert_int2_sat_rtn(map_data); + int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y); + int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1); + int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1); + + float2 _u = map_data - convert_float2(map_dataA); + WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE; + WT scalar = convertToWT(convertScalar(nVal)); + WT a = scalar, b = scalar, c = scalar, d = scalar; + + if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y)) + a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset)))); + else + EXTRAPOLATE(map_dataA, a); + + if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y)) + b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset)))); + else + EXTRAPOLATE(map_dataB, b); + + if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y)) + c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset)))); + else + EXTRAPOLATE(map_dataC, c); + + if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y)) + d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset)))); + else + EXTRAPOLATE(map_dataD, d); + + WT dst_data = a * (1 - u.x) * (1 - u.y) + + b * (u.x) * (1 - u.y) + + c * (1 - u.x) * (u.y) + + d * (u.x) * (u.y); + storepix(convertToT(dst_data), dst); + } } }