public ParallelLoopBody
{
public:
- IPPresizeInvoker(Mat &_src, Mat &_dst, double _inv_scale_x, double _inv_scale_y, int _mode, bool *_ok) :
- ParallelLoopBody(), src(_src), dst(_dst), inv_scale_x(_inv_scale_x), inv_scale_y(_inv_scale_y), mode(_mode), ok(_ok)
- {
- *ok = true;
- IppiSize srcSize, dstSize;
- int type = src.type();
- int specSize = 0, initSize = 0;
- srcSize.width = src.cols;
- srcSize.height = src.rows;
- dstSize.width = dst.cols;
- dstSize.height = dst.rows;
-
- switch (type)
- {
- case CV_8UC1: SET_IPP_RESIZE_PTR(8u,C1); break;
- case CV_8UC3: SET_IPP_RESIZE_PTR(8u,C3); break;
- case CV_8UC4: SET_IPP_RESIZE_PTR(8u,C4); break;
- case CV_16UC1: SET_IPP_RESIZE_PTR(16u,C1); break;
- case CV_16UC3: SET_IPP_RESIZE_PTR(16u,C3); break;
- case CV_16UC4: SET_IPP_RESIZE_PTR(16u,C4); break;
- case CV_16SC1: SET_IPP_RESIZE_PTR(16s,C1); break;
- case CV_16SC3: SET_IPP_RESIZE_PTR(16s,C3); break;
- case CV_16SC4: SET_IPP_RESIZE_PTR(16s,C4); break;
- case CV_32FC1: SET_IPP_RESIZE_PTR(32f,C1); break;
- case CV_32FC3: SET_IPP_RESIZE_PTR(32f,C3); break;
- case CV_32FC4: SET_IPP_RESIZE_PTR(32f,C4); break;
- case CV_64FC1: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C1); break;
- case CV_64FC3: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C3); break;
- case CV_64FC4: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C4); break;
- default: { *ok = false; return;} break;
- }
- }
+ IPPresizeInvoker(const Mat & _src, Mat & _dst, double _inv_scale_x, double _inv_scale_y, int _mode, bool *_ok) :
+ ParallelLoopBody(), src(_src), dst(_dst), inv_scale_x(_inv_scale_x), inv_scale_y(_inv_scale_y), mode(_mode), ok(_ok)
+ {
+ *ok = true;
+ IppiSize srcSize, dstSize;
+ int type = src.type();
+ int specSize = 0, initSize = 0;
+ srcSize.width = src.cols;
+ srcSize.height = src.rows;
+ dstSize.width = dst.cols;
+ dstSize.height = dst.rows;
- ~IPPresizeInvoker()
- {
- }
+ switch (type)
+ {
+ case CV_8UC1: SET_IPP_RESIZE_PTR(8u,C1); break;
+ case CV_8UC3: SET_IPP_RESIZE_PTR(8u,C3); break;
+ case CV_8UC4: SET_IPP_RESIZE_PTR(8u,C4); break;
+ case CV_16UC1: SET_IPP_RESIZE_PTR(16u,C1); break;
+ case CV_16UC3: SET_IPP_RESIZE_PTR(16u,C3); break;
+ case CV_16UC4: SET_IPP_RESIZE_PTR(16u,C4); break;
+ case CV_16SC1: SET_IPP_RESIZE_PTR(16s,C1); break;
+ case CV_16SC3: SET_IPP_RESIZE_PTR(16s,C3); break;
+ case CV_16SC4: SET_IPP_RESIZE_PTR(16s,C4); break;
+ case CV_32FC1: SET_IPP_RESIZE_PTR(32f,C1); break;
+ case CV_32FC3: SET_IPP_RESIZE_PTR(32f,C3); break;
+ case CV_32FC4: SET_IPP_RESIZE_PTR(32f,C4); break;
+ case CV_64FC1: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C1); break;
+ case CV_64FC3: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C3); break;
+ case CV_64FC4: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C4); break;
+ default: { *ok = false; return; } break;
+ }
+ }
- virtual void operator() (const Range& range) const
- {
- if (*ok == false) return;
+ ~IPPresizeInvoker()
+ {
+ }
- int cn = src.channels();
- int dsty = min(cvRound(range.start * inv_scale_y), dst.rows);
- int dstwidth = min(cvRound(src.cols * inv_scale_x), dst.cols);
- int dstheight = min(cvRound(range.end * inv_scale_y), dst.rows);
+ virtual void operator() (const Range& range) const
+ {
+ if (*ok == false)
+ return;
- IppiPoint dstOffset = { 0, dsty }, srcOffset = {0, 0};
- IppiSize dstSize = { dstwidth, dstheight - dsty };
- int bufsize = 0, itemSize = (int)src.elemSize1();
+ int cn = src.channels();
+ int dsty = min(cvRound(range.start * inv_scale_y), dst.rows);
+ int dstwidth = min(cvRound(src.cols * inv_scale_x), dst.cols);
+ int dstheight = min(cvRound(range.end * inv_scale_y), dst.rows);
- CHECK_IPP_STATUS(getBufferSizeFunc(pSpec, dstSize, cn, &bufsize));
- CHECK_IPP_STATUS(getSrcOffsetFunc(pSpec, dstOffset, &srcOffset));
+ IppiPoint dstOffset = { 0, dsty }, srcOffset = {0, 0};
+ IppiSize dstSize = { dstwidth, dstheight - dsty };
+ int bufsize = 0, itemSize = (int)src.elemSize1();
- Ipp8u* pSrc = (Ipp8u*)src.data + (int)src.step[0] * srcOffset.y + srcOffset.x * cn * itemSize;
- Ipp8u* pDst = (Ipp8u*)dst.data + (int)dst.step[0] * dstOffset.y + dstOffset.x * cn * itemSize;
+ CHECK_IPP_STATUS(getBufferSizeFunc(pSpec, dstSize, cn, &bufsize));
+ CHECK_IPP_STATUS(getSrcOffsetFunc(pSpec, dstOffset, &srcOffset));
- AutoBuffer<uchar> buf(bufsize + 64);
- uchar* bufptr = alignPtr((uchar*)buf, 32);
+ Ipp8u* pSrc = (Ipp8u*)src.data + (int)src.step[0] * srcOffset.y + srcOffset.x * cn * itemSize;
+ Ipp8u* pDst = (Ipp8u*)dst.data + (int)dst.step[0] * dstOffset.y + dstOffset.x * cn * itemSize;
+
+ AutoBuffer<uchar> buf(bufsize + 64);
+ uchar* bufptr = alignPtr((uchar*)buf, 32);
+
+ if( func( pSrc, (int)src.step[0], pDst, (int)dst.step[0], dstOffset, dstSize, ippBorderRepl, 0, pSpec, bufptr ) < 0 )
+ *ok = false;
+ }
- if( func( pSrc, (int)src.step[0], pDst, (int)dst.step[0], dstOffset, dstSize, ippBorderRepl, 0, pSpec, bufptr ) < 0 )
- *ok = false;
- }
private:
- Mat &src;
- Mat &dst;
+ Mat & src;
+ Mat & dst;
double inv_scale_x;
double inv_scale_y;
void *pSpec;
bool *ok;
const IPPresizeInvoker& operator= (const IPPresizeInvoker&);
};
+
#endif
#ifdef HAVE_OPENCL
static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab,
- float * const alpha_tab, int * const ofs_tab)
+ float * const alpha_tab, int * const ofs_tab)
{
int k = 0, dx = 0;
for ( ; dx < dsize; dx++)
{
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
- double inv_fx = 1. / fx, inv_fy = 1. / fy;
+ double inv_fx = 1.0 / fx, inv_fy = 1.0 / fy;
float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy;
+ int iscale_x = saturate_cast<int>(inv_fx), iscale_y = saturate_cast<int>(inv_fx);
+ bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON &&
+ std::abs(inv_fy - iscale_y) < DBL_EPSILON;
+
+ // in case of scale_x && scale_y is equal to 2
+ // INTER_AREA (fast) also is equal to INTER_LINEAR
+ if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
+ /*interpolation = INTER_AREA*/(void)0; // INTER_AREA is slower
if( !(cn <= 4 &&
(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR ||
_dst.create(dsize, type);
UMat dst = _dst.getUMat();
+ Size ssize = src.size();
ocl::Kernel k;
size_t globalsize[] = { dst.cols, dst.rows };
if (interpolation == INTER_LINEAR)
{
- int wdepth = std::max(depth, CV_32S);
- int wtype = CV_MAKETYPE(wdepth, cn);
char buf[2][32];
- k.create("resizeLN", ocl::imgproc::resize_oclsrc,
- format("-D INTER_LINEAR -D depth=%d -D PIXTYPE=%s -D PIXTYPE1=%s "
- "-D WORKTYPE=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d",
- depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
- ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
- ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
- cn));
+
+ // integer path is slower because of CPU part, so it's disabled
+ if (depth == CV_8U && ((void)0, 0))
+ {
+ AutoBuffer<uchar> _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2));
+ int* xofs = (int*)(uchar*)_buffer, * yofs = xofs + dsize.width;
+ short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2;
+ float fxx, fyy;
+ int sx, sy;
+
+ for (int dx = 0; dx < dsize.width; dx++)
+ {
+ fxx = (float)((dx+0.5)*inv_fx - 0.5);
+ sx = cvFloor(fxx);
+ fxx -= sx;
+
+ if (sx < 0)
+ fxx = 0, sx = 0;
+
+ if (sx >= ssize.width-1)
+ fxx = 0, sx = ssize.width-1;
+
+ xofs[dx] = sx;
+ ialpha[dx*2 + 0] = saturate_cast<short>((1.f - fxx) * INTER_RESIZE_COEF_SCALE);
+ ialpha[dx*2 + 1] = saturate_cast<short>(fxx * INTER_RESIZE_COEF_SCALE);
+ }
+
+ for (int dy = 0; dy < dsize.height; dy++)
+ {
+ fyy = (float)((dy+0.5)*inv_fy - 0.5);
+ sy = cvFloor(fyy);
+ fyy -= sy;
+
+ yofs[dy] = sy;
+ ibeta[dy*2 + 0] = saturate_cast<short>((1.f - fyy) * INTER_RESIZE_COEF_SCALE);
+ ibeta[dy*2 + 1] = saturate_cast<short>(fyy * INTER_RESIZE_COEF_SCALE);
+ }
+
+ int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
+ UMat coeffs;
+ Mat(1, static_cast<int>(_buffer.size()), CV_8UC1, (uchar *)_buffer).copyTo(coeffs);
+
+ k.create("resizeLN", ocl::imgproc::resize_oclsrc,
+ format("-D INTER_LINEAR_INTEGER -D depth=%d -D T=%s -D T1=%s "
+ "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
+ "-D INTER_RESIZE_COEF_BITS=%d",
+ depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
+ ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
+ ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
+ cn, INTER_RESIZE_COEF_BITS));
+ if (k.empty())
+ return false;
+
+ k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
+ ocl::KernelArg::PtrReadOnly(coeffs));
+ }
+ else
+ {
+ int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
+ k.create("resizeLN", ocl::imgproc::resize_oclsrc,
+ format("-D INTER_LINEAR -D depth=%d -D T=%s -D T1=%s "
+ "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
+ "-D INTER_RESIZE_COEF_BITS=%d",
+ depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
+ ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
+ ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
+ cn, INTER_RESIZE_COEF_BITS));
+ if (k.empty())
+ return false;
+
+ k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
+ (float)inv_fx, (float)inv_fy);
+ }
}
else if (interpolation == INTER_NEAREST)
{
k.create("resizeNN", ocl::imgproc::resize_oclsrc,
- format("-D INTER_NEAREST -D PIXTYPE=%s -D PIXTYPE1=%s -D cn=%d",
+ format("-D INTER_NEAREST -D T=%s -D T1=%s -D cn=%d",
ocl::memopTypeToStr(type), ocl::memopTypeToStr(depth), cn));
+ if (k.empty())
+ return false;
+
+ k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
+ (float)inv_fx, (float)inv_fy);
}
else if (interpolation == INTER_AREA)
{
- int iscale_x = saturate_cast<int>(inv_fx);
- int iscale_y = saturate_cast<int>(inv_fy);
- bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON &&
- std::abs(inv_fy - iscale_y) < DBL_EPSILON;
int wdepth = std::max(depth, is_area_fast ? CV_32S : CV_32F);
int wtype = CV_MAKE_TYPE(wdepth, cn);
char cvt[2][40];
- String buildOption = format("-D INTER_AREA -D PIXTYPE=%s -D PIXTYPE1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
+ String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn);
if (is_area_fast)
{
int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
- buildOption = buildOption + format(" -D convertToPIXTYPE=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
+ buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
" -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
}
else
{
- buildOption = buildOption + format(" -D convertToPIXTYPE=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
+ buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption);
if (k.empty())
return false;
- Size ssize = src.size();
int xytab_size = (ssize.width + ssize.height) << 1;
int tabofs_size = dsize.height + dsize.width + 2;
return k.run(2, globalsize, NULL, false);
}
- if( k.empty() )
- return false;
- k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
- (float)inv_fx, (float)inv_fy);
-
return k.run(2, globalsize, 0, false);
}
//
//M*/
-#if defined DOUBLE_SUPPORT
+#ifdef DOUBLE_SUPPORT
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
+#endif
-#define INTER_RESIZE_COEF_BITS 11
#define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS)
#define CAST_BITS (INTER_RESIZE_COEF_BITS << 1)
#define INC(x,l) min(x+1,l-1)
-
-#define noconvert(x) (x)
+#define noconvert
#if cn != 3
-#define loadpix(addr) *(__global const PIXTYPE*)(addr)
-#define storepix(val, addr) *(__global PIXTYPE*)(addr) = val
-#define PIXSIZE ((int)sizeof(PIXTYPE))
+#define loadpix(addr) *(__global const T *)(addr)
+#define storepix(val, addr) *(__global T *)(addr) = val
+#define TSIZE (int)sizeof(T)
#else
-#define loadpix(addr) vload3(0, (__global const PIXTYPE1*)(addr))
-#define storepix(val, addr) vstore3(val, 0, (__global PIXTYPE1*)(addr))
-#define PIXSIZE ((int)sizeof(PIXTYPE1)*3)
+#define loadpix(addr) vload3(0, (__global const T1 *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
+#define TSIZE (int)sizeof(T1)*cn
#endif
-#if defined INTER_LINEAR
+#ifdef INTER_LINEAR_INTEGER
-__kernel void resizeLN(__global const uchar* srcptr, int srcstep, int srcoffset,
- int srcrows, int srccols,
- __global uchar* dstptr, int dststep, int dstoffset,
- int dstrows, int dstcols,
- float ifx, float ify)
+__kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
+ __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+ __global const uchar * buffer)
{
int dx = get_global_id(0);
int dy = get_global_id(1);
- float sx = ((dx+0.5f) * ifx - 0.5f), sy = ((dy+0.5f) * ify - 0.5f);
- int x = floor(sx), y = floor(sy);
-
- float u = sx - x, v = sy - y;
+ if (dx < dst_cols && dy < dst_rows)
+ {
+ __global const int * xofs = (__global const int *)(buffer), * yofs = xofs + dst_cols;
+ __global const short * ialpha = (__global const short *)(yofs + dst_rows);
+ __global const short * ibeta = ialpha + ((dst_cols + dy) << 1);
+ ialpha += dx << 1;
+
+ int sx0 = xofs[dx], sy0 = clamp(yofs[dy], 0, src_rows - 1),
+ sy1 = clamp(yofs[dy] + 1, 0, src_rows - 1);
+ short a0 = ialpha[0], a1 = ialpha[1];
+ short b0 = ibeta[0], b1 = ibeta[1];
+
+ int src_index0 = mad24(sy0, src_step, mad24(sx0, TSIZE, src_offset)),
+ src_index1 = mad24(sy1, src_step, mad24(sx0, TSIZE, src_offset));
+ WT data0 = convertToWT(loadpix(srcptr + src_index0));
+ WT data1 = convertToWT(loadpix(srcptr + src_index0 + TSIZE));
+ WT data2 = convertToWT(loadpix(srcptr + src_index1));
+ WT data3 = convertToWT(loadpix(srcptr + src_index1 + TSIZE));
+
+ WT val = ( (((data0 * a0 + data1 * a1) >> 4) * b0) >> 16) +
+ ( (((data2 * a0 + data3 * a1) >> 4) * b1) >> 16);
+
+ storepix(convertToDT((val + 2) >> 2),
+ dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));
+ }
+}
- if ( x<0 ) x=0,u=0;
- if ( x>=srccols ) x=srccols-1,u=0;
- if ( y<0 ) y=0,v=0;
- if ( y>=srcrows ) y=srcrows-1,v=0;
+#elif defined INTER_LINEAR
- int y_ = INC(y,srcrows);
- int x_ = INC(x,srccols);
+__kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
+ __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+ float ifx, float ify)
+{
+ int dx = get_global_id(0);
+ int dy = get_global_id(1);
-#if depth <= 4
+ if (dx < dst_cols && dy < dst_rows)
+ {
+ float sx = ((dx+0.5f) * ifx - 0.5f), sy = ((dy+0.5f) * ify - 0.5f);
+ int x = floor(sx), y = floor(sy);
- u = u * INTER_RESIZE_COEF_SCALE;
- v = v * INTER_RESIZE_COEF_SCALE;
+ float u = sx - x, v = sy - y;
- int U = rint(u);
- int V = rint(v);
- int U1 = rint(INTER_RESIZE_COEF_SCALE - u);
- int V1 = rint(INTER_RESIZE_COEF_SCALE - v);
+ if ( x<0 ) x=0,u=0;
+ if ( x>=src_cols ) x=src_cols-1,u=0;
+ if ( y<0 ) y=0,v=0;
+ if ( y>=src_rows ) y=src_rows-1,v=0;
- WORKTYPE data0 = convertToWT(loadpix(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE)));
- WORKTYPE data1 = convertToWT(loadpix(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE)));
- WORKTYPE data2 = convertToWT(loadpix(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE)));
- WORKTYPE data3 = convertToWT(loadpix(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE)));
+ int y_ = INC(y, src_rows);
+ int x_ = INC(x, src_cols);
- WORKTYPE val = mul24((WORKTYPE)mul24(U1, V1), data0) + mul24((WORKTYPE)mul24(U, V1), data1) +
- mul24((WORKTYPE)mul24(U1, V), data2) + mul24((WORKTYPE)mul24(U, V), data3);
+#if depth <= 4
+ u = u * INTER_RESIZE_COEF_SCALE;
+ v = v * INTER_RESIZE_COEF_SCALE;
- PIXTYPE uval = convertToDT((val + (1<<(CAST_BITS-1)))>>CAST_BITS);
+ int U = rint(u);
+ int V = rint(v);
+ int U1 = rint(INTER_RESIZE_COEF_SCALE - u);
+ int V1 = rint(INTER_RESIZE_COEF_SCALE - v);
-#else
- float u1 = 1.f - u;
- float v1 = 1.f - v;
- WORKTYPE data0 = convertToWT(loadpix(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE)));
- WORKTYPE data1 = convertToWT(loadpix(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE)));
- WORKTYPE data2 = convertToWT(loadpix(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE)));
- WORKTYPE data3 = convertToWT(loadpix(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE)));
+ WT data0 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))));
+ WT data1 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x_, TSIZE, src_offset))));
+ WT data2 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x, TSIZE, src_offset))));
+ WT data3 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x_, TSIZE, src_offset))));
- PIXTYPE uval = u1 * v1 * data0 + u * v1 * data1 + u1 * v *data2 + u * v *data3;
+ WT val = mul24((WT)mul24(U1, V1), data0) + mul24((WT)mul24(U, V1), data1) +
+ mul24((WT)mul24(U1, V), data2) + mul24((WT)mul24(U, V), data3);
+ T uval = convertToDT((val + (1<<(CAST_BITS-1)))>>CAST_BITS);
+#else
+ float u1 = 1.f - u;
+ float v1 = 1.f - v;
+ WT data0 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))));
+ WT data1 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x_, TSIZE, src_offset))));
+ WT data2 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x, TSIZE, src_offset))));
+ WT data3 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x_, TSIZE, src_offset))));
+
+ T uval = u1 * v1 * data0 + u * v1 * data1 + u1 * v *data2 + u * v *data3;
#endif
-
- if(dx < dstcols && dy < dstrows)
- {
- storepix(uval, dstptr + mad24(dy, dststep, dstoffset + dx*PIXSIZE));
+ storepix(uval, dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));
}
}
#elif defined INTER_NEAREST
-__kernel void resizeNN(__global const uchar* srcptr, int srcstep, int srcoffset,
- int srcrows, int srccols,
- __global uchar* dstptr, int dststep, int dstoffset,
- int dstrows, int dstcols,
+__kernel void resizeNN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
+ __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
float ifx, float ify)
{
int dx = get_global_id(0);
int dy = get_global_id(1);
- if( dx < dstcols && dy < dstrows )
+ if (dx < dst_cols && dy < dst_rows)
{
- float s1 = dx*ifx;
- float s2 = dy*ify;
- int sx = min(convert_int_rtz(s1), srccols-1);
- int sy = min(convert_int_rtz(s2), srcrows-1);
+ float s1 = dx * ifx;
+ float s2 = dy * ify;
+ int sx = min(convert_int_rtz(s1), src_cols - 1);
+ int sy = min(convert_int_rtz(s2), src_rows - 1);
- storepix(loadpix(srcptr + mad24(sy, srcstep, srcoffset + sx*PIXSIZE)),
- dstptr + mad24(dy, dststep, dstoffset + dx*PIXSIZE));
+ storepix(loadpix(srcptr + mad24(sy, src_step, mad24(sx, TSIZE, src_offset))),
+ dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));
}
}
int src_index = mad24(symap_tab[y + sy], src_step, src_offset);
#pragma unroll
for (int x = 0; x < XSCALE; ++x)
- sum += convertToWTV(loadpix(src + src_index + sxmap_tab[sx + x]*PIXSIZE));
+ sum += convertToWTV(loadpix(src + mad24(sxmap_tab[sx + x], TSIZE, src_index)));
}
- storepix(convertToPIXTYPE(convertToWT2V(sum) * (WT2V)(SCALE)), dst + dst_index + dx*PIXSIZE);
+ storepix(convertToT(convertToWT2V(sum) * (WT2V)(SCALE)), dst + mad24(dx, TSIZE, dst_index));
}
}
for (int sx = sx0, xk = xk0; sx <= sx1; ++sx, ++xk)
{
WTV alpha = (WTV)(xalpha_tab[xk]);
- buf += convertToWTV(loadpix(src + src_index + sx*PIXSIZE)) * alpha;
+ buf += convertToWTV(loadpix(src + mad24(sx, TSIZE, src_index))) * alpha;
}
sum += buf * beta;
}
- storepix(convertToPIXTYPE(sum), dst + dst_index + dx*PIXSIZE);
+ storepix(convertToT(sum), dst + mad24(dx, TSIZE, dst_index));
}
}