#else
+#ifndef dstST
+#define dstST dstT
+#endif
+
+#if cn != 3
+#define value value_
+#define storedst(val) *(__global dstT*)(dstptr + dst_index) = val
+#else
+#define value (dstT)(value_.x, value_.y, value_.z)
+#define storedst(val) vstore3(val, 0, (__global dstT1*)(dstptr + dst_index))
+#endif
+
__kernel void setMask(__global const uchar* mask, int maskstep, int maskoffset,
__global uchar* dstptr, int dststep, int dstoffset,
- int rows, int cols, dstT value )
+ int rows, int cols, dstST value_ )
{
int x = get_global_id(0);
int y = get_global_id(1);
int mask_index = mad24(y, maskstep, x + maskoffset);
if( mask[mask_index] )
{
- int dst_index = mad24(y, dststep, x*(int)sizeof(dstT) + dstoffset);
- *(__global dstT*)(dstptr + dst_index) = value;
+ int dst_index = mad24(y, dststep, x*(int)sizeof(dstT1)*cn + dstoffset);
+ storedst(value);
}
}
}
__kernel void set(__global uchar* dstptr, int dststep, int dstoffset,
- int rows, int cols, dstT value )
+ int rows, int cols, dstST value_ )
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
- int dst_index = mad24(y, dststep, x*(int)sizeof(dstT) + dstoffset);
- *(__global dstT*)(dstptr + dst_index) = value;
+ int dst_index = mad24(y, dststep, x*(int)sizeof(dstT1)*cn + dstoffset);
+ storedst(value);
}
}
{
bool haveMask = !_mask.empty();
int tp = type(), cn = CV_MAT_CN(tp);
- if( dims <= 2 && cn <= 4 && cn != 3 && ocl::useOpenCL() )
+ if( dims <= 2 && cn <= 4 && CV_MAT_DEPTH(tp) < CV_64F && ocl::useOpenCL() )
{
Mat value = _value.getMat();
CV_Assert( checkScalar(value, type(), _value.kind(), _InputArray::UMAT) );
- double buf[4];
+ double buf[4]={0,0,0,0};
convertAndUnrollScalar(value, tp, (uchar*)buf, 1);
+ int scalarcn = cn == 3 ? 4 : cn;
char opts[1024];
- sprintf(opts, "-D dstT=%s", ocl::memopTypeToStr(tp));
+ sprintf(opts, "-D dstT=%s -D dstST=%s -D dstT1=%s -D cn=%d", ocl::memopTypeToStr(tp),
+ ocl::memopTypeToStr(CV_MAKETYPE(tp,scalarcn)),
+ ocl::memopTypeToStr(CV_MAT_DEPTH(tp)), cn);
ocl::Kernel setK(haveMask ? "setMask" : "set", ocl::core::copyset_oclsrc, opts);
if( !setK.empty() )
{
- ocl::KernelArg scalararg(0, 0, 0, buf, CV_ELEM_SIZE(tp));
+ ocl::KernelArg scalararg(0, 0, 0, buf, CV_ELEM_SIZE1(tp)*scalarcn);
UMat mask;
if( haveMask )
double inv_fx = 1. / fx, inv_fy = 1. / fy;
float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy;
- if( cn == 3 || !(cn <= 4 &&
+ if( !(cn <= 4 &&
(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR ||
(interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1) )) )
return false;
int wtype = CV_MAKETYPE(wdepth, cn);
char buf[2][32];
k.create("resizeLN", ocl::imgproc::resize_oclsrc,
- format("-D INTER_LINEAR -D depth=%d -D PIXTYPE=%s -D WORKTYPE=%s -D convertToWT=%s -D convertToDT=%s",
- depth, ocl::typeToStr(type), ocl::typeToStr(wtype),
+ format("-D INTER_LINEAR -D depth=%d -D PIXTYPE=%s -D PIXTYPE1=%s "
+ "-D WORKTYPE=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d",
+ depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
- ocl::convertTypeStr(wdepth, depth, cn, buf[1])));
+ ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
+ cn));
}
else if (interpolation == INTER_NEAREST)
{
k.create("resizeNN", ocl::imgproc::resize_oclsrc,
- format("-D INTER_NEAREST -D PIXTYPE=%s -D cn", ocl::memopTypeToStr(type), cn));
+ format("-D INTER_NEAREST -D PIXTYPE=%s -D PIXTYPE1=%s -D cn=%d",
+ ocl::memopTypeToStr(type), ocl::memopTypeToStr(depth), cn));
}
else if (interpolation == INTER_AREA)
{
int wtype = CV_MAKE_TYPE(wdepth, cn);
char cvt[2][40];
- String buildOption = format("-D INTER_AREA -D T=%s -D WTV=%s -D convertToWTV=%s",
- ocl::typeToStr(type), ocl::typeToStr(wtype),
- ocl::convertTypeStr(depth, wdepth, cn, cvt[0]));
+ String buildOption = format("-D INTER_AREA -D PIXTYPE=%s -D PIXTYPE1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
+ ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
+ ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn);
UMat alphaOcl, tabofsOcl, mapOcl;
UMat dmap, smap;
if (is_area_fast)
{
int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
- buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
+ buildOption = buildOption + format(" -D convertToPIXTYPE=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
" -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
}
else
{
- buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
+ buildOption = buildOption + format(" -D convertToPIXTYPE=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption);
if (k.empty())
return false;
{
CV_Assert(op_type == OCL_OP_AFFINE || op_type == OCL_OP_PERSPECTIVE);
- int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), wdepth = depth;
+ int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
double doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
int interpolation = flags & INTER_MAX;
if ( !(borderType == cv::BORDER_CONSTANT &&
(interpolation == cv::INTER_NEAREST || interpolation == cv::INTER_LINEAR || interpolation == cv::INTER_CUBIC)) ||
- (!doubleSupport && depth == CV_64F) || cn > 4 || cn == 3)
+ (!doubleSupport && depth == CV_64F) || cn > 4)
return false;
const char * const interpolationMap[3] = { "NEAREST", "LINEAR", "CUBIC" };
ocl::imgproc::warp_affine_oclsrc : ocl::imgproc::warp_perspective_oclsrc;
const char * const kernelName = op_type == OCL_OP_AFFINE ? "warpAffine" : "warpPerspective";
+ int scalarcn = cn == 3 ? 4 : cn;
+ int wdepth = interpolation == INTER_NEAREST ? depth : std::max(CV_32S, depth);
+ int sctype = CV_MAKETYPE(wdepth, scalarcn);
+
ocl::Kernel k;
+ String opts;
if (interpolation == INTER_NEAREST)
{
- k.create(kernelName, program,
- format("-D INTER_NEAREST -D T=%s%s", ocl::typeToStr(type),
- doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+ opts = format("-D INTER_NEAREST -D T=%s%s -D T1=%s -D ST=%s -D cn=%d", ocl::typeToStr(type),
+ doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+ ocl::typeToStr(CV_MAT_DEPTH(type)),
+ ocl::typeToStr(sctype),
+ cn);
}
else
{
char cvt[2][50];
- wdepth = std::max(CV_32S, depth);
- k.create(kernelName, program,
- format("-D INTER_%s -D T=%s -D WT=%s -D depth=%d -D convertToWT=%s -D convertToT=%s%s",
- interpolationMap[interpolation], ocl::typeToStr(type),
- ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), depth,
- ocl::convertTypeStr(depth, wdepth, cn, cvt[0]),
- ocl::convertTypeStr(wdepth, depth, cn, cvt[1]),
- doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+ opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D depth=%d -D convertToWT=%s -D convertToT=%s%s cn=%d",
+ interpolationMap[interpolation], ocl::typeToStr(type),
+ ocl::typeToStr(CV_MAT_DEPTH(type)),
+ ocl::typeToStr(sctype),
+ ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), depth,
+ ocl::convertTypeStr(depth, wdepth, cn, cvt[0]),
+ ocl::convertTypeStr(wdepth, depth, cn, cvt[1]),
+ doubleSupport ? " -D DOUBLE_SUPPORT" : "", cn);
}
+
+ k.create(kernelName, program, opts);
if (k.empty())
return false;
+ double borderBuf[] = {0, 0, 0, 0};
+ scalarToRawData(borderValue, borderBuf, sctype);
+
UMat src = _src.getUMat(), M0;
_dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
UMat dst = _dst.getUMat();
matM.convertTo(M0, doubleSupport ? CV_64F : CV_32F);
k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0),
- ocl::KernelArg::Constant(Mat(1, 1, CV_MAKE_TYPE(wdepth, cn), borderValue)));
+ ocl::KernelArg(0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype)));
size_t globalThreads[2] = { dst.cols, dst.rows };
return k.run(2, globalThreads, NULL, false);
#define CAST_BITS (INTER_RESIZE_COEF_BITS << 1)
#define INC(x,l) min(x+1,l-1)
-#define PIXSIZE ((int)sizeof(PIXTYPE))
+
#define noconvert(x) (x)
+#if cn != 3
+#define loadpix(addr) *(__global const PIXTYPE*)(addr)
+#define storepix(val, addr) *(__global PIXTYPE*)(addr) = val
+#define PIXSIZE ((int)sizeof(PIXTYPE))
+#else
+#define loadpix(addr) vload3(0, (__global const PIXTYPE1*)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global PIXTYPE1*)(addr))
+#define PIXSIZE ((int)sizeof(PIXTYPE1)*3)
+#endif
+
#if defined INTER_LINEAR
__kernel void resizeLN(__global const uchar* srcptr, int srcstep, int srcoffset,
int U1 = rint(INTER_RESIZE_COEF_SCALE - u);
int V1 = rint(INTER_RESIZE_COEF_SCALE - v);
- WORKTYPE data0 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE)));
- WORKTYPE data1 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE)));
- WORKTYPE data2 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE)));
- WORKTYPE data3 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE)));
+ WORKTYPE data0 = convertToWT(loadpix(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE)));
+ WORKTYPE data1 = convertToWT(loadpix(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE)));
+ WORKTYPE data2 = convertToWT(loadpix(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE)));
+ WORKTYPE data3 = convertToWT(loadpix(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE)));
WORKTYPE val = mul24((WORKTYPE)mul24(U1, V1), data0) + mul24((WORKTYPE)mul24(U, V1), data1) +
mul24((WORKTYPE)mul24(U1, V), data2) + mul24((WORKTYPE)mul24(U, V), data3);
#else
float u1 = 1.f - u;
float v1 = 1.f - v;
- WORKTYPE data0 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE)));
- WORKTYPE data1 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE)));
- WORKTYPE data2 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE)));
- WORKTYPE data3 = convertToWT(*(__global const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE)));
+ WORKTYPE data0 = convertToWT(loadpix(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE)));
+ WORKTYPE data1 = convertToWT(loadpix(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE)));
+ WORKTYPE data2 = convertToWT(loadpix(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE)));
+ WORKTYPE data3 = convertToWT(loadpix(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE)));
PIXTYPE uval = u1 * v1 * data0 + u * v1 * data1 + u1 * v *data2 + u * v *data3;
if(dx < dstcols && dy < dstrows)
{
- __global PIXTYPE* dst = (__global PIXTYPE*)(dstptr + mad24(dy, dststep, dstoffset + dx*PIXSIZE));
- dst[0] = uval;
+ storepix(uval, dstptr + mad24(dy, dststep, dstoffset + dx*PIXSIZE));
}
}
int sx = min(convert_int_rtz(s1), srccols-1);
int sy = min(convert_int_rtz(s2), srcrows-1);
- __global PIXTYPE* dst = (__global PIXTYPE*)(dstptr + mad24(dy, dststep, dstoffset + dx*PIXSIZE));
- __global const PIXTYPE* src = (__global const PIXTYPE*)(srcptr + mad24(sy, srcstep, srcoffset + sx*PIXSIZE));
-
- dst[0] = src[0];
+ storepix(loadpix(srcptr + mad24(sy, srcstep, srcoffset + sx*PIXSIZE)),
+ dstptr + mad24(dy, dststep, dstoffset + dx*PIXSIZE));
}
}
#elif defined INTER_AREA
-#define TSIZE ((int)(sizeof(T)))
-
#ifdef INTER_AREA_FAST
__kernel void resizeAREA_FAST(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
int src_index = mad24(symap_tab[y + sy], src_step, src_offset);
#pragma unroll
for (int x = 0; x < XSCALE; ++x)
- sum += convertToWTV(((__global const T*)(src + src_index))[sxmap_tab[sx + x]]);
+ sum += convertToWTV(loadpix(src + src_index + sxmap_tab[sx + x]*PIXSIZE));
}
- ((__global T*)(dst + dst_index))[dx] = convertToT(convertToWT2V(sum) * (WT2V)(SCALE));
+ storepix(convertToPIXTYPE(convertToWT2V(sum) * (WT2V)(SCALE)), dst + dst_index + dx*PIXSIZE);
}
}
for (int sx = sx0, xk = xk0; sx <= sx1; ++sx, ++xk)
{
WTV alpha = (WTV)(xalpha_tab[xk]);
- buf += convertToWTV(((__global const T*)(src + src_index))[sx]) * alpha;
+ buf += convertToWTV(loadpix(src + src_index + sx*PIXSIZE)) * alpha;
}
sum += buf * beta;
}
- ((__global T*)(dst + dst_index))[dx] = convertToT(sum);
+ storepix(convertToPIXTYPE(sum), dst + dst_index + dx*PIXSIZE);
}
}
#define noconvert
+#ifndef ST
+#define ST T
+#endif
+
+#if cn != 3
+#define loadpix(addr) *(__global const T*)(addr)
+#define storepix(val, addr) *(__global T*)(addr) = val
+#define scalar scalar_
+#define pixsize (int)sizeof(T)
+#else
+#define loadpix(addr) vload3(0, (__global const T1*)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global T1*)(addr))
+#ifdef INTER_NEAREST
+#define scalar (T)(scalar_.x, scalar_.y, scalar_.z)
+#else
+#define scalar (WT)(scalar_.x, scalar_.y, scalar_.z)
+#endif
+#define pixsize ((int)sizeof(T1)*3)
+#endif
+
#ifdef INTER_NEAREST
__kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
- __constant CT * M, T scalar)
+ __constant CT * M, ST scalar_)
{
int dx = get_global_id(0);
int dy = get_global_id(1);
short sx = convert_short_sat(X0 >> AB_BITS);
short sy = convert_short_sat(Y0 >> AB_BITS);
- int dst_index = mad24(dy, dst_step, dst_offset + dx * (int)sizeof(T));
- __global T * dst = (__global T *)(dstptr + dst_index);
+ int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize);
if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows)
{
- int src_index = mad24(sy, src_step, src_offset + sx * (int)sizeof(T));
- __global const T * src = (__global const T *)(srcptr + src_index);
- dst[0] = src[0];
+ int src_index = mad24(sy, src_step, src_offset + sx * pixsize);
+ storepix(loadpix(srcptr + src_index), dstptr + dst_index);
}
else
- dst[0] = scalar;
+ storepix(scalar, dstptr + dst_index);
}
}
__kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
- __constant CT * M, WT scalar)
+ __constant CT * M, ST scalar_)
{
int dx = get_global_id(0);
int dy = get_global_id(1);
short ay = convert_short(Y0 & (INTER_TAB_SIZE-1));
WT v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ?
- convertToWT(*(__global const T *)(srcptr + mad24(sy, src_step, src_offset + sx * (int)sizeof(T)))) : scalar;
+ convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + sx * pixsize))) : scalar;
WT v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ?
- convertToWT(*(__global const T *)(srcptr + mad24(sy, src_step, src_offset + (sx+1) * (int)sizeof(T)))) : scalar;
+ convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + (sx+1) * pixsize))) : scalar;
WT v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ?
- convertToWT(*(__global const T *)(srcptr + mad24(sy+1, src_step, src_offset + sx * (int)sizeof(T)))) : scalar;
+ convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + sx * pixsize))) : scalar;
WT v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ?
- convertToWT(*(__global const T *)(srcptr + mad24(sy+1, src_step, src_offset + (sx+1) * (int)sizeof(T)))) : scalar;
+ convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + (sx+1) * pixsize))) : scalar;
float taby = 1.f/INTER_TAB_SIZE*ay;
float tabx = 1.f/INTER_TAB_SIZE*ax;
- int dst_index = mad24(dy, dst_step, dst_offset + dx * (int)sizeof(T));
- __global T * dst = (__global T *)(dstptr + dst_index);
+ int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize);
#if depth <= 4
int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );
int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE );
WT val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3;
- dst[0] = convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS);
+ storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index);
#else
float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby;
WT val = v0 * tabx2 * taby2 + v1 * tabx * taby2 + v2 * tabx2 * taby + v3 * tabx * taby;
- dst[0] = convertToT(val);
+ storepix(convertToT(val), dstptr + dst_index);
#endif
}
}
__kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
- __constant CT * M, WT scalar)
+ __constant CT * M, ST scalar_)
{
int dx = get_global_id(0);
int dy = get_global_id(1);
#pragma unroll
for (int x = 0; x < 4; x++)
v[mad24(y, 4, x)] = (sx+x >= 0 && sx+x < src_cols && sy+y >= 0 && sy+y < src_rows) ?
- convertToWT(*(__global const T *)(srcptr + mad24(sy+y, src_step, src_offset + (sx+x) * (int)sizeof(T)))) : scalar;
+ convertToWT(loadpix(srcptr + mad24(sy+y, src_step, src_offset + (sx+x) * pixsize))) : scalar;
float tab1y[4], tab1x[4];
interpolateCubic(ayy, tab1y);
interpolateCubic(axx, tab1x);
- int dst_index = mad24(dy, dst_step, dst_offset + dx * (int)sizeof(T));
- __global T * dst = (__global T *)(dstptr + dst_index);
+ int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize);
WT sum = (WT)(0);
#if depth <= 4
#pragma unroll
for (int i = 0; i < 16; i++)
sum += v[i] * itab[i];
- dst[0] = convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS );
+ storepix(convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index);
#else
#pragma unroll
for (int i = 0; i < 16; i++)
sum += v[i] * tab1y[(i>>2)] * tab1x[(i&3)];
- dst[0] = convertToT( sum );
+ storepix(convertToT( sum ), dstptr + dst_index);
#endif
}
}
#define noconvert
+#ifndef ST
+#define ST T
+#endif
+
+#if cn != 3
+#define loadpix(addr) *(__global const T*)(addr)
+#define storepix(val, addr) *(__global T*)(addr) = val
+#define scalar scalar_
+#define pixsize (int)sizeof(T)
+#else
+#define loadpix(addr) vload3(0, (__global const T1*)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global T1*)(addr))
+#ifdef INTER_NEAREST
+#define scalar (T)(scalar_.x, scalar_.y, scalar_.z)
+#else
+#define scalar (WT)(scalar_.x, scalar_.y, scalar_.z)
+#endif
+#define pixsize ((int)sizeof(T1)*3)
+#endif
+
#ifdef INTER_NEAREST
__kernel void warpPerspective(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
- __constant CT * M, T scalar)
+ __constant CT * M, ST scalar_)
{
int dx = get_global_id(0);
int dy = get_global_id(1);
short sx = convert_short_sat_rte(X0*W);
short sy = convert_short_sat_rte(Y0*W);
- int dst_index = mad24(dy, dst_step, dx * (int)sizeof(T) + dst_offset);
- __global T * dst = (__global T *)(dstptr + dst_index);
+ int dst_index = mad24(dy, dst_step, dx * pixsize + dst_offset);
if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows)
{
- int src_index = mad24(sy, src_step, sx * (int)sizeof(T) + src_offset);
- __global const T * src = (__global const T *)(srcptr + src_index);
- dst[0] = src[0];
+ int src_index = mad24(sy, src_step, sx * pixsize + src_offset);
+ storepix(loadpix(srcptr + src_index), dstptr + dst_index);
}
else
- dst[0] = scalar;
+ storepix(scalar, dstptr + dst_index);
}
}
__kernel void warpPerspective(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
- __constant CT * M, WT scalar)
+ __constant CT * M, ST scalar_)
{
int dx = get_global_id(0);
int dy = get_global_id(1);
short ax = (short)(X & (INTER_TAB_SIZE - 1));
WT v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ?
- convertToWT(*(__global const T *)(srcptr + mad24(sy, src_step, src_offset + sx * (int)sizeof(T)))) : scalar;
+ convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + sx * pixsize))) : scalar;
WT v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ?
- convertToWT(*(__global const T *)(srcptr + mad24(sy, src_step, src_offset + (sx+1) * (int)sizeof(T)))) : scalar;
+ convertToWT(loadpix(srcptr + mad24(sy, src_step, src_offset + (sx+1) * pixsize))) : scalar;
WT v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ?
- convertToWT(*(__global const T *)(srcptr + mad24(sy+1, src_step, src_offset + sx * (int)sizeof(T)))) : scalar;
+ convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + sx * pixsize))) : scalar;
WT v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ?
- convertToWT(*(__global const T *)(srcptr + mad24(sy+1, src_step, src_offset + (sx+1) * (int)sizeof(T)))) : scalar;
+ convertToWT(loadpix(srcptr + mad24(sy+1, src_step, src_offset + (sx+1) * pixsize))) : scalar;
float taby = 1.f/INTER_TAB_SIZE*ay;
float tabx = 1.f/INTER_TAB_SIZE*ax;
- int dst_index = mad24(dy, dst_step, dst_offset + dx * (int)sizeof(T));
- __global T * dst = (__global T *)(dstptr + dst_index);
+ int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize);
#if depth <= 4
int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );
int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE );
WT val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3;
- dst[0] = convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS);
+ storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index);
#else
float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby;
WT val = v0 * tabx2 * taby2 + v1 * tabx * taby2 + v2 * tabx2 * taby + v3 * tabx * taby;
- dst[0] = convertToT(val);
+ storepix(convertToT(val), dstptr + dst_index);
#endif
}
}
__kernel void warpPerspective(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
- __constant CT * M, WT scalar)
+ __constant CT * M, ST scalar_)
{
int dx = get_global_id(0);
int dy = get_global_id(1);
#pragma unroll
for (int x = 0; x < 4; x++)
v[mad24(y, 4, x)] = (sx+x >= 0 && sx+x < src_cols && sy+y >= 0 && sy+y < src_rows) ?
- convertToWT(*(__global const T *)(srcptr + mad24(sy+y, src_step, src_offset + (sx+x) * (int)sizeof(T)))) : scalar;
+ convertToWT(loadpix(srcptr + mad24(sy+y, src_step, src_offset + (sx+x) * pixsize))) : scalar;
float tab1y[4], tab1x[4];
interpolateCubic(ayy, tab1y);
interpolateCubic(axx, tab1x);
- int dst_index = mad24(dy, dst_step, dst_offset + dx * (int)sizeof(T));
- __global T * dst = (__global T *)(dstptr + dst_index);
+ int dst_index = mad24(dy, dst_step, dst_offset + dx * pixsize);
WT sum = (WT)(0);
#if depth <= 4
#pragma unroll
for (int i = 0; i < 16; i++)
sum += v[i] * itab[i];
- dst[0] = convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS );
+ storepix(convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index);
#else
#pragma unroll
for (int i = 0; i < 16; i++)
sum += v[i] * tab1y[(i>>2)] * tab1x[(i&3)];
- dst[0] = convertToT( sum );
+ storepix(convertToT( sum ), dstptr + dst_index);
#endif
}
}