set(OPENCL_FOUND YES)
set(OPENCL_LIBRARIES "-framework OpenCL")
else()
- find_package(OpenCL QUIET)
+ #find_package(OpenCL QUIET)
if(WITH_OPENCLAMDFFT)
find_path(CLAMDFFT_INCLUDE_DIR
NAMES clAmdFft.h)
namespace ocl
{
using std::auto_ptr;
-
-#define CVCL_DEVICE_TYPE_DEFAULT (1 << 0)
-#define CVCL_DEVICE_TYPE_CPU (1 << 1)
-#define CVCL_DEVICE_TYPE_GPU (1 << 2)
-#define CVCL_DEVICE_TYPE_ACCELERATOR (1 << 3)
- //#define CVCL_DEVICE_TYPE_CUSTOM (1 << 4)
-#define CVCL_DEVICE_TYPE_ALL 0xFFFFFFFF
+ enum
+ {
+ CVCL_DEVICE_TYPE_DEFAULT = (1 << 0),
+ CVCL_DEVICE_TYPE_CPU = (1 << 1),
+ CVCL_DEVICE_TYPE_GPU = (1 << 2),
+ CVCL_DEVICE_TYPE_ACCELERATOR = (1 << 3),
+ //CVCL_DEVICE_TYPE_CUSTOM = (1 << 4)
+ CVCL_DEVICE_TYPE_ALL = 0xFFFFFFFF
+ };
//this class contains ocl runtime information
class CV_EXPORTS Info
{
};
-#ifdef HAVE_CLAMDFFT
///////////////////////////////////////// clAmdFft related /////////////////////////////////////////
//! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix.
//! Param dft_size is the size of DFT transform.
// real to complex dft output is not the same with cpu version
// real to complex and complex to real does not support DFT_ROWS
CV_EXPORTS void dft(const oclMat &src, oclMat &dst, Size dft_size = Size(0, 0), int flags = 0);
-#endif // HAVE_CLAMDFFT
-#ifdef HAVE_CLAMDBLAS
//! implements generalized matrix product algorithm GEMM from BLAS
// The functionality requires clAmdBlas library
// only support type CV_32FC1
// flag GEMM_3_T is not supported
CV_EXPORTS void gemm(const oclMat &src1, const oclMat &src2, double alpha,
const oclMat &src3, double beta, oclMat &dst, int flags = 0);
-#endif
//////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector //////////////
#endif
}
-
+std::string workdir;
int main(int argc, char **argv)
{
- std::vector<cv::ocl::Info> oclinfo;
TS::ptr()->init("ocl");
InitGoogleTest(&argc, argv);
+ const char *keys =
+
+ "{ h | help | false | print help message }"
+
+ "{ w | workdir | ../../../samples/c/| set working directory }"
+
+ "{ t | type | gpu | set device type:cpu or gpu}"
+
+ "{ p | platform | 0 | set platform id }"
+
+ "{ d | device | 0 | set device id }";
+
+
+
+ CommandLineParser cmd(argc, argv, keys);
+
+ if (cmd.get<bool>("help"))
+
+ {
+
+ cout << "Avaible options besides goole test option:" << endl;
+
+ cmd.printParams();
+ }
+
+ workdir = cmd.get<string>("workdir");
+
+ string type = cmd.get<string>("type");
+
+ unsigned int pid = cmd.get<unsigned int>("platform");
+
+ int device = cmd.get<int>("device");
+
print_info();
+ int flag = CVCL_DEVICE_TYPE_GPU;
+
+ if(type == "cpu")
+
+ {
+
+ flag = CVCL_DEVICE_TYPE_CPU;
+
+ }
+ std::vector<cv::ocl::Info> oclinfo;
int devnums = getDevice(oclinfo);
- if(devnums < 1)
+ if(devnums <= device || device < 0)
+
{
- std::cout << "no device found\n";
+
+ std::cout << "device invalid\n";
+
return -1;
+
}
- //if you want to use undefault device, set it here
- //setDevice(oclinfo[0]);
+
+ if(pid >= oclinfo.size())
+
+ {
+
+ std::cout << "platform invalid\n";
+
+ return -1;
+
+ }
+
+ if(pid != 0 || device != 0)
+
+ {
+
+ setDevice(oclinfo[pid], device);
+
+ }
+
+ cout << "Device type:" << type << endl << "Device name:" << oclinfo[pid].DeviceName[device] << endl;
setBinpath(CLBINPATH);
return RUN_ALL_TESTS();
}
Has_roi(k);
t0 = (double)cvGetTickCount();//cpu start
- int cpures = cv::countNonZero(mat1_roi);
+ cv::countNonZero(mat1_roi);
t0 = (double)cvGetTickCount() - t0;//cpu end
t1 = (double)cvGetTickCount();//gpu start1
gmat1 = mat1_roi;
t2 = (double)cvGetTickCount(); //kernel
- int gpures = cv::ocl::countNonZero(gmat1);
+ cv::ocl::countNonZero(gmat1);
t2 = (double)cvGetTickCount() - t2;//kernel
t1 = (double)cvGetTickCount() - t1;//gpu end1
if(j == 0)
{
cout << "\nwith roi:";
};
- int gpures = cv::ocl::countNonZero(gmat1);
+ cv::ocl::countNonZero(gmat1);
};
#endif
using namespace testing;
using namespace std;
-#define FILTER_IMAGE "../../../samples/gpu/road.png"
-
#ifndef MWC_TEST_UTILITY
#define MWC_TEST_UTILITY
////////////////////////////////////////////////////////
// Canny1
-
+extern std::string workdir;
IMPLEMENT_PARAM_CLASS(AppertureSize, int);
IMPLEMENT_PARAM_CLASS(L2gradient, bool);
TEST_P(Canny1, Performance)
{
- cv::Mat img = readImage(FILTER_IMAGE, cv::IMREAD_GRAYSCALE);
+ cv::Mat img = readImage(workdir + "fruits.jpg", cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(img.empty());
double low_thresh = 100.0;
//INSTANTIATE_TEST_CASE_P(Filter, ErodeDilate, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 2, 3)));
-INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(false)));
+INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(Values(CV_8UC1, CV_8UC1), Values(false)));
//INSTANTIATE_TEST_CASE_P(Filter, ErodeDilate, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 2, 3)));
-INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(false)));
+INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine(Values(CV_8UC1, CV_8UC1), Values(false)));
INSTANTIATE_TEST_CASE_P(Filter, Sobel, Combine(Values(CV_8UC1, CV_32FC1),
using namespace testing;
using namespace std;
using namespace cv;
-
+extern std::string workdir;
struct getRect
{
Rect operator ()(const CvAvgComp &e) const
if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)))
{
cout << "ERROR: Could not load classifier cascade" << endl;
- cout << "Usage: facedetect [--cascade=<cascade_path>]\n"
- " [--scale[=<image scale>\n"
- " [filename|camera_index]\n" << endl ;
return;
}
//int devnums = getDevice(oclinfo);
TEST_F(Haar, FaceDetect)
{
- string imgName = "../../../samples/c/lena.jpg";
+ string imgName = workdir + "lena.jpg";
Mat img = imread( imgName, 1 );
if(img.empty())
{
- std::cout << "Couldn't read test" << index << ".jpg" << std::endl;
+ std::cout << imgName << std::endl;
return ;
}
- int i = 0;
+ //int i = 0;
double t = 0;
vector<Rect> faces, oclfaces;
using namespace cvtest;
using namespace testing;
using namespace std;
-
-#define FILTER_IMAGE "../../../samples/gpu/road.png"
+extern std::string workdir;
#ifndef MWC_TEST_UTILITY
#define MWC_TEST_UTILITY
TEST_P(HOG, Performance)
{
- cv::Mat img = readImage(FILTER_IMAGE, cv::IMREAD_GRAYSCALE);
+ cv::Mat img = readImage(workdir + "lena.jpg", cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(img.empty());
// define HOG related arguments
float scale = 1.05;
- int nlevels = 13;
+ //int nlevels = 13;
float gr_threshold = 8;
float hit_threshold = 1.4;
- bool hit_threshold_auto = true;
+ //bool hit_threshold_auto = true;
int win_width = is48 ? 48 : 64;
int win_stride_width = 8;
}
int bordertype[] = {cv::BORDER_CONSTANT, cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/};
const char *borderstr[] = {"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/};
+ cout << borderstr[0] << endl;
#ifndef PRINT_KERNEL_RUN_TIME
double totalcputick = 0;
double totalgputick = 0;
totaluploadtick = t0 + totaluploadtick;
totaldownloadtick = t1 + totaldownloadtick;
}
- EXPECT_MAT_SIMILAR(mat, cpu_dst, 0.0);
totaltick = totaluploadtick + totaldownloadtick;
cout << "average upload time is " << totaluploadtick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
cout << "average download time is " << totaldownloadtick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
{
Has_roi(j);
- cv::Mat dev_dst[4] = {dst1_roi, dst2_roi, dst3_roi, dst4_roi};
+ //cv::Mat dev_dst[4] = {dst1_roi, dst2_roi, dst3_roi, dst4_roi};
cv::ocl::oclMat dev_gdst[4] = {gdst1, gdst2, gdst3, gdst4};
gdst1_whole = dst1;
gdst1 = gdst1_whole(Rect(dst1x, dst1y, roicols, roirows));
using namespace testing;
using namespace std;
-#define FILTER_IMAGE "../../../samples/gpu/road.png"
+extern std::string workdir;
TEST(SURF, Performance)
{
- cv::Mat img = readImage(FILTER_IMAGE, cv::IMREAD_GRAYSCALE);
+ cv::Mat img = readImage(workdir+"lena.jpg", cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(img.empty());
ocl::SURF_OCL d_surf;
{
size_t groupnum = src.clCxt->impl->maxComputeUnits;
CV_Assert(groupnum != 0);
- int vlen = src.oclchannels() == 3 ? 12 : 8, dbsize = groupnum * vlen, status;
+ int vlen = src.oclchannels() == 3 ? 12 : 8, dbsize = groupnum * vlen;
Context *clCxt = src.clCxt;
T *p = new T[dbsize];
cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize * sizeof(T));
CV_Assert(groupnum != 0);
groupnum = groupnum * 2;
int vlen = 8;
- int dbsize = groupnum * 2 * vlen * sizeof(T) , status;
+ int dbsize = groupnum * 2 * vlen * sizeof(T) ;
Context *clCxt = src.clCxt;
cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize);
*minVal = std::numeric_limits<double>::max() , *maxVal = -std::numeric_limits<double>::max();
T *p = new T[groupnum * vlen * 2];
memset(p, 0, dbsize);
openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize);
- for(int i = 0; i < vlen * groupnum; i++)
+ for(int i = 0; i < vlen * (int)groupnum; i++)
{
*minVal = *minVal < p[i] ? *minVal : p[i];
}
- for(int i = vlen * groupnum; i < 2 * vlen * groupnum; i++)
+ for(int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
{
*maxVal = *maxVal > p[i] ? *maxVal : p[i];
}
size_t groupnum = src.clCxt->impl->maxComputeUnits;
CV_Assert(groupnum != 0);
int minloc = -1 , maxloc = -1;
- int vlen = 4, dbsize = groupnum * vlen * 4 * sizeof(T) , status;
+ int vlen = 4, dbsize = groupnum * vlen * 4 * sizeof(T) ;
Context *clCxt = src.clCxt;
cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize);
*minVal = std::numeric_limits<double>::max() , *maxVal = -std::numeric_limits<double>::max();
T *p = new T[groupnum * vlen * 4];
memset(p, 0, dbsize);
openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize);
- for(int i = 0; i < vlen * groupnum; i++)
+ for(int i = 0; i < vlen * (int)groupnum; i++)
{
*minVal = (*minVal < p[i] || p[i + 2 * vlen * groupnum] == -1) ? *minVal : p[i];
- minloc = (*minVal < p[i] || p[i + 2 * vlen * groupnum] == -1) ? minloc : p[i + 2 * vlen * groupnum];
+ minloc = (*minVal < p[i] || p[i + 2 * vlen * groupnum] == -1) ? minloc : cvRound(p[i + 2 * vlen * groupnum]);
}
- for(int i = vlen * groupnum; i < 2 * vlen * groupnum; i++)
+ for(int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
{
*maxVal = (*maxVal > p[i] || p[i + 2 * vlen * groupnum] == -1) ? *maxVal : p[i];
- maxloc = (*maxVal > p[i] || p[i + 2 * vlen * groupnum] == -1) ? maxloc : p[i + 2 * vlen * groupnum];
+ maxloc = (*maxVal > p[i] || p[i + 2 * vlen * groupnum] == -1) ? maxloc : cvRound(p[i + 2 * vlen * groupnum]);
}
int pre_rows = src.offset / src.step;
}
CV_Assert(groupnum != 0);
groupnum = groupnum * 2;
- int vlen = 8 , dbsize = groupnum * vlen, status;
+ int vlen = 8 , dbsize = groupnum * vlen;
//cl_ulong start, end;
Context *clCxt = src.clCxt;
string kernelName = "arithm_op_nonzero";
}
template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ >
-void matchUnrolledCached(const oclMat query, const oclMat *trains, int n, const oclMat mask,
- const oclMat &bestTrainIdx, const oclMat &bestImgIdx, const oclMat &bestDistance, int distType)
+void matchUnrolledCached(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const oclMat /*mask*/,
+ const oclMat &/*bestTrainIdx*/, const oclMat & /*bestImgIdx*/, const oclMat & /*bestDistance*/, int /*distType*/)
{
}
}
template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
-void match(const oclMat query, const oclMat *trains, int n, const oclMat mask,
- const oclMat &bestTrainIdx, const oclMat &bestImgIdx, const oclMat &bestDistance, int distType)
+void match(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const oclMat /*mask*/,
+ const oclMat &/*bestTrainIdx*/, const oclMat & /*bestImgIdx*/, const oclMat & /*bestDistance*/, int /*distType*/)
{
}
}
template <int BLOCK_SIZE>
-void findKnnMatch(int k, const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int distType)
+void findKnnMatch(int k, const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int /*distType*/)
{
cv::ocl::Context *ctx = trainIdx.clCxt;
size_t globalSize[] = {trainIdx.rows * BLOCK_SIZE, 1, 1};
}
void cv::ocl::BruteForceMatcher_OCL_base::knnMatch2Collection(const oclMat &query, const oclMat &trainCollection,
- oclMat &trainIdx, oclMat &imgIdx, oclMat &distance, const oclMat &maskCollection)
+ oclMat &trainIdx, oclMat &imgIdx, oclMat &distance, const oclMat &/*maskCollection*/)
{
if (query.empty() || trainCollection.empty())
return;
}
void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchCollection(const oclMat &query, oclMat &trainIdx, oclMat &imgIdx, oclMat &distance,
- oclMat &nMatches, float maxDistance, const vector<oclMat> &masks)
+ oclMat &nMatches, float /*maxDistance*/, const vector<oclMat> &masks)
{
if (query.empty() || empty())
return;
using namespace cv::ocl;
using namespace std;
-#if !defined (HAVE_OPENCL)
+#if !defined HAVE_OPENCL
void cv::ocl::dft(const oclMat &src, oclMat &dst, int flags)
{
throw_nogpu();
}
+#elif !defined HAVE_CLAMDFFT
+void cv::ocl::dft(const oclMat &src, oclMat &dst, int flags)
+{
+ CV_Error(CV_StsNotImplemented, "OpenCL DFT is not implemented");
+}
#else
-
#include <clAmdFft.h>
namespace cv
Context *clCxt = src.clCxt;
string kernelName;
size_t localThreads[3] = {16, 16, 1};
- size_t globalThreads[3] = {(src.cols + localThreads[0]) / localThreads[0] *localThreads[0], (src.rows + localThreads[1]) / localThreads[1] *localThreads[1], 1};
+ size_t globalThreads[3] = {(src.cols + localThreads[0] - 1) / localThreads[0] *localThreads[0], (src.rows + localThreads[1] - 1) / localThreads[1] *localThreads[1], 1};
if(src.type() == CV_8UC1)
{
kernelName = "morph_C1_D0";
- globalThreads[0] = ((src.cols + 3) / 4 + localThreads[0]) / localThreads[0] * localThreads[0];
+ globalThreads[0] = ((src.cols + 3) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
CV_Assert( localThreads[0]*localThreads[1] * 8 >= (localThreads[0] * 4 + ksize.width - 1) * (localThreads[1] + ksize.height - 1) );
}
else
MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU> &filter2D_, int iters_) :
Filter2DEngine_GPU(filter2D_), iters(iters_) {}
- virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1))
+ virtual void apply(const oclMat &src, oclMat &dst)
{
Filter2DEngine_GPU::apply(src, dst);
//if (iters > 1)
virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1))
{
Size src_size = src.size();
- int src_type = src.type();
+ //int src_type = src.type();
int cn = src.oclchannels();
//dst.create(src_size, src_type);
#include "clAmdBlas.h"
-#if !defined (HAVE_OPENCL)
-void cv::ocl::dft(const oclMat &src, oclMat &dst, int flags)
+#if !defined HAVE_OPENCL
+void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
+ const oclMat &src3, double beta, oclMat &dst, int flags)
{
throw_nogpu();
}
+#elif !defined HAVE_CLAMDBLAS
+void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
+ const oclMat &src3, double beta, oclMat &dst, int flags)
+{
+ CV_Error(CV_StsNotImplemented, "OpenCL BLAS is not implemented");
+}
#else
using namespace cv;
#define sum_elem_ptr(sum,row,col) \
- ((sumtype*)CV_MAT_ELEM_PTR_FAST((sum),(row),(col),sizeof(sumtype)))
+ ((sumtype*)CV_MAT_ELEM_PTR_FAST((sum),(row),(col),sizeof(sumtype)))
#define sqsum_elem_ptr(sqsum,row,col) \
- ((sqsumtype*)CV_MAT_ELEM_PTR_FAST((sqsum),(row),(col),sizeof(sqsumtype)))
+ ((sqsumtype*)CV_MAT_ELEM_PTR_FAST((sqsum),(row),(col),sizeof(sqsumtype)))
#define calc_sum(rect,offset) \
- ((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset])
+ ((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset])
CV_IMPL void
CvHaarFeature *feature =
&_cascade->stage_classifier[i].classifier[j].haar_feature[l];
GpuHidHaarTreeNode *hidnode = &stage_classifier[i].classifier[j].node[l];
- double sum0 = 0, area0 = 0;
CvRect r[3];
- int base_w = -1, base_h = -1;
- int new_base_w = 0, new_base_h = 0;
- int kx, ky;
- int flagx = 0, flagy = 0;
- int x0 = 0, y0 = 0;
+
int nr;
/* align blocks */
std::vector<cv::Rect> rectList;
std::vector<int> rweights;
double factor;
- int coi;
int datasize;
int totalclassifier;
int *candidate;
cl_int status;
- bool doCannyPruning = (flags & CV_HAAR_DO_CANNY_PRUNING) != 0;
+ // bool doCannyPruning = (flags & CV_HAAR_DO_CANNY_PRUNING) != 0;
bool findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0;
- bool roughSearch = (flags & CV_HAAR_DO_ROUGH_SEARCH) != 0;
+ // bool roughSearch = (flags & CV_HAAR_DO_ROUGH_SEARCH) != 0;
//the Intel HD Graphics is unsupported
if (gimg.clCxt->impl->devName.find("Intel(R) HD Graphics") != string::npos)
gimgroi = gsum(roi);
gimgroisq = gsqsum(roi);
//scaleinfo[i].rows = gimgroi.rows;
- int ystep = 1; // factor > 2 ? 1 : 2;
int width = gimgroi.cols - 1 - cascade->orig_window_size.width;
int height = gimgroi.rows - 1 - cascade->orig_window_size.height;
scaleinfo[i].width_height = (width << 16) | height;
pq.s[2] = gcascade->pq2;
pq.s[3] = gcascade->pq3;
float correction = gcascade->inv_window_area;
- int argcount = 0;
+
//int grpnumperline = ((m + localThreads[0] - 1) / localThreads[0]);
//int totalgrp = ((n + localThreads[1] - 1) / localThreads[1])*grpnumperline;
// openCLVerifyKernel(gsum.clCxt, kernel, &blocksize, globalThreads, localThreads);
{
CvSize winsize0 = cascade->orig_window_size;
int n_factors = 0;
- int flag = 0;
oclMat gsum;
oclMat gsqsum;
cv::ocl::integral(gimg, gsum, gsqsum);
scaleinfo[i].imgoff = 0;
scaleinfo[i].factor = factor;
int startnodenum = nodenum * i;
- int argcounts = 0;
float factor2 = (float)factor;
/*
openCLSafeCall(clSetKernelArg(kernel2, argcounts++, sizeof(cl_mem), (void *)&nodebuffer));
args1.push_back ( make_pair(sizeof(cl_int) , (void *)&startnodenum ));
size_t globalThreads2[3] = {nodenum, 1, 1};
- size_t localThreads2[3] = {256, 1, 1};
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1);
// } /* j */
// }
//}
-
+/*
CV_INLINE
double gpuEvalHidHaarClassifier( GpuHidHaarClassifier *classifier,
- double variance_norm_factor,
- size_t p_offset )
+double variance_norm_factor,
+size_t p_offset )
{
- /*
+
int idx = 0;
do
{
}
while( idx > 0 );
return classifier->alpha[-idx];
- */
+
return 0.;
}
+*/
CV_IMPL int
-gpuRunHaarClassifierCascade( const CvHaarClassifierCascade *_cascade,
- CvPoint pt, int start_stage )
+gpuRunHaarClassifierCascade( /*const CvHaarClassifierCascade *_cascade,
+CvPoint pt, int start_stage */)
{
/*
int result = -1;
for( y = y1; y < y2; y += ystep )
for( x = 0; x < ssz.width; x += ystep )
{
- if( gpuRunHaarClassifierCascade( cascade, cvPoint(x, y), 0 ) > 0 )
+ if( gpuRunHaarClassifierCascade( /*cascade, cvPoint(x, y), 0*/ ) > 0 )
vec->push_back(Rect(cvRound(x * factor), cvRound(y * factor),
winSize.width, winSize.height));
}
}
}
- int result = gpuRunHaarClassifierCascade( cascade, cvPoint(x, y), 0 );
+ int result = gpuRunHaarClassifierCascade(/* cascade, cvPoint(x, y), 0 */);
if( result > 0 )
vec->push_back(Rect(x, y, winsize.width, winsize.height));
ixstep = result != 0 ? 1 : 2;
kernelName = "remapNNF1Constant";
}
- int channels = dst.oclchannels();
- int depth = dst.depth();
- int type = src.type();
+ //int channels = dst.oclchannels();
+ //int depth = dst.depth();
+ //int type = src.type();
size_t blkSizeX = 16, blkSizeY = 16;
size_t glbSizeX;
int cols = dst.cols;
glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
}
- else if(src.type() == CV_8UC3 || src.type() == CV_8UC4 || src.type() == CV_32FC1)
+ else if(src.type() == CV_32FC1 && interpolation == INTER_LINEAR)
{
cols = (dst.cols + (dst.offset >> 2) % 4 + 3) / 4;
glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
size_t glbSizeY = dst.rows % blkSizeY == 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY;
size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
- /*
- /////////////////////////////
- //using the image buffer
- /////////////////////////////
-
- size_t image_row_pitch = 0;
- cl_int err1, err2, err3;
- cl_mem_flags flags1 = CL_MEM_READ_ONLY;
- cl_image_format format;
- if(src.type() == CV_8UC1)
- {
- format.image_channel_order = CL_R;
- format.image_channel_data_type = CL_UNSIGNED_INT8;
- }
- else if(src.type() == CV_8UC4)
- {
- format.image_channel_order = CL_RGBA;
- format.image_channel_data_type = CL_UNSIGNED_INT8;
- }
- else if(src.type() == CV_32FC1)
- {
- format.image_channel_order = CL_R;
- format.image_channel_data_type = CL_FLOAT;
- }
- else if(src.type() == CV_32FC4)
- {
- format.image_channel_order = CL_RGBA;
- format.image_channel_data_type = CL_FLOAT;
- }
- cl_mem srcImage = clCreateImage2D(clCxt->impl->clContext, flags1, &format, src.cols, src.rows,
- image_row_pitch, NULL, &err1);
- if(err1 != CL_SUCCESS)
- {
- printf("Error creating CL image buffer, error code %d\n", err1);
- return;
- }
- const size_t src_origin[3] = {0, 0, 0};
- const size_t region[3] = {src.cols, src.rows, 1};
- cl_event BtoI_event, ItoB_event;
- err3 = clEnqueueCopyBufferToImage(clCxt->impl->clCmdQueue, (cl_mem)src.data, srcImage,
- 0, src_origin, region, 0, NULL, NULL);
- if(err3 != CL_SUCCESS)
- {
- printf("Error copying buffer to image\n");
- printf("Error code %d \n", err3);
- return;
- }
- // clWaitForEvents(1, &BtoI_event);
-
- cl_int ret;
- Mat test(src.rows, src.cols, CV_8UC1);
- memset(test.data, 0, src.rows*src.cols);
- ret = clEnqueueReadImage(clCxt->impl->clCmdQueue, srcImage, CL_TRUE,
- src_origin, region, 0, 0, test.data, NULL, NULL, &ItoB_event);
- if(ret != CL_SUCCESS)
- {
- printf("read image error, %d ", ret);
- return;
- }
- clWaitForEvents(1, &ItoB_event);
-
- cout << "src" << endl;
- cout << src << endl;
- cout<<"image:"<<endl;
- cout<< test << endl;
-
- */
vector< pair<size_t, const void *> > args;
{
args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
- // args.push_back( make_pair(sizeof(cl_mem),(void*)&srcImage)); //imageBuffer
args.push_back( make_pair(sizeof(cl_mem), (void *)&map1.data));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
{
args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
- // args.push_back( make_pair(sizeof(cl_mem),(void*)&srcImage)); //imageBuffer
args.push_back( make_pair(sizeof(cl_mem), (void *)&map1.data));
args.push_back( make_pair(sizeof(cl_mem), (void *)&map2.data));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
if( src.depth() != CV_8U || src.oclchannels() != 4 )
CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
- if(src.clCxt->impl->double_support == 0)
- {
- CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation is exists.\nIf the accuracy is acceptable, the error can be ignored.\n");
- }
+ // if(src.clCxt->impl->double_support == 0)
+ // {
+ // CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation exists.\nIf the accuracy is acceptable, the error can be ignored.\n");
+ // }
dst.create( src.size(), CV_8UC4 );
if( src.depth() != CV_8U || src.oclchannels() != 4 )
CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
- if(src.clCxt->impl->double_support == 0)
- {
- CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation is exists.\nIf the accuracy is acceptable, the error can be ignored.\n");
- }
+ // if(src.clCxt->impl->double_support == 0)
+ // {
+ // CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation exists.\nIf the accuracy is acceptable, the error can be ignored.\n");
+ // }
dstr.create( src.size(), CV_8UC4 );
dstsp.create( src.size(), CV_16SC2 );
int borderType )
{
int cn = src.channels();
- int i, j, k, maxk, radius;
+ int i, j, maxk, radius;
Size size = src.size();
CV_Assert( (src.channels() == 1 || src.channels() == 3) &&
void openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset,
const void *src, size_t spitch,
- size_t width, size_t height, int src_offset, enum openCLMemcpyKind kind)
+ size_t width, size_t height, int src_offset)
{
size_t src_origin[3] = {src_offset % spitch, src_offset / spitch, 0};
size_t dst_origin[3] = {dst_offset % dpitch, dst_offset / dpitch, 0};
}
int savetofile(const Context *clcxt, cl_program &program, const char *fileName)
{
- cl_int status;
+ //cl_int status;
size_t numDevices = 1;
cl_device_id *devices = clcxt->impl->devices;
//figure out the sizes of each of the binaries.
FILE *fp = fopen(fileName, "wb+");
if(fp == NULL)
{
- char *temp;
+ char *temp = NULL;
sprintf(temp, "Failed to load kernel file : %s\r\n", fileName);
CV_Error(CV_GpuApiCallError, temp);
}
return kernel;
}
- void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *blockSize,
- size_t *globalThreads, size_t *localThreads)
+ void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads)
{
size_t kernelWorkGroupSize;
openCLSafeCall(clGetKernelWorkGroupInfo(kernel, clCxt->impl->devices[0],
globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1];
globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2];
- size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
- cv::ocl::openCLVerifyKernel(clCxt, kernel, &blockSize, globalThreads, localThreads);
+ //size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
+ cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads);
}
- for(int i = 0; i < args.size(); i ++)
+ for(size_t i = 0; i < args.size(); i ++)
openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
#ifndef PRINT_KERNEL_RUN_TIME
impl->maxComputeUnits = m.impl->maxComputeUnits;
impl->double_support = m.impl->double_support;
memcpy(impl->extra_options, m.impl->extra_options, 512);
- for(int i = 0; i < m.impl->devices.size(); i++)
+ for(size_t i = 0; i < m.impl->devices.size(); i++)
{
impl->devices.push_back(m.impl->devices[i]);
impl->devName.push_back(m.impl->devName[i]);
int y = get_global_id(1);
if (x < cols && y < rows)
-
+
{
x = x << 2;
-
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- uchar4 src1_data ,src2_data;
+ uchar4 src1_data ,src2_data;
- src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
- src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
- src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
- src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
+ src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
+ src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
+ src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
+ src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
- src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
- src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
- src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
- src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
+ src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
+ src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
+ src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
+ src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
// short4 tmp = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
int y = get_global_id(1);
if (x < cols && y < rows)
-
+
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset +( x<< 1) & (int)0xfffffff8);
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
+ ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
+ if(src1_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
- ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
- ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
// int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
int y = get_global_id(1);
if (x < cols && y < rows)
-
+
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset +( x<< 1) - (dst_align << 1 ));
- short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
- short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
+ short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
+ if(src1_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
// int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
int4 tmp;
int y = get_global_id(1);
if (x < cols && y < rows)
-
+
{
-
+
x = x << 2;
#define bitOfInt (sizeof(int)== 4 ? 2: 3)
#define dst_align ((dst_offset >> bitOfInt) & 3)
- int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
- int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
-
+ int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
+ int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
+
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
- int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
- int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix));
+ int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix));
+
+ if(src1_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
// double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
float4 tmp;
int y = get_global_id(1);
if (x < cols && y < rows)
-
+
{
-
+
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
- float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index));
- float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
+ float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
+ if(src1_index < 0)
+ {
+ float4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ float4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
// double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
// float4 tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ;
int y = get_global_id(1);
if (x < cols && y < rows)
-
+
{
-
+
x = x << 2;
#define dst_align ((dst_offset >> 3) & 3)
- int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
- int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
-
+ int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+ int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
+
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));
- double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index));
- double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index));
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
+ double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
+ if(src1_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
// double4 tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ;
double4 tmp_data;
tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
- uchar4 src1_data = vload4(0, src1 + src1_index);
- uchar4 src2_data = vload4(0, src2 + src2_index);
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ uchar4 src1_data = vload4(0, src1 + src1_index_fix);
+ uchar4 src2_data = vload4(0, src2 + src2_index_fix);
+
+ if(src1_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data & src2_data;
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- char4 src1_data = vload4(0, src1 + src1_index);
- char4 src2_data = vload4(0, src2 + src2_index);
-
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ char4 src1_data = vload4(0, src1 + src1_index_fix);
+ char4 src2_data = vload4(0, src2 + src2_index_fix);
+
+ if(src1_index < 0)
+ {
+ char4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ char4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
char4 dst_data = *((__global char4 *)(dst + dst_index));
char4 tmp_data = src1_data & src2_data;
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
- ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
- ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
+ ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
+
+ if(src1_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 tmp_data = src1_data & src2_data;
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
- short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
- short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
+ short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
+
+ if(src1_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
short4 tmp_data = src1_data & src2_data;
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
- uchar4 src1_data = vload4(0, src1 + src1_index);
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = ~ src1_data;
-
+
+ /* if(src1_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ */
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
{
int src_index = mad24(y, src_step, (x << 3) + src_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
-
+
char8 data;
data = *((__global char8 *)((__global char *)src + src_index));
data = ~ data;
-
+
*((__global char8 *)((__global char *)dst + dst_index)) = data;
}
}
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
- uchar4 src1_data = vload4(0, src1 + src1_index);
- uchar4 src2_data = vload4(0, src2 + src2_index);
-
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ uchar4 src1_data = vload4(0, src1 + src1_index_fix);
+ uchar4 src2_data = vload4(0, src2 + src2_index_fix);
+ if(src1_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data | src2_data;
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- uchar4 src1_data = vload4(0, src1 + src1_index);
- uchar4 src2_data = vload4(0, src2 + src2_index);
-
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ uchar4 src1_data = vload4(0, src1 + src1_index_fix);
+ uchar4 src2_data = vload4(0, src2 + src2_index_fix);
+
+ if(src1_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data ^ src2_data;
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- char4 src1_data = vload4(0, src1 + src1_index);
- char4 src2_data = vload4(0, src2 + src2_index);
-
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ char4 src1_data = vload4(0, src1 + src1_index_fix);
+ char4 src2_data = vload4(0, src2 + src2_index_fix);
+
+ if(src1_index < 0)
+ {
+ char4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ char4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
char4 dst_data = *((__global char4 *)(dst + dst_index));
char4 tmp_data = src1_data ^ src2_data;
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
- ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
- ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
+ ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
+
+ if(src1_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 tmp_data = src1_data ^ src2_data;
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
- short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
- short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
+ short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
+
+ if(src1_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
+
+
short4 tmp_data = src1_data ^ src2_data;
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
- uchar4 src1_data = vload4(0, src1 + src1_index);
- uchar4 src2_data = vload4(0, src2 + src2_index);
-
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ uchar4 src1_data = vload4(0, src1 + src1_index_fix);
+ uchar4 src2_data = vload4(0, src2 + src2_index_fix);
+ if(src1_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
+
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
}
}
-__kernel void arithm_compare_eq_D2 (__global ushort *src1, int src1_step, int src1_offset,
+
+__kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
x = x << 2;
- #define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ #define dst_align ((dst_offset >> 1)& 3)
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
- ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+ ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
}
-
__kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
- short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-
+ short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
+
+
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
int y = get_global_id(1);
if (x < cols && y < rows)
- {
+ {
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
- int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
+ int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
+ float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src2_index < 0)
+ {
+ float4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
- float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index));
- float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
{
x = x << 2;
#define dst_align ((dst_offset >> 3) & 3)
- int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
- int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
+ int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+ int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
+ double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
+ if(src1_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
- double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index));
- double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
- uchar4 src1_data = vload4(0, src1 + src1_index);
- uchar4 src2_data = vload4(0, src2 + src2_index);
-
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ uchar4 src1_data = vload4(0, src1 + src1_index_fix);
+ uchar4 src2_data = vload4(0, src2 + src2_index_fix);
+ if(src1_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
+
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
- ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-
+ ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
+
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
- short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+ short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
- int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
+ int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
+ float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
+ if(src1_index < 0)
+ {
+ float4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ float4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
- float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index));
- float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
{
x = x << 2;
#define dst_align ((dst_offset >> 3) & 3)
- int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
- int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
+ int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+ int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
+ double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
+ if(src1_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
- double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index));
- double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- uchar4 src1_data = vload4(0, src1 + src1_index);
- uchar4 src2_data = vload4(0, src2 + src2_index);
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ uchar4 src1_data = vload4(0, src1 + src1_index_fix);
+ uchar4 src2_data = vload4(0, src2 + src2_index_fix);
+ if(src1_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
- ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+ ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
+
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
x = x << 2;
#define dst_align ((dst_offset >> 1)& 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
- short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+ short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
x = x << 2;
#define dst_align ((dst_offset >> 2)& 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
- uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+ if(src1_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+ uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
x = x << 2;
#define dst_align ((dst_offset >> 2)& 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index));
- float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
+ float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
+ if(src1_index < 0)
+ {
+
+ float4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ float4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
x = x << 2;
#define dst_align ((dst_offset >> 3)& 3)
- int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
- int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
+ int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+ int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
- double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index));
- double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index));
- uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
+ double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
+ if(src1_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
- uchar4 src1_data = vload4(0, src1 + src1_index);
- uchar4 src2_data = vload4(0, src2 + src2_index);
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ uchar4 src1_data = vload4(0, src1 + src1_index_fix);
+ uchar4 src2_data = vload4(0, src2 + src2_index_fix);
+ if(src1_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
x = x << 2;
#define dst_align ((dst_offset >> 1)& 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
- ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+ ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
x = x << 2;
#define dst_align ((dst_offset >> 1)& 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
- short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+ short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
{
x = x << 2;
#define dst_align ((dst_offset >> 2)& 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
- float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index));
- float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
- uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
+ float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src1_index < 0)
+ {
+ float4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ float4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
+ uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
{
x = x << 2;
#define dst_align ((dst_offset >> 3) & 3)
- int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
- int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
+ int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+ int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
+ double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
+ if(src1_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
- double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index));
- double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
}
#endif
-
+
/***********************************Compare LT*******************************/
__kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
- uchar4 src1_data = vload4(0, src1 + src1_index);
- uchar4 src2_data = vload4(0, src2 + src2_index);
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ uchar4 src1_data = vload4(0, src1 + src1_index_fix);
+ uchar4 src2_data = vload4(0, src2 + src2_index_fix);
+ if(src1_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
- ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+ ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
- short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+ short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
+
+
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
- float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index));
- float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
- uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
+ float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
+ if(src1_index < 0)
+ {
+ float4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ float4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
+
+ uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
{
x = x << 2;
#define dst_align ((dst_offset >> 3) & 3)
- int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
- int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
+ int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+ int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
- double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index));
- double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index));
- uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
+ double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
+ if(src1_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
+
+ uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ uchar4 src1_data = vload4(0, src1 + src1_index_fix);
+ uchar4 src2_data = vload4(0, src2 + src2_index_fix);
+ if(src1_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
- uchar4 src1_data = vload4(0, src1 + src1_index);
- uchar4 src2_data = vload4(0, src2 + src2_index);
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
- ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+ ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
- short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+ short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
{
x = x << 2;
#define dst_align ((dst_offset >> 2)& 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data =convert_uchar4((src1_data <= src2_data));
{
x = x << 2;
#define dst_align ((dst_offset >> 2)& 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
+ float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
+ if(src1_index < 0)
+ {
+ float4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ float4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
- float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index));
- float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
{
x = x << 2;
#define dst_align ((dst_offset >> 3)& 3)
- int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
- int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
+ int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+ int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
+ double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
+ if(src1_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
- double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index));
- double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
int y = get_global_id(1);
if (x < cols && y < rows)
-
+
{
-
+
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
-
- float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index));
- float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
+ float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
+ if(src1_index < 0)
+ {
+ float4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ float4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
float4 tmp_data ;
int y = get_global_id(1);
if (x < cols && y < rows)
-
+
{
-
+
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
- int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-
+ int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+
+ float8 src1_data = vload8(0, (__global float *)((__global char *)src1 + src1_index_fix));
+
+ if(src1_index==-6)
+ src1_data.s01234567 = src1_data.s67012345;
+ if(src1_index==-4)
+ src1_data.s01234567 = src1_data.s45670123;
+ if(src1_index== -2)
+ src1_data.s01234567 = src1_data.s23456701;
+
+
- float8 src1_data = vload8(0, (__global float *)((__global char *)src1 + src1_index));
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
float4 tmp_data ;
#if defined DOUBLE_SUPPORT
#pragma OPENCL EXTENSION cl_khr_fp64:enable
typedef double4 F4 ;
-#else
+#else
typedef float4 F4;
#endif
{
int x = get_global_id(0);
int y = get_global_id(1);
-
+
if(x < threadCols && y < dst_rows)
{
x = x << 2;
int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
uchar4 nval =convert_uchar4(nVal);
- uchar val = nval.s0;
+ uchar4 val = (uchar4)(nval.s0);
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3);
map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even) + src_offset;
+
+ uchar4 con = convert_uchar4(convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows) || convert_int4(map1_data.even) < (int4)(0) || convert_int4(map1_data.odd) < (int4)(0));
+ uchar4 src_data = val;
- uchar4 src_data;
-
+ if (con.s0 == 0)
src_data.s0 = *(src + srcIdx.s0);
+ if (con.s1 == 0)
src_data.s1 = *(src + srcIdx.s1);
+ if (con.s2 == 0)
src_data.s2 = *(src + srcIdx.s2);
+ if (con.s3 == 0)
src_data.s3 = *(src + srcIdx.s3);
+
uchar4 dst_data;
- dst_data = convert_uchar4((convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows)))? (uchar4)(val) : src_data;
-
+
__global uchar4* d = (__global uchar4 *)(dst + dstStart);
- uchar4 dVal = *d;
+ uchar4 dVal = *d;
- int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
- dst_data = (convert_uchar4(con) != convert_uchar4((int4)(0))) ? dst_data : dVal;
+ int4 dcon = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
+ dst_data = (convert_uchar4(dcon) != convert_uchar4((int4)(0))) ? src_data : dVal;
*d = dst_data;
}
}
+
__kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsigned char const * restrict src,
__global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows, int threadCols, F4 nVal)
{
int x = get_global_id(0);
int y = get_global_id(1);
-
+
if(x < threadCols && y < dst_rows)
{
x = x << 2;
map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
int8 map1_dataZ = convert_int8_sat_rte(map1_data);
int4 srcIdx = map1_dataZ.odd * src_step + map1_dataZ.even + src_offset;
+
+ uchar4 src_data = val;
+ uchar4 con = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows) || map1_dataZ.even < (int4)(0) || map1_dataZ.odd < (int4)(0));
- uchar4 src_data;
-
+ if (con.s0 == 0)
src_data.s0 = *(src + srcIdx.s0);
+ if (con.s1 == 0)
src_data.s1 = *(src + srcIdx.s1);
+ if (con.s2 == 0)
src_data.s2 = *(src + srcIdx.s2);
+ if (con.s3 == 0)
src_data.s3 = *(src + srcIdx.s3);
uchar4 dst_data;
- dst_data = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data;
+ // dst_data = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data;
__global uchar4* d = (__global uchar4 *)(dst + dstStart);
- uchar4 dVal = *d;
+ uchar4 dVal = *d;
- int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
-
- dst_data = (convert_uchar4(con) != convert_uchar4((int4)(0))) ? dst_data : dVal;
+ int4 dcon = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
+
+ dst_data = (convert_uchar4(dcon) != convert_uchar4((int4)(0))) ? src_data : dVal;
*d = dst_data;
-
}
}
{
int x = get_global_id(0);
int y = get_global_id(1);
-
+
if(x < threadCols && y < dst_rows)
{
x = x << 2;
int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
uchar4 nval =convert_uchar4(nVal);
- uchar val = nval.s0;
+ uchar4 val = (uchar4)(nval.s0);
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3);
float8 map_data = (float8)(map1_data.s0, map2_data.s0, map1_data.s1, map2_data.s1, map1_data.s2, map2_data.s2, map1_data.s3, map2_data.s3);
int8 map_dataZ = convert_int8_sat_rte(map_data);
int4 srcIdx = map_dataZ.odd * src_step + map_dataZ.even + src_offset;
+
+ uchar4 src_data = val;
+ uchar4 con = convert_uchar4(map_dataZ.even >= (int4)(src_cols) || map_dataZ.odd >= (int4)(src_rows)|| map_dataZ.even < (int4)(0) || map_dataZ.odd < (int4)(0));
- uchar4 src_data;
-
+ if (con.s0 == 0)
src_data.s0 = *(src + srcIdx.s0);
+ if (con.s1 == 0)
src_data.s1 = *(src + srcIdx.s1);
+ if (con.s2 == 0)
src_data.s2 = *(src + srcIdx.s2);
+ if (con.s3 == 0)
src_data.s3 = *(src + srcIdx.s3);
uchar4 dst_data;
- dst_data = convert_uchar4(map_dataZ.even >= (int4)(src_cols) || map_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data;
+
+ // dst_data = convert_uchar4(map_dataZ.even >= (int4)(src_cols) || map_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data;
__global uchar4* d = (__global uchar4 *)(dst + dstStart);
- uchar4 dVal = *d;
-
- int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
+ uchar4 dVal = *d;
- dst_data = (convert_uchar4(con) != convert_uchar4((int4)(0))) ? dst_data : dVal;
+ int4 dcon = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
+
+ dst_data = (convert_uchar4(dcon) != convert_uchar4((int4)(0))) ? src_data : dVal;
*d = dst_data;
}
}
if(x < threadCols && y < dst_rows)
{
- x = x << 4;
- int gx = x - (dst_offset&15);
- int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15);
- uchar4 nval =convert_uchar4_sat_rte(nVal);
-
- int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15);
- int map1Start = y * map1_step + x + map1_offset - (dst_offset&15 );
- short8 map1_data;
-
- map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
- int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) <<((int4)(2))) + src_offset;
- uchar4 src_a, src_b, src_c, src_d;
- src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0));
- src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1));
- src_c = *((__global uchar4 *)((__global char*)src + srcIdx.s2));
- src_d = *((__global uchar4 *)((__global char*)src + srcIdx.s3));
-
- uchar16 dst_data;
- uchar4 dst_a, dst_b, dst_c, dst_d;
- dst_a = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? nval : src_a;
- dst_b = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? nval : src_b;
- dst_c = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? nval : src_c;
- dst_d = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? nval : src_d;
+ int dstIdx = y * dst_step + (x << 2) + dst_offset;
+ int mapIdx = y * map1_step + (x << 2) + map1_offset;
+ short2 map1_data = *((__global short2 *)((__global char*)map1 + mapIdx));
+ int srcIdx = map1_data.y * src_step + (map1_data.x << 2) + src_offset;
+ uchar4 nval = convert_uchar4(nVal);
+ uchar4 src_data;
+ if(map1_data.x >= src_cols || map1_data.y >= src_rows || map1_data.x <0 || map1_data.y < 0 )
+ src_data = nval;
+ else
+ src_data = *((__global uchar4 *)((__global uchar *)src + srcIdx));
+ *((__global uchar4 *)((__global uchar*)dst + dstIdx)) = src_data;
- dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d);
- __global uchar16* d = (__global uchar16 *)(dst + dstStart);
- uchar16 dVal = *d;
-
- int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
- dst_data = (convert_uchar16(con) != ((uchar16)(0))) ? dst_data : dVal;
-
- *d = dst_data;
}
+
}
+
__kernel void remapNNFConstant_C4_D0(__global unsigned char* dst, __global unsigned char const * restrict src,
__global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows, int threadCols, F4 nVal)
if(x < threadCols && y < dst_rows)
{
- x = x << 4;
- int gx = x - (dst_offset&15);
- int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15);
-
- uchar4 nval =convert_uchar4(nVal);
-
- int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15);
-
- int map1Start = y * map1_step +(x << 1) + map1_offset - ((dst_offset&15) << 1);
- float8 map1_data;
+ int dstIdx = y * dst_step + (x << 2) + dst_offset;
+ int mapIdx = y * map1_step + (x << 3) + map1_offset;
+ float2 map1_data = *((__global float2 *)((__global char*)map1 + mapIdx));
+ int2 map1_dataZ = convert_int2_sat_rte(map1_data);
+ int srcIdx = map1_dataZ.y * src_step + (map1_dataZ.x << 2) + src_offset;
+ uchar4 nval = convert_uchar4(nVal);
+ uchar4 src_data;
+ if(map1_dataZ.x >= src_cols || map1_dataZ.y >= src_rows || map1_dataZ.x < 0 || map1_dataZ.y < 0)
+ src_data = nval;
+ else
+ src_data = *((__global uchar4 *)((__global uchar *)src + srcIdx));
+ *((__global uchar4 *)((__global uchar*)dst + dstIdx)) = src_data;
- map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
- int8 map1_dataZ = convert_int8_sat_rte(map1_data);
-
- int4 srcIdx = map1_dataZ.odd * src_step + (map1_dataZ.even <<((int4)(2))) + src_offset;
- uchar4 src_a, src_b, src_c, src_d;
- src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0));
- src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1));
- src_c = *((__global uchar4 *)((__global char*)src + srcIdx.s2));
- src_d = *((__global uchar4 *)((__global char*)src + srcIdx.s3));
-
- uchar16 dst_data;
- uchar4 dst_a, dst_b, dst_c, dst_d;
- dst_a = (map1_dataZ.s0 >= src_cols || map1_dataZ.s1 >= src_rows)? nval : src_a;
- dst_b = (map1_dataZ.s2 >= src_cols || map1_dataZ.s3 >= src_rows)? nval : src_b;
- dst_c = (map1_dataZ.s4 >= src_cols || map1_dataZ.s5 >= src_rows)? nval : src_c;
- dst_d = (map1_dataZ.s6 >= src_cols || map1_dataZ.s7 >= src_rows)? nval : src_d;
-
- dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d);
- __global uchar16* d = (__global uchar16 *)(dst + dstStart);
-
- uchar16 dVal = *d;
-
- int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
- dst_data = (convert_uchar16(con) != ((uchar16)(0))) ? dst_data : dVal;
-
- *d = dst_data;
}
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
- {
- x = x << 4;
- int gx = x - (dst_offset&15);
- int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15);
-
- uchar4 nval =convert_uchar4(nVal);
-
- int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15);
-
- int map1Start = y * map1_step + x + map1_offset - (dst_offset&15);
- float4 map1_data;
- float4 map2_data;
-
- map1_data = *((__global float4 *)((__global char*)map1 + map1Start));
- map2_data = *((__global float4 *)((__global char*)map2 + map1Start));
- float8 map_data = (float8)(map1_data.s0, map2_data.s0, map1_data.s1, map2_data.s1, map1_data.s2, map2_data.s2, map1_data.s3, map2_data.s3);
- int8 map1_dataZ = convert_int8_sat_rte(map_data);
-
- int4 srcIdx = map1_dataZ.odd * src_step + (map1_dataZ.even <<((int4)(2))) + src_offset;
- uchar4 src_a, src_b, src_c, src_d;
- src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0));
- src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1));
- src_c = *((__global uchar4 *)((__global char*)src + srcIdx.s2));
- src_d = *((__global uchar4 *)((__global char*)src + srcIdx.s3));
-
- uchar16 dst_data;
- uchar4 dst_a, dst_b, dst_c, dst_d;
- dst_a = (map1_dataZ.s0 >= src_cols || map1_dataZ.s1 >= src_rows)? nval : src_a;
- dst_b = (map1_dataZ.s2 >= src_cols || map1_dataZ.s3 >= src_rows)? nval : src_b;
- dst_c = (map1_dataZ.s4 >= src_cols || map1_dataZ.s5 >= src_rows)? nval : src_c;
- dst_d = (map1_dataZ.s6 >= src_cols || map1_dataZ.s7 >= src_rows)? nval : src_d;
-
- dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d);
- __global uchar16* d = (__global uchar16 *)(dst + dstStart);
-
- uchar16 dVal = *d;
-
- int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
- dst_data = (convert_uchar16(con) != ((uchar16)(0))) ? dst_data : dVal;
-
- *d = dst_data;
-
+ {
+ int dstIdx = y * dst_step + (x << 2) + dst_offset;
+ int mapIdx = y * map1_step + (x << 2) + map1_offset;
+ float map1_data = *((__global float *)((__global char*)map1 + mapIdx));
+ float map2_data = *((__global float *)((__global char*)map2 + mapIdx));
+ int srcIdx = convert_int_sat_rte(map2_data) * src_step + (convert_int_sat_rte(map1_data) << 2) + src_offset;
+ uchar4 nval = convert_uchar4(nVal);
+ uchar4 src_data;
+ if(convert_int_sat_rte(map1_data) >= src_cols || convert_int_sat_rte(map2_data) >= src_rows || convert_int_sat_rte(map1_data) < 0 || convert_int_sat_rte(map2_data) < 0)
+ src_data = nval;
+ else
+ src_data = *((__global uchar4 *)((__global uchar *)src + srcIdx));
+ *((__global uchar4 *)((__global uchar*)dst + dstIdx)) = src_data;
}
-
}
-
__kernel void remapNNSConstant_C1_D5(__global float* dst, __global float const * restrict src,
__global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows ,int threadCols, F4 nVal)
{
int x = get_global_id(0);
int y = get_global_id(1);
-
+
if(x < threadCols && y < dst_rows)
{
- x = x << 4;
-
- int gx = x - (dst_offset&15);
- int4 Gx = (int4)(gx, gx+4, gx+8, gx+12);
-
- float4 nval =convert_float4(nVal);
- float val = nval.s0;
-
- int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15);
-
- int map1Start = y * map1_step + x + map1_offset - (dst_offset&15);
- short8 map1_data;
-
- map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
-
- int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) <<((int4)(2))) + src_offset;
-
- float4 src_data;
- src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
- src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1));
- src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2));
- src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3));
- float4 dst_data;
-
- dst_data.s0 = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? val : src_data.s0;
- dst_data.s1 = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? val : src_data.s1;
- dst_data.s2 = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? val : src_data.s2;
- dst_data.s3 = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? val : src_data.s3;
-
-
- __global float4* d = (__global float4 *)((__global uchar*)dst + dstStart);
-
- float4 dVal = *d;
-
- int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
- dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
-
- *d = dst_data;
-
+ int dstIdx = y * dst_step + (x << 2) + dst_offset;
+ int mapIdx = y * map1_step + (x << 2) + map1_offset;
+ short2 map1_data = *((__global short2 *)((__global char*)map1 + mapIdx));
+ int srcIdx = map1_data.y * src_step + (map1_data.x << 2) + src_offset;
+ float nval = convert_float(nVal.x);
+ float src_data;
+ if(map1_data.x >= src_cols || map1_data.y >= src_rows|| map1_data.x < 0 || map1_data.y < 0)
+ src_data = nval;
+ else
+ src_data = *((__global float *)((__global uchar *)src + srcIdx));
+ *((__global float *)((__global uchar*)dst + dstIdx)) = src_data;
+
+
}
+
}
+
__kernel void remapNNFConstant_C1_D5(__global float* dst, __global float const * restrict src,
__global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows ,int threadCols, F4 nVal)
{
int x = get_global_id(0);
int y = get_global_id(1);
-
+
if(x < threadCols && y < dst_rows)
{
- x = x << 4;
-
- int gx = x - (dst_offset&15);
- int4 Gx = (int4)(gx, gx+4, gx+8, gx+12);
-
- float4 nval =convert_float4(nVal);
- float val = nval.s0;
-
- int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15);
-
- int map1Start = y * map1_step + (x << 1) + map1_offset - ((dst_offset&15) << 1);
- float8 map1_data;
-
- map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
- int8 map1_dataZ = convert_int8_sat_rte(map1_data);
-
- int4 srcIdx = convert_int4(map1_dataZ.odd) * src_step + convert_int4(map1_dataZ.even <<(int4)(2)) + src_offset;
-
- float4 src_data;
- src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
- src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1));
- src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2));
- src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3));
- float4 dst_data;
-
- dst_data.s0 = (map1_dataZ.s0 >= src_cols || map1_dataZ.s1 >= src_rows)? val : src_data.s0;
- dst_data.s1 = (map1_dataZ.s2 >= src_cols || map1_dataZ.s3 >= src_rows)? val : src_data.s1;
- dst_data.s2 = (map1_dataZ.s4 >= src_cols || map1_dataZ.s5 >= src_rows)? val : src_data.s2;
- dst_data.s3 = (map1_dataZ.s6 >= src_cols || map1_dataZ.s7 >= src_rows)? val : src_data.s3;
-
-
- __global float4* d = (__global float4 *)((__global uchar*)dst + dstStart);
-
- float4 dVal = *d;
-
- int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
- dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
-
- *d = dst_data;
-
+ int dstIdx = y * dst_step + (x << 2) + dst_offset;
+ int mapIdx = y * map1_step + (x << 3) + map1_offset;
+ float2 map1_data = *((__global float2 *)((__global char*)map1 + mapIdx));
+ int2 map1_dataZ = convert_int2_sat_rte(map1_data);
+ int srcIdx = map1_dataZ.y * src_step + (map1_dataZ.x << 2) + src_offset;
+ float nval = convert_float(nVal.x);
+ float src_data;
+ if(map1_dataZ.x >= src_cols || map1_dataZ.y >= src_rows || map1_dataZ.x < 0 || map1_dataZ.y < 0)
+ src_data = nval;
+ else
+ src_data = *((__global float *)((__global uchar *)src + srcIdx));
+ *((__global float *)((__global uchar*)dst + dstIdx)) = src_data;
+
+
}
}
{
int x = get_global_id(0);
int y = get_global_id(1);
-
+
if(x < threadCols && y < dst_rows)
{
- x = x << 4;
-
- int gx = x - (dst_offset&15);
- int4 Gx = (int4)(gx, gx+4, gx+8, gx+12);
-
- float4 nval =convert_float4(nVal);
- float val = nval.s0;
-
- int dstStart = y * dst_step + x + dst_offset - (dst_offset&15);
-
- int map1Start = y * map1_step + x + map1_offset - (dst_offset&15);
- float4 map1_data;
- float4 map2_data;
-
- map1_data = *((__global float4 *)((__global char*)map1 + map1Start));
- map2_data = *((__global float4 *)((__global char*)map2 + map1Start));
- float8 map_data = (float8)(map1_data.s0, map2_data.s0, map1_data.s1, map2_data.s1, map1_data.s2, map2_data.s2, map1_data.s3, map2_data.s3);
- int8 map1_dataZ = convert_int8_sat_rte(map_data);
-
- int4 srcIdx = convert_int4(map1_dataZ.odd) * src_step + convert_int4(map1_dataZ.even <<(int4)(2)) + src_offset;
-
- float4 src_data;
- src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
- src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1));
- src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2));
- src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3));
- float4 dst_data;
-
- dst_data.s0 = (map1_dataZ.s0 >= src_cols || map1_dataZ.s1 >= src_rows)? val : src_data.s0;
- dst_data.s1 = (map1_dataZ.s2 >= src_cols || map1_dataZ.s3 >= src_rows)? val : src_data.s1;
- dst_data.s2 = (map1_dataZ.s4 >= src_cols || map1_dataZ.s5 >= src_rows)? val : src_data.s2;
- dst_data.s3 = (map1_dataZ.s6 >= src_cols || map1_dataZ.s7 >= src_rows)? val : src_data.s3;
-
-
- __global float4* d = (__global float4 *)((__global uchar*)dst + dstStart);
-
- float4 dVal = *d;
-
- int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
- dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
-
- *d = dst_data;
-
+ int dstIdx = y * dst_step + (x << 2) + dst_offset;
+ int mapIdx = y * map1_step + (x << 2) + map1_offset;
+ float map1_data = *((__global float *)((__global char*)map1 + mapIdx));
+ float map2_data = *((__global float *)((__global char*)map2 + mapIdx));
+ float2 map_data = (float2)(map1_data, map2_data);
+ int2 map1_dataZ = convert_int2_sat_rte(map_data);
+ int srcIdx = map1_dataZ.y * src_step + (map1_dataZ.x << 2) + src_offset;
+ float nval = convert_float(nVal.x);
+ float src_data;
+
+ if(map1_dataZ.x >= src_cols || map1_dataZ.y >= src_rows || map1_dataZ.x < 0 || map1_dataZ.y < 0)
+ src_data = nval;
+ else
+ src_data = *((__global float *)((__global uchar *)src + srcIdx));
+ *((__global float *)((__global uchar*)dst + dstIdx)) = src_data;
+
+
}
}
int dstIdx = y * dst_step + (x << 4) + dst_offset ;
int mapIdx = y * map1_step + (x << 2) + map1_offset ;
short2 map1_data = *((__global short2 *)((__global char*)map1 + mapIdx));
-
int srcIdx = map1_data.y * src_step + (map1_data.x << 4) + src_offset;
float4 nval = convert_float4(nVal);
- float4 src_data = *((__global float4 *)((__global uchar *)src + srcIdx));
- *((__global float4 *)((__global uchar*)dst + dstIdx)) = (map1_data.x >= src_cols || map1_data.y >= src_rows) ? nval : src_data;
+ float4 src_data;
+ if (map1_data.x <0 || map1_data.x >= src_cols || map1_data.y <0 || map1_data.y >= src_rows)
+ src_data = nval;
+ else
+ src_data = *((__global float4 *)((__global uchar *)src + srcIdx));
+ *((__global float4 *)((__global uchar*)dst + dstIdx)) = src_data;
+
+
}
}
+
__kernel void remapNNFConstant_C4_D5(__global float * dst, __global float const * restrict src,
__global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , int threadCols, F4 nVal)
int2 map1_dataZ = convert_int2_sat_rte(map1_data);
int srcIdx = map1_dataZ.y * src_step + (map1_dataZ.x << 4) + src_offset;
float4 nval = convert_float4(nVal);
- float4 src_data = *((__global float4 *)((__global uchar *)src + srcIdx));
- *((__global float4 *)((__global uchar*)dst + dstIdx)) = (map1_dataZ.x >= src_cols || map1_dataZ.y >= src_rows) ? nval : src_data;
+ float4 src_data = nval;
+ if(map1_dataZ.x >= 0 && map1_dataZ.x < src_cols && map1_dataZ.y >=0 && map1_dataZ.y < src_rows)
+ src_data = *((__global float4 *)((__global uchar *)src + srcIdx));
+ *((__global float4 *)((__global uchar*)dst + dstIdx)) = src_data;
}
}
int2 map1_dataZ = convert_int2_sat_rte(map_data);
int srcIdx = map1_dataZ.y * src_step + (map1_dataZ.x << 4) + src_offset;
float4 nval = convert_float4(nVal);
- float4 src_data = *((__global float4 *)((__global uchar *)src + srcIdx));
- *((__global float4 *)((__global uchar*)dst + dstIdx)) = (map1_dataZ.x >= src_cols || map1_dataZ.y >= src_rows) ? nval : src_data;
+ float4 src_data = nval;
+ if(map1_dataZ.x >= 0 && map1_dataZ.x < src_cols && map1_dataZ.y >= 0 && map1_dataZ.y < src_rows)
+ src_data = *((__global float4 *)((__global uchar *)src + srcIdx));
+ *((__global float4 *)((__global uchar*)dst + dstIdx)) = src_data;
}
}
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
- x = x << 2;
+ x = x << 2;
int gx = x - (dst_offset&3);
int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
uchar4 nval =convert_uchar4(nVal);
- uchar val = nval.s0;
-
+ uchar4 val = (uchar4)(nval.s0);
+
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3);
int4 map1_dataDy = map1_dataD.odd;
int4 map1_dataDx1 = map1_dataDx + (int4)(1);
int4 map1_dataDy1 = map1_dataDy + (int4)(1);
-
- int4 src_StartU = map1_dataDy * src_step + map1_dataDx + src_offset;
- int4 src_StartD = src_StartU + src_step;
- /*
- //not using the vload
- int4 src_StartU1 = src_StartU + (int4)(1);
- int4 src_StartD1 = src_StartD + (int4)(1);
-
- uchar4 a, b, c, d;
- a.x = *(src_StartU.x + src);
- a.y = *(src_StartU.y + src);
- a.z = *(src_StartU.z + src);
- a.w = *(src_StartU.w + src);
-
- b.x = *(src_StartU1.x + src);
- b.y = *(src_StartU1.y + src);
- b.z = *(src_StartU1.z + src);
- b.w = *(src_StartU1.w + src);
-
- c.x = *(src_StartD.x + src);
- c.y = *(src_StartD.y + src);
- c.z = *(src_StartD.z + src);
- c.w = *(src_StartD.w + src);
-
- d.x = *(src_StartD1.x + src);
- d.y = *(src_StartD1.y + src);
- d.z = *(src_StartD1.z + src);
- d.w = *(src_StartD1.w + src);
- */
- uchar2 aU, aD, bU, bD, cU, cD, dU, dD;
-
- aU = vload2(0, src + src_StartU.s0);
- bU = vload2(0, src + src_StartU.s1);
- cU = vload2(0, src + src_StartU.s2);
- dU = vload2(0, src + src_StartU.s3);
- aD = vload2(0, src + src_StartD.s0);
- bD = vload2(0, src + src_StartD.s1);
- cD = vload2(0, src + src_StartD.s2);
- dD = vload2(0, src + src_StartD.s3);
-
- uchar4 a, b, c, d;
- a = (uchar4)(aU.x, bU.x, cU.x, dU.x);
- b = (uchar4)(aU.y, bU.y, cU.y, dU.y);
- c = (uchar4)(aD.x, bD.x, cD.x, dD.x);
- d = (uchar4)(aD.y, bD.y, cD.y, dD.y);
-
- int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0);
- int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0);
- int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0);
- int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0);
- a = (convert_uchar4(ac) == (uchar4)(0))? a : val;
- b = (convert_uchar4(bc) == (uchar4)(0))? b : val;
- c = (convert_uchar4(cc) == (uchar4)(0))? c : val;
- d = (convert_uchar4(dc) == (uchar4)(0))? d : val;
-
+ uchar4 a = val, b = val, c = val, d =val;
+
+ if (map1_dataDx.s0 < src_cols && map1_dataDx.s0 >= 0 && map1_dataDy.s0 < src_rows && map1_dataDy.s0 >= 0)
+ a.s0 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s0 * src_step + map1_dataDx.s0 + src_offset));
+ if (map1_dataDx.s1 < src_cols && map1_dataDx.s1 >= 0 && map1_dataDy.s1 < src_rows && map1_dataDy.s1 >= 0)
+ a.s1 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s1 * src_step + map1_dataDx.s1 + src_offset));
+ if (map1_dataDx.s2 < src_cols && map1_dataDx.s2 >= 0 && map1_dataDy.s2 < src_rows && map1_dataDy.s2 >= 0)
+ a.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s2 * src_step + map1_dataDx.s2 + src_offset));
+ if (map1_dataDx.s3 < src_cols && map1_dataDx.s3 >= 0 && map1_dataDy.s3 < src_rows && map1_dataDy.s3 >= 0)
+ a.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s3 * src_step + map1_dataDx.s3 + src_offset));
+
+ if (map1_dataDx1.s0 < src_cols && map1_dataDx1.s0 >= 0 && map1_dataDy.s0 < src_rows && map1_dataDy.s0 >= 0)
+ b.s0 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s0 * src_step + map1_dataDx1.s0 + src_offset));
+ if (map1_dataDx1.s1 < src_cols && map1_dataDx1.s1 >= 0 && map1_dataDy.s1 < src_rows && map1_dataDy.s1 >= 0)
+ b.s1 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s1 * src_step + map1_dataDx1.s1 + src_offset));
+ if (map1_dataDx1.s2 < src_cols && map1_dataDx1.s2 >= 0 && map1_dataDy.s2 < src_rows && map1_dataDy.s2 >= 0)
+ b.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s2 * src_step + map1_dataDx1.s2 + src_offset));
+ if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy.s3 < src_rows && map1_dataDy.s3 >= 0)
+ b.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s3 * src_step + map1_dataDx1.s3 + src_offset));
+
+ if (map1_dataDx.s0 < src_cols && map1_dataDx.s0 >= 0 && map1_dataDy1.s0 < src_rows && map1_dataDy1.s0 >= 0)
+ c.s0 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s0 * src_step + map1_dataDx.s0 + src_offset));
+ if (map1_dataDx.s1 < src_cols && map1_dataDx.s1 >= 0 && map1_dataDy1.s1 < src_rows && map1_dataDy1.s1 >= 0)
+ c.s1 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s1 * src_step + map1_dataDx.s1 + src_offset));
+ if (map1_dataDx.s2 < src_cols && map1_dataDx.s2 >= 0 && map1_dataDy1.s2 < src_rows && map1_dataDy1.s2 >= 0)
+ c.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s2 * src_step + map1_dataDx.s2 + src_offset));
+ if (map1_dataDx.s3 < src_cols && map1_dataDx.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0)
+ c.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s3 * src_step + map1_dataDx.s3 + src_offset));
+
+ if (map1_dataDx1.s0 < src_cols && map1_dataDx1.s0 >= 0 && map1_dataDy1.s0 < src_rows && map1_dataDy1.s0 >= 0)
+ d.s0 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s0 * src_step + map1_dataDx1.s0 + src_offset));
+ if (map1_dataDx1.s1 < src_cols && map1_dataDx1.s1 >= 0 && map1_dataDy1.s1 < src_rows && map1_dataDy1.s1 >= 0)
+ d.s1 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s1 * src_step + map1_dataDx1.s1 + src_offset));
+ if (map1_dataDx1.s2 < src_cols && map1_dataDx1.s2 >= 0 && map1_dataDy1.s2 < src_rows && map1_dataDy1.s2 >= 0)
+ d.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s2 * src_step + map1_dataDx1.s2 + src_offset));
+ if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0)
+ d.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s3 * src_step + map1_dataDx1.s3 + src_offset));
+
uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v );
-
+
__global uchar4* D = (__global uchar4 *)(dst + dstStart);
- uchar4 dVal = *D;
+ uchar4 dVal = *D;
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
- x = x << 2;
+ x = x << 2;
int gx = x - (dst_offset&3);
int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
uchar4 nval =convert_uchar4(nVal);
- uchar val = nval.s0;
-
+ uchar4 val = (uchar4)(nval.s0);
+
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3);
int4 map1_dataDx1 = map1_dataDx + (int4)(1);
int4 map1_dataDy1 = map1_dataDy + (int4)(1);
- int4 src_StartU = map1_dataDy * src_step + map1_dataDx + src_offset;
- int4 src_StartD = src_StartU + src_step;
- /*
- //not using the vload
- int4 src_StartU1 = src_StartU + (int4)(1);
- int4 src_StartD1 = src_StartD + (int4)(1);
-
- uchar4 a, b, c, d;
- a.x = *(src_StartU.x + src);
- a.y = *(src_StartU.y + src);
- a.z = *(src_StartU.z + src);
- a.w = *(src_StartU.w + src);
-
- b.x = *(src_StartU1.x + src);
- b.y = *(src_StartU1.y + src);
- b.z = *(src_StartU1.z + src);
- b.w = *(src_StartU1.w + src);
-
- c.x = *(src_StartD.x + src);
- c.y = *(src_StartD.y + src);
- c.z = *(src_StartD.z + src);
- c.w = *(src_StartD.w + src);
-
- d.x = *(src_StartD1.x + src);
- d.y = *(src_StartD1.y + src);
- d.z = *(src_StartD1.z + src);
- d.w = *(src_StartD1.w + src);
- */
- uchar2 aU, aD, bU, bD, cU, cD, dU, dD;
-
- aU = vload2(0, src + src_StartU.s0);
- bU = vload2(0, src + src_StartU.s1);
- cU = vload2(0, src + src_StartU.s2);
- dU = vload2(0, src + src_StartU.s3);
- aD = vload2(0, src + src_StartD.s0);
- bD = vload2(0, src + src_StartD.s1);
- cD = vload2(0, src + src_StartD.s2);
- dD = vload2(0, src + src_StartD.s3);
-
- uchar4 a, b, c, d;
- a = (uchar4)(aU.x, bU.x, cU.x, dU.x);
- b = (uchar4)(aU.y, bU.y, cU.y, dU.y);
- c = (uchar4)(aD.x, bD.x, cD.x, dD.x);
- d = (uchar4)(aD.y, bD.y, cD.y, dD.y);
-
- int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0);
- int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0);
- int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0);
- int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0);
- a = (convert_uchar4(ac) == (uchar4)(0))? a : val;
- b = (convert_uchar4(bc) == (uchar4)(0))? b : val;
- c = (convert_uchar4(cc) == (uchar4)(0))? c : val;
- d = (convert_uchar4(dc) == (uchar4)(0))? d : val;
+ uchar4 a = val, b = val, c = val, d =val;
+ if (map1_dataDx.s0 < src_cols && map1_dataDx.s0 >= 0 && map1_dataDy.s0 < src_rows && map1_dataDy.s0 >= 0)
+ a.s0 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s0 * src_step + map1_dataDx.s0 + src_offset));
+ if (map1_dataDx.s1 < src_cols && map1_dataDx.s1 >= 0 && map1_dataDy.s1 < src_rows && map1_dataDy.s1 >= 0)
+ a.s1 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s1 * src_step + map1_dataDx.s1 + src_offset));
+ if (map1_dataDx.s2 < src_cols && map1_dataDx.s2 >= 0 && map1_dataDy.s2 < src_rows && map1_dataDy.s2 >= 0)
+ a.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s2 * src_step + map1_dataDx.s2 + src_offset));
+ if (map1_dataDx.s3 < src_cols && map1_dataDx.s3 >= 0 && map1_dataDy.s3 < src_rows && map1_dataDy.s3 >= 0)
+ a.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s3 * src_step + map1_dataDx.s3 + src_offset));
+
+ if (map1_dataDx1.s0 < src_cols && map1_dataDx1.s0 >= 0 && map1_dataDy.s0 < src_rows && map1_dataDy.s0 >= 0)
+ b.s0 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s0 * src_step + map1_dataDx1.s0 + src_offset));
+ if (map1_dataDx1.s1 < src_cols && map1_dataDx1.s1 >= 0 && map1_dataDy.s1 < src_rows && map1_dataDy.s1 >= 0)
+ b.s1 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s1 * src_step + map1_dataDx1.s1 + src_offset));
+ if (map1_dataDx1.s2 < src_cols && map1_dataDx1.s2 >= 0 && map1_dataDy.s2 < src_rows && map1_dataDy.s2 >= 0)
+ b.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s2 * src_step + map1_dataDx1.s2 + src_offset));
+ if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy.s3 < src_rows && map1_dataDy.s3 >= 0)
+ b.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy.s3 * src_step + map1_dataDx1.s3 + src_offset));
+
+ if (map1_dataDx.s0 < src_cols && map1_dataDx.s0 >= 0 && map1_dataDy1.s0 < src_rows && map1_dataDy1.s0 >= 0)
+ c.s0 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s0 * src_step + map1_dataDx.s0 + src_offset));
+ if (map1_dataDx.s1 < src_cols && map1_dataDx.s1 >= 0 && map1_dataDy1.s1 < src_rows && map1_dataDy1.s1 >= 0)
+ c.s1 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s1 * src_step + map1_dataDx.s1 + src_offset));
+ if (map1_dataDx.s2 < src_cols && map1_dataDx.s2 >= 0 && map1_dataDy1.s2 < src_rows && map1_dataDy1.s2 >= 0)
+ c.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s2 * src_step + map1_dataDx.s2 + src_offset));
+ if (map1_dataDx.s3 < src_cols && map1_dataDx.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0)
+ c.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s3 * src_step + map1_dataDx.s3 + src_offset));
+
+ if (map1_dataDx1.s0 < src_cols && map1_dataDx1.s0 >= 0 && map1_dataDy1.s0 < src_rows && map1_dataDy1.s0 >= 0)
+ d.s0 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s0 * src_step + map1_dataDx1.s0 + src_offset));
+ if (map1_dataDx1.s1 < src_cols && map1_dataDx1.s1 >= 0 && map1_dataDy1.s1 < src_rows && map1_dataDy1.s1 >= 0)
+ d.s1 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s1 * src_step + map1_dataDx1.s1 + src_offset));
+ if (map1_dataDx1.s2 < src_cols && map1_dataDx1.s2 >= 0 && map1_dataDy1.s2 < src_rows && map1_dataDy1.s2 >= 0)
+ d.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s2 * src_step + map1_dataDx1.s2 + src_offset));
+ if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0)
+ d.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s3 * src_step + map1_dataDx1.s3 + src_offset));
+
uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v );
-
+
__global uchar4* D = (__global uchar4 *)(dst + dstStart);
- uchar4 dVal = *D;
+ uchar4 dVal = *D;
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;
}
-__kernel void remapLNSConstant_C1_D0(__global unsigned char* dst, __global unsigned char const * restrict src,
- __global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
- int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , int threadCols, F4 nVal)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if(x < threadCols && y < dst_rows)
- {
- x = x << 2;
- int gx = x - (dst_offset&3);
- int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
-
- uchar4 nval =convert_uchar4(nVal);
- uchar val = nval.s0;
-
- int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3);
-
- int map1Start = y * map1_step + (x << 2) + map1_offset - ((dst_offset & 3) << 2);
- short8 map1_data;
-
- map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
- int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even) + src_offset;
-
- uchar4 src_data;
-
- src_data.s0 = *(src + srcIdx.s0);
- src_data.s1 = *(src + srcIdx.s1);
- src_data.s2 = *(src + srcIdx.s2);
- src_data.s3 = *(src + srcIdx.s3);
- uchar4 dst_data;
- dst_data = convert_uchar4((convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows)))? (uchar4)(val) : src_data;
-
- __global uchar4* d = (__global uchar4 *)(dst + dstStart);
-
- uchar4 dVal = *d;
-
- int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
- dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;
-
- *d = dst_data;
-
- }
-
-}
-
-
__kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsigned char const * restrict src,
__global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
- x = x << 4;
- int gx = x - (dst_offset&15);
- int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15);
+ int dstIdx = y * dst_step + (x << 2) + dst_offset;
+ int mapIdx = y * map1_step + (x << 3) + map1_offset;
+ float2 map_data = *((__global float2 *)((__global char*)map1 + mapIdx));
+ int2 map_dataA = convert_int2(map_data);
+ float2 u = map_data - convert_float2(map_dataA);
+ int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
+ int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
+ int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y +1);
+ uchar4 nval = convert_uchar4(nVal);
+ uchar4 a, b, c , d;
+ if(map_dataA.x < 0 || map_dataA.x >= src_cols || map_dataA.y >= src_rows || map_dataA.y < 0)
+ a = nval;
+ else
+ a = *((__global uchar4 *)((__global uchar *)src + map_dataA.y * src_step + (map_dataA.x<<2) + src_offset ));
+ if(map_dataB.x < 0 || map_dataB.x >= src_cols || map_dataB.y >= src_rows || map_dataB.y < 0)
+ b = nval;
+ else
+ b = *((__global uchar4 *)((__global uchar *)src + map_dataB.y * src_step + (map_dataB.x<<2) + src_offset ));
+
+ if(map_dataC.x < 0 || map_dataC.x >= src_cols || map_dataC.y >= src_rows || map_dataC.y < 0)
+ c = nval;
+ else
+ c = *((__global uchar4 *)((__global uchar *)src + map_dataC.y * src_step + (map_dataC.x<<2) + src_offset ));
+
+ if(map_dataD.x < 0 || map_dataD.x >= src_cols || map_dataD.y >= src_rows || map_dataD.y < 0)
+ d = nval;
+ else
+ d = *((__global uchar4 *)((__global uchar *)src + map_dataD.y * src_step + (map_dataD.x<<2) + src_offset ));
+ float4 dst_data = convert_float4(a)*((float4)(1.0-u.x)*((float4)(1.0-u.y))) + convert_float4(b)*((float4)(u.x))*((float4)(1.0-u.y)) + convert_float4(c)*((float4)(1.0-u.x))*((float4)(u.y)) + convert_float4(d)*((float4)(u.x))*((float4)(u.y));
+ *((__global uchar4 *)((__global uchar*)dst + dstIdx)) = convert_uchar4_sat_rte(dst_data);
- uchar4 nval =convert_uchar4(nVal);
- int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15);
-
- int map1Start = y * map1_step + (x << 1) + map1_offset - ((dst_offset & 15) << 1);
- float8 map1_data;
-
- map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
- int8 map1_dataD = convert_int8(map1_data);
- float8 temp = map1_data - convert_float8(map1_dataD);
-
- float4 u = temp.even;
- float4 v = temp.odd;
- float4 ud = (float4)(1.0) - u;
- float4 vd = (float4)(1.0) - v;
-
- //float8 map1_dataU = map1_dataD + 1;
-
- int4 map1_dataDx = map1_dataD.even;
- int4 map1_dataDy = map1_dataD.odd;
- int4 map1_dataDx1 = map1_dataDx + (int4)(1);
- int4 map1_dataDy1 = map1_dataDy + (int4)(1);
-
- int4 src_StartU = map1_dataDy * src_step + (convert_int4(map1_dataDx) << (int4)(2)) + src_offset;
- int4 src_StartD = src_StartU + src_step;
-
- uchar8 aU, bU, cU, dU, aD, bD, cD, dD;
- aU = vload8(0, src + src_StartU.s0);
- bU = vload8(0, src + src_StartU.s1);
- cU = vload8(0, src + src_StartU.s2);
- dU = vload8(0, src + src_StartU.s3);
- aD = vload8(0, src + src_StartD.s0);
- bD = vload8(0, src + src_StartD.s1);
- cD = vload8(0, src + src_StartD.s2);
- dD = vload8(0, src + src_StartD.s3);
- uchar16 a, b, c, d;
- a = (uchar16)(aU.s0123, bU.s0123, cU.s0123, dU.s0123);
- b = (uchar16)(aU.s4567, bU.s4567, cU.s4567, dU.s4567);
- c = (uchar16)(aD.s0123, bD.s0123, cD.s0123, dD.s0123);
- d = (uchar16)(aD.s4567, bD.s4567, cD.s4567, dD.s4567);
- int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0);
- int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0);
- int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0);
- int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0);
-
- int16 acc = (int16)((int4)(ac.x), (int4)(ac.y), (int4)(ac.z), (int4)(ac.w));
- int16 bcc = (int16)((int4)(bc.x), (int4)(bc.y), (int4)(bc.z), (int4)(bc.w));
- int16 ccc = (int16)((int4)(cc.x), (int4)(cc.y), (int4)(cc.z), (int4)(cc.w));
- int16 dcc = (int16)((int4)(dc.x), (int4)(dc.y), (int4)(dc.z), (int4)(dc.w));
-
- uchar16 val = (uchar16)(nval, nval, nval, nval);
- a = (convert_uchar16(acc) == (uchar16)(0))? a : val;
- b = (convert_uchar16(bcc) == (uchar16)(0))? b : val;
- c = (convert_uchar16(ccc) == (uchar16)(0))? c : val;
- d = (convert_uchar16(dcc) == (uchar16)(0))? d : val;
-
- float16 U = (float16)((float4)(u.x), (float4)(u.y), (float4)(u.z), (float4)(u.w));
- float16 V = (float16)((float4)(v.x), (float4)(v.y), (float4)(v.z), (float4)(v.w));
- float16 Ud = (float16)((float4)(ud.x), (float4)(ud.y), (float4)(ud.z), (float4)(ud.w));
- float16 Vd = (float16)((float4)(vd.x), (float4)(vd.y), (float4)(vd.z), (float4)(vd.w));
-
- uchar16 dst_data = convert_uchar16_sat_rte((convert_float16(a))* Ud * Vd +(convert_float16(b))* U * Vd + (convert_float16(c))* Ud * V + (convert_float16(d)) * U * V );
-
- __global uchar16* D = (__global uchar16 *)(dst + dstStart);
-
- uchar16 dVal = *D;
- int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
- dst_data = (convert_uchar16(con) != (uchar16)(0)) ? dst_data : dVal;
-
- *D = dst_data;
}
-}
-
+}
__kernel void remapLNF1Constant_C4_D0(__global unsigned char* dst, __global unsigned char const * restrict src,
__global float * map1, __global float * map2, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , int threadCols, F4 nVal)
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
- x = x << 4;
- int gx = x - (dst_offset&15);
- int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15);
+ int dstIdx = y * dst_step + (x << 2) + dst_offset;
+ int mapIdx = y * map1_step + (x << 2) + map1_offset;
+ float map1_data = *((__global float *)((__global char*)map1 + mapIdx));
+ float map2_data = *((__global float *)((__global char*)map2 + mapIdx));
+ float2 map_data = (float2)(map1_data, map2_data);
+ int2 map_dataA = convert_int2(map_data);
+ float2 u = map_data - convert_float2(map_dataA);
+ int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
+ int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
+ int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y +1);
+ uchar4 nval = convert_uchar4(nVal);
+ uchar4 a, b, c , d;
+ if(map_dataA.x < 0 || map_dataA.x >= src_cols || map_dataA.y >= src_rows || map_dataA.y < 0)
+ a = nval;
+ else
+ a = *((__global uchar4 *)((__global uchar *)src + map_dataA.y * src_step + (map_dataA.x<<2) + src_offset ));
+ if(map_dataB.x < 0 || map_dataB.x >= src_cols || map_dataB.y >= src_rows || map_dataB.y < 0)
+ b = nval;
+ else
+ b = *((__global uchar4 *)((__global uchar *)src + map_dataB.y * src_step + (map_dataB.x<<2) + src_offset ));
+
+ if(map_dataC.x < 0 || map_dataC.x >= src_cols || map_dataC.y >= src_rows || map_dataC.y < 0)
+ c = nval;
+ else
+ c = *((__global uchar4 *)((__global uchar *)src + map_dataC.y * src_step + (map_dataC.x<<2) + src_offset ));
+
+ if(map_dataD.x < 0 || map_dataD.x >= src_cols || map_dataD.y >= src_rows || map_dataD.y < 0)
+ d = nval;
+ else
+ d = *((__global uchar4 *)((__global uchar *)src + map_dataD.y * src_step + (map_dataD.x<<2) + src_offset ));
+ float4 dst_data = convert_float4(a)*((float4)(1.0-u.x)*((float4)(1.0-u.y))) + convert_float4(b)*((float4)(u.x))*((float4)(1.0-u.y)) + convert_float4(c)*((float4)(1.0-u.x))*((float4)(u.y)) + convert_float4(d)*((float4)(u.x))*((float4)(u.y));
+ *((__global uchar4 *)((__global uchar*)dst + dstIdx)) = convert_uchar4_sat_rte(dst_data);
- uchar4 nval =convert_uchar4(nVal);
- int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15);
- int map1Start = y * map1_step + x + map1_offset - (dst_offset & 15);
- float4 map1_data;
- float4 map2_data;
-
- map1_data = *((__global float4 *)((__global char*)map1 + map1Start));
- map2_data = *((__global float4 *)((__global char*)map2 + map1Start));
- float8 map_data = (float8)(map1_data.s0, map2_data.s0, map1_data.s1, map2_data.s1, map1_data.s2, map2_data.s2, map1_data.s3, map2_data.s3);
- int8 map1_dataD = convert_int8(map_data);
- float8 temp = map_data - convert_float8(map1_dataD);
-
- float4 u = temp.even;
- float4 v = temp.odd;
- float4 ud = (float4)(1.0) - u;
- float4 vd = (float4)(1.0) - v;
-
- //float8 map1_dataU = map1_dataD + 1;
-
- int4 map1_dataDx = map1_dataD.even;
- int4 map1_dataDy = map1_dataD.odd;
- int4 map1_dataDx1 = map1_dataDx + (int4)(1);
- int4 map1_dataDy1 = map1_dataDy + (int4)(1);
-
- int4 src_StartU = map1_dataDy * src_step + (convert_int4(map1_dataDx) << (int4)(2)) + src_offset;
- int4 src_StartD = src_StartU + src_step;
-
- uchar8 aU, bU, cU, dU, aD, bD, cD, dD;
- aU = vload8(0, src + src_StartU.s0);
- bU = vload8(0, src + src_StartU.s1);
- cU = vload8(0, src + src_StartU.s2);
- dU = vload8(0, src + src_StartU.s3);
- aD = vload8(0, src + src_StartD.s0);
- bD = vload8(0, src + src_StartD.s1);
- cD = vload8(0, src + src_StartD.s2);
- dD = vload8(0, src + src_StartD.s3);
- uchar16 a, b, c, d;
- a = (uchar16)(aU.s0123, bU.s0123, cU.s0123, dU.s0123);
- b = (uchar16)(aU.s4567, bU.s4567, cU.s4567, dU.s4567);
- c = (uchar16)(aD.s0123, bD.s0123, cD.s0123, dD.s0123);
- d = (uchar16)(aD.s4567, bD.s4567, cD.s4567, dD.s4567);
- int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0);
- int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0);
- int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0);
- int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0);
-
- int16 acc = (int16)((int4)(ac.x), (int4)(ac.y), (int4)(ac.z), (int4)(ac.w));
- int16 bcc = (int16)((int4)(bc.x), (int4)(bc.y), (int4)(bc.z), (int4)(bc.w));
- int16 ccc = (int16)((int4)(cc.x), (int4)(cc.y), (int4)(cc.z), (int4)(cc.w));
- int16 dcc = (int16)((int4)(dc.x), (int4)(dc.y), (int4)(dc.z), (int4)(dc.w));
-
- uchar16 val = (uchar16)(nval, nval, nval, nval);
- a = (convert_uchar16(acc) == (uchar16)(0))? a : val;
- b = (convert_uchar16(bcc) == (uchar16)(0))? b : val;
- c = (convert_uchar16(ccc) == (uchar16)(0))? c : val;
- d = (convert_uchar16(dcc) == (uchar16)(0))? d : val;
-
- float16 U = (float16)((float4)(u.x), (float4)(u.y), (float4)(u.z), (float4)(u.w));
- float16 V = (float16)((float4)(v.x), (float4)(v.y), (float4)(v.z), (float4)(v.w));
- float16 Ud = (float16)((float4)(ud.x), (float4)(ud.y), (float4)(ud.z), (float4)(ud.w));
- float16 Vd = (float16)((float4)(vd.x), (float4)(vd.y), (float4)(vd.z), (float4)(vd.w));
-
- uchar16 dst_data = convert_uchar16_sat_rte((convert_float16(a))* Ud * Vd +(convert_float16(b))* U * Vd + (convert_float16(c))* Ud * V + (convert_float16(d)) * U * V );
-
- __global uchar16* D = (__global uchar16 *)(dst + dstStart);
-
- uchar16 dVal = *D;
- int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
- dst_data = (convert_uchar16(con) != (uchar16)(0)) ? dst_data : dVal;
-
- *D = dst_data;
}
}
-__kernel void remapLNSConstant_C4_D0(__global unsigned char* dst, __global unsigned char const * restrict src,
- __global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
- int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows, int threadCols, F4 nVal)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if(x < threadCols && y < dst_rows)
- {
- x = x << 4;
- int gx = x - (dst_offset&15);
- int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15);
- uchar4 nval =convert_uchar4_sat_rte(nVal);
-
- int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15);
-
- int map1Start = y * map1_step + x + map1_offset - (dst_offset&15 );
- short8 map1_data;
-
- map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
- int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) <<(int4)(2)) + src_offset;
- uchar4 src_a, src_b, src_c, src_d;
- src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0));
- src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1));
- src_c = *((__global uchar4 *)((__global char*)src + srcIdx.s2));
- src_d = *((__global uchar4 *)((__global char*)src + srcIdx.s3));
-
- uchar16 dst_data;
- uchar4 dst_a, dst_b, dst_c, dst_d;
- dst_a = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? nval : src_a;
- dst_b = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? nval : src_b;
- dst_c = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? nval : src_c;
- dst_d = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? nval : src_d;
-
- dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d);
- __global uchar16* d = (__global uchar16 *)(dst + dstStart);
-
- uchar16 dVal = *d;
-
- int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
- dst_data = (convert_uchar16(con) != (uchar16)(0)) ? dst_data : dVal;
- *d = dst_data;
-
- }
-
-}
__kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const * restrict src,
__global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
- x = x << 4;
+ x = x << 4;
int gx = x - (dst_offset&15);
int4 Gx = (int4)(gx, gx+4, gx+8, gx+12);
float4 nval =convert_float4(nVal);
float4 val = (float4)(nval.s0);
-
+
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15);
int map1Start = y * map1_step + (x << 1) + map1_offset - ((dst_offset & 15) << 1);
float8 map1_data;
int4 map1_dataDx1 = map1_dataDx + (int4)(1);
int4 map1_dataDy1 = map1_dataDy + (int4)(1);
- int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << (int4)(2)) + src_offset;
- int4 src_StartD = src_StartU + src_step;
- /*
- //not using the vload
- int4 src_StartU1 = src_StartU + (int4)(1);
- int4 src_StartD1 = src_StartD + (int4)(1);
-
- float4 a, b, c, d;
- a.x = *(src_StartU.x + src);
- a.y = *(src_StartU.y + src);
- a.z = *(src_StartU.z + src);
- a.w = *(src_StartU.w + src);
-
- b.x = *(src_StartU1.x + src);
- b.y = *(src_StartU1.y + src);
- b.z = *(src_StartU1.z + src);
- b.w = *(src_StartU1.w + src);
-
- c.x = *(src_StartD.x + src);
- c.y = *(src_StartD.y + src);
- c.z = *(src_StartD.z + src);
- c.w = *(src_StartD.w + src);
-
- d.x = *(src_StartD1.x + src);
- d.y = *(src_StartD1.y + src);
- d.z = *(src_StartD1.z + src);
- d.w = *(src_StartD1.w + src);
- */
- float2 aU, aD, bU, bD, cU, cD, dU, dD;
-
- aU = vload2(0, (__global float *)((__global char*)src + src_StartU.s0));
- bU = vload2(0, (__global float *)((__global char*)src + src_StartU.s1));
- cU = vload2(0, (__global float *)((__global char*)src + src_StartU.s2));
- dU = vload2(0, (__global float *)((__global char*)src + src_StartU.s3));
- aD = vload2(0, (__global float *)((__global char*)src + src_StartD.s0));
- bD = vload2(0, (__global float *)((__global char*)src + src_StartD.s1));
- cD = vload2(0, (__global float *)((__global char*)src + src_StartD.s2));
- dD = vload2(0, (__global float *)((__global char*)src + src_StartD.s3));
-
- float4 a, b, c, d;
- a = (float4)(aU.x, bU.x, cU.x, dU.x);
- b = (float4)(aU.y, bU.y, cU.y, dU.y);
- c = (float4)(aD.x, bD.x, cD.x, dD.x);
- d = (float4)(aD.y, bD.y, cD.y, dD.y);
-
- int4 ac =(map1_dataDx >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDy < (int4)(0) || map1_dataDy < (int4)(0));
- int4 bc =(map1_dataDx1 >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDx1 < (int4)(0) || map1_dataDy < (int4)(0));
- int4 cc =(map1_dataDx >= (int4)(src_cols) || map1_dataDy1 >= (int4)(src_rows) || map1_dataDy1 < (int4)(0) || map1_dataDx < (int4)(0));
- int4 dc =(map1_dataDx1 >= (int4)(src_cols) || map1_dataDy1 >= (int4)(src_rows) || map1_dataDy1 < (int4)(0) || map1_dataDy1 < (int4)(0));
- a = (convert_float4(ac) == (float4)(0))? a : val;
- b = (convert_float4(bc) == (float4)(0))? b : val;
- c = (convert_float4(cc) == (float4)(0))? c : val;
- d = (convert_float4(dc) == (float4)(0))? d : val;
-
+ float4 a = val, b = val, c = val, d = val;
+ if (map1_dataDx.s0 < src_cols && map1_dataDx.s0 >= 0 && map1_dataDy.s0 < src_rows && map1_dataDy.s0 >= 0)
+ a.s0 = *((__global float*)((__global uchar *)src + map1_dataDy.s0 * src_step + (map1_dataDx.s0 << 2) + src_offset));
+ if (map1_dataDx.s1 < src_cols && map1_dataDx.s1 >= 0 && map1_dataDy.s1 < src_rows && map1_dataDy.s1 >= 0)
+ a.s1 = *((__global float*)((__global uchar *)src + map1_dataDy.s1 * src_step + (map1_dataDx.s1 << 2) + src_offset));
+ if (map1_dataDx.s2 < src_cols && map1_dataDx.s2 >= 0 && map1_dataDy.s2 < src_rows && map1_dataDy.s2 >= 0)
+ a.s2 = *((__global float*)((__global uchar *)src + map1_dataDy.s2 * src_step + (map1_dataDx.s2 << 2) + src_offset));
+ if (map1_dataDx.s3 < src_cols && map1_dataDx.s3 >= 0 && map1_dataDy.s3 < src_rows && map1_dataDy.s3 >= 0)
+ a.s3 = *((__global float*)((__global uchar *)src + map1_dataDy.s3 * src_step + (map1_dataDx.s3 << 2) + src_offset));
+
+ if (map1_dataDx1.s0 < src_cols && map1_dataDx1.s0 >= 0 && map1_dataDy.s0 < src_rows && map1_dataDy.s0 >= 0)
+ b.s0 = *((__global float*)((__global uchar *)src + map1_dataDy.s0 * src_step + (map1_dataDx1.s0 << 2) + src_offset));
+ if (map1_dataDx1.s1 < src_cols && map1_dataDx1.s1 >= 0 && map1_dataDy.s1 < src_rows && map1_dataDy.s1 >= 0)
+ b.s1 = *((__global float*)((__global uchar *)src + map1_dataDy.s1 * src_step + (map1_dataDx1.s1 << 2) + src_offset));
+ if (map1_dataDx1.s2 < src_cols && map1_dataDx1.s2 >= 0 && map1_dataDy.s2 < src_rows && map1_dataDy.s2 >= 0)
+ b.s2 = *((__global float*)((__global uchar *)src + map1_dataDy.s2 * src_step + (map1_dataDx1.s2 << 2) + src_offset));
+ if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy.s3 < src_rows && map1_dataDy.s3 >= 0)
+ b.s3 = *((__global float*)((__global uchar *)src + map1_dataDy.s3 * src_step + (map1_dataDx1.s3 << 2) + src_offset));
+
+ if (map1_dataDx.s0 < src_cols && map1_dataDx.s0 >= 0 && map1_dataDy1.s0 < src_rows && map1_dataDy1.s0 >= 0)
+ c.s0 = *((__global float*)((__global uchar *)src + map1_dataDy1.s0 * src_step + (map1_dataDx.s0 << 2) + src_offset));
+ if (map1_dataDx.s1 < src_cols && map1_dataDx.s1 >= 0 && map1_dataDy1.s1 < src_rows && map1_dataDy1.s1 >= 0)
+ c.s1 = *((__global float*)((__global uchar *)src + map1_dataDy1.s1 * src_step + (map1_dataDx.s1 << 2) + src_offset));
+ if (map1_dataDx.s2 < src_cols && map1_dataDx.s2 >= 0 && map1_dataDy1.s2 < src_rows && map1_dataDy1.s2 >= 0)
+ c.s2 = *((__global float*)((__global uchar *)src + map1_dataDy1.s2 * src_step + (map1_dataDx.s2 << 2) + src_offset));
+ if (map1_dataDx.s3 < src_cols && map1_dataDx.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0)
+ c.s3 = *((__global float*)((__global uchar *)src + map1_dataDy1.s3 * src_step + (map1_dataDx.s3 << 2) + src_offset));
+
+ if (map1_dataDx1.s0 < src_cols && map1_dataDx1.s0 >= 0 && map1_dataDy1.s0 < src_rows && map1_dataDy1.s0 >= 0)
+ d.s0 = *((__global float*)((__global uchar *)src + map1_dataDy1.s0 * src_step + (map1_dataDx1.s0 << 2) + src_offset));
+ if (map1_dataDx1.s1 < src_cols && map1_dataDx1.s1 >= 0 && map1_dataDy1.s1 < src_rows && map1_dataDy1.s1 >= 0)
+ d.s1 = *((__global float*)((__global uchar *)src + map1_dataDy1.s1 * src_step + (map1_dataDx1.s1 << 2) + src_offset));
+ if (map1_dataDx1.s2 < src_cols && map1_dataDx1.s2 >= 0 && map1_dataDy1.s2 < src_rows && map1_dataDy1.s2 >= 0)
+ d.s2 = *((__global float*)((__global uchar *)src + map1_dataDy1.s2 * src_step + (map1_dataDx1.s2 << 2) + src_offset));
+ if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0)
+ d.s3 = *((__global float*)((__global uchar *)src + map1_dataDy1.s3 * src_step + (map1_dataDx1.s3 << 2) + src_offset));
+
float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
-
+
__global float4* D = (__global float4 *)((__global char*)dst + dstStart);
- float4 dVal = *D;
+ float4 dVal = *D;
int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows);
dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
- x = x << 4;
+ x = x << 4;
int gx = x - (dst_offset&15);
int4 Gx = (int4)(gx, gx+4, gx+8, gx+12);
float4 nval =convert_float4(nVal);
float4 val = (float4)(nval.s0);
-
+
int dstStart = y * dst_step + x + dst_offset - (dst_offset & 15);
int map1Start = y * map1_step + x + map1_offset - (dst_offset & 15);
float4 map1_data;
int4 map1_dataDx1 = map1_dataDx + (int4)(1);
int4 map1_dataDy1 = map1_dataDy + (int4)(1);
- int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << (int4)(2)) + src_offset;
- int4 src_StartD = src_StartU + src_step;
- /*
- //not using the vload
- int4 src_StartU1 = src_StartU + (int4)(1);
- int4 src_StartD1 = src_StartD + (int4)(1);
-
- float4 a, b, c, d;
- a.x = *(src_StartU.x + src);
- a.y = *(src_StartU.y + src);
- a.z = *(src_StartU.z + src);
- a.w = *(src_StartU.w + src);
-
- b.x = *(src_StartU1.x + src);
- b.y = *(src_StartU1.y + src);
- b.z = *(src_StartU1.z + src);
- b.w = *(src_StartU1.w + src);
-
- c.x = *(src_StartD.x + src);
- c.y = *(src_StartD.y + src);
- c.z = *(src_StartD.z + src);
- c.w = *(src_StartD.w + src);
-
- d.x = *(src_StartD1.x + src);
- d.y = *(src_StartD1.y + src);
- d.z = *(src_StartD1.z + src);
- d.w = *(src_StartD1.w + src);
- */
- float2 aU, aD, bU, bD, cU, cD, dU, dD;
-
- aU = vload2(0, (__global float *)((__global char*)src + src_StartU.s0));
- bU = vload2(0, (__global float *)((__global char*)src + src_StartU.s1));
- cU = vload2(0, (__global float *)((__global char*)src + src_StartU.s2));
- dU = vload2(0, (__global float *)((__global char*)src + src_StartU.s3));
- aD = vload2(0, (__global float *)((__global char*)src + src_StartD.s0));
- bD = vload2(0, (__global float *)((__global char*)src + src_StartD.s1));
- cD = vload2(0, (__global float *)((__global char*)src + src_StartD.s2));
- dD = vload2(0, (__global float *)((__global char*)src + src_StartD.s3));
-
- float4 a, b, c, d;
- a = (float4)(aU.x, bU.x, cU.x, dU.x);
- b = (float4)(aU.y, bU.y, cU.y, dU.y);
- c = (float4)(aD.x, bD.x, cD.x, dD.x);
- d = (float4)(aD.y, bD.y, cD.y, dD.y);
-
- int4 ac =(map1_dataDx >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDy < (int4)(0) || map1_dataDy < (int4)(0));
- int4 bc =(map1_dataDx1 >= (int4)(src_cols) || map1_dataDy >= (int4)(src_rows) || map1_dataDx1 < (int4)(0) || map1_dataDy < (int4)(0));
- int4 cc =(map1_dataDx >= (int4)(src_cols) || map1_dataDy1 >= (int4)(src_rows) || map1_dataDy1 < (int4)(0) || map1_dataDx < (int4)(0));
- int4 dc =(map1_dataDx1 >= (int4)(src_cols) || map1_dataDy1 >= (int4)(src_rows) || map1_dataDy1 < (int4)(0) || map1_dataDy1 < (int4)(0));
- a = (convert_float4(ac) == (float4)(0))? a : val;
- b = (convert_float4(bc) == (float4)(0))? b : val;
- c = (convert_float4(cc) == (float4)(0))? c : val;
- d = (convert_float4(dc) == (float4)(0))? d : val;
-
+ float4 a = val, b = val, c = val, d = val;
+ if (map1_dataDx.s0 < src_cols && map1_dataDx.s0 >= 0 && map1_dataDy.s0 < src_rows && map1_dataDy.s0 >= 0)
+ a.s0 = *((__global float*)((__global uchar *)src + map1_dataDy.s0 * src_step + (map1_dataDx.s0 << 2) + src_offset));
+ if (map1_dataDx.s1 < src_cols && map1_dataDx.s1 >= 0 && map1_dataDy.s1 < src_rows && map1_dataDy.s1 >= 0)
+ a.s1 = *((__global float*)((__global uchar *)src + map1_dataDy.s1 * src_step + (map1_dataDx.s1 << 2) + src_offset));
+ if (map1_dataDx.s2 < src_cols && map1_dataDx.s2 >= 0 && map1_dataDy.s2 < src_rows && map1_dataDy.s2 >= 0)
+ a.s2 = *((__global float*)((__global uchar *)src + map1_dataDy.s2 * src_step + (map1_dataDx.s2 << 2) + src_offset));
+ if (map1_dataDx.s3 < src_cols && map1_dataDx.s3 >= 0 && map1_dataDy.s3 < src_rows && map1_dataDy.s3 >= 0)
+ a.s3 = *((__global float*)((__global uchar *)src + map1_dataDy.s3 * src_step + (map1_dataDx.s3 << 2) + src_offset));
+
+ if (map1_dataDx1.s0 < src_cols && map1_dataDx1.s0 >= 0 && map1_dataDy.s0 < src_rows && map1_dataDy.s0 >= 0)
+ b.s0 = *((__global float*)((__global uchar *)src + map1_dataDy.s0 * src_step + (map1_dataDx1.s0 << 2) + src_offset));
+ if (map1_dataDx1.s1 < src_cols && map1_dataDx1.s1 >= 0 && map1_dataDy.s1 < src_rows && map1_dataDy.s1 >= 0)
+ b.s1 = *((__global float*)((__global uchar *)src + map1_dataDy.s1 * src_step + (map1_dataDx1.s1 << 2) + src_offset));
+ if (map1_dataDx1.s2 < src_cols && map1_dataDx1.s2 >= 0 && map1_dataDy.s2 < src_rows && map1_dataDy.s2 >= 0)
+ b.s2 = *((__global float*)((__global uchar *)src + map1_dataDy.s2 * src_step + (map1_dataDx1.s2 << 2) + src_offset));
+ if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy.s3 < src_rows && map1_dataDy.s3 >= 0)
+ b.s3 = *((__global float*)((__global uchar *)src + map1_dataDy.s3 * src_step + (map1_dataDx1.s3 << 2) + src_offset));
+
+ if (map1_dataDx.s0 < src_cols && map1_dataDx.s0 >= 0 && map1_dataDy1.s0 < src_rows && map1_dataDy1.s0 >= 0)
+ c.s0 = *((__global float*)((__global uchar *)src + map1_dataDy1.s0 * src_step + (map1_dataDx.s0 << 2) + src_offset));
+ if (map1_dataDx.s1 < src_cols && map1_dataDx.s1 >= 0 && map1_dataDy1.s1 < src_rows && map1_dataDy1.s1 >= 0)
+ c.s1 = *((__global float*)((__global uchar *)src + map1_dataDy1.s1 * src_step + (map1_dataDx.s1 << 2) + src_offset));
+ if (map1_dataDx.s2 < src_cols && map1_dataDx.s2 >= 0 && map1_dataDy1.s2 < src_rows && map1_dataDy1.s2 >= 0)
+ c.s2 = *((__global float*)((__global uchar *)src + map1_dataDy1.s2 * src_step + (map1_dataDx.s2 << 2) + src_offset));
+ if (map1_dataDx.s3 < src_cols && map1_dataDx.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0)
+ c.s3 = *((__global float*)((__global uchar *)src + map1_dataDy1.s3 * src_step + (map1_dataDx.s3 << 2) + src_offset));
+
+ if (map1_dataDx1.s0 < src_cols && map1_dataDx1.s0 >= 0 && map1_dataDy1.s0 < src_rows && map1_dataDy1.s0 >= 0)
+ d.s0 = *((__global float*)((__global uchar *)src + map1_dataDy1.s0 * src_step + (map1_dataDx1.s0 << 2) + src_offset));
+ if (map1_dataDx1.s1 < src_cols && map1_dataDx1.s1 >= 0 && map1_dataDy1.s1 < src_rows && map1_dataDy1.s1 >= 0)
+ d.s1 = *((__global float*)((__global uchar *)src + map1_dataDy1.s1 * src_step + (map1_dataDx1.s1 << 2) + src_offset));
+ if (map1_dataDx1.s2 < src_cols && map1_dataDx1.s2 >= 0 && map1_dataDy1.s2 < src_rows && map1_dataDy1.s2 >= 0)
+ d.s2 = *((__global float*)((__global uchar *)src + map1_dataDy1.s2 * src_step + (map1_dataDx1.s2 << 2) + src_offset));
+ if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0)
+ d.s3 = *((__global float*)((__global uchar *)src + map1_dataDy1.s3 * src_step + (map1_dataDx1.s3 << 2) + src_offset));
+
+
float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
-
+
__global float4* D = (__global float4 *)((__global char*)dst + dstStart);
- float4 dVal = *D;
+ float4 dVal = *D;
int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows);
dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
}
}
-__kernel void remapLNSConstant_C1_D5(__global float* dst, __global float const * restrict src,
- __global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
- int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows ,int threadCols, F4 nVal)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if(x < threadCols && y < dst_rows)
- {
- x = x << 4;
-
- int gx = x - (dst_offset&15);
- int4 Gx = (int4)(gx, gx+4, gx+8, gx+12);
-
- float4 nval =convert_float4(nVal);
- float val = nval.s0;
-
- int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15);
-
- int map1Start = y * map1_step + x + map1_offset - (dst_offset&15);
- short8 map1_data;
-
- map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
-
- int4 srcIdx = convert_int4(map1_data.odd) * src_step + (convert_int4(map1_data.even) << (int4)(2)) + src_offset;
-
- float4 src_data;
- src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0));
- src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1));
- src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2));
- src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3));
- float4 dst_data;
-
- dst_data.s0 = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? val : src_data.s0;
- dst_data.s1 = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? val : src_data.s1;
- dst_data.s2 = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? val : src_data.s2;
- dst_data.s3 = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? val : src_data.s3;
-
-
- __global float4* d = (__global float4 *)((__global uchar*)dst + dstStart);
-
- float4 dVal = *d;
-
- int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows);
- dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
-
- *d = dst_data;
-
- }
-
-}
__kernel void remapLNFConstant_C4_D5(__global float * dst, __global float const * restrict src,
{
int dstIdx = y * dst_step + (x << 4) + dst_offset ;
int mapIdx = y * map1_step + (x << 3) + map1_offset ;
- float2 map1_data = *((__global float2 *)((__global char*)map1 + mapIdx));
-
- int2 map1_dataZ = convert_int2(map1_data);
-
- int mX = map1_dataZ.x;
- int mY = map1_dataZ.y;
- int mX1 = map1_dataZ.x + 1;
- int mY1 = map1_dataZ.y + 1;
-
- float u = map1_data.x - convert_float(map1_dataZ.x);
- float v = map1_data.y - convert_float(map1_dataZ.y);
- float ud = 1.0 - u;
- float vd = 1.0 - v;
-
- int srcIdx = map1_dataZ.y * src_step + (map1_dataZ.x << 4) + src_offset;
- float8 src_dataU = vload8(0,(__global float *)((__global char*)src + srcIdx));
- float8 src_dataD = vload8(0,(__global float *)((__global char*)src + srcIdx + src_step));
-
- float4 a = src_dataU.lo;
- float4 b = src_dataU.hi;
- float4 c = src_dataD.lo;
- float4 d = src_dataD.hi;
-
+ float2 map_data = *((__global float2 *)((__global char*)map1 + mapIdx));
+ int2 map_dataA = convert_int2(map_data);
+ float2 u = map_data - convert_float2(map_dataA);
+ int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
+ int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
+ int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y +1);
float4 nval = convert_float4(nVal);
- a = (mX >= src_cols || mY >= src_rows ) ? nval : a;
- b = (mX1 >= src_cols || mY >= src_rows ) ? nval : b;
- c = (mX >= src_cols || mY1 >= src_rows ) ? nval : c;
- d = (mX1 >= src_cols || mY1 >= src_rows ) ? nval : d;
-
- float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v;
- *((__global float4 *)((__global uchar*)dst + dstIdx)) = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
+ float4 a, b, c , d;
+ if(map_dataA.x < 0 || map_dataA.x >= src_cols || map_dataA.y >= src_rows || map_dataA.y < 0)
+ a = nval;
+ else
+ a = *((__global float4 *)((__global uchar *)src + map_dataA.y * src_step + (map_dataA.x<<4) + src_offset ));
+ if(map_dataB.x < 0 || map_dataB.x >= src_cols || map_dataB.y >= src_rows || map_dataB.y < 0)
+ b = nval;
+ else
+ b = *((__global float4 *)((__global uchar *)src + map_dataB.y * src_step + (map_dataB.x<<4) + src_offset ));
+
+ if(map_dataC.x < 0 || map_dataC.x >= src_cols || map_dataC.y >= src_rows || map_dataC.y < 0)
+ c = nval;
+ else
+ c = *((__global float4 *)((__global uchar *)src + map_dataC.y * src_step + (map_dataC.x<<4) + src_offset ));
+
+ if(map_dataD.x < 0 || map_dataD.x >= src_cols || map_dataD.y >= src_rows || map_dataD.y < 0)
+ d = nval;
+ else
+ d = *((__global float4 *)((__global uchar *)src + map_dataD.y * src_step + (map_dataD.x<<4) + src_offset ));
+
+ float4 dst_data = a * ((float4)(1.0-u.x)) * ((float4)(1.0-u.y)) + b *((float4)(u.x)) * ((float4)(1.0-u.y)) + c * ((float4)(1.0-u.x)) *((float4)(u.y)) + d *((float4)(u.x)) *((float4)(u.y));
+ *((__global float4 *)((__global uchar*)dst + dstIdx)) = dst_data ;
}
}
float map1_data = *((__global float *)((__global char*)map1 + mapIdx));
float map2_data = *((__global float *)((__global char*)map2 + mapIdx));
float2 map_data = (float2)(map1_data, map2_data);
- int2 map1_dataZ = convert_int2(map_data);
-
- int mX = map1_dataZ.x;
- int mY = map1_dataZ.y;
- int mX1 = map1_dataZ.x + 1;
- int mY1 = map1_dataZ.y + 1;
-
- float u = map1_data - convert_float(map1_dataZ.x);
- float v = map2_data - convert_float(map1_dataZ.y);
- float ud = 1.0 - u;
- float vd = 1.0 - v;
-
- int srcIdx = map1_dataZ.y * src_step + (map1_dataZ.x << 4) + src_offset;
- float8 src_dataU = vload8(0,(__global float *)((__global char*)src + srcIdx));
- float8 src_dataD = vload8(0,(__global float *)((__global char*)src + srcIdx + src_step));
-
- float4 a = src_dataU.lo;
- float4 b = src_dataU.hi;
- float4 c = src_dataD.lo;
- float4 d = src_dataD.hi;
-
+ int2 map_dataA = convert_int2(map_data);
+ float2 u = map_data - convert_float2(map_dataA);
+ int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
+ int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
+ int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y +1);
float4 nval = convert_float4(nVal);
- a = (mX >= src_cols || mY >= src_rows ) ? nval : a;
- b = (mX1 >= src_cols || mY >= src_rows ) ? nval : b;
- c = (mX >= src_cols || mY1 >= src_rows ) ? nval : c;
- d = (mX1 >= src_cols || mY1 >= src_rows ) ? nval : d;
+ float4 a, b, c , d;
+ if(map_dataA.x < 0 || map_dataA.x >= src_cols || map_dataA.y >= src_rows || map_dataA.y < 0)
+ a = nval;
+ else
+ a = *((__global float4 *)((__global uchar *)src + map_dataA.y * src_step + (map_dataA.x<<4) + src_offset ));
+ if(map_dataB.x < 0 || map_dataB.x >= src_cols || map_dataB.y >= src_rows || map_dataB.y < 0)
+ b = nval;
+ else
+ b = *((__global float4 *)((__global uchar *)src + map_dataB.y * src_step + (map_dataB.x<<4) + src_offset ));
+
+ if(map_dataC.x < 0 || map_dataC.x >= src_cols || map_dataC.y >= src_rows || map_dataC.y < 0)
+ c = nval;
+ else
+ c = *((__global float4 *)((__global uchar *)src + map_dataC.y * src_step + (map_dataC.x<<4) + src_offset ));
+
+ if(map_dataD.x < 0 || map_dataD.x >= src_cols || map_dataD.y >= src_rows || map_dataD.y < 0)
+ d = nval;
+ else
+ d = *((__global float4 *)((__global uchar *)src + map_dataD.y * src_step + (map_dataD.x<<4) + src_offset ));
+
+ float4 dst_data = a * ((float4)(1.0-u.x)) * ((float4)(1.0-u.y)) + b *((float4)(u.x)) * ((float4)(1.0-u.y)) + c * ((float4)(1.0-u.x)) *((float4)(u.y)) + d *((float4)(u.x)) *((float4)(u.y));
+ *((__global float4 *)((__global uchar*)dst + dstIdx)) = dst_data ;
- float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v;
- *((__global float4 *)((__global uchar*)dst + dstIdx)) = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
}
}
-/*
-////////////////////////////////////////////////////////////////////////
-///////////////////using image buffer///////////////////////////////////
-////////////////////////////////////////////////////////////////////////
-
-
-__kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __read_only image2d_t src,
- __global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step,
- int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , int threadCols, F4 nVal)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- x = x << 2;
- if(x < threadCols && y < dst_rows)
- {
- int gx = x - (dst_offset&3);
- int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
-
- uchar4 nval =convert_uchar4(nVal);
- char val = nval.s0;
-
- int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3);
-
- int map1Start = y * map1_step + (x << 2) + map1_offset - ((dst_offset & 3) << 2);
- short8 map1_data;
-
- map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
-
- const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |
- CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
-
- int4 src_data;
- src_data.x = read_imageui(src, sampler, (int2)((int)map1_data.s0, (int)map1_data.s1)).x;
- src_data.y = read_imageui(src, sampler, (int2)((int)map1_data.s2, (int)map1_data.s3)).x;
- src_data.z = read_imageui(src, sampler, (int2)((int)map1_data.s4, (int)map1_data.s5)).x;
- src_data.w = read_imageui(src, sampler, (int2)((int)map1_data.s6, (int)map1_data.s7)).x;
-
- int4 bcon = (convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows));
- uchar4 dst_data = (convert_uchar4(bcon != 0)) ? (uchar4)(val) : convert_uchar4(src_data);
- __global uchar4* d = (__global uchar4 *)(dst + dstStart);
- uchar4 dVal = *d;
- int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
- dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;
-
- *d = dst_data;
- }
-}
-*/
////////////vector fuction name format: split_vector_C(channels number)_D(data type depth)//////
////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int src_offset,
- __global uchar *mat_dst0, int dst0_step, int dst0_offset,
- __global uchar *mat_dst1, int dst1_step, int dst1_offset,
- __global uchar *mat_dst2, int dst2_step, int dst2_offset,
+ __global uchar *mat_dst0, int dst0_step, int dst0_offset,
+ __global uchar *mat_dst1, int dst1_step, int dst1_offset,
+ __global uchar *mat_dst2, int dst2_step, int dst2_offset,
__global uchar *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
x = x << 2;
- int src_idx = mad24(y, src_step, src_offset + (x << 2));
+ int src_idx = mad24(y, src_step, src_offset + (x << 2));
- int dst0_start = mad24(y, dst0_step, dst0_offset);
+ int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x) & (int)0xfffffffc;
- int dst1_start = mad24(y, dst1_step, dst1_offset);
+ int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x) & (int)0xfffffffc;
- int dst2_start = mad24(y, dst2_step, dst2_offset);
+ int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + x) & (int)0xfffffffc;
- int dst3_start = mad24(y, dst3_step, dst3_offset);
+ int dst3_start = mad24(y, dst3_step, dst3_offset);
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
int dst3_idx = mad24(y, dst3_step, dst3_offset + x) & (int)0xfffffffc;
+
+ uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx)));
+ uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8 >= 0 ? src_idx - 8 : src_idx)));
+ uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4 >= 0 ? src_idx - 4 : src_idx)));
+ uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 ));
- uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx)));
- uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8 >= 0 ? src_idx - 8 : src_idx)));
- uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4 >= 0 ? src_idx - 4 : src_idx)));
- uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 ));
-
- int total_bytes = src_offset + rows * src_step;
- uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4 < total_bytes ? src_idx + 4 : src_idx)));
- uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8 < total_bytes ? src_idx + 8 : src_idx)));
- uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));
+ int total_bytes = src_offset + rows * src_step;
+ uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4 < total_bytes ? src_idx + 4 : src_idx)));
+ uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8 < total_bytes ? src_idx + 8 : src_idx)));
+ uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));
uchar4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
}
__kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int src_offset,
- __global uchar *mat_dst0, int dst0_step, int dst0_offset,
- __global uchar *mat_dst1, int dst1_step, int dst1_offset,
- __global uchar *mat_dst2, int dst2_step, int dst2_offset,
+ __global uchar *mat_dst0, int dst0_step, int dst0_offset,
+ __global uchar *mat_dst1, int dst1_step, int dst1_offset,
+ __global uchar *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
x = x << 2;
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
- int dst0_start = mad24(y, dst0_step, dst0_offset);
+ int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
- int dst1_start = mad24(y, dst1_step, dst1_offset);
+ int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
- int dst2_start = mad24(y, dst2_step, dst2_offset);
+ int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
-
+
uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx));
uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx));
uchar4 dst2_data = *((__global uchar4 *)(mat_dst2 + dst2_idx));
uchar data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
int index = 3 - dst0_offset & 3;
- tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
+ tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
uchar4 data0, data1, data2;
-
+
data0 = (uchar4)(src_data_1, src_data_4, src_data_7, src_data_10);
data1 = (dst1_offset & 3) == 2 ? (uchar4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0;
data2 = (dst1_offset & 3) == 1 ? (uchar4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
}
__kernel void split_vector_C2_D0 (__global uchar *mat_src, int src_step, int src_offset,
- __global uchar *mat_dst0, int dst0_step, int dst0_offset,
- __global uchar *mat_dst1, int dst1_step, int dst1_offset,
+ __global uchar *mat_dst0, int dst0_step, int dst0_offset,
+ __global uchar *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
x = x << 2;
#define dst0_align ((dst0_offset & 3) << 1)
#define dst1_align ((dst1_offset & 3) << 1)
- int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
- int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
+ int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
+ int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
- int dst0_start = mad24(y, dst0_step, dst0_offset);
+ int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
- int dst1_start = mad24(y, dst1_step, dst1_offset);
+ int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
-
+
+ int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+ int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
uchar8 src_data_0 = vload8(0, mat_src + src_idx_0);
uchar8 src_data_1 = vload8(0, mat_src + src_idx_1);
+ if(src_idx_0 == -6)
+ src_data_0.s01234567 = src_data_0.s67012345;
+ if(src_idx_0 == -4)
+ src_data_0.s01234567 = src_data_0.s45670123;
+ if(src_idx_0 == -2)
+ src_data_0.s01234567 = src_data_0.s23456701;
+ if(src_idx_1 == -6)
+ src_data_1.s01234567 = src_data_1.s67012345;
+ if(src_idx_1 == -4)
+ src_data_1.s01234567 = src_data_1.s45670123;
+ if(src_idx_1 == -2)
+ src_data_1.s01234567 = src_data_1.s23456701;
uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx));
uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx));
}
__kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int src_offset,
- __global char *mat_dst0, int dst0_step, int dst0_offset,
- __global char *mat_dst1, int dst1_step, int dst1_offset,
- __global char *mat_dst2, int dst2_step, int dst2_offset,
+ __global char *mat_dst0, int dst0_step, int dst0_offset,
+ __global char *mat_dst1, int dst1_step, int dst1_offset,
+ __global char *mat_dst2, int dst2_step, int dst2_offset,
__global char *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
x = x << 2;
- int src_idx = mad24(y, src_step, src_offset + (x << 2));
+ int src_idx = mad24(y, src_step, src_offset + (x << 2));
- int dst0_start = mad24(y, dst0_step, dst0_offset);
+ int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
- int dst1_start = mad24(y, dst1_step, dst1_offset);
+ int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
- int dst2_start = mad24(y, dst2_step, dst2_offset);
+ int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
- int dst3_start = mad24(y, dst3_step, dst3_offset);
+ int dst3_start = mad24(y, dst3_step, dst3_offset);
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
int dst3_idx = mad24(y, dst3_step, dst3_offset + x & (int)0xfffffffc);
-
- char4 data_0 = *((global char4 *)(mat_src + src_idx - 12));
- char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 ));
- char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 ));
- char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 ));
- char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 ));
- char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 ));
- char4 data_6 = *((global char4 *)(mat_src + src_idx + 12));
+
+ char4 data_0 = *((global char4 *)(mat_src + src_idx - 12));
+ char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 ));
+ char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 ));
+ char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 ));
+ char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 ));
+ char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 ));
+ char4 data_6 = *((global char4 *)(mat_src + src_idx + 12));
char4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
}
__kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int src_offset,
- __global char *mat_dst0, int dst0_step, int dst0_offset,
- __global char *mat_dst1, int dst1_step, int dst1_offset,
- __global char *mat_dst2, int dst2_step, int dst2_offset,
+ __global char *mat_dst0, int dst0_step, int dst0_offset,
+ __global char *mat_dst1, int dst1_step, int dst1_offset,
+ __global char *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
x = x << 2;
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
- int dst0_start = mad24(y, dst0_step, dst0_offset);
+ int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
- int dst1_start = mad24(y, dst1_step, dst1_offset);
+ int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
- int dst2_start = mad24(y, dst2_step, dst2_offset);
+ int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
-
+
char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx));
char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx));
char4 dst2_data = *((__global char4 *)(mat_dst2 + dst2_idx));
char data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
int index = 3 - dst0_offset & 3;
- tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
+ tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
char4 data0, data1, data2;
-
+
data0 = (char4)(src_data_1, src_data_4, src_data_7, src_data_10);
data1 = (dst1_offset & 3) == 2 ? (char4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0;
data2 = (dst1_offset & 3) == 1 ? (char4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
}
__kernel void split_vector_C2_D1 (__global char *mat_src, int src_step, int src_offset,
- __global char *mat_dst0, int dst0_step, int dst0_offset,
- __global char *mat_dst1, int dst1_step, int dst1_offset,
+ __global char *mat_dst0, int dst0_step, int dst0_offset,
+ __global char *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
x = x << 2;
#define dst0_align ((dst0_offset & 3) << 1)
#define dst1_align ((dst1_offset & 3) << 1)
- int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
- int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
+ int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
+ int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
- int dst0_start = mad24(y, dst0_step, dst0_offset);
+ int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
- int dst1_start = mad24(y, dst1_step, dst1_offset);
+ int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
-
+ int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+ int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
char8 src_data_0 = vload8(0, mat_src + src_idx_0);
char8 src_data_1 = vload8(0, mat_src + src_idx_1);
-
+ if(src_idx_0 == -6)
+ src_data_0.s01234567 = src_data_0.s67012345;
+ if(src_idx_0 == -4)
+ src_data_0.s01234567 = src_data_0.s45670123;
+ if(src_idx_0 == -2)
+ src_data_0.s01234567 = src_data_0.s23456701;
+ if(src_idx_1 == -6)
+ src_data_1.s01234567 = src_data_1.s67012345;
+ if(src_idx_1 == -4)
+ src_data_1.s01234567 = src_data_1.s45670123;
+ if(src_idx_1 == -2)
+ src_data_1.s01234567 = src_data_1.s23456701;
char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx));
char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx));
}
__kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int src_offset,
- __global ushort *mat_dst0, int dst0_step, int dst0_offset,
- __global ushort *mat_dst1, int dst1_step, int dst1_offset,
- __global ushort *mat_dst2, int dst2_step, int dst2_offset,
+ __global ushort *mat_dst0, int dst0_step, int dst0_offset,
+ __global ushort *mat_dst1, int dst1_step, int dst1_offset,
+ __global ushort *mat_dst2, int dst2_step, int dst2_offset,
__global ushort *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
x = x << 1;
- int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
- int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
+ int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
+ int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
- int dst0_start = mad24(y, dst0_step, dst0_offset);
+ int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
- int dst1_start = mad24(y, dst1_step, dst1_offset);
+ int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
- int dst2_start = mad24(y, dst2_step, dst2_offset);
+ int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
- int dst3_start = mad24(y, dst3_step, dst3_offset);
+ int dst3_start = mad24(y, dst3_step, dst3_offset);
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
-
- ushort8 src_data0 = vload8(0, (__global ushort *)((__global char *)mat_src + src_idx_0));
+
+ int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+ ushort8 src_data0 = vload8(0,(__global ushort *)((__global char *)mat_src + src_idx_0));
+ if(src_idx_0 == -6)
+ src_data0.s01234567 = src_data0.s67012345;
+ if(src_idx_0 == -4)
+ src_data0.s01234567 = src_data0.s45670123;
+ if(src_idx_0 == -2)
+ src_data0.s01234567 = src_data0.s23456701;
ushort4 src_data1 = *((__global ushort4 *)((__global char *)mat_src + src_idx_1));
ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
}
__kernel void split_vector_C3_D2 (__global ushort *mat_src, int src_step, int src_offset,
- __global ushort *mat_dst0, int dst0_step, int dst0_offset,
- __global ushort *mat_dst1, int dst1_step, int dst1_offset,
- __global ushort *mat_dst2, int dst2_step, int dst2_offset,
+ __global ushort *mat_dst0, int dst0_step, int dst0_offset,
+ __global ushort *mat_dst1, int dst1_step, int dst1_offset,
+ __global ushort *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
x = x << 1;
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
- int dst0_start = mad24(y, dst0_step, dst0_offset);
+ int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
- int dst1_start = mad24(y, dst1_step, dst1_offset);
+ int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
- int dst2_start = mad24(y, dst2_step, dst2_offset);
+ int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
-
+
ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
ushort2 dst2_data = *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx));
}
__kernel void split_vector_C2_D2 (__global ushort *mat_src, int src_step, int src_offset,
- __global ushort *mat_dst0, int dst0_step, int dst0_offset,
- __global ushort *mat_dst1, int dst1_step, int dst1_offset,
+ __global ushort *mat_dst0, int dst0_step, int dst0_offset,
+ __global ushort *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
x = x << 1;
#define dst0_align ((dst0_offset & 3) << 1)
#define dst1_align ((dst1_offset & 3) << 1)
- int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
- int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
+ int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
+ int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
- int dst0_start = mad24(y, dst0_step, dst0_offset);
+ int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
- int dst1_start = mad24(y, dst1_step, dst1_offset);
+ int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-
- ushort4 src_data_0 = vload4(0, (__global ushort *)((__global char *)mat_src + src_idx_0));
- ushort4 src_data_1 = vload4(0, (__global ushort *)((__global char *)mat_src + src_idx_1));
-
+
+ int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+ int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
+ ushort4 src_data_0 = vload4(0, (__global ushort *)((__global char *)mat_src + src1_index_fix));
+ ushort4 src_data_1 = vload4(0, (__global ushort *)((__global char *)mat_src + src2_index_fix));
+ if(src_idx_0 < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
+ src_data_0.xyzw = (src_idx_1 == -1) ? src_data_0.wxyz:tmp.xyzw;
+ }
+ if(src_idx_1 < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src_idx_1 == -2) ? src_data_1.zwxy : src_data_1.yzwx;
+ src_data_1.xyzw = (src_idx_1 == -1) ? src_data_1.wxyz : tmp.xyzw;
+ }
+
ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
}
}
__kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int src_offset,
- __global short *mat_dst0, int dst0_step, int dst0_offset,
- __global short *mat_dst1, int dst1_step, int dst1_offset,
- __global short *mat_dst2, int dst2_step, int dst2_offset,
+ __global short *mat_dst0, int dst0_step, int dst0_offset,
+ __global short *mat_dst1, int dst1_step, int dst1_offset,
+ __global short *mat_dst2, int dst2_step, int dst2_offset,
__global short *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
x = x << 1;
- int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
- int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
+ int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
+ int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
- int dst0_start = mad24(y, dst0_step, dst0_offset);
+ int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
- int dst1_start = mad24(y, dst1_step, dst1_offset);
+ int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
- int dst2_start = mad24(y, dst2_step, dst2_offset);
+ int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
- int dst3_start = mad24(y, dst3_step, dst3_offset);
+ int dst3_start = mad24(y, dst3_step, dst3_offset);
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
-
- short8 src_data0 = vload8(0, (__global short *)((__global char *)mat_src + src_idx_0));
+ int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+ short8 src_data0 = vload8(0,(__global short *)((__global char *)mat_src + src_idx_0));
+
+ if(src_idx_0 == -6)
+ src_data0.s01234567 = src_data0.s67012345;
+ if(src_idx_0 == -4)
+ src_data0.s01234567 = src_data0.s45670123;
+ if(src_idx_0 == -2)
+ src_data0.s01234567 = src_data0.s23456701;
+
short4 src_data1 = *((__global short4 *)((__global char *)mat_src + src_idx_1));
short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
}
}
__kernel void split_vector_C3_D3 (__global short *mat_src, int src_step, int src_offset,
- __global short *mat_dst0, int dst0_step, int dst0_offset,
- __global short *mat_dst1, int dst1_step, int dst1_offset,
- __global short *mat_dst2, int dst2_step, int dst2_offset,
+ __global short *mat_dst0, int dst0_step, int dst0_offset,
+ __global short *mat_dst1, int dst1_step, int dst1_offset,
+ __global short *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
x = x << 1;
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
- int dst0_start = mad24(y, dst0_step, dst0_offset);
+ int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
- int dst1_start = mad24(y, dst1_step, dst1_offset);
+ int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
- int dst2_start = mad24(y, dst2_step, dst2_offset);
+ int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
-
+
short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
short2 dst2_data = *((__global short2 *)((__global char *)mat_dst2 + dst2_idx));
__kernel void split_vector_C2_D3 (__global short *mat_src, int src_step, int src_offset,
- __global short *mat_dst0, int dst0_step, int dst0_offset,
- __global short *mat_dst1, int dst1_step, int dst1_offset,
+ __global short *mat_dst0, int dst0_step, int dst0_offset,
+ __global short *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
x = x << 1;
#define dst0_align ((dst0_offset & 3) << 1)
#define dst1_align ((dst1_offset & 3) << 1)
- int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
- int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
+ int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
+ int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
- int dst0_start = mad24(y, dst0_step, dst0_offset);
+ int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
- int dst1_start = mad24(y, dst1_step, dst1_offset);
+ int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-
+ int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+ int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
short4 src_data_0 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_0));
short4 src_data_1 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_1));
+ if(src_idx_0 < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
+ src_data_0.xyzw = (src_idx_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
+ }
+ if(src_idx_1< 0)
+ {
+ short4 tmp;
+ tmp.xyzw = ( src_idx_1== -2) ? src_data_1.zwxy : src_data_1.yzwx;
+ src_data_1.xyzw = ( src_idx_1== -1) ? src_data_1.wxyz : tmp.xyzw;
+ }
+
short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
}
}
__kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src_offset,
- __global int *mat_dst0, int dst0_step, int dst0_offset,
- __global int *mat_dst1, int dst1_step, int dst1_offset,
- __global int *mat_dst2, int dst2_step, int dst2_offset,
+ __global int *mat_dst0, int dst0_step, int dst0_offset,
+ __global int *mat_dst1, int dst1_step, int dst1_offset,
+ __global int *mat_dst2, int dst2_step, int dst2_offset,
__global int *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset);
int dst3_idx = mad24(y, dst3_step, dst3_offset);
-
+
int4 src_data = ((__global int4 *)((__global char *)mat_src + src_idx))[x];
((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
}
}
__kernel void split_vector_C3_D4 (__global int *mat_src, int src_step, int src_offset,
- __global int *mat_dst0, int dst0_step, int dst0_offset,
- __global int *mat_dst1, int dst1_step, int dst1_offset,
- __global int *mat_dst2, int dst2_step, int dst2_offset,
+ __global int *mat_dst0, int dst0_step, int dst0_offset,
+ __global int *mat_dst1, int dst1_step, int dst1_offset,
+ __global int *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset);
}
__kernel void split_vector_C2_D4 (__global int *mat_src, int src_step, int src_offset,
- __global int *mat_dst0, int dst0_step, int dst0_offset,
- __global int *mat_dst1, int dst1_step, int dst1_offset,
+ __global int *mat_dst0, int dst0_step, int dst0_offset,
+ __global int *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
-
+
int2 src_data = ((__global int2 *)((__global char *)mat_src + src_idx))[x];
((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
}
__kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int src_offset,
- __global float *mat_dst0, int dst0_step, int dst0_offset,
- __global float *mat_dst1, int dst1_step, int dst1_offset,
- __global float *mat_dst2, int dst2_step, int dst2_offset,
+ __global float *mat_dst0, int dst0_step, int dst0_offset,
+ __global float *mat_dst1, int dst1_step, int dst1_offset,
+ __global float *mat_dst2, int dst2_step, int dst2_offset,
__global float *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset);
int dst3_idx = mad24(y, dst3_step, dst3_offset);
-
+
float4 src_data = ((__global float4 *)((__global char *)mat_src + src_idx))[x];
((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
}
__kernel void split_vector_C3_D5 (__global float *mat_src, int src_step, int src_offset,
- __global float *mat_dst0, int dst0_step, int dst0_offset,
- __global float *mat_dst1, int dst1_step, int dst1_offset,
- __global float *mat_dst2, int dst2_step, int dst2_offset,
+ __global float *mat_dst0, int dst0_step, int dst0_offset,
+ __global float *mat_dst1, int dst1_step, int dst1_offset,
+ __global float *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset);
}
__kernel void split_vector_C2_D5 (__global float *mat_src, int src_step, int src_offset,
- __global float *mat_dst0, int dst0_step, int dst0_offset,
- __global float *mat_dst1, int dst1_step, int dst1_offset,
+ __global float *mat_dst0, int dst0_step, int dst0_offset,
+ __global float *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
-
+
float2 src_data = ((__global float2 *)((__global char *)mat_src + src_idx))[x];
((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
#if defined (DOUBLE_SUPPORT)
__kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int src_offset,
- __global double *mat_dst0, int dst0_step, int dst0_offset,
- __global double *mat_dst1, int dst1_step, int dst1_offset,
- __global double *mat_dst2, int dst2_step, int dst2_offset,
+ __global double *mat_dst0, int dst0_step, int dst0_offset,
+ __global double *mat_dst1, int dst1_step, int dst1_offset,
+ __global double *mat_dst2, int dst2_step, int dst2_offset,
__global double *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset);
int dst3_idx = mad24(y, dst3_step, dst3_offset);
-
+
double4 src_data = ((__global double4 *)((__global char *)mat_src + src_idx))[x];
((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
}
__kernel void split_vector_C3_D6 (__global double *mat_src, int src_step, int src_offset,
- __global double *mat_dst0, int dst0_step, int dst0_offset,
- __global double *mat_dst1, int dst1_step, int dst1_offset,
- __global double *mat_dst2, int dst2_step, int dst2_offset,
+ __global double *mat_dst0, int dst0_step, int dst0_offset,
+ __global double *mat_dst1, int dst1_step, int dst1_offset,
+ __global double *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset);
}
__kernel void split_vector_C2_D6 (__global double *mat_src, int src_step, int src_offset,
- __global double *mat_dst0, int dst0_step, int dst0_offset,
- __global double *mat_dst1, int dst1_step, int dst1_offset,
+ __global double *mat_dst0, int dst0_step, int dst0_offset,
+ __global double *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
-
+
double2 src_data = ((__global double2 *)((__global char *)mat_src + src_idx))[x];
((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth()))
{
- matchTemplateNaive_SQDIFF(image, templ, result, image.channels());
+ matchTemplateNaive_SQDIFF(image, templ, result, image.oclchannels());
return;
}
else
CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
|| ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F)
);
- CV_Assert(image.channels() == templ.channels() && (image.channels() == 1 || image.oclchannels() == 4) && result.channels() == 1);
+ CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1);
CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
Context *clCxt = image.clCxt;
result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth()))
{
- matchTemplateNaive_CCORR(image, templ, result, image.channels());
+ matchTemplateNaive_CCORR(image, templ, result, image.oclchannels());
return;
}
else
image.convertTo(buf.imagef, CV_32F);
templ.convertTo(buf.templf, CV_32F);
}
- CV_Assert(image.channels() == 1);
- oclMat o_result(image.size(), CV_MAKETYPE(CV_32F, image.channels()));
+ CV_Assert(image.oclchannels() == 1);
+ oclMat o_result(image.size(), CV_MAKETYPE(CV_32F, image.oclchannels()));
filter2D(buf.imagef, o_result, CV_32F, buf.templf, Point(0, 0));
result = o_result(Rect(0, 0, image.rows - templ.rows + 1, image.cols - templ.cols + 1));
}
CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
|| ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F)
);
- CV_Assert(image.channels() == templ.channels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.channels() == 1);
+ CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1);
CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
Context *clCxt = image.clCxt;
args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
// to be continued in the following section
- if(image.channels() == 1)
+ if(image.oclchannels() == 1)
{
buf.image_sums.resize(1);
integral(image, buf.image_sums[0]);
buf.image_sums.resize(buf.images.size());
- for(int i = 0; i < image.channels(); i ++)
+ for(int i = 0; i < image.oclchannels(); i ++)
{
integral(buf.images[i], buf.image_sums[i]);
}
args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
args.push_back( make_pair( sizeof(cl_float), (void *)&scale) );
// to be continued in the following section
- if(image.channels() == 1)
+ if(image.oclchannels() == 1)
{
buf.image_sums.resize(1);
buf.image_sqsums.resize(1);
////////////////////////////////////////////////////////////////////////
// convert_C3C4
-void convert_C3C4(const cl_mem &src, oclMat &dst, int srcStep)
+void convert_C3C4(const cl_mem &src, oclMat &dst)
{
int dstStep_in_pixel = dst.step1() / dst.oclchannels();
int pixel_end = dst.wholecols * dst.wholerows - 1;
}
////////////////////////////////////////////////////////////////////////
// convert_C4C3
-void convert_C4C3(const oclMat &src, cl_mem &dst, int dstStep)
+void convert_C4C3(const oclMat &src, cl_mem &dst)
{
int srcStep_in_pixel = src.step1() / src.oclchannels();
int pixel_end = src.wholecols * src.wholerows - 1;
openCLVerifyCall(err);
openCLMemcpy2D(clCxt, temp, pitch, m.datastart, m.step, wholeSize.width * m.elemSize(), wholeSize.height, clMemcpyHostToDevice, 3);
- convert_C3C4(temp, *this, pitch);
+ convert_C3C4(temp, *this);
//int* cputemp=new int[wholeSize.height*wholeSize.width * 3];
//int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
//openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE,
(pitch * wholerows + tail_padding - 1) / tail_padding * tail_padding, 0, &err);
openCLVerifyCall(err);
- convert_C4C3(*this, temp, pitch / m.elemSize1());
+ convert_C4C3(*this, temp);
openCLMemcpy2D(clCxt, m.data, m.step, temp, pitch, wholecols * m.elemSize(), wholerows, clMemcpyDeviceToHost, 3);
//int* cputemp=new int[wholecols*wholerows * 3];
//int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
CV_DbgAssert(!this->empty());
m.create(size(), type());
openCLCopyBuffer2D(clCxt, m.data, m.step, m.offset,
- data, step, cols * elemSize(), rows, offset, clMemcpyDeviceToDevice);
+ data, step, cols * elemSize(), rows, offset);
}
void cv::ocl::oclMat::copyTo( oclMat &mat, const oclMat &mask) const
globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1];
globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2];
- size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
- cv::ocl::openCLVerifyKernel(clCxt, kernel, &blockSize, globalThreads, localThreads);
+ //size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
+ cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads);
}
- for(int i = 0; i < args.size(); i ++)
+ for(size_t i = 0; i < args.size(); i ++)
openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
#define __OPENCV_PRECOMP_H__
#if _MSC_VER >= 1200
-#pragma warning( disable: 4251 4710 4711 4514 4996 )
+#pragma warning( disable: 4244 4251 4710 4711 4514 4996 )
#endif
#ifdef HAVE_CVCONFIG_H
#if defined __APPLE__
#include <OpenCL/OpenCL.h>
#else
-#include <CL/cl.h>
+#include <CL/opencl.h>
#endif
#include "safe_call.hpp"
size_t width, size_t height, enum openCLMemcpyKind kind, int channels = -1);
void openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset,
const void *src, size_t spitch,
- size_t width, size_t height, int src_offset, enum openCLMemcpyKind kind);
+ size_t width, size_t height, int src_offset);
void openCLFree(void *devPtr);
cl_mem openCLCreateBuffer(Context *clCxt, size_t flag, size_t size);
void openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size);
const char **source, string kernelName);
cl_kernel openCLGetKernelFromSource(const Context *clCxt,
const char **source, string kernelName, const char *build_options);
- void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *blockSize,
- size_t *globalThreads, size_t *localThreads);
+ void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads);
void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName, vector< std::pair<size_t, const void *> > &args,
int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1);
void openCLExecuteKernel_(Context *clCxt , const char **source, string kernelName,
extern const char *pyrlk;
extern const char *operator_setTo;
extern const char *operator_convertTo;
+ extern const char *operator_copyToM;
extern const char *arithm_mul;
extern const char *pyr_down;
}
return src;
}
+///////////////////////////////////////////////////////////////////////////
+////////////////////////////////// CopyTo /////////////////////////////////
+///////////////////////////////////////////////////////////////////////////
+void copy_to_with_mask_cus(const oclMat &src, oclMat &dst, const oclMat &mask, string kernelName)
+{
+ CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols &&
+ src.rows == dst.rows && src.cols == dst.cols
+ && mask.type() == CV_8UC1);
+
+ vector<pair<size_t , const void *> > args;
+
+ std::string string_types[4][7] = {{"uchar", "char", "ushort", "short", "int", "float", "double"},
+ {"uchar2", "char2", "ushort2", "short2", "int2", "float2", "double2"},
+ {"uchar3", "char3", "ushort3", "short3", "int3", "float3", "double3"},
+ {"uchar4", "char4", "ushort4", "short4", "int4", "float4", "double4"}
+ };
+ char compile_option[32];
+ sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.oclchannels() - 1][dst.depth()].c_str());
+ size_t localThreads[3] = {16, 16, 1};
+ size_t globalThreads[3];
+
+ globalThreads[0] = divUp(dst.cols, localThreads[0]) * localThreads[0];
+ globalThreads[1] = divUp(dst.rows, localThreads[1]) * localThreads[1];
+ globalThreads[2] = 1;
+
+ int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize();
+ int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize();
+
+ args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
+ args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data ));
+ args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data ));
+ args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
+ args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
+ args.push_back( make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel ));
+ args.push_back( make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel ));
+ args.push_back( make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel ));
+ args.push_back( make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel ));
+ args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.step ));
+ args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.offset ));
+
+ openCLExecuteKernel2(dst.clCxt , &operator_copyToM, kernelName, globalThreads,
+ localThreads, args, -1, -1, compile_option, CLFLUSH);
+}
+
+void copyTo(const oclMat &src, oclMat &m )
+{
+ CV_DbgAssert(!src.empty());
+ m.create(src.size(), src.type());
+ openCLCopyBuffer2D(src.clCxt, m.data, m.step, m.offset,
+ src.data, src.step, src.cols * src.elemSize(), src.rows, src.offset);
+}
+
+void copyTo(const oclMat &src, oclMat &mat, const oclMat &mask)
+{
+ if (mask.empty())
+ {
+ copyTo(src, mat);
+ }
+ else
+ {
+ mat.create(src.size(), src.type());
+ copy_to_with_mask_cus(src, mat, mask, "copy_to_with_mask");
+ }
+}
+
void arithmetic_run(const oclMat &src1, oclMat &dst, string kernelName, const char **kernelString, void *_scalar)
{
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
nextPyr_.resize(maxLevel + 1);
prevPyr_[0] = prevImg;
- nextImg.convertTo(nextPyr_[0], CV_32F);
+ //nextImg.convertTo(nextPyr_[0], CV_32F);
+ convertTo(nextImg, nextPyr_[0], CV_32F);
for (int level = 1; level <= maxLevel; ++level)
{
- pyrDown(prevPyr_[level - 1], prevPyr_[level]);
- pyrDown(nextPyr_[level - 1], nextPyr_[level]);
+ pyrDown_cus(prevPyr_[level - 1], prevPyr_[level]);
+ pyrDown_cus(nextPyr_[level - 1], nextPyr_[level]);
}
ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[0]);
ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]);
ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]);
ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]);
- uPyr_[1].setTo(Scalar::all(0));
- vPyr_[1].setTo(Scalar::all(0));
+ //uPyr_[1].setTo(Scalar::all(0));
+ //vPyr_[1].setTo(Scalar::all(0));
+ setTo(uPyr_[1], Scalar::all(0));
+ setTo(vPyr_[1], Scalar::all(0));
Size winSize2i(winSize.width, winSize.height);
idx = idx2;
}
- uPyr_[idx].copyTo(u);
- vPyr_[idx].copyTo(v);
+ //uPyr_[idx].copyTo(u);
+ //vPyr_[idx].copyTo(v);
+ copyTo(uPyr_[idx], u);
+ copyTo(vPyr_[idx], v);
+
+ clFinish(prevImg.clCxt->impl->clCmdQueue);
}
#endif /* !defined (HAVE_CUDA) */
#endif
}
-
+std::string workdir;
int main(int argc, char **argv)
{
TS::ptr()->init("ocl");
InitGoogleTest(&argc, argv);
+ const char *keys =
+ "{ h | help | false | print help message }"
+ "{ w | workdir | ../../../samples/c/| set working directory }"
+ "{ t | type | gpu | set device type:cpu or gpu}"
+ "{ p | platform | 0 | set platform id }"
+ "{ d | device | 0 | set device id }";
- print_info();
+ CommandLineParser cmd(argc, argv, keys);
+ if (cmd.get<bool>("help"))
+ {
+ cout << "Avaible options besides goole test option:" << endl;
+ cmd.printParams();
+ return 0;
+ }
+ workdir = cmd.get<string>("workdir");
+ string type = cmd.get<string>("type");
+ unsigned int pid = cmd.get<unsigned int>("platform");
+ int device = cmd.get<int>("device");
+ print_info();
+ int flag = CVCL_DEVICE_TYPE_GPU;
+ if(type == "cpu")
+ {
+ flag = CVCL_DEVICE_TYPE_CPU;
+ }
std::vector<cv::ocl::Info> oclinfo;
- int devnums = getDevice(oclinfo);
- if(devnums < 1)
+ int devnums = getDevice(oclinfo, flag);
+ if(devnums <= device || device < 0)
+ {
+ std::cout << "device invalid\n";
+ return -1;
+ }
+ if(pid >= oclinfo.size())
{
- std::cout << "no device found\n";
+ std::cout << "platform invalid\n";
return -1;
}
- //setDevice(oclinfo[1]);
+ if(pid != 0 || device != 0)
+ {
+ setDevice(oclinfo[pid], device);
+ }
+ cout << "Device type:" << type << endl << "Device name:" << oclinfo[pid].DeviceName[device] << endl;
return RUN_ALL_TESTS();
}
#include "interpolation.hpp"
//#include "add_test_info.h"
-#define OPENCV_DEFAULT_OPENCL_DEVICE CVCL_DEVICE_TYPE_GPU
-
#endif
void random_roi()
{
- cv::RNG &rng = TS::ptr()->get_rng();
-
#ifdef RANDOMROI
//randomize ROI
+ cv::RNG &rng = TS::ptr()->get_rng();
roicols = rng.uniform(1, mat1.cols);
roirows = rng.uniform(1, mat1.rows);
src1x = rng.uniform(0, mat1.cols - roicols);
char s[1024];
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
- EXPECT_MAT_NEAR(dst, cpu_dst, 1, s);
+ EXPECT_MAT_NEAR(dst, cpu_dst, 2, s);
}
}
cv::Point minLoc_, maxLoc_;
cv::ocl::minMaxLoc(gmat1, &minVal_, &maxVal_, &minLoc_, &maxLoc_, cv::ocl::oclMat());
- double error0, error1, minlocVal, minlocVal_, maxlocVal, maxlocVal_;
+ double error0 = 0., error1 = 0., minlocVal = 0., minlocVal_ = 0., maxlocVal = 0., maxlocVal_ = 0.;
if(depth == 0)
{
minlocVal = mat1_roi.at<unsigned char>(minLoc);
cv::Point minLoc_, maxLoc_;
cv::ocl::minMaxLoc(gmat1, &minVal_, &maxVal_, &minLoc_, &maxLoc_, gmask);
- double error0, error1, minlocVal, minlocVal_, maxlocVal, maxlocVal_;
+ double error0 = 0., error1 = 0., minlocVal = 0., minlocVal_ = 0., maxlocVal = 0., maxlocVal_ = 0.;
if(minLoc_.x == -1 || minLoc_.y == -1 || maxLoc_.x == -1 || maxLoc_.y == -1) continue;
if(depth == 0)
{
#include "precomp.hpp"
#ifdef HAVE_OPENCL
-#ifdef WIN32
-#define FILTER_IMAGE "C:/Users/Public/Pictures/Sample Pictures/Penguins.jpg"
-#else
-#define FILTER_IMAGE "/Users/Test/Valve_original.PNG" // user need to specify a valid image path
-#endif
#define SHOW_RESULT 0
////////////////////////////////////////////////////////
// Canny
-
+extern std::string workdir;
IMPLEMENT_PARAM_CLASS(AppertureSize, int);
IMPLEMENT_PARAM_CLASS(L2gradient, bool);
TEST_P(Canny, Accuracy)
{
- cv::Mat img = readImage(FILTER_IMAGE, cv::IMREAD_GRAYSCALE);
+ cv::Mat img = readImage(workdir + "fruits.jpg", cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(img.empty());
double low_thresh = 50.0;
INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Canny, testing::Combine(
testing::Values(AppertureSize(3), AppertureSize(5)),
testing::Values(L2gradient(false), L2gradient(true))));
-#endif
+#endif
\ No newline at end of file
using namespace testing;
using namespace std;
using namespace cv;
-
+extern string workdir;
struct getRect
{
Rect operator ()(const CvAvgComp &e) const
{
scale = 1.0;
index = 0;
- string cascadeName = "../../../data/haarcascades/haarcascade_frontalface_alt.xml";
+ string cascadeName = workdir + "../../data/haarcascades/haarcascade_frontalface_alt.xml";
if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)))
{
cout << "ERROR: Could not load classifier cascade" << endl;
- cout << "Usage: facedetect [--cascade=<cascade_path>]\n"
- " [--scale[=<image scale>\n"
- " [filename|camera_index]\n" << endl ;
return;
}
//int devnums = getDevice(oclinfo);
TEST_F(Haar, FaceDetect)
{
- string imgName = "../../../samples/c/lena.jpg";
+ string imgName = workdir + "lena.jpg";
Mat img = imread( imgName, 1 );
if(img.empty())
{
- std::cout << "Couldn't read test" << index << ".jpg" << std::endl;
+ std::cout << "Couldn't read " << imgName << std::endl;
return ;
}
- int i = 0;
- double t = 0;
+ //int i = 0;
+ //double t = 0;
vector<Rect> faces, oclfaces;
const static Scalar colors[] = { CV_RGB(0, 0, 255),
using namespace std;
#ifdef HAVE_OPENCL
-
+extern string workdir;
PARAM_TEST_CASE(HOG, cv::Size, int)
{
cv::Size winSize;
TEST_P(HOG, GetDescriptors)
{
// Load image
- cv::Mat img_rgb = readImage("../../../samples/gpu/road.png");
+ cv::Mat img_rgb = readImage(workdir + "lena.jpg");
ASSERT_FALSE(img_rgb.empty());
// Convert image
TEST_P(HOG, Detect)
{
// Load image
- cv::Mat img_rgb = readImage("../../../samples/gpu/road.png");
+ cv::Mat img_rgb = readImage(workdir + "lena.jpg");
ASSERT_FALSE(img_rgb.empty());
// Convert image
int bordertype[] = {cv::BORDER_CONSTANT, cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/};
const char *borderstr[] = {"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/};
// for(int i = 0; i < sizeof(bordertype)/sizeof(int); i++)
- for(int j = 0; j < 100; j++)
+ for(int j = 0; j < LOOP_TIMES; j++)
{
random_roi();
cv::remap(src_roi, dst_roi, map1_roi, map2_roi, interpolation, bordertype[0], val);
gdst.download(cpu_dst);
char sss[1024];
- sprintf(sss, "src_roicols=%d,src_roirows=%d,dst_roicols=%d,dst_roirows=%d,src1x =%d,src1y=%d,dstx=%d,dsty=%d", src_roicols, src_roirows, dst_roicols, dst_roirows, srcx, srcy, dstx, dsty);
+ sprintf(sss, "src_roicols=%d,src_roirows=%d,dst_roicols=%d,dst_roirows=%d,src1x =%d,src1y=%d,dstx=%d,dsty=%d bordertype=%s", src_roicols, src_roirows, dst_roicols, dst_roirows, srcx, srcy, dstx, dsty, borderstr[0]);
if(interpolation == 0)
gdst.download(cpu_gdst);
char sss[1024];
+ char warning[300] = "Warning: If the selected device doesn't support double, a deviation will exist.\nIf the accuracy is acceptable, please ignore it.\n";
sprintf(sss, "roicols=%d,roirows=%d,srcx=%d,srcy=%d,dstx=%d,dsty=%d\n", roicols, roirows, srcx, srcy, dstx, dsty);
+ strcat(sss, warning);
EXPECT_MAT_NEAR(dst, cpu_gdst, 0.0, sss);
}
gdstCoor.download(cpu_gdstCoor);
char sss[1024];
+ char warning[300] = "Warning: If the selected device doesn't support double, a deviation will exist.\nIf the accuracy is acceptable, please ignore it.\n";
sprintf(sss, "roicols=%d,roirows=%d,srcx=%d,srcy=%d,dstx=%d,dsty=%d\n", roicols, roirows, srcx, srcy, dstx, dsty);
+ strcat(sss, warning);
EXPECT_MAT_NEAR(dst, cpu_gdst, 0.0, sss);
EXPECT_MAT_NEAR(dstCoor, cpu_gdstCoor, 0.0, sss);
}
));
INSTANTIATE_TEST_CASE_P(Imgproc, Remap, Combine(
- Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4),
+ Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
Values(CV_32FC1, CV_16SC2, CV_32FC2), Values(-1, CV_32FC1),
Values((int)cv::INTER_NEAREST, (int)cv::INTER_LINEAR),
Values((int)cv::BORDER_CONSTANT)));
ONE_TYPE(CV_32SC1) //no use
));
-INSTANTIATE_TEST_CASE_P(ConvolveTestBase, Convolve, Combine(
- Values(CV_32FC1, CV_32FC1),
- Values(false))); // Values(false) is the reserved parameter
+//INSTANTIATE_TEST_CASE_P(ConvolveTestBase, Convolve, Combine(
+// Values(CV_32FC1, CV_32FC1),
+// Values(false))); // Values(false) is the reserved parameter
#endif // HAVE_OPENCL
#include "precomp.hpp"
-#define PERF_TEST 0
+//#define PERF_TEST 0
#ifdef HAVE_OPENCL
////////////////////////////////////////////////////////////////////////////////
// MatchTemplate
#endif // PERF_TEST
}
-//INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U,
-// testing::Combine(
-// MTEMP_SIZES,
-// testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
-// testing::Values(Channels(1), Channels(3), Channels(4)),
-// ALL_TEMPLATE_METHODS
-// )
-// );
-//
-//INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate32F, testing::Combine(
-// MTEMP_SIZES,
-// testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
-// testing::Values(Channels(1), Channels(3), Channels(4)),
-// testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))));
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U,
+ testing::Combine(
+ MTEMP_SIZES,
+ testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
+ testing::Values(Channels(1), Channels(3), Channels(4)),
+ ALL_TEMPLATE_METHODS
+ )
+ );
+
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate32F, testing::Combine(
+ MTEMP_SIZES,
+ testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
+ testing::Values(Channels(1), Channels(3), Channels(4)),
+ testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))));
#endif
/////////////////////////////////////////////////////////////////////////////////////////////////
// BroxOpticalFlow
-
+extern string workdir;
#define BROX_OPTICAL_FLOW_DUMP_FILE "opticalflow/brox_optical_flow.bin"
#define BROX_OPTICAL_FLOW_DUMP_FILE_CC20 "opticalflow/brox_optical_flow_cc20.bin"
TEST_P(Sparse, Mat)
{
- cv::Mat frame0 = readImage("../../../samples/gpu/rubberwhale1.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
+ cv::Mat frame0 = readImage(workdir + "../gpu/rubberwhale1.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
ASSERT_FALSE(frame0.empty());
- cv::Mat frame1 = readImage("../../../samples/gpu/rubberwhale2.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
+ cv::Mat frame1 = readImage(workdir + "../gpu/rubberwhale2.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
ASSERT_FALSE(frame1.empty());
cv::Mat gray_frame;