From cf04fed3690211f59c5e671f4aaddc4a71ff0808 Mon Sep 17 00:00:00 2001 From: niko Date: Mon, 30 Jul 2012 14:34:36 +0800 Subject: [PATCH] fix the compilation bugs --- cmake/OpenCVDetectOpenCL.cmake | 2 +- modules/ocl/cl2cpp.py | 2 +- modules/ocl/src/imgproc.cpp | 97 ++- modules/ocl/src/kernels/imgproc_remap.cl | 1187 ++++++++++++++++++++---------- modules/ocl/test/precomp.hpp | 2 +- modules/ocl/test/test_imgproc.cpp | 24 +- 6 files changed, 880 insertions(+), 434 deletions(-) diff --git a/cmake/OpenCVDetectOpenCL.cmake b/cmake/OpenCVDetectOpenCL.cmake index 3b021bc..903b55b 100644 --- a/cmake/OpenCVDetectOpenCL.cmake +++ b/cmake/OpenCVDetectOpenCL.cmake @@ -2,7 +2,7 @@ if(APPLE) set(OPENCL_FOUND YES) set(OPENCL_LIBRARIES "-framework OpenCL") else() - find_package(OpenCL) + find_package(OpenCL QUIET) # Try AMD/ATI Stream SDK if (NOT OPENCL_FOUND) diff --git a/modules/ocl/cl2cpp.py b/modules/ocl/cl2cpp.py index b6b5fa6..9c8d9cf 100644 --- a/modules/ocl/cl2cpp.py +++ b/modules/ocl/cl2cpp.py @@ -14,7 +14,7 @@ cl_list = glob.glob(os.path.join(indir, "*.cl")) kfile = open(outname, "wt") kfile.write("""// This file is auto-generated. Do not edit! -#include "precomp.hpp" +//#include "precomp.hpp" namespace cv { namespace ocl diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp index 00fba08..fd07df5 100644 --- a/modules/ocl/src/imgproc.cpp +++ b/modules/ocl/src/imgproc.cpp @@ -260,7 +260,7 @@ namespace cv CV_Assert((!map2.data || map2.size()== map1.size())); dst.create(map1.size(), src.type()); - int depth = src.depth(), map_depth = map1.depth(); + string kernelName; @@ -279,28 +279,107 @@ namespace cv kernelName = "remapNNSConstant"; } - int type = src.type(); - size_t blkSizeX = 16, blkSizeY = 16; - size_t glbSizeX; - - if(src.type() == CV_8UC1 || src.type() == CV_8UC2 || src.type() == CV_8UC4) + int channels = dst.channels(); + int depth = dst.depth(); + int type = src.type(); + size_t blkSizeX = 16, blkSizeY = 16; + size_t glbSizeX; + int cols = dst.cols; + if(src.type() == CV_8UC1) + { + cols = (dst.cols + dst.offset%4 + 3)/4; + glbSizeX = cols %blkSizeX==0 ? cols : (cols/blkSizeX+1)*blkSizeX; + + } + else if(src.type() == CV_8UC4 || src.type() == CV_32FC1) { - size_t cols = (dst.cols + dst.offset%4 + 3)/4; + cols = (dst.cols + (dst.offset>>2)%4 + 3)/4; glbSizeX = cols %blkSizeX==0 ? cols : (cols/blkSizeX+1)*blkSizeX; } else { glbSizeX = dst.cols%blkSizeX==0 ? dst.cols : (dst.cols/blkSizeX+1)*blkSizeX; + } + size_t glbSizeY = dst.rows%blkSizeY==0 ? dst.rows : (dst.rows/blkSizeY+1)*blkSizeY; size_t globalThreads[3] = {glbSizeX,glbSizeY,1}; size_t localThreads[3] = {blkSizeX,blkSizeY,1}; + /* + ///////////////////////////// + //using the image buffer + ///////////////////////////// + + size_t image_row_pitch = 0; + cl_int err1, err2, err3; + cl_mem_flags flags1 = CL_MEM_READ_ONLY; + cl_image_format format; + if(src.type() == CV_8UC1) + { + format.image_channel_order = CL_R; + format.image_channel_data_type = CL_UNSIGNED_INT8; + } + else if(src.type() == CV_8UC4) + { + format.image_channel_order = CL_RGBA; + format.image_channel_data_type = CL_UNSIGNED_INT8; + } + else if(src.type() == CV_32FC1) + { + format.image_channel_order = CL_R; + format.image_channel_data_type = CL_FLOAT; + } + else if(src.type() == CV_32FC4) + { + format.image_channel_order = CL_RGBA; + format.image_channel_data_type = CL_FLOAT; + } + cl_mem srcImage = clCreateImage2D(clCxt->impl->clContext, flags1, &format, src.cols, src.rows, + image_row_pitch, NULL, &err1); + if(err1 != CL_SUCCESS) + { + printf("Error creating CL image buffer, error code %d\n", err1); + return; + } + const size_t src_origin[3] = {0, 0, 0}; + const size_t region[3] = {src.cols, src.rows, 1}; + cl_event BtoI_event, ItoB_event; + err3 = clEnqueueCopyBufferToImage(clCxt->impl->clCmdQueue, (cl_mem)src.data, srcImage, + 0, src_origin, region, 0, NULL, NULL); + if(err3 != CL_SUCCESS) + { + printf("Error copying buffer to image\n"); + printf("Error code %d \n", err3); + return; + } + // clWaitForEvents(1, &BtoI_event); + + cl_int ret; + Mat test(src.rows, src.cols, CV_8UC1); + memset(test.data, 0, src.rows*src.cols); + ret = clEnqueueReadImage(clCxt->impl->clCmdQueue, srcImage, CL_TRUE, + src_origin, region, 0, 0, test.data, NULL, NULL, &ItoB_event); + if(ret != CL_SUCCESS) + { + printf("read image error, %d ", ret); + return; + } + clWaitForEvents(1, &ItoB_event); + + cout << "src" << endl; + cout << src << endl; + cout<<"image:"< > args; if(map1.channels() == 2) { args.push_back( make_pair(sizeof(cl_mem),(void*)&dst.data)); args.push_back( make_pair(sizeof(cl_mem),(void*)&src.data)); + // args.push_back( make_pair(sizeof(cl_mem),(void*)&srcImage)); //imageBuffer args.push_back( make_pair(sizeof(cl_mem),(void*)&map1.data)); args.push_back( make_pair(sizeof(cl_int),(void*)&dst.offset)); args.push_back( make_pair(sizeof(cl_int),(void*)&src.offset)); @@ -314,12 +393,10 @@ namespace cv args.push_back( make_pair(sizeof(cl_int),(void*)&dst.rows)); args.push_back( make_pair(sizeof(cl_int),(void*)&map1.cols)); args.push_back( make_pair(sizeof(cl_int),(void*)&map1.rows)); + args.push_back( make_pair(sizeof(cl_int), (void *)&cols)); args.push_back( make_pair(sizeof(cl_double4),(void*)&borderValue)); } openCLExecuteKernel(clCxt,&imgproc_remap,kernelName,globalThreads,localThreads,args,src.channels(),src.depth()); - - - } //////////////////////////////////////////////////////////////////////////////////////////// diff --git a/modules/ocl/src/kernels/imgproc_remap.cl b/modules/ocl/src/kernels/imgproc_remap.cl index 81ea56d..38f30b8 100644 --- a/modules/ocl/src/kernels/imgproc_remap.cl +++ b/modules/ocl/src/kernels/imgproc_remap.cl @@ -43,513 +43,884 @@ // the use of this software, even if advised of the possibility of such damage. // //M*/ -#pragma OPENCL EXTENSION cl_amd_printf : enable +//#pragma OPENCL EXTENSION cl_amd_printf : enable #if defined DOUBLE_SUPPORT #pragma OPENCL EXTENSION cl_khr_fp64:enable +typedef double4 F4 ; +#else +typedef float4 F4; #endif + +///////////////////////////////////////////////////////// +///////////////////////using buffer////////////////////// +///////////////////////////////////////////////////////// __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsigned char const * restrict src, __global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, - int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , double4 nVal) + int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows, int threadCols, F4 nVal) { int x = get_global_id(0); int y = get_global_id(1); - /* - if(x < dst_cols && y < dst_rows) + + if(x < threadCols && y < dst_rows) { - int dstIdx = y * dst_step + x + dst_offset; - int map1Idx = y * (map1_step>>2) + x + (map1_offset>>2) - (map1_offset & 1); - short2 map1_data = *(map1 + map1Idx); - int srcIdx = map1_data.y*src_step+map1_data.x + src_offset; - uchar src_data = *(src +srcIdx); - uchar dst_data = src_data; - *(dst +dstIdx)=(map1_data.x >= map1_cols || map1_data.y >= map1_rows) ? val : dst_data; - } - */ + x = x << 2; + int gx = x - (dst_offset&3); + int4 Gx = (int4)(gx, gx+1, gx+2, gx+3); + + uchar4 nval =convert_uchar4(nVal); + uchar val = nval.s0; + + int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3); + + int map1Start = y * map1_step + (x << 2) + map1_offset - ((dst_offset & 3) << 2); + short8 map1_data; + + map1_data = *((__global short8 *)((__global char*)map1 + map1Start)); + int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even) + src_offset; - int gx = (x << 2) - (dst_offset&3); - int4 Gx = (int4)(gx, gx+1, gx+2, gx+3); + uchar4 src_data; - uchar4 nval =convert_uchar4(nVal); - char val = nval.s0; + src_data.s0 = *(src + srcIdx.s0); + src_data.s1 = *(src + srcIdx.s1); + src_data.s2 = *(src + srcIdx.s2); + src_data.s3 = *(src + srcIdx.s3); + uchar4 dst_data; + dst_data = convert_uchar4((convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows)))? (uchar4)(val) : src_data; + + __global uchar4* d = (__global uchar4 *)(dst + dstStart); - x = x << 2; + uchar4 dVal = *d; - int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3); + int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); + dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal; - int map1Start = y * map1_step + (x << 2) + map1_offset - ((dst_offset & 3) << 2); - short8 map1_data; + *d = dst_data; - map1_data.s01 = *((__global short2 *)((__global char*)map1 + map1Start)); - map1_data.s23 = *((__global short2 *)((__global char*)map1 + map1Start + 4)); - map1_data.s45 = *((__global short2 *)((__global char*)map1 + map1Start + 8)); - map1_data.s67 = *((__global short2 *)((__global char*)map1 + map1Start + 12)); - - int4 srcIdx ; - srcIdx.s0 = map1_data.s1 * src_step + map1_data.s0 + src_offset; - srcIdx.s1 = map1_data.s3 * src_step + map1_data.s2 + src_offset; - srcIdx.s2 = map1_data.s5 * src_step + map1_data.s4 + src_offset; - srcIdx.s3 = map1_data.s7 * src_step + map1_data.s6 + src_offset; - - //uchar4 src_data = *(src + srcIdx); - uchar4 src_data; - src_data.s0 = *(src + srcIdx.s0); - src_data.s1 = *(src + srcIdx.s1); - src_data.s2 = *(src + srcIdx.s2); - src_data.s3 = *(src + srcIdx.s3); - - uchar4 dst_data; - dst_data.s0 = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? val : src_data.s0; - dst_data.s1 = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? val : src_data.s1; - dst_data.s2 = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? val : src_data.s2; - dst_data.s3 = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? val : src_data.s3; + } + +} +__kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsigned char const * restrict src, + __global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, + int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows, int threadCols, F4 nVal) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if(x < threadCols && y < dst_rows) + { + x = x << 2; + int gx = x - (dst_offset&3); + int4 Gx = (int4)(gx, gx+1, gx+2, gx+3); + + uchar4 nval =convert_uchar4(nVal); + uchar val = nval.s0; + + int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3); + + int map1Start = y * map1_step + (x << 3) + map1_offset - ((dst_offset & 3) << 3); + float8 map1_data; + + map1_data = *((__global float8 *)((__global char*)map1 + map1Start)); + int8 map1_dataZ = convert_int8_sat_rte(map1_data); + int4 srcIdx = map1_dataZ.odd * src_step + map1_dataZ.even + src_offset; - __global uchar4* d = (__global uchar4 *)(dst + dstStart); + uchar4 src_data; + + src_data.s0 = *(src + srcIdx.s0); + src_data.s1 = *(src + srcIdx.s1); + src_data.s2 = *(src + srcIdx.s2); + src_data.s3 = *(src + srcIdx.s3); + uchar4 dst_data; + dst_data = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data; + __global uchar4* d = (__global uchar4 *)(dst + dstStart); + + uchar4 dVal = *d; - uchar4 dVal = *d; - int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); - dst_data = (convert_uchar4(con) != 0) ? dst_data : dVal; + int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); + dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal; + + *d = dst_data; + + } - *d = dst_data; } -__kernel void remapNNSConstant_C2_D0(__global unsigned char* dst, __global unsigned char const * restrict src, + +__kernel void remapNNSConstant_C4_D0(__global unsigned char* dst, __global unsigned char const * restrict src, __global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, - int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , double4 nVal) + int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows, int threadCols, F4 nVal) { int x = get_global_id(0); int y = get_global_id(1); - int gx = (x << 3) - (dst_offset&7); - int8 Gx = (int8)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7); - uchar4 nval =convert_uchar4(nVal); - uchar2 val = nval.s01;//testing + if(x < threadCols && y < dst_rows) + { + x = x << 4; + int gx = x - (dst_offset&15); + int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15); + uchar4 nval =convert_uchar4_sat_rte(nVal); + + int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15); + int map1Start = y * map1_step + x + map1_offset - (dst_offset&15 ); + short8 map1_data; + + map1_data = *((__global short8 *)((__global char*)map1 + map1Start)); + int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even <<(short4)2) + src_offset; + uchar4 src_a, src_b, src_c, src_d; + src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0)); + src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1)); + src_c = *((__global uchar4 *)((__global char*)src + srcIdx.s2)); + src_d = *((__global uchar4 *)((__global char*)src + srcIdx.s3)); + + uchar16 dst_data; + uchar4 dst_a, dst_b, dst_c, dst_d; + dst_a = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? nval : src_a; + dst_b = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? nval : src_b; + dst_c = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? nval : src_c; + dst_d = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? nval : src_d; + + dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d); + __global uchar16* d = (__global uchar16 *)(dst + dstStart); + + uchar16 dVal = *d; + + int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows); + dst_data = (convert_uchar16(con) != (uchar16)0) ? dst_data : dVal; + + *d = dst_data; + } - x = x << 3; +} +__kernel void remapNNFConstant_C4_D0(__global unsigned char* dst, __global unsigned char const * restrict src, + __global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, + int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows, int threadCols, F4 nVal) +{ + int x = get_global_id(0); + int y = get_global_id(1); - int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&7); + if(x < threadCols && y < dst_rows) + { + x = x << 4; + int gx = x - (dst_offset&15); + int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15); - int map1Start = y * map1_step + (x << 1) + map1_offset - (((dst_offset>>1) & 3) << 2); - short8 map1_data; + uchar4 nval =convert_uchar4(nVal); + + int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15); + + int map1Start = y * map1_step +(x << 1) + map1_offset - ((dst_offset&15) << 1); + float8 map1_data; + + map1_data = *((__global float8 *)((__global char*)map1 + map1Start)); + int8 map1_dataZ = convert_int8_sat_rte(map1_data); + + int4 srcIdx = map1_dataZ.odd * src_step + (map1_dataZ.even <<2) + src_offset; + uchar4 src_a, src_b, src_c, src_d; + src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0)); + src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1)); + src_c = *((__global uchar4 *)((__global char*)src + srcIdx.s2)); + src_d = *((__global uchar4 *)((__global char*)src + srcIdx.s3)); + + uchar16 dst_data; + uchar4 dst_a, dst_b, dst_c, dst_d; + dst_a = (map1_dataZ.s0 >= src_cols || map1_dataZ.s1 >= src_rows)? nval : src_a; + dst_b = (map1_dataZ.s2 >= src_cols || map1_dataZ.s3 >= src_rows)? nval : src_b; + dst_c = (map1_dataZ.s4 >= src_cols || map1_dataZ.s5 >= src_rows)? nval : src_c; + dst_d = (map1_dataZ.s6 >= src_cols || map1_dataZ.s7 >= src_rows)? nval : src_d; + + dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d); + __global uchar16* d = (__global uchar16 *)(dst + dstStart); + + uchar16 dVal = *d; + + int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows); + dst_data = (convert_uchar16(con) != (uchar16)0) ? dst_data : dVal; + + *d = dst_data; + + } - map1_data.s01 = *((__global short2 *)((__global char*)map1 + map1Start)); - map1_data.s23 = *((__global short2 *)((__global char*)map1 + map1Start + 4)); - map1_data.s45 = *((__global short2 *)((__global char*)map1 + map1Start + 8)); - map1_data.s67 = *((__global short2 *)((__global char*)map1 + map1Start + 12)); - - int4 srcIdx ; - srcIdx.s0 = map1_data.s1 * src_step + (map1_data.s0 << 1) + src_offset; - srcIdx.s1 = map1_data.s3 * src_step + (map1_data.s2 << 1) + src_offset; - srcIdx.s2 = map1_data.s5 * src_step + (map1_data.s4 << 1) + src_offset; - srcIdx.s3 = map1_data.s7 * src_step + (map1_data.s6 << 1) + src_offset; - - //uchar4 src_data = *(src + srcIdx); - uchar8 src_data; - src_data.s01 = *((__global uchar2 *)((__global char*)src + srcIdx.s0)); - src_data.s23 = *((__global uchar2 *)((__global char*)src + srcIdx.s1)); - src_data.s45 = *((__global uchar2 *)((__global char*)src + srcIdx.s2)); - src_data.s67 = *((__global uchar2 *)((__global char*)src + srcIdx.s3)); - - uchar8 dst_data; - dst_data.s01 = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows) ? val : (convert_uchar2(src_data.s01)); - dst_data.s23 = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows) ? val : (convert_uchar2(src_data.s23)); - dst_data.s45 = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows) ? val : (convert_uchar2(src_data.s45)); - dst_data.s67 = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows) ? val : (convert_uchar2(src_data.s67)); - __global uchar8* d = (__global uchar8 *)(dst + dstStart); - - uchar8 dVal = *d; - int8 con = (Gx >= 0 && Gx < (dst_cols << 1) && y >= 0 && y < dst_rows); - dst_data = (convert_uchar8(con) != 0) ? dst_data : dVal; - *d = dst_data; } -__kernel void remapNNSConstant_C4_D0(__global unsigned char* dst, __global unsigned char const * restrict src, + + +__kernel void remapNNSConstant_C1_D5(__global float* dst, __global float const * restrict src, __global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, - int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , double4 nVal) + int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows ,int threadCols, F4 nVal) { int x = get_global_id(0); int y = get_global_id(1); - int gx = (x << 4) - (dst_offset&15); - int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15); + + if(x < threadCols && y < dst_rows) + { + x = x << 4; - uchar4 nval =convert_uchar4(nVal); + int gx = x - (dst_offset&15); + int4 Gx = (int4)(gx, gx+4, gx+8, gx+12); - x = x << 4; + float4 nval =convert_float4(nVal); + float val = nval.s0; - int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15); + int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15); - int map1Start = y * map1_step + x + map1_offset - (((dst_offset>>2) & 3) << 2); - short8 map1_data; + int map1Start = y * map1_step + x + map1_offset - (dst_offset&15); + short8 map1_data; - map1_data.s01 = *((__global short2 *)((__global char*)map1 + map1Start)); - map1_data.s23 = *((__global short2 *)((__global char*)map1 + map1Start + 4)); - map1_data.s45 = *((__global short2 *)((__global char*)map1 + map1Start + 8)); - map1_data.s67 = *((__global short2 *)((__global char*)map1 + map1Start + 12)); + map1_data = *((__global short8 *)((__global char*)map1 + map1Start)); - int4 srcIdx ; - srcIdx.s0 = map1_data.s1 * src_step + (map1_data.s0 << 2) + src_offset; - srcIdx.s1 = map1_data.s3 * src_step + (map1_data.s2 << 2) + src_offset; - srcIdx.s2 = map1_data.s5 * src_step + (map1_data.s4 << 2) + src_offset; - srcIdx.s3 = map1_data.s7 * src_step + (map1_data.s6 << 2) + src_offset; + int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even <<(short4)2) + src_offset; - // uchar16 src_data; - uchar4 src_a, src_b, src_c, src_d; - src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0)); - src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1)); - src_c = *((__global uchar4 *)((__global char*)src + srcIdx.s2)); - src_d = *((__global uchar4 *)((__global char*)src + srcIdx.s3)); - // src_data = (uchar16)(src_a, src_b, src_c, src_d); - uchar16 dst_data; - uchar4 dst_a, dst_b, dst_c, dst_d; - dst_a = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows) ? nval : src_a; - dst_b = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows) ? nval : src_b; - dst_c = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows) ? nval : src_c; - dst_d = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows) ? nval : src_d; - dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d); - __global uchar16* d = (__global uchar16 *)(dst + dstStart); - - uchar16 dVal = *d; - int16 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows); - dst_data = (convert_uchar16(con) != 0) ? dst_data : dVal; - - *d = dst_data; -} + float4 src_data; + src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0)); + src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1)); + src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2)); + src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3)); + float4 dst_data; + + dst_data.s0 = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? val : src_data.s0; + dst_data.s1 = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? val : src_data.s1; + dst_data.s2 = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? val : src_data.s2; + dst_data.s3 = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? val : src_data.s3; + + + __global float4* d = (__global float4 *)((__global uchar*)dst + dstStart); -__kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsigned char const * restrict src, + float4 dVal = *d; + + int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows); + dst_data = (convert_float4(con) != 0) ? dst_data : dVal; + + *d = dst_data; + + } + +} +__kernel void remapNNFConstant_C1_D5(__global float* dst, __global float const * restrict src, __global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, - int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , double4 nVal) + int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows ,int threadCols, F4 nVal) { int x = get_global_id(0); int y = get_global_id(1); - int gx = (x << 2) - (dst_offset&3); - int4 Gx = (int4)(gx, gx+1, gx+2, gx+3); + if(x < threadCols && y < dst_rows) + { + x = x << 4; - uchar4 nval =convert_uchar4_sat_rte(nVal); - char val = nval.s0; + int gx = x - (dst_offset&15); + int4 Gx = (int4)(gx, gx+4, gx+8, gx+12); - x = x << 2; + float4 nval =convert_float4(nVal); + float val = nval.s0; - int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3); + int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15); - int map1Start = y * map1_step + (x << 3) + map1_offset - ((dst_offset & 3) << 3); - float8 map1_data; + int map1Start = y * map1_step + (x << 1) + map1_offset - ((dst_offset&15) << 1); + float8 map1_data; - map1_data = *((__global float8 *)((__global char*)map1 + map1Start)); - /* map1_data.s01 = *((__global float2 *)((__global char*)map1 + map1Start)); - map1_data.s23 = *((__global float2 *)((__global char*)map1 + map1Start + 8)); - map1_data.s45 = *((__global float2 *)((__global char*)map1 + map1Start + 16)); - map1_data.s67 = *((__global float2 *)((__global char*)map1 + map1Start + 24)); -*/ - int8 map1_dataZ; + map1_data = *((__global float8 *)((__global char*)map1 + map1Start)); + int8 map1_dataZ = convert_int8_sat_rte(map1_data); - map1_dataZ = convert_int8_sat_rte(map1_data); - - int4 srcIdx ; - srcIdx.s0 = map1_dataZ.s1 * src_step + map1_dataZ.s0 + src_offset; - srcIdx.s1 = map1_dataZ.s3 * src_step + map1_dataZ.s2 + src_offset; - srcIdx.s2 = map1_dataZ.s5 * src_step + map1_dataZ.s4 + src_offset; - srcIdx.s3 = map1_dataZ.s7 * src_step + map1_dataZ.s6 + src_offset; - - //uchar4 src_data = *(src + srcIdx); - uchar4 src_data; - src_data.s0 = *(src + srcIdx.s0); - src_data.s1 = *(src + srcIdx.s1); - src_data.s2 = *(src + srcIdx.s2); - src_data.s3 = *(src + srcIdx.s3); - - uchar4 dst_data; - dst_data.s0 = (map1_dataZ.s0 >= src_cols || map1_dataZ.s1 >= src_rows)? val : src_data.s0; - dst_data.s1 = (map1_dataZ.s2 >= src_cols || map1_dataZ.s3 >= src_rows)? val : src_data.s1; - dst_data.s2 = (map1_dataZ.s4 >= src_cols || map1_dataZ.s5 >= src_rows)? val : src_data.s2; - dst_data.s3 = (map1_dataZ.s6 >= src_cols || map1_dataZ.s7 >= src_rows)? val : src_data.s3; + int4 srcIdx = convert_int4(map1_dataZ.odd) * src_step + convert_int4(map1_dataZ.even <<2) + src_offset; - __global uchar4* d = (__global uchar4 *)(dst + dstStart); + float4 src_data; + src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0)); + src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1)); + src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2)); + src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3)); + float4 dst_data; + + dst_data.s0 = (map1_dataZ.s0 >= src_cols || map1_dataZ.s1 >= src_rows)? val : src_data.s0; + dst_data.s1 = (map1_dataZ.s2 >= src_cols || map1_dataZ.s3 >= src_rows)? val : src_data.s1; + dst_data.s2 = (map1_dataZ.s4 >= src_cols || map1_dataZ.s5 >= src_rows)? val : src_data.s2; + dst_data.s3 = (map1_dataZ.s6 >= src_cols || map1_dataZ.s7 >= src_rows)? val : src_data.s3; + + + __global float4* d = (__global float4 *)((__global uchar*)dst + dstStart); + + float4 dVal = *d; + + int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows); + dst_data = (convert_float4(con) != 0) ? dst_data : dVal; + + *d = dst_data; - uchar4 dVal = *d; - int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); - dst_data = (convert_uchar4(con) != 0) ? dst_data : dVal; + } - *d = dst_data; +} + + +__kernel void remapNNSConstant_C4_D5(__global float * dst, __global float const * restrict src, + __global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, + int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , int threadCols, F4 nVal) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if(x < threadCols && y < dst_rows) + { + int dstIdx = y * dst_step + (x << 4) + dst_offset ; + int mapIdx = y * map1_step + (x << 2) + map1_offset ; + short2 map1_data = *((__global short2 *)((__global char*)map1 + mapIdx)); + + int srcIdx = map1_data.y * src_step + (map1_data.x << 4) + src_offset; + float4 nval = convert_float4(nVal); + float4 src_data = *((__global float4 *)((__global uchar *)src + srcIdx)); + *((__global float4 *)((__global uchar*)dst + dstIdx)) = (map1_data.x >= src_cols || map1_data.y >= src_rows) ? nval : src_data; + } +} +__kernel void remapNNFConstant_C4_D5(__global float * dst, __global float const * restrict src, + __global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, + int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , int threadCols, F4 nVal) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if(x < threadCols && y < dst_rows) + { + int dstIdx = y * dst_step + (x << 4) + dst_offset ; + int mapIdx = y * map1_step + (x << 3) + map1_offset ; + float2 map1_data = *((__global float2 *)((__global char*)map1 + mapIdx)); + int2 map1_dataZ = convert_int2_sat_rte(map1_data); + int srcIdx = map1_dataZ.y * src_step + (map1_dataZ.x << 4) + src_offset; + float4 nval = convert_float4(nVal); + float4 src_data = *((__global float4 *)((__global uchar *)src + srcIdx)); + *((__global float4 *)((__global uchar*)dst + dstIdx)) = (map1_dataZ.x >= src_cols || map1_dataZ.y >= src_rows) ? nval : src_data; + } } __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsigned char const * restrict src, __global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, - int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , double4 nVal) + int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , int threadCols, F4 nVal) { + int x = get_global_id(0); int y = get_global_id(1); - - int gx = (x << 2) - (dst_offset&3); - int4 Gx = (int4)(gx, gx+1, gx+2, gx+3); + if(x < threadCols && y < dst_rows) + { + x = x << 2; + int gx = x - (dst_offset&3); + int4 Gx = (int4)(gx, gx+1, gx+2, gx+3); - uchar4 nval =convert_uchar4(nVal); - uchar val = nval.s0; + uchar4 nval =convert_uchar4(nVal); + uchar val = nval.s0; - x = x << 2; - int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3); + int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3); + + int map1Start = y * map1_step + (x << 3) + map1_offset - ((dst_offset & 3) << 3); + float8 map1_data; + + map1_data = *((__global float8 *)((__global char*)map1 + map1Start)); + int8 map1_dataD = convert_int8(map1_data); + float8 temp = map1_data - convert_float8(map1_dataD); + + float4 u = temp.even; + float4 v = temp.odd; + float4 ud = 1.f - u; + float4 vd = 1.f - v; + //float8 map1_dataU = map1_dataD + 1; + + int4 map1_dataDx = map1_dataD.even; + int4 map1_dataDy = map1_dataD.odd; + int4 map1_dataDx1 = map1_dataDx + 1; + int4 map1_dataDy1 = map1_dataDy + 1; + + int4 src_StartU = map1_dataDy * src_step + map1_dataDx + src_offset; + int4 src_StartD = src_StartU + src_step; + /* + //not using the vload + int4 src_StartU1 = src_StartU + 1; + int4 src_StartD1 = src_StartD + 1; + + uchar4 a, b, c, d; + a.x = *(src_StartU.x + src); + a.y = *(src_StartU.y + src); + a.z = *(src_StartU.z + src); + a.w = *(src_StartU.w + src); + + b.x = *(src_StartU1.x + src); + b.y = *(src_StartU1.y + src); + b.z = *(src_StartU1.z + src); + b.w = *(src_StartU1.w + src); + + c.x = *(src_StartD.x + src); + c.y = *(src_StartD.y + src); + c.z = *(src_StartD.z + src); + c.w = *(src_StartD.w + src); + + d.x = *(src_StartD1.x + src); + d.y = *(src_StartD1.y + src); + d.z = *(src_StartD1.z + src); + d.w = *(src_StartD1.w + src); + */ + uchar2 aU, aD, bU, bD, cU, cD, dU, dD; + + aU = vload2(0, src + src_StartU.s0); + bU = vload2(0, src + src_StartU.s1); + cU = vload2(0, src + src_StartU.s2); + dU = vload2(0, src + src_StartU.s3); + aD = vload2(0, src + src_StartD.s0); + bD = vload2(0, src + src_StartD.s1); + cD = vload2(0, src + src_StartD.s2); + dD = vload2(0, src + src_StartD.s3); + + uchar4 a, b, c, d; + a = (uchar4)(aU.x, bU.x, cU.x, dU.x); + b = (uchar4)(aU.y, bU.y, cU.y, dU.y); + c = (uchar4)(aD.x, bD.x, cD.x, dD.x); + d = (uchar4)(aD.y, bD.y, cD.y, dD.y); + + int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0); + int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0); + int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0); + int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0); + a = (convert_uchar4(ac) == (uchar4)0)? a : val; + b = (convert_uchar4(bc) == (uchar4)0)? b : val; + c = (convert_uchar4(cc) == (uchar4)0)? c : val; + d = (convert_uchar4(dc) == (uchar4)0)? d : val; + + uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v ); + + __global uchar4* D = (__global uchar4 *)(dst + dstStart); + + uchar4 dVal = *D; + int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); + dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal; - int map1Start = y * map1_step + (x << 3) + map1_offset - ((dst_offset & 3) << 3); - float8 map1_data; + *D = dst_data; + } +} - map1_data = *((__global float8 *)((__global char*)map1 + map1Start)); - int8 map1_dataD = convert_int8(map1_data); - float8 temp = map1_data - convert_float8(map1_dataD); +__kernel void remapLNSConstant_C1_D0(__global unsigned char* dst, __global unsigned char const * restrict src, + __global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, + int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , int threadCols, F4 nVal) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if(x < threadCols && y < dst_rows) + { + x = x << 2; + int gx = x - (dst_offset&3); + int4 Gx = (int4)(gx, gx+1, gx+2, gx+3); - float4 u = temp.even; - float4 v = temp.odd; - float4 ud = 1.0 - u; - float4 vd = 1.0 - v; - //float8 map1_dataU = map1_dataD + 1; + uchar4 nval =convert_uchar4(nVal); + uchar val = nval.s0; - int4 map1_dataDx = map1_dataD.even; - int4 map1_dataDy = map1_dataD.odd; - int4 map1_dataDx1 = map1_dataDx + 1; - int4 map1_dataDy1 = map1_dataDy + 1; + int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3); - int4 src_StartU = map1_dataDy * src_step + map1_dataDx + src_offset; - int4 src_StartD = src_StartU + src_step; - int4 src_StartU1 = src_StartU + 1; - int4 src_StartD1 = src_StartD + 1; + int map1Start = y * map1_step + (x << 2) + map1_offset - ((dst_offset & 3) << 2); + short8 map1_data; - uchar4 a, b, c, d; - a.x = *(src_StartU.x + src); - a.y = *(src_StartU.y + src); - a.z = *(src_StartU.z + src); - a.w = *(src_StartU.w + src); + map1_data = *((__global short8 *)((__global char*)map1 + map1Start)); + int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even) + src_offset; - b.x = *(src_StartU1.x + src); - b.y = *(src_StartU1.y + src); - b.z = *(src_StartU1.z + src); - b.w = *(src_StartU1.w + src); - - c.x = *(src_StartD.x + src); - c.y = *(src_StartD.y + src); - c.z = *(src_StartD.z + src); - c.w = *(src_StartD.w + src); - - d.x = *(src_StartD1.x + src); - d.y = *(src_StartD1.y + src); - d.z = *(src_StartD1.z + src); - d.w = *(src_StartD1.w + src); - int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0); - int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0); - int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0); - int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0); - a = (convert_uchar4(ac) == 0)? a : val; - b = (convert_uchar4(bc) == 0)? b : val; - c = (convert_uchar4(cc) == 0)? c : val; - d = (convert_uchar4(dc) == 0)? d : val; - - uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v ); - - __global uchar4* D = (__global uchar4 *)(dst + dstStart); + uchar4 src_data; + + src_data.s0 = *(src + srcIdx.s0); + src_data.s1 = *(src + srcIdx.s1); + src_data.s2 = *(src + srcIdx.s2); + src_data.s3 = *(src + srcIdx.s3); + uchar4 dst_data; + dst_data = convert_uchar4((convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows)))? (uchar4)(val) : src_data; + + __global uchar4* d = (__global uchar4 *)(dst + dstStart); - uchar4 dVal = *D; - int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); - dst_data = (convert_uchar4(con) != 0) ? dst_data : dVal; + uchar4 dVal = *d; + + int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); + dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal; + + *d = dst_data; + + } - *D = dst_data; } -__kernel void remapLNFConstant_C2_D0(__global unsigned char* dst, __global unsigned char const * restrict src, + +__kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsigned char const * restrict src, __global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, - int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , double4 nVal) + int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , int threadCols, F4 nVal) { + int x = get_global_id(0); int y = get_global_id(1); - - int gx = (x << 3) - (dst_offset&7); - int8 Gx = (int8)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7); - - uchar4 nval =convert_uchar4(nVal); - uchar8 val = (uchar8)(nval.s01, nval.s01, nval.s01, nval.s01); - - x = x << 3; - - int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&7); - - int map1Start = y * map1_step + (x << 2) + map1_offset - (((dst_offset>>1) & 3) << 3); - float8 map1_data; - - map1_data = *((__global float8 *)((__global char*)map1 + map1Start)); - int8 map1_dataD = convert_int8(map1_data); - float8 temp = map1_data - convert_float8(map1_dataD); - - float4 U = temp.even; - float4 V = temp.odd; - float4 UD = 1.0 - U; - float4 VD = 1.0 - V; - - float8 u, v, ud, vd; - u = (float8)(U.x, U.x, U.y, U.y, U.z, U.z, U.w, U.w); - v = (float8)(V.x, V.x, V.y, V.y, V.z, V.z, V.w, V.w); - ud = (float8)(UD.x, UD.x, UD.y, UD.y, UD.z, UD.z, UD.w, UD.w); - vd = (float8)(VD.x, VD.x, VD.y, VD.y, VD.z, VD.z, VD.w, VD.w); - - //float8 map1_dataU = map1_dataD + 1; - - int4 map1_dataDx = map1_dataD.even; - int4 map1_dataDy = map1_dataD.odd; - int4 map1_dataDx1 = map1_dataDx + 1; - int4 map1_dataDy1 = map1_dataDy + 1; - - int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << 1) + src_offset; - int4 src_StartD = src_StartU + src_step; - int4 src_StartU1 = src_StartU + 2; - int4 src_StartD1 = src_StartD + 2; - - uchar8 a, b, c, d; - a.s01 = *((__global uchar2 *)((__global char*)src + src_StartU.x)); - a.s23 = *((__global uchar2 *)((__global char*)src + src_StartU.y)); - a.s45 = *((__global uchar2 *)((__global char*)src + src_StartU.z)); - a.s67 = *((__global uchar2 *)((__global char*)src + src_StartU.w)); - - b.s01 = *((__global uchar2 *)((__global char*)src + src_StartU1.x)); - b.s23 = *((__global uchar2 *)((__global char*)src + src_StartU1.y)); - b.s45 = *((__global uchar2 *)((__global char*)src + src_StartU1.z)); - b.s67 = *((__global uchar2 *)((__global char*)src + src_StartU1.w)); - - c.s01 = *((__global uchar2 *)((__global char*)src + src_StartD.x)); - c.s23 = *((__global uchar2 *)((__global char*)src + src_StartD.y)); - c.s45 = *((__global uchar2 *)((__global char*)src + src_StartD.z)); - c.s67 = *((__global uchar2 *)((__global char*)src + src_StartD.w)); - - d.s01 = *((__global uchar2 *)((__global char*)src + src_StartD1.x)); - d.s23 = *((__global uchar2 *)((__global char*)src + src_StartD1.y)); - d.s45 = *((__global uchar2 *)((__global char*)src + src_StartD1.z)); - d.s67 = *((__global uchar2 *)((__global char*)src + src_StartD1.w)); - - int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0); - int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0); - int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0); - int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0); - - /* a.even = (convert_uchar4(ac) == 0)? a.even : val.even; - a.odd = (convert_uchar4(ac) == 0)? a.odd : val.odd; - b.even = (convert_uchar4(bc) == 0)? b.even : val.even; - b.odd = (convert_uchar4(bc) == 0)? b.odd : val.odd; - c.even = (convert_uchar4(cc) == 0)? c.even : val.even; - c.odd = (convert_uchar4(cc) == 0)? c.odd : val.odd; - d.even = (convert_uchar4(dc) == 0)? d.even : val.even; - d.odd = (convert_uchar4(dc) == 0)? d.odd : val.odd; -*/ - int8 aC = (int8)(ac.x, ac.x, ac.y, ac.y, ac.z, ac.z, ac.w, ac.w); - int8 bC = (int8)(bc.x, bc.x, bc.y, bc.y, bc.z, bc.z, bc.w, bc.w); - int8 cC = (int8)(cc.x, cc.x, cc.y, cc.y, cc.z, cc.z, cc.w, cc.w); - int8 dC = (int8)(dc.x, dc.x, dc.y, dc.y, dc.z, dc.z, dc.w, dc.w); - - a = (convert_uchar8(aC) == 0)? a : val; - b = (convert_uchar8(bC) == 0)? b : val; - c = (convert_uchar8(cC) == 0)? c : val; - d = (convert_uchar8(dC) == 0)? d : val; - uchar8 dst_data = convert_uchar8_sat_rte((convert_float8(a))* ud * vd +(convert_float8(b))* u * vd + (convert_float8(c))* ud * v + (convert_float8(d)) * u * v ); + if(x < threadCols && y < dst_rows) + { + x = x << 4; + int gx = x - (dst_offset&15); + int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15); + + uchar4 nval =convert_uchar4(nVal); + + int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15); + + int map1Start = y * map1_step + (x << 1) + map1_offset - ((dst_offset & 15) << 1); + float8 map1_data; + + map1_data = *((__global float8 *)((__global char*)map1 + map1Start)); + int8 map1_dataD = convert_int8(map1_data); + float8 temp = map1_data - convert_float8(map1_dataD); + + float4 u = temp.even; + float4 v = temp.odd; + float4 ud = 1.f - u; + float4 vd = 1.f - v; + + //float8 map1_dataU = map1_dataD + 1; + + int4 map1_dataDx = map1_dataD.even; + int4 map1_dataDy = map1_dataD.odd; + int4 map1_dataDx1 = map1_dataDx + 1; + int4 map1_dataDy1 = map1_dataDy + 1; + + int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << 2) + src_offset; + int4 src_StartD = src_StartU + src_step; + + uchar8 aU, bU, cU, dU, aD, bD, cD, dD; + aU = vload8(0, src + src_StartU.s0); + bU = vload8(0, src + src_StartU.s1); + cU = vload8(0, src + src_StartU.s2); + dU = vload8(0, src + src_StartU.s3); + aD = vload8(0, src + src_StartD.s0); + bD = vload8(0, src + src_StartD.s1); + cD = vload8(0, src + src_StartD.s2); + dD = vload8(0, src + src_StartD.s3); + uchar16 a, b, c, d; + a = (uchar16)(aU.s0123, bU.s0123, cU.s0123, dU.s0123); + b = (uchar16)(aU.s4567, bU.s4567, cU.s4567, dU.s4567); + c = (uchar16)(aD.s0123, bD.s0123, cD.s0123, dD.s0123); + d = (uchar16)(aD.s4567, bD.s4567, cD.s4567, dD.s4567); + int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0); + int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0); + int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0); + int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0); + + int16 acc = (int16)((int4)(ac.x), (int4)(ac.y), (int4)(ac.z), (int4)(ac.w)); + int16 bcc = (int16)((int4)(bc.x), (int4)(bc.y), (int4)(bc.z), (int4)(bc.w)); + int16 ccc = (int16)((int4)(cc.x), (int4)(cc.y), (int4)(cc.z), (int4)(cc.w)); + int16 dcc = (int16)((int4)(dc.x), (int4)(dc.y), (int4)(dc.z), (int4)(dc.w)); + + uchar16 val = (uchar16)(nval, nval, nval, nval); + a = (convert_uchar16(acc) == (uchar16)0)? a : val; + b = (convert_uchar16(bcc) == (uchar16)0)? b : val; + c = (convert_uchar16(ccc) == (uchar16)0)? c : val; + d = (convert_uchar16(dcc) == (uchar16)0)? d : val; + + float16 U = (float16)((float4)(u.x), (float4)(u.y), (float4)(u.z), (float4)(u.w)); + float16 V = (float16)((float4)(v.x), (float4)(v.y), (float4)(v.z), (float4)(v.w)); + float16 Ud = (float16)((float4)(ud.x), (float4)(ud.y), (float4)(ud.z), (float4)(ud.w)); + float16 Vd = (float16)((float4)(vd.x), (float4)(vd.y), (float4)(vd.z), (float4)(vd.w)); + + uchar16 dst_data = convert_uchar16_sat_rte((convert_float16(a))* Ud * Vd +(convert_float16(b))* U * Vd + (convert_float16(c))* Ud * V + (convert_float16(d)) * U * V ); - __global uchar8* D = (__global uchar8 *)(dst + dstStart); + __global uchar16* D = (__global uchar16 *)(dst + dstStart); - uchar8 dVal = *D; - int8 con = (Gx >= 0 && Gx < (dst_cols << 1) && y >= 0 && y < dst_rows); - dst_data = (convert_uchar8(con) != 0) ? dst_data : dVal; + uchar16 dVal = *D; + int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows); + dst_data = (convert_uchar16(con) != (uchar16)0) ? dst_data : dVal; - *D = dst_data; + *D = dst_data; + } } +__kernel void remapLNSConstant_C4_D0(__global unsigned char* dst, __global unsigned char const * restrict src, + __global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, + int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows, int threadCols, F4 nVal) +{ + int x = get_global_id(0); + int y = get_global_id(1); -/* -__kernel void remapLNFConstant_C4_D0(__global unsigned char* dst, __global unsigned char const * restrict src, + if(x < threadCols && y < dst_rows) + { + x = x << 4; + int gx = x - (dst_offset&15); + int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15); + uchar4 nval =convert_uchar4_sat_rte(nVal); + + int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15); + + int map1Start = y * map1_step + x + map1_offset - (dst_offset&15 ); + short8 map1_data; + + map1_data = *((__global short8 *)((__global char*)map1 + map1Start)); + int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even <<(short4)2) + src_offset; + uchar4 src_a, src_b, src_c, src_d; + src_a = *((__global uchar4 *)((__global char*)src + srcIdx.s0)); + src_b = *((__global uchar4 *)((__global char*)src + srcIdx.s1)); + src_c = *((__global uchar4 *)((__global char*)src + srcIdx.s2)); + src_d = *((__global uchar4 *)((__global char*)src + srcIdx.s3)); + + uchar16 dst_data; + uchar4 dst_a, dst_b, dst_c, dst_d; + dst_a = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? nval : src_a; + dst_b = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? nval : src_b; + dst_c = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? nval : src_c; + dst_d = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? nval : src_d; + + dst_data = (uchar16)(dst_a, dst_b, dst_c, dst_d); + __global uchar16* d = (__global uchar16 *)(dst + dstStart); + + uchar16 dVal = *d; + + int16 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows); + dst_data = (convert_uchar16(con) != (uchar16)0) ? dst_data : dVal; + + *d = dst_data; + + } + +} + +__kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const * restrict src, __global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, - int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , double4 nVal) + int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , int threadCols, F4 nVal) { + int x = get_global_id(0); int y = get_global_id(1); - - int gx = (x << 4) - (dst_offset&15); - int16 Gx = (int16)(gx, gx+1, gx+2, gx+3, gx+4, gx+5, gx+6, gx+7, gx+8, gx+9, gx+10, gx+11, gx+12, gx+13, gx+14, gx+15); + if(x < threadCols && y < dst_rows) + { + x = x << 4; + int gx = x - (dst_offset&15); + int4 Gx = (int4)(gx, gx+4, gx+8, gx+12); - uchar4 nval =convert_uchar4(nVal); - uchar16 val = (uchar16)(nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01, nval.s01); + float4 nval =convert_float4(nVal); + float4 val = (float4)(nval.s0); - x = x << 4; - - int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15); - - int map1Start = y * map1_step + (x << 1) + map1_offset - (((dst_offset>>2) & 3) << 3); - float8 map1_data; - - map1_data = *((__global float8 *)((__global char*)map1 + map1Start)); - int8 map1_dataD = convert_int8(map1_data); - float8 temp = map1_data - convert_float8(map1_dataD); - - float4 U = temp.even; - float4 V = temp.odd; - float4 UD = 1.0 - U; - float4 VD = 1.0 - V; - - float16 u, v, ud, vd; - u = (float16)(U.x, U.x, U.x, U.x, U.y, U.y, U.y, U.y, U.z, U.z, U.z, U.z, U.w, U.w, U.w, U.w); - v = (float16)(V.x, V.x, V.x, V.x, V.y, V.y, V.y, V.y, V.z, V.z, V.z, V.z, V.w, V.w, V.w, V.w); - ud = (float16)(UD.x, UD.x, UD.x, UD.x, UD.y, UD.y, UD.y, UD.y, UD.z, UD.z, UD.z, UD.z, UD.w, UD.w, UD.w, UD.w); - vd = (float16)(VD.x, VD.x, VD.y, VD.y, VD.z, VD.z, VD.w, VD.w); - - //float8 map1_dataU = map1_dataD + 1; - - int4 map1_dataDx = map1_dataD.even; - int4 map1_dataDy = map1_dataD.odd; - int4 map1_dataDx1 = map1_dataDx + 1; - int4 map1_dataDy1 = map1_dataDy + 1; - - int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << 1) + src_offset; - int4 src_StartD = src_StartU + src_step; - int4 src_StartU1 = src_StartU + 2; - int4 src_StartD1 = src_StartD + 2; - - uchar8 a, b, c, d; - a.s01 = *((__global uchar2 *)((__global char*)src + src_StartU.x)); - a.s23 = *((__global uchar2 *)((__global char*)src + src_StartU.y)); - a.s45 = *((__global uchar2 *)((__global char*)src + src_StartU.z)); - a.s67 = *((__global uchar2 *)((__global char*)src + src_StartU.w)); - - b.s01 = *((__global uchar2 *)((__global char*)src + src_StartU1.x)); - b.s23 = *((__global uchar2 *)((__global char*)src + src_StartU1.y)); - b.s45 = *((__global uchar2 *)((__global char*)src + src_StartU1.z)); - b.s67 = *((__global uchar2 *)((__global char*)src + src_StartU1.w)); - - c.s01 = *((__global uchar2 *)((__global char*)src + src_StartD.x)); - c.s23 = *((__global uchar2 *)((__global char*)src + src_StartD.y)); - c.s45 = *((__global uchar2 *)((__global char*)src + src_StartD.z)); - c.s67 = *((__global uchar2 *)((__global char*)src + src_StartD.w)); - - d.s01 = *((__global uchar2 *)((__global char*)src + src_StartD1.x)); - d.s23 = *((__global uchar2 *)((__global char*)src + src_StartD1.y)); - d.s45 = *((__global uchar2 *)((__global char*)src + src_StartD1.z)); - d.s67 = *((__global uchar2 *)((__global char*)src + src_StartD1.w)); - - int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0); - int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0); - int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0); - int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0); - - int8 aC = (int8)(ac.x, ac.x, ac.y, ac.y, ac.z, ac.z, ac.w, ac.w); - int8 bC = (int8)(bc.x, bc.x, bc.y, bc.y, bc.z, bc.z, bc.w, bc.w); - int8 cC = (int8)(cc.x, cc.x, cc.y, cc.y, cc.z, cc.z, cc.w, cc.w); - int8 dC = (int8)(dc.x, dc.x, dc.y, dc.y, dc.z, dc.z, dc.w, dc.w); - - a = (convert_uchar8(aC) == 0)? a : val; - b = (convert_uchar8(bC) == 0)? b : val; - c = (convert_uchar8(cC) == 0)? c : val; - d = (convert_uchar8(dC) == 0)? d : val; - uchar8 dst_data = convert_uchar8_sat_rte((convert_float8(a))* ud * vd +(convert_float8(b))* u * vd + (convert_float8(c))* ud * v + (convert_float8(d)) * u * v ); + int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15); + int map1Start = y * map1_step + (x << 1) + map1_offset - ((dst_offset & 15) << 1); + float8 map1_data; + + map1_data = *((__global float8 *)((__global char*)map1 + map1Start)); + int8 map1_dataD = convert_int8(map1_data); + float8 temp = map1_data - convert_float8(map1_dataD); + + float4 u = temp.even; + float4 v = temp.odd; + float4 ud = 1.f - u; + float4 vd = 1.f - v; + //float8 map1_dataU = map1_dataD + 1; + + int4 map1_dataDx = map1_dataD.even; + int4 map1_dataDy = map1_dataD.odd; + int4 map1_dataDx1 = map1_dataDx + 1; + int4 map1_dataDy1 = map1_dataDy + 1; + + int4 src_StartU = map1_dataDy * src_step + (map1_dataDx << 2) + src_offset; + int4 src_StartD = src_StartU + src_step; + /* + //not using the vload + int4 src_StartU1 = src_StartU + 1; + int4 src_StartD1 = src_StartD + 1; + + float4 a, b, c, d; + a.x = *(src_StartU.x + src); + a.y = *(src_StartU.y + src); + a.z = *(src_StartU.z + src); + a.w = *(src_StartU.w + src); + + b.x = *(src_StartU1.x + src); + b.y = *(src_StartU1.y + src); + b.z = *(src_StartU1.z + src); + b.w = *(src_StartU1.w + src); + + c.x = *(src_StartD.x + src); + c.y = *(src_StartD.y + src); + c.z = *(src_StartD.z + src); + c.w = *(src_StartD.w + src); + + d.x = *(src_StartD1.x + src); + d.y = *(src_StartD1.y + src); + d.z = *(src_StartD1.z + src); + d.w = *(src_StartD1.w + src); + */ + float2 aU, aD, bU, bD, cU, cD, dU, dD; + + aU = vload2(0, (__global float *)((__global char*)src + src_StartU.s0)); + bU = vload2(0, (__global float *)((__global char*)src + src_StartU.s1)); + cU = vload2(0, (__global float *)((__global char*)src + src_StartU.s2)); + dU = vload2(0, (__global float *)((__global char*)src + src_StartU.s3)); + aD = vload2(0, (__global float *)((__global char*)src + src_StartD.s0)); + bD = vload2(0, (__global float *)((__global char*)src + src_StartD.s1)); + cD = vload2(0, (__global float *)((__global char*)src + src_StartD.s2)); + dD = vload2(0, (__global float *)((__global char*)src + src_StartD.s3)); + + float4 a, b, c, d; + a = (float4)(aU.x, bU.x, cU.x, dU.x); + b = (float4)(aU.y, bU.y, cU.y, dU.y); + c = (float4)(aD.x, bD.x, cD.x, dD.x); + d = (float4)(aD.y, bD.y, cD.y, dD.y); + + int4 ac =(map1_dataDx >= src_cols || map1_dataDy >= src_rows || map1_dataDy< 0 || map1_dataDy < 0); + int4 bc =(map1_dataDx1 >= src_cols || map1_dataDy >= src_rows || map1_dataDx1 < 0 || map1_dataDy < 0); + int4 cc =(map1_dataDx >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDx < 0); + int4 dc =(map1_dataDx1 >= src_cols || map1_dataDy1 >= src_rows || map1_dataDy1 < 0 || map1_dataDy1 < 0); + a = (convert_float4(ac) == 0)? a : val; + b = (convert_float4(bc) == 0)? b : val; + c = (convert_float4(cc) == 0)? c : val; + d = (convert_float4(dc) == 0)? d : val; + + float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ; - __global uchar8* D = (__global uchar8 *)(dst + dstStart); + __global float4* D = (__global float4 *)((__global char*)dst + dstStart); + + float4 dVal = *D; + int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows); + dst_data = (convert_float4(con) != 0) ? dst_data : dVal; + + *D = dst_data; + } +} +__kernel void remapLNSConstant_C1_D5(__global float* dst, __global float const * restrict src, + __global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, + int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows ,int threadCols, F4 nVal) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if(x < threadCols && y < dst_rows) + { + x = x << 4; + + int gx = x - (dst_offset&15); + int4 Gx = (int4)(gx, gx+4, gx+8, gx+12); - uchar8 dVal = *D; - int8 con = (Gx >= 0 && Gx < (dst_cols << 1) && y >= 0 && y < dst_rows); - dst_data = (convert_uchar8(con) != 0) ? dst_data : dVal; + float4 nval =convert_float4(nVal); + float val = nval.s0; - *D = dst_data; + int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15); + + int map1Start = y * map1_step + x + map1_offset - (dst_offset&15); + short8 map1_data; + + map1_data = *((__global short8 *)((__global char*)map1 + map1Start)); + int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even <<(short4)2) + src_offset; + + float4 src_data; + src_data.s0 = *((__global float *)((__global char*)src + srcIdx.s0)); + src_data.s1 = *((__global float *)((__global char*)src + srcIdx.s1)); + src_data.s2 = *((__global float *)((__global char*)src + srcIdx.s2)); + src_data.s3 = *((__global float *)((__global char*)src + srcIdx.s3)); + float4 dst_data; + + dst_data.s0 = (map1_data.s0 >= src_cols || map1_data.s1 >= src_rows)? val : src_data.s0; + dst_data.s1 = (map1_data.s2 >= src_cols || map1_data.s3 >= src_rows)? val : src_data.s1; + dst_data.s2 = (map1_data.s4 >= src_cols || map1_data.s5 >= src_rows)? val : src_data.s2; + dst_data.s3 = (map1_data.s6 >= src_cols || map1_data.s7 >= src_rows)? val : src_data.s3; + + + __global float4* d = (__global float4 *)((__global uchar*)dst + dstStart); + + float4 dVal = *d; + + int4 con = (Gx >= 0 && Gx < (dst_cols<<2) && y >= 0 && y < dst_rows); + dst_data = (convert_float4(con) != 0) ? dst_data : dVal; + + *d = dst_data; + + } + } -*/ + +__kernel void remapLNFConstant_C4_D5(__global float * dst, __global float const * restrict src, + __global float * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, + int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , int threadCols, F4 nVal) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if(x < threadCols && y < dst_rows) + { + int dstIdx = y * dst_step + (x << 4) + dst_offset ; + int mapIdx = y * map1_step + (x << 3) + map1_offset ; + float2 map1_data = *((__global float2 *)((__global char*)map1 + mapIdx)); + + int2 map1_dataZ = convert_int2(map1_data); + + int mX = map1_dataZ.x; + int mY = map1_dataZ.y; + int mX1 = map1_dataZ.x + 1; + int mY1 = map1_dataZ.y + 1; + + float u = map1_data.x - convert_float(map1_dataZ.x); + float v = map1_data.y - convert_float(map1_dataZ.y); + float ud = 1.0 - u; + float vd = 1.0 - v; + + int srcIdx = map1_dataZ.y * src_step + (map1_dataZ.x << 4) + src_offset; + float8 src_dataU = vload8(0,(__global float *)((__global char*)src + srcIdx)); + float8 src_dataD = vload8(0,(__global float *)((__global char*)src + srcIdx + src_step)); + + float4 a = src_dataU.lo; + float4 b = src_dataU.hi; + float4 c = src_dataD.lo; + float4 d = src_dataD.hi; + + float4 nval = convert_float4(nVal); + a = (mX >= src_cols || mY >= src_rows ) ? nval : a; + b = (mX1 >= src_cols || mY >= src_rows ) ? nval : b; + c = (mX >= src_cols || mY1 >= src_rows ) ? nval : c; + d = (mX1 >= src_cols || mY1 >= src_rows ) ? nval : d; + + float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v; + *((__global float4 *)((__global uchar*)dst + dstIdx)) = a * ud * vd + b * u * vd + c * ud * v + d * u * v ; + + } +} + + +/* +//////////////////////////////////////////////////////////////////////// +///////////////////using image buffer/////////////////////////////////// +//////////////////////////////////////////////////////////////////////// + + +__kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __read_only image2d_t src, + __global short * map1, int dst_offset, int src_offset, int map1_offset, int dst_step, int src_step, + int map1_step, int src_cols, int src_rows, int dst_cols, int dst_rows, int map1_cols, int map1_rows , int threadCols, F4 nVal) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + x = x << 2; + if(x < threadCols && y < dst_rows) + { + int gx = x - (dst_offset&3); + int4 Gx = (int4)(gx, gx+1, gx+2, gx+3); + + uchar4 nval =convert_uchar4(nVal); + char val = nval.s0; + + int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3); + + int map1Start = y * map1_step + (x << 2) + map1_offset - ((dst_offset & 3) << 2); + short8 map1_data; + + map1_data = *((__global short8 *)((__global char*)map1 + map1Start)); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | + CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; + + int4 src_data; + src_data.x = read_imageui(src, sampler, (int2)((int)map1_data.s0, (int)map1_data.s1)).x; + src_data.y = read_imageui(src, sampler, (int2)((int)map1_data.s2, (int)map1_data.s3)).x; + src_data.z = read_imageui(src, sampler, (int2)((int)map1_data.s4, (int)map1_data.s5)).x; + src_data.w = read_imageui(src, sampler, (int2)((int)map1_data.s6, (int)map1_data.s7)).x; + + int4 bcon = (convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows)); + uchar4 dst_data = (convert_uchar4(bcon != 0)) ? (uchar4)(val) : convert_uchar4(src_data); + + __global uchar4* d = (__global uchar4 *)(dst + dstStart); + uchar4 dVal = *d; + int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); + dst_data = (convert_uchar4(con) != (uchar4)0) ? dst_data : dVal; + + *d = dst_data; + } +} +*/ diff --git a/modules/ocl/test/precomp.hpp b/modules/ocl/test/precomp.hpp index d3ab3a2..5d57a30 100644 --- a/modules/ocl/test/precomp.hpp +++ b/modules/ocl/test/precomp.hpp @@ -67,7 +67,7 @@ #include "interpolation.hpp" //#include "add_test_info.h" -#define OPENCV_DEFAULT_OPENCL_DEVICE CVCL_DEVICE_TYPE_ALL +#define OPENCV_DEFAULT_OPENCL_DEVICE CVCL_DEVICE_TYPE_GPU #endif diff --git a/modules/ocl/test/test_imgproc.cpp b/modules/ocl/test/test_imgproc.cpp index 2b8a29c..ff2f441 100644 --- a/modules/ocl/test/test_imgproc.cpp +++ b/modules/ocl/test/test_imgproc.cpp @@ -858,10 +858,10 @@ PARAM_TEST_CASE(Remap, MatType, MatType, MatType, int, int) cv::RNG& rng = TS::ptr()->get_rng(); //cv::Size size = cv::Size(20, 20); - cv::Size srcSize = cv::Size(15, 20); - cv::Size dstSize = cv::Size(20, 20); - cv::Size map1Size = cv::Size(20, 20); - double min = 1, max = 20; + cv::Size srcSize = cv::Size(100, 100); + cv::Size dstSize = cv::Size(100, 100); + cv::Size map1Size = cv::Size(100, 100); + double min = 5, max = 16; if(srcType != nulltype) { @@ -898,14 +898,11 @@ PARAM_TEST_CASE(Remap, MatType, MatType, MatType, int, int) src_roicols = rng.uniform(1, src.cols); src_roirows = rng.uniform(1, src.rows); - cout << "dst_roicols: " << dst_roicols << "dst_roirows: "<< dst_roirows << endl; - cout << "src_roicols: " << src_roicols << "dst_roirows: "<< src_roirows << endl; - + srcx = rng.uniform(0, src.cols - src_roicols); srcy = rng.uniform(0, src.rows - src_roirows); dstx = rng.uniform(0, dst.cols - dst_roicols); dsty = rng.uniform(0, dst.rows - dst_roirows); - cout << "srcx: " << srcx << "srcy: " << srcy << "dstx: " << dstx << "dsty: " << dsty << endl; map1_roicols = dst_roicols; map1_roirows = dst_roirows; map2_roicols = dst_roicols; @@ -940,10 +937,6 @@ PARAM_TEST_CASE(Remap, MatType, MatType, MatType, int, int) { map1_roi = map1(Rect(map1x,map1y,map1_roicols,map1_roirows)); gmap1_roi = map1_roi; - // cv::Mat maptest(gmap1_roi); - // cout << "maptest " << endl; - //cout << maptest << endl; - //gmap1_roi = gmap1(Rect(map1x,map1y,map1_roicols,map1_roirows)); } else if (map1Type == CV_32FC1 && map2Type == CV_32FC1) @@ -962,6 +955,11 @@ PARAM_TEST_CASE(Remap, MatType, MatType, MatType, int, int) TEST_P(Remap, Mat) { + if((interpolation == 1 && map1Type == CV_16SC2) ||(interpolation == 1 && map1Type == CV_16SC1 && map2Type == CV_16SC1)) + { + cout << "LINEAR don't support the map1Type and map2Type" << endl; + return; + } int bordertype[] = {cv::BORDER_CONSTANT,cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/}; const char* borderstr[]={"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/}; // for(int i = 0; i < sizeof(bordertype)/sizeof(int); i++) @@ -1448,7 +1446,7 @@ INSTANTIATE_TEST_CASE_P(Imgproc, meanShiftProc, Combine( )); INSTANTIATE_TEST_CASE_P(Imgproc, Remap, Combine( - Values(CV_8UC1, CV_8UC2, CV_8UC4), + Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(CV_16SC2, CV_32FC2), NULL_TYPE, Values((int)cv::INTER_NEAREST, (int)cv::INTER_LINEAR), Values((int)cv::BORDER_CONSTANT))); -- 2.7.4