From: Vadim Pisarevsky Date: Mon, 18 Nov 2013 17:02:10 +0000 (-0500) Subject: removed unnecessary opencl kernels X-Git-Tag: submit/tizen_ivi/20141117.190038~2^2~852^2~6 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=8762ee3f6f719843cef3fda7b3e4ccde31c0733e;p=profile%2Fivi%2Fopencv.git removed unnecessary opencl kernels --- diff --git a/modules/core/src/opencl/mulspectrums.cl b/modules/core/src/opencl/mulspectrums.cl deleted file mode 100644 index 86d4e5d..0000000 --- a/modules/core/src/opencl/mulspectrums.cl +++ /dev/null @@ -1,96 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Peng Xiao, pengxiao@multicorewareinc.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other oclMaterials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the uintel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business uinterruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -typedef float2 cfloat; -inline cfloat cmulf(cfloat a, cfloat b) -{ - return (cfloat)( a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x); -} - -inline cfloat conjf(cfloat a) -{ - return (cfloat)( a.x, - a.y ); -} - -__kernel void -mulAndScaleSpectrumsKernel( - __global const cfloat* a, - __global const cfloat* b, - float scale, - __global cfloat* dst, - uint cols, - uint rows, - uint mstep -) -{ - const uint x = get_global_id(0); - const uint y = get_global_id(1); - const uint idx = mad24(y, mstep / sizeof(cfloat), x); - if (x < cols && y < rows) - { - cfloat v = cmulf(a[idx], b[idx]); - dst[idx] = (cfloat)( v.x * scale, v.y * scale ); - } -} -__kernel void -mulAndScaleSpectrumsKernel_CONJ( - __global const cfloat* a, - __global const cfloat* b, - float scale, - __global cfloat* dst, - uint cols, - uint rows, - uint mstep -) -{ - const uint x = get_global_id(0); - const uint y = get_global_id(1); - const uint idx = mad24(y, mstep / sizeof(cfloat), x); - if (x < cols && y < rows) - { - cfloat v = cmulf(a[idx], conjf(b[idx])); - dst[idx] = (cfloat)( v.x * scale, v.y * scale ); - } -} diff --git a/modules/core/src/opencl/polarcart.cl b/modules/core/src/opencl/polarcart.cl deleted file mode 100644 index 1883df7..0000000 --- a/modules/core/src/opencl/polarcart.cl +++ /dev/null @@ -1,73 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Copyright (C) 2013, OpenCV Foundation, all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the copyright holders or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -__kernel void polarToCart(__global const uchar* mask, int maskstep, int maskoffset, - __global uchar* dstptr, int dststep, int dstoffset, - int rows, int cols, dstT value ) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int mask_index = mad24(y, maskstep, x + maskoffset); - if( mask[mask_index] ) - { - int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset); - *(dstT*)(dstptr + dst_index) = value; - } - } -} - -__kernel void cartToPolar(__global uchar* dstptr, int dststep, int dstoffset, - int rows, int cols, dstT value ) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset); - *(dstT*)(dstptr + dst_index) = value; - } -} diff --git a/modules/core/src/opencl/reductions.cl b/modules/core/src/opencl/reductions.cl deleted file mode 100644 index 6eb6e48..0000000 --- a/modules/core/src/opencl/reductions.cl +++ /dev/null @@ -1,104 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Shengen Yan,yanshengen@gmail.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#if defined (DOUBLE_SUPPORT) -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#elif defined (cl_amd_fp64) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#endif -#endif - -#if FUNC_SUM -#define FUNC(a, b) b += a; -#elif FUNC_ABS_SUM -#define FUNC(a, b) b += a >= (dstT)(0) ? a : -a; -#elif FUNC_SQR_SUM -#define FUNC(a, b) b += a * a; -#else -#error No sum function -#endif - -/**************************************Array buffer SUM**************************************/ - -__kernel void arithm_op_sum(int cols,int invalid_cols,int offset,int elemnum,int groupnum, - __global srcT *src, __global dstT *dst) -{ - unsigned int lid = get_local_id(0); - unsigned int gid = get_group_id(0); - unsigned int id = get_global_id(0); - unsigned int idx = offset + id + (id / cols) * invalid_cols; - - __local dstT localmem_sum[128]; - dstT sum = (dstT)(0), temp; - - for (int grainSize = groupnum << 8; id < elemnum; id += grainSize) - { - idx = offset + id + (id / cols) * invalid_cols; - temp = convertToDstT(src[idx]); - FUNC(temp, sum); - } - - if (lid > 127) - localmem_sum[lid - 128] = sum; - barrier(CLK_LOCAL_MEM_FENCE); - - if (lid < 128) - localmem_sum[lid] = sum + localmem_sum[lid]; - barrier(CLK_LOCAL_MEM_FENCE); - - for (int lsize = 64; lsize > 0; lsize >>= 1) - { - if (lid < lsize) - { - int lid2 = lsize + lid; - localmem_sum[lid] = localmem_sum[lid] + localmem_sum[lid2]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (lid == 0) - dst[gid] = localmem_sum[0]; -} diff --git a/modules/imgproc/src/opencl/bilateral.cl b/modules/imgproc/src/opencl/bilateral.cl deleted file mode 100644 index cb317a0..0000000 --- a/modules/imgproc/src/opencl/bilateral.cl +++ /dev/null @@ -1,145 +0,0 @@ -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Rock Li, Rock.li@amd.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. - -__kernel void bilateral_C1_D0(__global uchar *dst, - __global const uchar *src, - const int dst_rows, - const int dst_cols, - const int maxk, - const int radius, - const int dst_step, - const int dst_offset, - const int src_step, - const int src_rows, - const int src_cols, - __constant float *color_weight, - __constant float *space_weight, - __constant int *space_ofs) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (y < dst_rows && x < dst_cols) - { - int src_index = mad24(y + radius, src_step, x + radius); - int dst_index = mad24(y, dst_step, x + dst_offset); - float sum = 0.f, wsum = 0.f; - - int val0 = (int)src[src_index]; - for(int k = 0; k < maxk; k++ ) - { - int val = (int)src[src_index + space_ofs[k]]; - float w = space_weight[k] * color_weight[abs(val - val0)]; - sum += (float)(val) * w; - wsum += w; - } - dst[dst_index] = convert_uchar_rtz(sum / wsum + 0.5f); - } -} - -__kernel void bilateral2_C1_D0(__global uchar *dst, - __global const uchar *src, - const int dst_rows, - const int dst_cols, - const int maxk, - const int radius, - const int dst_step, - const int dst_offset, - const int src_step, - const int src_rows, - const int src_cols, - __constant float *color_weight, - __constant float *space_weight, - __constant int *space_ofs) -{ - int x = get_global_id(0) << 2; - int y = get_global_id(1); - - if (y < dst_rows && x < dst_cols) - { - int src_index = mad24(y + radius, src_step, x + radius); - int dst_index = mad24(y, dst_step, x + dst_offset); - float4 sum = (float4)(0.f), wsum = (float4)(0.f); - - int4 val0 = convert_int4(vload4(0,src + src_index)); - for(int k = 0; k < maxk; k++ ) - { - int4 val = convert_int4(vload4(0,src+src_index + space_ofs[k])); - float4 w = (float4)(space_weight[k]) * (float4)(color_weight[abs(val.x - val0.x)], color_weight[abs(val.y - val0.y)], - color_weight[abs(val.z - val0.z)], color_weight[abs(val.w - val0.w)]); - sum += convert_float4(val) * w; - wsum += w; - } - *(__global uchar4*)(dst+dst_index) = convert_uchar4_rtz(sum/wsum+0.5f); - } -} - -__kernel void bilateral_C4_D0(__global uchar4 *dst, - __global const uchar4 *src, - const int dst_rows, - const int dst_cols, - const int maxk, - const int radius, - const int dst_step, - const int dst_offset, - const int src_step, - const int src_rows, - const int src_cols, - __constant float *color_weight, - __constant float *space_weight, - __constant int *space_ofs) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (y < dst_rows && x < dst_cols) - { - int src_index = mad24(y + radius, src_step, x + radius); - int dst_index = mad24(y, dst_step, x + dst_offset); - float4 sum = (float4)0.f; - float wsum = 0.f; - - int4 val0 = convert_int4(src[src_index]); - for(int k = 0; k < maxk; k++ ) - { - int4 val = convert_int4(src[src_index + space_ofs[k]]); - float w = space_weight[k] * color_weight[abs(val.x - val0.x) + abs(val.y - val0.y) + abs(val.z - val0.z)]; - sum += convert_float4(val) * (float4)w; - wsum += w; - } - - wsum = 1.f / wsum; - dst[dst_index] = convert_uchar4_rtz(sum * (float4)wsum + (float4)0.5f); - } -} diff --git a/modules/imgproc/src/opencl/boxfilter.cl b/modules/imgproc/src/opencl/boxfilter.cl deleted file mode 100644 index 030c13c..0000000 --- a/modules/imgproc/src/opencl/boxfilter.cl +++ /dev/null @@ -1,478 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Zhang Ying, zhangying913@gmail.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////Macro for border type//////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////////////////////////////// -#ifdef BORDER_REPLICATE -//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i)) -#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i)) -#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr)) -#endif - -#ifdef BORDER_REFLECT -//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i)) -#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i)) -#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr)) -#endif - -#ifdef BORDER_REFLECT_101 -//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i)) -#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i)) -#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr)) -#endif - -//blur function does not support BORDER_WRAP -#ifdef BORDER_WRAP -//BORDER_WRAP: cdefgh|abcdefgh|abcdefg -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i)) -#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i)) -#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr)) -#endif - -#define THREADS 256 -#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2) - -inline void update_dst_C1_D0(__global uchar *dst, __local uint* temp, - int dst_rows, int dst_cols, - int dst_startX, int dst_x_off, - float alpha) -{ - if(get_local_id(0) < anX || get_local_id(0) >= (THREADS-ksX+anX+1)) - { - return; - } - - uint4 tmp_sum = 0; - int posX = dst_startX - dst_x_off + (get_local_id(0)-anX)*4; - int posY = (get_group_id(1) << 1); - - for(int i=-anX; i<=anX; i++) - { - tmp_sum += vload4(get_local_id(0), temp+i); - } - - if(posY < dst_rows && posX < dst_cols) - { - tmp_sum /= (uint4) alpha; - if(posX >= 0 && posX < dst_cols) - *(dst) = tmp_sum.x; - if(posX+1 >= 0 && posX+1 < dst_cols) - *(dst + 1) = tmp_sum.y; - if(posX+2 >= 0 && posX+2 < dst_cols) - *(dst + 2) = tmp_sum.z; - if(posX+3 >= 0 && posX+3 < dst_cols) - *(dst + 3) = tmp_sum.w; - } -} - - -inline void update_dst_C4_D0(__global uchar4 *dst, __local uint4* temp, - int dst_rows, int dst_cols, - int dst_startX, int dst_x_off, - float alpha) -{ - if(get_local_id(0) >= (THREADS-ksX+1)) - { - return; - } - - int posX = dst_startX - dst_x_off + get_local_id(0); - int posY = (get_group_id(1) << 1); - - uint4 temp_sum = 0; - for(int i=-anX; i<=anX; i++) - { - temp_sum += temp[get_local_id(0) + anX + i]; - } - - if(posX >= 0 && posX < dst_cols && posY >= 0 && posY < dst_rows) - *dst = convert_uchar4(convert_float4(temp_sum)/alpha); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////////////8uC1//////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////// -__kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global uchar *dst, float alpha, - int src_offset, int src_whole_rows, int src_whole_cols, int src_step, - int dst_offset, int dst_rows, int dst_cols, int dst_step - ) -{ - - int col = get_local_id(0); - const int gX = get_group_id(0); - const int gY = get_group_id(1); - int src_x_off = src_offset % src_step; - int src_y_off = src_offset / src_step; - int dst_x_off = dst_offset % dst_step; - int dst_y_off = dst_offset / dst_step; - - int head_off = dst_x_off%4; - int startX = ((gX * (THREADS-ksX+1)-anX) * 4) - head_off + src_x_off; - int startY = (gY << 1) - anY + src_y_off; - int dst_startX = (gX * (THREADS-ksX+1) * 4) - head_off + dst_x_off; - int dst_startY = (gY << 1) + dst_y_off; - - uint4 data[ksY+1]; - __local uint4 temp[2][THREADS]; - -#ifdef BORDER_CONSTANT - - for(int i=0; i < ksY+1; i++) - { - if(startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4=0 && startY+i < src_whole_rows && startX+col*4+1 >=0 && startX+col*4+1=0 && startY+i < src_whole_rows && startX+col*4+2 >=0 && startX+col*4+2=0 && startY+i < src_whole_rows && startX+col*4+3 >=0 && startX+col*4+3src_whole_cols-1) - | (startY+i<0) | (startY+i>src_whole_rows-1); - if(not_all_in_range) - { - int selected_row; - int4 selected_col; - selected_row = ADDR_H(startY+i, 0, src_whole_rows); - selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); - - selected_col.x = ADDR_L(startX+col*4, 0, src_whole_cols); - selected_col.x = ADDR_R(startX+col*4, src_whole_cols, selected_col.x); - - selected_col.y = ADDR_L(startX+col*4+1, 0, src_whole_cols); - selected_col.y = ADDR_R(startX+col*4+1, src_whole_cols, selected_col.y); - - selected_col.z = ADDR_L(startX+col*4+2, 0, src_whole_cols); - selected_col.z = ADDR_R(startX+col*4+2, src_whole_cols, selected_col.z); - - selected_col.w = ADDR_L(startX+col*4+3, 0, src_whole_cols); - selected_col.w = ADDR_R(startX+col*4+3, src_whole_cols, selected_col.w); - - data[i].x = *(src + selected_row * src_step + selected_col.x); - data[i].y = *(src + selected_row * src_step + selected_col.y); - data[i].z = *(src + selected_row * src_step + selected_col.z); - data[i].w = *(src + selected_row * src_step + selected_col.w); - } - else - { - data[i] = convert_uint4(vload4(col,(__global uchar*)(src+(startY+i)*src_step + startX))); - } - } -#endif - uint4 tmp_sum = 0; - for(int i=1; i < ksY; i++) - { - tmp_sum += (data[i]); - } - - int index = dst_startY * dst_step + dst_startX + (col-anX)*4; - - temp[0][col] = tmp_sum + (data[0]); - temp[1][col] = tmp_sum + (data[ksY]); - barrier(CLK_LOCAL_MEM_FENCE); - update_dst_C1_D0(dst+index, (__local uint *)(temp[0]), - dst_rows, dst_cols, dst_startX, dst_x_off, alpha); - update_dst_C1_D0(dst+index+dst_step, (__local uint *)(temp[1]), - dst_rows, dst_cols, dst_startX, dst_x_off, alpha); - -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////////////8uC4//////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////// -__kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, float alpha, - int src_offset, int src_whole_rows, int src_whole_cols, int src_step, - int dst_offset, int dst_rows, int dst_cols, int dst_step - ) -{ - int col = get_local_id(0); - const int gX = get_group_id(0); - const int gY = get_group_id(1); - - int src_x_off = (src_offset % src_step) >> 2; - int src_y_off = src_offset / src_step; - int dst_x_off = (dst_offset % dst_step) >> 2; - int dst_y_off = dst_offset / dst_step; - - int startX = gX * (THREADS-ksX+1) - anX + src_x_off; - int startY = (gY << 1) - anY + src_y_off; - int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; - int dst_startY = (gY << 1) + dst_y_off; - - uint4 data[ksY+1]; - __local uint4 temp[2][THREADS]; - -#ifdef BORDER_CONSTANT - bool con; - for(int i=0; i < ksY+1; i++) - { - con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows; - int cur_col = clamp(startX + col, 0, src_whole_cols); - - data[i].x = con ? src[(startY+i)*(src_step>>2) + cur_col].x : 0; - data[i].y = con ? src[(startY+i)*(src_step>>2) + cur_col].y : 0; - data[i].z = con ? src[(startY+i)*(src_step>>2) + cur_col].z : 0; - data[i].w = con ? src[(startY+i)*(src_step>>2) + cur_col].w : 0; - } -#else - for(int i=0; i < ksY+1; i++) - { - int selected_row; - int selected_col; - selected_row = ADDR_H(startY+i, 0, src_whole_rows); - selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); - - selected_col = ADDR_L(startX+col, 0, src_whole_cols); - selected_col = ADDR_R(startX+col, src_whole_cols, selected_col); - - - data[i] = convert_uint4(src[selected_row * (src_step>>2) + selected_col]); - } - -#endif - uint4 tmp_sum = 0; - for(int i=1; i < ksY; i++) - { - tmp_sum += (data[i]); - } - - int index = dst_startY * (dst_step>>2)+ dst_startX + col; - - temp[0][col] = tmp_sum + (data[0]); - temp[1][col] = tmp_sum + (data[ksY]); - barrier(CLK_LOCAL_MEM_FENCE); - update_dst_C4_D0(dst+index, (__local uint4 *)(temp[0]), - dst_rows, dst_cols, dst_startX, dst_x_off, alpha); - update_dst_C4_D0(dst+index+(dst_step>>2), (__local uint4 *)(temp[1]), - dst_rows, dst_cols, dst_startX, dst_x_off, alpha); - -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////////////32fC1//////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////// -__kernel void boxFilter_C1_D5(__global const float *restrict src, __global float *dst, float alpha, - int src_offset, int src_whole_rows, int src_whole_cols, int src_step, - int dst_offset, int dst_rows, int dst_cols, int dst_step - ) -{ - int col = get_local_id(0); - const int gX = get_group_id(0); - const int gY = get_group_id(1); - - int src_x_off = (src_offset % src_step) >> 2; - int src_y_off = src_offset / src_step; - int dst_x_off = (dst_offset % dst_step) >> 2; - int dst_y_off = dst_offset / dst_step; - - int startX = gX * (THREADS-ksX+1) - anX + src_x_off; - int startY = (gY << 1) - anY + src_y_off; - int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; - int dst_startY = (gY << 1) + dst_y_off; - float data[ksY+1]; - __local float temp[2][THREADS]; -#ifdef BORDER_CONSTANT - bool con; - float ss; - for(int i=0; i < ksY+1; i++) - { - con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows; - - int cur_col = clamp(startX + col, 0, src_whole_cols); - ss = (startY+i)=0&&cur_col>=0&&cur_col>2) + cur_col]:(float)0; - - data[i] = con ? ss : 0.f; - } -#else - for(int i=0; i < ksY+1; i++) - { - int selected_row; - int selected_col; - selected_row = ADDR_H(startY+i, 0, src_whole_rows); - selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); - - selected_col = ADDR_L(startX+col, 0, src_whole_cols); - selected_col = ADDR_R(startX+col, src_whole_cols, selected_col); - - data[i] = src[selected_row * (src_step>>2) + selected_col]; - } - -#endif - float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; - for(int i=1; i < ksY; i++) - { - sum0 += (data[i]); - } - sum1 = sum0 + (data[0]); - sum2 = sum0 + (data[ksY]); - temp[0][col] = sum1; - temp[1][col] = sum2; - barrier(CLK_LOCAL_MEM_FENCE); - if(col < (THREADS-(ksX-1))) - { - col += anX; - int posX = dst_startX - dst_x_off + col - anX; - int posY = (gY << 1); - - float tmp_sum[2]= {0.0, 0.0}; - for(int k=0; k<2; k++) - for(int i=-anX; i<=anX; i++) - { - tmp_sum[k] += temp[k][col+i]; - } - for(int i=0; i<2; i++) - { - if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows) - dst[(dst_startY+i) * (dst_step>>2)+ dst_startX + col - anX] = tmp_sum[i]/alpha; - } - - } -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////////////32fC4//////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////// -__kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global float4 *dst, float alpha, - int src_offset, int src_whole_rows, int src_whole_cols, int src_step, - int dst_offset, int dst_rows, int dst_cols, int dst_step - ) -{ - int col = get_local_id(0); - const int gX = get_group_id(0); - const int gY = get_group_id(1); - - int src_x_off = (src_offset % src_step) >> 4; - int src_y_off = src_offset / src_step; - int dst_x_off = (dst_offset % dst_step) >> 4; - int dst_y_off = dst_offset / dst_step; - - int startX = gX * (THREADS-ksX+1) - anX + src_x_off; - int startY = (gY << 1) - anY + src_y_off; - int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; - int dst_startY = (gY << 1) + dst_y_off; - float4 data[ksY+1]; - __local float4 temp[2][THREADS]; -#ifdef BORDER_CONSTANT - bool con; - float4 ss; - for(int i=0; i < ksY+1; i++) - { - con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows; - - int cur_col = clamp(startX + col, 0, src_whole_cols); - ss = (startY+i)=0&&cur_col>=0&&cur_col>4) + cur_col]:(float4)0; - - data[i] = con ? ss : (float4)(0.0,0.0,0.0,0.0); - } -#else - for(int i=0; i < ksY+1; i++) - { - int selected_row; - int selected_col; - selected_row = ADDR_H(startY+i, 0, src_whole_rows); - selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); - - selected_col = ADDR_L(startX+col, 0, src_whole_cols); - selected_col = ADDR_R(startX+col, src_whole_cols, selected_col); - - data[i] = src[selected_row * (src_step>>4) + selected_col]; - } - -#endif - float4 sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; - for(int i=1; i < ksY; i++) - { - sum0 += (data[i]); - } - sum1 = sum0 + (data[0]); - sum2 = sum0 + (data[ksY]); - temp[0][col] = sum1; - temp[1][col] = sum2; - barrier(CLK_LOCAL_MEM_FENCE); - if(col < (THREADS-(ksX-1))) - { - col += anX; - int posX = dst_startX - dst_x_off + col - anX; - int posY = (gY << 1); - - float4 tmp_sum[2]= {(float4)(0.0,0.0,0.0,0.0), (float4)(0.0,0.0,0.0,0.0)}; - for(int k=0; k<2; k++) - for(int i=-anX; i<=anX; i++) - { - tmp_sum[k] += temp[k][col+i]; - } - for(int i=0; i<2; i++) - { - if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows) - dst[(dst_startY+i) * (dst_step>>4)+ dst_startX + col - anX] = tmp_sum[i]/alpha; - } - - } -} diff --git a/modules/imgproc/src/opencl/canny.cl b/modules/imgproc/src/opencl/canny.cl deleted file mode 100644 index ca670b6..0000000 --- a/modules/imgproc/src/opencl/canny.cl +++ /dev/null @@ -1,636 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Peng Xiao, pengxiao@multicorewareinc.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable -#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable - -#ifdef L2GRAD -inline float calc(int x, int y) -{ - return sqrt((float)(x * x + y * y)); -} -#else -inline float calc(int x, int y) -{ - return (float)abs(x) + abs(y); -} -#endif // - -// Smoothing perpendicular to the derivative direction with a triangle filter -// only support 3x3 Sobel kernel -// h (-1) = 1, h (0) = 2, h (1) = 1 -// h'(-1) = -1, h'(0) = 0, h'(1) = 1 -// thus sobel 2D operator can be calculated as: -// h'(x, y) = h'(x)h(y) for x direction -// -// src input 8bit single channel image data -// dx_buf output dx buffer -// dy_buf output dy buffer -__kernel -void -__attribute__((reqd_work_group_size(16,16,1))) -calcSobelRowPass -( - __global const uchar * src, - __global int * dx_buf, - __global int * dy_buf, - int rows, - int cols, - int src_step, - int src_offset, - int dx_buf_step, - int dx_buf_offset, - int dy_buf_step, - int dy_buf_offset -) -{ - dx_buf_step /= sizeof(*dx_buf); - dx_buf_offset /= sizeof(*dx_buf); - dy_buf_step /= sizeof(*dy_buf); - dy_buf_offset /= sizeof(*dy_buf); - - int gidx = get_global_id(0); - int gidy = get_global_id(1); - - int lidx = get_local_id(0); - int lidy = get_local_id(1); - - __local int smem[16][18]; - - smem[lidy][lidx + 1] = - src[gidx + min(gidy, rows - 1) * src_step + src_offset]; - if(lidx == 0) - { - smem[lidy][0] = - src[max(gidx - 1, 0) + min(gidy, rows - 1) * src_step + src_offset]; - smem[lidy][17] = - src[min(gidx + 16, cols - 1) + min(gidy, rows - 1) * src_step + src_offset]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if(gidy < rows && gidx < cols) - { - dx_buf[gidx + gidy * dx_buf_step + dx_buf_offset] = - -smem[lidy][lidx] + smem[lidy][lidx + 2]; - dy_buf[gidx + gidy * dy_buf_step + dy_buf_offset] = - smem[lidy][lidx] + 2 * smem[lidy][lidx + 1] + smem[lidy][lidx + 2]; - } -} - -// calculate the magnitude of the filter pass combining both x and y directions -// This is the buffered version(3x3 sobel) -// -// dx_buf dx buffer, calculated from calcSobelRowPass -// dy_buf dy buffer, calculated from calcSobelRowPass -// dx direvitive in x direction output -// dy direvitive in y direction output -// mag magnitude direvitive of xy output -__kernel -void -__attribute__((reqd_work_group_size(16,16,1))) -calcMagnitude_buf -( - __global const int * dx_buf, - __global const int * dy_buf, - __global int * dx, - __global int * dy, - __global float * mag, - int rows, - int cols, - int dx_buf_step, - int dx_buf_offset, - int dy_buf_step, - int dy_buf_offset, - int dx_step, - int dx_offset, - int dy_step, - int dy_offset, - int mag_step, - int mag_offset -) -{ - dx_buf_step /= sizeof(*dx_buf); - dx_buf_offset /= sizeof(*dx_buf); - dy_buf_step /= sizeof(*dy_buf); - dy_buf_offset /= sizeof(*dy_buf); - dx_step /= sizeof(*dx); - dx_offset /= sizeof(*dx); - dy_step /= sizeof(*dy); - dy_offset /= sizeof(*dy); - mag_step /= sizeof(*mag); - mag_offset /= sizeof(*mag); - - int gidx = get_global_id(0); - int gidy = get_global_id(1); - - int lidx = get_local_id(0); - int lidy = get_local_id(1); - - __local int sdx[18][16]; - __local int sdy[18][16]; - - sdx[lidy + 1][lidx] = - dx_buf[gidx + min(gidy, rows - 1) * dx_buf_step + dx_buf_offset]; - sdy[lidy + 1][lidx] = - dy_buf[gidx + min(gidy, rows - 1) * dy_buf_step + dy_buf_offset]; - if(lidy == 0) - { - sdx[0][lidx] = - dx_buf[gidx + min(max(gidy-1,0),rows-1) * dx_buf_step + dx_buf_offset]; - sdx[17][lidx] = - dx_buf[gidx + min(gidy + 16, rows - 1) * dx_buf_step + dx_buf_offset]; - - sdy[0][lidx] = - dy_buf[gidx + min(max(gidy-1,0),rows-1) * dy_buf_step + dy_buf_offset]; - sdy[17][lidx] = - dy_buf[gidx + min(gidy + 16, rows - 1) * dy_buf_step + dy_buf_offset]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if(gidx < cols && gidy < rows) - { - int x = sdx[lidy][lidx] + 2 * sdx[lidy + 1][lidx] + sdx[lidy + 2][lidx]; - int y = -sdy[lidy][lidx] + sdy[lidy + 2][lidx]; - - dx[gidx + gidy * dx_step + dx_offset] = x; - dy[gidx + gidy * dy_step + dy_offset] = y; - - mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] = calc(x, y); - } -} - -// calculate the magnitude of the filter pass combining both x and y directions -// This is the non-buffered version(non-3x3 sobel) -// -// dx_buf dx buffer, calculated from calcSobelRowPass -// dy_buf dy buffer, calculated from calcSobelRowPass -// dx direvitive in x direction output -// dy direvitive in y direction output -// mag magnitude direvitive of xy output -__kernel -void calcMagnitude -( - __global const int * dx, - __global const int * dy, - __global float * mag, - int rows, - int cols, - int dx_step, - int dx_offset, - int dy_step, - int dy_offset, - int mag_step, - int mag_offset -) -{ - dx_step /= sizeof(*dx); - dx_offset /= sizeof(*dx); - dy_step /= sizeof(*dy); - dy_offset /= sizeof(*dy); - mag_step /= sizeof(*mag); - mag_offset /= sizeof(*mag); - - int gidx = get_global_id(0); - int gidy = get_global_id(1); - - if(gidy < rows && gidx < cols) - { - mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] = - calc( - dx[gidx + gidy * dx_step + dx_offset], - dy[gidx + gidy * dy_step + dy_offset] - ); - } -} - -////////////////////////////////////////////////////////////////////////////////////////// -// 0.4142135623730950488016887242097 is tan(22.5) -#define CANNY_SHIFT 15 -#define TG22 (int)(0.4142135623730950488016887242097*(1< low_thresh) - { - const int tg22x = x * TG22; - const int tg67x = tg22x + (x << (1 + CANNY_SHIFT)); - y <<= CANNY_SHIFT; - if(y < tg22x) - { - if(m > smem[lidy + 1][lidx] && m >= smem[lidy + 1][lidx + 2]) - { - edge_type = 1 + (int)(m > high_thresh); - } - } - else if (y > tg67x) - { - if(m > smem[lidy][lidx + 1]&& m >= smem[lidy + 2][lidx + 1]) - { - edge_type = 1 + (int)(m > high_thresh); - } - } - else - { - if(m > smem[lidy][lidx + 1 - s]&& m > smem[lidy + 2][lidx + 1 + s]) - { - edge_type = 1 + (int)(m > high_thresh); - } - } - } - map[gidx + 1 + (gidy + 1) * map_step] = edge_type; - } -} - -#undef CANNY_SHIFT -#undef TG22 - -////////////////////////////////////////////////////////////////////////////////////////// -// do Hysteresis for pixel whose edge type is 1 -// -// If candidate pixel (edge type is 1) has a neighbour pixel (in 3x3 area) with type 2, it is believed to be part of an edge and -// marked as edge. Each thread will iterate for 16 times to connect local edges. -// Candidate pixel being identified as edge will then be tested if there is nearby potiential edge points. If there is, counter will -// be incremented by 1 and the point location is stored. These potiential candidates will be processed further in next kernel. -// -// map raw edge type results calculated from calcMap. -// st the potiential edge points found in this kernel call -// counter the number of potiential edge points -__kernel -void -__attribute__((reqd_work_group_size(16,16,1))) -edgesHysteresisLocal -( - __global int * map, - __global ushort2 * st, - __global unsigned int * counter, - int rows, - int cols, - int map_step, - int map_offset -) -{ - map_step /= sizeof(*map); - map_offset /= sizeof(*map); - - map += map_offset; - - __local int smem[18][18]; - - int gidx = get_global_id(0); - int gidy = get_global_id(1); - - int lidx = get_local_id(0); - int lidy = get_local_id(1); - - int grp_idx = get_global_id(0) & 0xFFFFF0; - int grp_idy = get_global_id(1) & 0xFFFFF0; - - int tid = lidx + lidy * 16; - int lx = tid % 18; - int ly = tid / 18; - if(ly < 14) - { - smem[ly][lx] = - map[grp_idx + lx + min(grp_idy + ly, rows - 1) * map_step]; - } - if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols) - { - smem[ly + 14][lx] = - map[grp_idx + lx + min(grp_idy + ly + 14, rows - 1) * map_step]; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if(gidy < rows && gidx < cols) - { - int n; - - #pragma unroll - for (int k = 0; k < 16; ++k) - { - n = 0; - - if (smem[lidy + 1][lidx + 1] == 1) - { - n += smem[lidy ][lidx ] == 2; - n += smem[lidy ][lidx + 1] == 2; - n += smem[lidy ][lidx + 2] == 2; - - n += smem[lidy + 1][lidx ] == 2; - n += smem[lidy + 1][lidx + 2] == 2; - - n += smem[lidy + 2][lidx ] == 2; - n += smem[lidy + 2][lidx + 1] == 2; - n += smem[lidy + 2][lidx + 2] == 2; - } - - if (n > 0) - smem[lidy + 1][lidx + 1] = 2; - } - - const int e = smem[lidy + 1][lidx + 1]; - map[gidx + 1 + (gidy + 1) * map_step] = e; - - n = 0; - if(e == 2) - { - n += smem[lidy ][lidx ] == 1; - n += smem[lidy ][lidx + 1] == 1; - n += smem[lidy ][lidx + 2] == 1; - - n += smem[lidy + 1][lidx ] == 1; - n += smem[lidy + 1][lidx + 2] == 1; - - n += smem[lidy + 2][lidx ] == 1; - n += smem[lidy + 2][lidx + 1] == 1; - n += smem[lidy + 2][lidx + 2] == 1; - } - - if(n > 0) - { - unsigned int ind = atomic_inc(counter); - st[ind] = (ushort2)(gidx + 1, gidy + 1); - } - } -} - -__constant int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; -__constant int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; - - -#define stack_size 512 -__kernel -void -__attribute__((reqd_work_group_size(128,1,1))) -edgesHysteresisGlobal -( - __global int * map, - __global ushort2 * st1, - __global ushort2 * st2, - __global int * counter, - int rows, - int cols, - int count, - int map_step, - int map_offset -) -{ - - map_step /= sizeof(*map); - map_offset /= sizeof(*map); - - map += map_offset; - - int gidx = get_global_id(0); - int gidy = get_global_id(1); - - int lidx = get_local_id(0); - int lidy = get_local_id(1); - - int grp_idx = get_group_id(0); - int grp_idy = get_group_id(1); - - __local unsigned int s_counter; - __local unsigned int s_ind; - - __local ushort2 s_st[stack_size]; - - if(lidx == 0) - { - s_counter = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int ind = mad24(grp_idy, (int)get_local_size(0), grp_idx); - - if(ind < count) - { - ushort2 pos = st1[ind]; - if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) - { - if (lidx < 8) - { - pos.x += c_dx[lidx]; - pos.y += c_dy[lidx]; - - if (map[pos.x + pos.y * map_step] == 1) - { - map[pos.x + pos.y * map_step] = 2; - - ind = atomic_inc(&s_counter); - - s_st[ind] = pos; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - - while (s_counter > 0 && s_counter <= stack_size - get_local_size(0)) - { - const int subTaskIdx = lidx >> 3; - const int portion = min(s_counter, (uint)(get_local_size(0)>> 3)); - - pos.x = pos.y = 0; - - if (subTaskIdx < portion) - pos = s_st[s_counter - 1 - subTaskIdx]; - barrier(CLK_LOCAL_MEM_FENCE); - - if (lidx == 0) - s_counter -= portion; - barrier(CLK_LOCAL_MEM_FENCE); - - if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) - { - pos.x += c_dx[lidx & 7]; - pos.y += c_dy[lidx & 7]; - - if (map[pos.x + pos.y * map_step] == 1) - { - map[pos.x + pos.y * map_step] = 2; - - ind = atomic_inc(&s_counter); - - s_st[ind] = pos; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (s_counter > 0) - { - if (lidx == 0) - { - ind = atomic_add(counter, s_counter); - s_ind = ind - s_counter; - } - barrier(CLK_LOCAL_MEM_FENCE); - - ind = s_ind; - - for (int i = lidx; i < s_counter; i += get_local_size(0)) - { - st2[ind + i] = s_st[i]; - } - } - } - } -} -#undef stack_size - -//Get the edge result. egde type of value 2 will be marked as an edge point and set to 255. Otherwise 0. -// map edge type mappings -// dst edge output -__kernel -void getEdges -( - __global const int * map, - __global uchar * dst, - int rows, - int cols, - int map_step, - int map_offset, - int dst_step, - int dst_offset -) -{ - map_step /= sizeof(*map); - map_offset /= sizeof(*map); - - int gidx = get_global_id(0); - int gidy = get_global_id(1); - - if(gidy < rows && gidx < cols) - { - dst[gidx + gidy * dst_step] = (uchar)(-(map[gidx + 1 + (gidy + 1) * map_step + map_offset] >> 1)); - } -} diff --git a/modules/imgproc/src/opencl/clahe.cl b/modules/imgproc/src/opencl/clahe.cl deleted file mode 100644 index 16c68fd..0000000 --- a/modules/imgproc/src/opencl/clahe.cl +++ /dev/null @@ -1,255 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Sen Liu, swjtuls1987@126.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#ifndef WAVE_SIZE -#define WAVE_SIZE 1 -#endif - -int calc_lut(__local int* smem, int val, int tid) -{ - smem[tid] = val; - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid == 0) - for (int i = 1; i < 256; ++i) - smem[i] += smem[i - 1]; - barrier(CLK_LOCAL_MEM_FENCE); - - return smem[tid]; -} - -#ifdef CPU -void reduce(volatile __local int* smem, int val, int tid) -{ - smem[tid] = val; - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 128) - smem[tid] = val += smem[tid + 128]; - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 64) - smem[tid] = val += smem[tid + 64]; - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 32) - smem[tid] += smem[tid + 32]; - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 16) - smem[tid] += smem[tid + 16]; - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 8) - smem[tid] += smem[tid + 8]; - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 4) - smem[tid] += smem[tid + 4]; - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 2) - smem[tid] += smem[tid + 2]; - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 1) - smem[256] = smem[tid] + smem[tid + 1]; - barrier(CLK_LOCAL_MEM_FENCE); -} - -#else - -void reduce(__local volatile int* smem, int val, int tid) -{ - smem[tid] = val; - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 128) - smem[tid] = val += smem[tid + 128]; - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 64) - smem[tid] = val += smem[tid + 64]; - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 32) - { - smem[tid] += smem[tid + 32]; -#if WAVE_SIZE < 32 - } barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 16) - { -#endif - smem[tid] += smem[tid + 16]; -#if WAVE_SIZE < 16 - } - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 8) - { -#endif - smem[tid] += smem[tid + 8]; - smem[tid] += smem[tid + 4]; - smem[tid] += smem[tid + 2]; - smem[tid] += smem[tid + 1]; - } -} -#endif - -__kernel void calcLut(__global __const uchar * src, __global uchar * lut, - const int srcStep, const int dstStep, - const int2 tileSize, const int tilesX, - const int clipLimit, const float lutScale, - const int src_offset, const int dst_offset) -{ - __local int smem[512]; - - const int tx = get_group_id(0); - const int ty = get_group_id(1); - const unsigned int tid = get_local_id(1) * get_local_size(0) - + get_local_id(0); - - smem[tid] = 0; - barrier(CLK_LOCAL_MEM_FENCE); - - for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1)) - { - __global const uchar* srcPtr = src + mad24(ty * tileSize.y + i, srcStep, tx * tileSize.x + src_offset); - for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0)) - { - const int data = srcPtr[j]; - atomic_inc(&smem[data]); - } - } - barrier(CLK_LOCAL_MEM_FENCE); - - int tHistVal = smem[tid]; - barrier(CLK_LOCAL_MEM_FENCE); - - if (clipLimit > 0) - { - // clip histogram bar - int clipped = 0; - if (tHistVal > clipLimit) - { - clipped = tHistVal - clipLimit; - tHistVal = clipLimit; - } - - // find number of overall clipped samples - reduce(smem, clipped, tid); - barrier(CLK_LOCAL_MEM_FENCE); -#ifdef CPU - clipped = smem[256]; -#else - clipped = smem[0]; -#endif - - // broadcast evaluated value - - __local int totalClipped; - - if (tid == 0) - totalClipped = clipped; - barrier(CLK_LOCAL_MEM_FENCE); - - // redistribute clipped samples evenly - - int redistBatch = totalClipped / 256; - tHistVal += redistBatch; - - int residual = totalClipped - redistBatch * 256; - if (tid < residual) - ++tHistVal; - } - - const int lutVal = calc_lut(smem, tHistVal, tid); - uint ires = (uint)convert_int_rte(lutScale * lutVal); - lut[(ty * tilesX + tx) * dstStep + tid + dst_offset] = - convert_uchar(clamp(ires, (uint)0, (uint)255)); -} - -__kernel void transform(__global __const uchar * src, - __global uchar * dst, - __global uchar * lut, - const int srcStep, const int dstStep, const int lutStep, - const int cols, const int rows, - const int2 tileSize, - const int tilesX, const int tilesY, - const int src_offset, const int dst_offset, int lut_offset) -{ - const int x = get_global_id(0); - const int y = get_global_id(1); - - if (x >= cols || y >= rows) - return; - - const float tyf = (convert_float(y) / tileSize.y) - 0.5f; - int ty1 = convert_int_rtn(tyf); - int ty2 = ty1 + 1; - const float ya = tyf - ty1; - ty1 = max(ty1, 0); - ty2 = min(ty2, tilesY - 1); - - const float txf = (convert_float(x) / tileSize.x) - 0.5f; - int tx1 = convert_int_rtn(txf); - int tx2 = tx1 + 1; - const float xa = txf - tx1; - tx1 = max(tx1, 0); - tx2 = min(tx2, tilesX - 1); - - const int srcVal = src[mad24(y, srcStep, x + src_offset)]; - - float res = 0; - - res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (1.0f - ya)); - res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (1.0f - ya)); - res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (ya)); - res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (ya)); - - uint ires = (uint)convert_int_rte(res); - dst[mad24(y, dstStep, x + dst_offset)] = convert_uchar(clamp(ires, (uint)0, (uint)255)); -} diff --git a/modules/imgproc/src/opencl/convolve.cl b/modules/imgproc/src/opencl/convolve.cl deleted file mode 100644 index fb9596e..0000000 --- a/modules/imgproc/src/opencl/convolve.cl +++ /dev/null @@ -1,109 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Jiang Liyuan, jlyuan001.good@163.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#if defined (__ATI__) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#elif defined (__NVIDIA__) -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#endif - -/************************************** convolve **************************************/ - -__kernel void convolve_D5(__global float *src, __global float *temp1, __global float *dst, - int rows, int cols, int src_step, int dst_step,int k_step, int kWidth, int kHeight, - int src_offset, int dst_offset, int koffset) -{ - __local float smem[16 + 2 * 8][16 + 2 * 8]; - - int x = get_local_id(0); - int y = get_local_id(1); - int gx = get_global_id(0); - int gy = get_global_id(1); - - // x | x 0 | 0 - // ----------- - // x | x 0 | 0 - // 0 | 0 0 | 0 - // ----------- - // 0 | 0 0 | 0 - smem[y][x] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset]; - - // 0 | 0 x | x - // ----------- - // 0 | 0 x | x - // 0 | 0 0 | 0 - // ----------- - // 0 | 0 0 | 0 - smem[y][x + 16] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset]; - - // 0 | 0 0 | 0 - // ----------- - // 0 | 0 0 | 0 - // x | x 0 | 0 - // ----------- - // x | x 0 | 0 - smem[y + 16][x] = src[min(gy + 8, rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset]; - - // 0 | 0 0 | 0 - // ----------- - // 0 | 0 0 | 0 - // 0 | 0 x | x - // ----------- - // 0 | 0 x | x - smem[y + 16][x + 16] = src[min(gy + 8, rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset]; - - barrier(CLK_LOCAL_MEM_FENCE); - - if (gx < cols && gy < rows) - { - float res = 0; - - for (int i = 0; i < kHeight; ++i) - for (int j = 0; j < kWidth; ++j) - res += smem[y + 8 - kHeight / 2 + i][x + 8 - kWidth / 2 + j] * temp1[i * k_step + j + koffset]; - - dst[gy * dst_step + gx + dst_offset] = res; - } -} diff --git a/modules/imgproc/src/opencl/copymakeborder.cl b/modules/imgproc/src/opencl/copymakeborder.cl deleted file mode 100644 index d97f660..0000000 --- a/modules/imgproc/src/opencl/copymakeborder.cl +++ /dev/null @@ -1,134 +0,0 @@ -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Niko Li, newlife20080214@gmail.com -// Zero Lin zero.lin@amd.com -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -// - -#if defined (DOUBLE_SUPPORT) -#ifdef cl_amd_fp64 -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#elif defined (cl_khr_fp64) -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#endif -#endif - -#ifdef BORDER_CONSTANT -#define EXTRAPOLATE(x, y, v) v = scalar; -#elif defined BORDER_REPLICATE -#define EXTRAPOLATE(x, y, v) \ - { \ - x = max(min(x, src_cols - 1), 0); \ - y = max(min(y, src_rows - 1), 0); \ - v = src[mad24(y, src_step, x + src_offset)]; \ - } -#elif defined BORDER_WRAP -#define EXTRAPOLATE(x, y, v) \ - { \ - if (x < 0) \ - x -= ((x - src_cols + 1) / src_cols) * src_cols; \ - if (x >= src_cols) \ - x %= src_cols; \ - \ - if (y < 0) \ - y -= ((y - src_rows + 1) / src_rows) * src_rows; \ - if( y >= src_rows ) \ - y %= src_rows; \ - v = src[mad24(y, src_step, x + src_offset)]; \ - } -#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101) -#ifdef BORDER_REFLECT -#define DELTA int delta = 0 -#else -#define DELTA int delta = 1 -#endif -#define EXTRAPOLATE(x, y, v) \ - { \ - DELTA; \ - if (src_cols == 1) \ - x = 0; \ - else \ - do \ - { \ - if( x < 0 ) \ - x = -x - 1 + delta; \ - else \ - x = src_cols - 1 - (x - src_cols) - delta; \ - } \ - while (x >= src_cols || x < 0); \ - \ - if (src_rows == 1) \ - y = 0; \ - else \ - do \ - { \ - if( y < 0 ) \ - y = -y - 1 + delta; \ - else \ - y = src_rows - 1 - (y - src_rows) - delta; \ - } \ - while (y >= src_rows || y < 0); \ - v = src[mad24(y, src_step, x + src_offset)]; \ - } -#else -#error No extrapolation method -#endif - -#define NEED_EXTRAPOLATION(gx, gy) (gx >= src_cols || gy >= src_rows || gx < 0 || gy < 0) - -__kernel void copymakeborder - (__global const GENTYPE *src, - __global GENTYPE *dst, - int dst_cols, int dst_rows, - int src_cols, int src_rows, - int src_step, int src_offset, - int dst_step, int dst_offset, - int top, int left, GENTYPE scalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < dst_cols && y < dst_rows) - { - int src_x = x - left; - int src_y = y - top; - int dst_index = mad24(y, dst_step, x + dst_offset); - - if (NEED_EXTRAPOLATION(src_x, src_y)) - EXTRAPOLATE(src_x, src_y, dst[dst_index]) - else - { - int src_index = mad24(src_y, src_step, src_x + src_offset); - dst[dst_index] = src[src_index]; - } - } -} diff --git a/modules/imgproc/src/opencl/gftt.cl b/modules/imgproc/src/opencl/gftt.cl deleted file mode 100644 index 80bdec0..0000000 --- a/modules/imgproc/src/opencl/gftt.cl +++ /dev/null @@ -1,275 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Peng Xiao, pengxiao@outlook.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#ifndef WITH_MASK -#define WITH_MASK 0 -#endif - -__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; - -inline float ELEM_INT2(image2d_t _eig, int _x, int _y) -{ - return read_imagef(_eig, sampler, (int2)(_x, _y)).x; -} - -inline float ELEM_FLT2(image2d_t _eig, float2 pt) -{ - return read_imagef(_eig, sampler, pt).x; -} - -__kernel - void findCorners - ( - image2d_t eig, - __global const char * mask, - __global float2 * corners, - const int mask_strip,// in pixels - const float threshold, - const int rows, - const int cols, - const int max_count, - __global int * g_counter - ) -{ - const int j = get_global_id(0); - const int i = get_global_id(1); - - if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 -#if WITH_MASK - && mask[i * mask_strip + j] != 0 -#endif - ) - { - const float val = ELEM_INT2(eig, j, i); - - if (val > threshold) - { - float maxVal = val; - - maxVal = fmax(ELEM_INT2(eig, j - 1, i - 1), maxVal); - maxVal = fmax(ELEM_INT2(eig, j , i - 1), maxVal); - maxVal = fmax(ELEM_INT2(eig, j + 1, i - 1), maxVal); - - maxVal = fmax(ELEM_INT2(eig, j - 1, i), maxVal); - maxVal = fmax(ELEM_INT2(eig, j + 1, i), maxVal); - - maxVal = fmax(ELEM_INT2(eig, j - 1, i + 1), maxVal); - maxVal = fmax(ELEM_INT2(eig, j , i + 1), maxVal); - maxVal = fmax(ELEM_INT2(eig, j + 1, i + 1), maxVal); - - if (val == maxVal) - { - const int ind = atomic_inc(g_counter); - - if (ind < max_count) - corners[ind] = (float2)(j, i); - } - } - } -} - -//bitonic sort -__kernel - void sortCorners_bitonicSort - ( - image2d_t eig, - __global float2 * corners, - const int count, - const int stage, - const int passOfStage - ) -{ - const int threadId = get_global_id(0); - if(threadId >= count / 2) - { - return; - } - - const int sortOrder = (((threadId/(1 << stage)) % 2)) == 1 ? 1 : 0; // 0 is descent - - const int pairDistance = 1 << (stage - passOfStage); - const int blockWidth = 2 * pairDistance; - - const int leftId = min( (threadId % pairDistance) - + (threadId / pairDistance) * blockWidth, count ); - - const int rightId = min( leftId + pairDistance, count ); - - const float2 leftPt = corners[leftId]; - const float2 rightPt = corners[rightId]; - - const float leftVal = ELEM_FLT2(eig, leftPt); - const float rightVal = ELEM_FLT2(eig, rightPt); - - const bool compareResult = leftVal > rightVal; - - float2 greater = compareResult ? leftPt:rightPt; - float2 lesser = compareResult ? rightPt:leftPt; - - corners[leftId] = sortOrder ? lesser : greater; - corners[rightId] = sortOrder ? greater : lesser; -} - -//selection sort for gfft -//kernel is ported from Bolt library: -//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl -// Local sort will firstly sort elements of each workgroup using selection sort -// its performance is O(n) -__kernel - void sortCorners_selectionSortLocal - ( - image2d_t eig, - __global float2 * corners, - const int count, - __local float2 * scratch - ) -{ - int i = get_local_id(0); // index in workgroup - int numOfGroups = get_num_groups(0); // index in workgroup - int groupID = get_group_id(0); - int wg = get_local_size(0); // workgroup size = block size - int n; // number of elements to be processed for this work group - - int offset = groupID * wg; - int same = 0; - corners += offset; - n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg; - float2 pt1, pt2; - - pt1 = corners[min(i, n)]; - scratch[i] = pt1; - barrier(CLK_LOCAL_MEM_FENCE); - - if(i >= n) - { - return; - } - - float val1 = ELEM_FLT2(eig, pt1); - float val2; - - int pos = 0; - for (int j=0;j val1) - pos++;//calculate the rank of this element in this work group - else - { - if(val1 > val2) - continue; - else - { - // val1 and val2 are same - same++; - } - } - } - for (int j=0; j< same; j++) - corners[pos + j] = pt1; -} -__kernel - void sortCorners_selectionSortFinal - ( - image2d_t eig, - __global float2 * corners, - const int count - ) -{ - const int i = get_local_id(0); // index in workgroup - const int numOfGroups = get_num_groups(0); // index in workgroup - const int groupID = get_group_id(0); - const int wg = get_local_size(0); // workgroup size = block size - int pos = 0, same = 0; - const int offset = get_group_id(0) * wg; - const int remainder = count - wg*(numOfGroups-1); - - if((offset + i ) >= count) - return; - float2 pt1, pt2; - pt1 = corners[groupID*wg + i]; - - float val1 = ELEM_FLT2(eig, pt1); - float val2; - - for(int j=0; j val2) - break; - else - { - //Increment only if the value is not the same. - if( val2 > val1 ) - pos++; - else - same++; - } - } - } - - for(int k=0; k val2) - break; - else - { - //Don't increment if the value is the same. - //Two elements are same if (*userComp)(jData, iData) and (*userComp)(iData, jData) are both false - if(val2 > val1) - pos++; - else - same++; - } - } - for (int j=0; j< same; j++) - corners[pos + j] = pt1; -} diff --git a/modules/imgproc/src/opencl/harris.cl b/modules/imgproc/src/opencl/harris.cl deleted file mode 100644 index cac0b2c..0000000 --- a/modules/imgproc/src/opencl/harris.cl +++ /dev/null @@ -1,202 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Shengen Yan,yanshengen@gmail.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#if defined (DOUBLE_SUPPORT) -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#endif -/////////////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////Macro for border type//////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////////////////////////////// -#ifdef BORDER_REPLICATE -//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i)) -#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i)) -#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr)) -#endif - -#ifdef BORDER_REFLECT -//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i)) -#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i)) -#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr)) -#endif - -#ifdef BORDER_REFLECT101 -//BORDER_REFLECT101: gfedcb|abcdefgh|gfedcba -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i)) -#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i)) -#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr)) -#endif - -#ifdef BORDER_WRAP -//BORDER_WRAP: cdefgh|abcdefgh|abcdefg -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i)) -#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i)) -#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr)) -#endif - -#define THREADS 256 -#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2) -/////////////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////////calcHarris//////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////////////////// -__kernel void calcHarris(__global const float *Dx,__global const float *Dy, __global float *dst, - int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step, - int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step, - int dst_offset, int dst_rows, int dst_cols, int dst_step, - float k) -{ - int col = get_local_id(0); - const int gX = get_group_id(0); - const int gY = get_group_id(1); - const int glx = get_global_id(0); - const int gly = get_global_id(1); - - int dx_x_off = (dx_offset % dx_step) >> 2; - int dx_y_off = dx_offset / dx_step; - int dy_x_off = (dy_offset % dy_step) >> 2; - int dy_y_off = dy_offset / dy_step; - int dst_x_off = (dst_offset % dst_step) >> 2; - int dst_y_off = dst_offset / dst_step; - - int dx_startX = gX * (THREADS-ksX+1) - anX + dx_x_off; - int dx_startY = (gY << 1) - anY + dx_y_off; - int dy_startX = gX * (THREADS-ksX+1) - anX + dy_x_off; - int dy_startY = (gY << 1) - anY + dy_y_off; - int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; - int dst_startY = (gY << 1) + dst_y_off; - - float dx_data[ksY+1],dy_data[ksY+1],data[3][ksY+1]; - __local float temp[6][THREADS]; -#ifdef BORDER_CONSTANT - bool dx_con,dy_con; - float dx_s,dy_s; - for(int i=0; i < ksY+1; i++) - { - dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows; - dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)]; - dx_data[i] = dx_con ? dx_s : 0.0; - dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows; - dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)]; - dy_data[i] = dy_con ? dy_s : 0.0; - data[0][i] = dx_data[i] * dx_data[i]; - data[1][i] = dx_data[i] * dy_data[i]; - data[2][i] = dy_data[i] * dy_data[i]; - } -#else - int clamped_col = min(dst_cols, col); - for(int i=0; i < ksY+1; i++) - { - int dx_selected_row; - int dx_selected_col; - dx_selected_row = ADDR_H(dx_startY+i, 0, dx_whole_rows); - dx_selected_row = ADDR_B(dx_startY+i, dx_whole_rows, dx_selected_row); - dx_selected_col = ADDR_L(dx_startX+clamped_col, 0, dx_whole_cols); - dx_selected_col = ADDR_R(dx_startX+clamped_col, dx_whole_cols, dx_selected_col); - dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col]; - - int dy_selected_row; - int dy_selected_col; - dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows); - dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row); - dy_selected_col = ADDR_L(dy_startX+clamped_col, 0, dy_whole_cols); - dy_selected_col = ADDR_R(dy_startX+clamped_col, dy_whole_cols, dy_selected_col); - dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col]; - - data[0][i] = dx_data[i] * dx_data[i]; - data[1][i] = dx_data[i] * dy_data[i]; - data[2][i] = dy_data[i] * dy_data[i]; - } -#endif - float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; - for(int i=1; i < ksY; i++) - { - sum0 += (data[0][i]); - sum1 += (data[1][i]); - sum2 += (data[2][i]); - } - float sum01,sum02,sum11,sum12,sum21,sum22; - sum01 = sum0 + (data[0][0]); - sum02 = sum0 + (data[0][ksY]); - temp[0][col] = sum01; - temp[1][col] = sum02; - sum11 = sum1 + (data[1][0]); - sum12 = sum1 + (data[1][ksY]); - temp[2][col] = sum11; - temp[3][col] = sum12; - sum21 = sum2 + (data[2][0]); - sum22 = sum2 + (data[2][ksY]); - temp[4][col] = sum21; - temp[5][col] = sum22; - barrier(CLK_LOCAL_MEM_FENCE); - if(col < (THREADS-(ksX-1))) - { - col += anX; - int posX = dst_startX - dst_x_off + col - anX; - int posY = (gly << 1); - int till = (ksX + 1)%2; - float tmp_sum[6]={ 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 }; - for(int k=0; k<6; k++) - for(int i=-anX; i<=anX - till; i++) - { - tmp_sum[k] += temp[k][col+i]; - } - - if(posX < dst_cols && (posY) < dst_rows) - { - dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] = - tmp_sum[0] * tmp_sum[4] - tmp_sum[2] * tmp_sum[2] - k * (tmp_sum[0] + tmp_sum[4]) * (tmp_sum[0] + tmp_sum[4]); - } - if(posX < dst_cols && (posY + 1) < dst_rows) - { - dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] = - tmp_sum[1] * tmp_sum[5] - tmp_sum[3] * tmp_sum[3] - k * (tmp_sum[1] + tmp_sum[5]) * (tmp_sum[1] + tmp_sum[5]); - } - } -} diff --git a/modules/imgproc/src/opencl/histogram.cl b/modules/imgproc/src/opencl/histogram.cl deleted file mode 100644 index bac9a6b..0000000 --- a/modules/imgproc/src/opencl/histogram.cl +++ /dev/null @@ -1,279 +0,0 @@ -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Niko Li, newlife20080214@gmail.com -// Jia Haipeng, jiahaipeng95@gmail.com -// Xu Pang, pangxu010@163.com -// Wenju He, wenju@multicorewareinc.com -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -// -#define PARTIAL_HISTOGRAM256_COUNT (256) -#define HISTOGRAM256_BIN_COUNT (256) - -#define HISTOGRAM256_WORK_GROUP_SIZE (256) -#define HISTOGRAM256_LOCAL_MEM_SIZE (HISTOGRAM256_BIN_COUNT) - -#define NBANKS (16) -#define NBANKS_BIT (4) - - -__kernel __attribute__((reqd_work_group_size(HISTOGRAM256_BIN_COUNT,1,1)))void calc_sub_hist_D0( - __global const uint4* src, - int src_step, int src_offset, - __global int* globalHist, - int dataCount, int cols, - int inc_x, int inc_y, - int hist_step) -{ - __local int subhist[(HISTOGRAM256_BIN_COUNT << NBANKS_BIT)]; // NBINS*NBANKS - int gid = get_global_id(0); - int lid = get_local_id(0); - int gx = get_group_id(0); - int gsize = get_global_size(0); - int lsize = get_local_size(0); - const int shift = 8; - const int mask = HISTOGRAM256_BIN_COUNT-1; - int offset = (lid & (NBANKS-1));// lid % NBANKS - uint4 data, temp1, temp2, temp3, temp4; - src += src_offset; - - //clear LDS - for(int i=0, idx=lid; i<(NBANKS >> 2); i++, idx += lsize) - { - subhist[idx] = 0; - subhist[idx+=lsize] = 0; - subhist[idx+=lsize] = 0; - subhist[idx+=lsize] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - //read and scatter - int y = gid/cols; - int x = gid - mul24(y, cols); - for(int idx=gid; idx>= shift; - temp2 = ((data & mask) << NBANKS_BIT) + offset; - data >>= shift; - temp3 = ((data & mask) << NBANKS_BIT) + offset; - data >>= shift; - temp4 = ((data & mask) << NBANKS_BIT) + offset; - - atomic_inc(subhist + temp1.x); - atomic_inc(subhist + temp1.y); - atomic_inc(subhist + temp1.z); - atomic_inc(subhist + temp1.w); - - atomic_inc(subhist + temp2.x); - atomic_inc(subhist + temp2.y); - atomic_inc(subhist + temp2.z); - atomic_inc(subhist + temp2.w); - - atomic_inc(subhist + temp3.x); - atomic_inc(subhist + temp3.y); - atomic_inc(subhist + temp3.z); - atomic_inc(subhist + temp3.w); - - atomic_inc(subhist + temp4.x); - atomic_inc(subhist + temp4.y); - atomic_inc(subhist + temp4.z); - atomic_inc(subhist + temp4.w); - - x += inc_x; - int off = ((x>=cols) ? -1 : 0); - x = mad24(off, cols, x); - y += inc_y - off; - } - barrier(CLK_LOCAL_MEM_FENCE); - - //reduce local banks to single histogram per workgroup - int bin1=0, bin2=0, bin3=0, bin4=0; - for(int i=0; i=left_col) ? (gidx+cols) : gidx); - if(gidy= rows ? HISTOGRAM256_LOCAL_MEM_SIZE : p; - atomic_inc(subhist + p); - } - barrier(CLK_LOCAL_MEM_FENCE); - - globalHist[mad24(rowIndex, hist_step, lidy)] += subhist[lidy]; -} - -__kernel __attribute__((reqd_work_group_size(256,1,1)))void merge_hist(__global int* buf, - __global int* hist, - int src_step) -{ - int lx = get_local_id(0); - int gx = get_group_id(0); - - int sum = 0; - - for(int i = lx; i < PARTIAL_HISTOGRAM256_COUNT; i += HISTOGRAM256_WORK_GROUP_SIZE) - sum += buf[ mad24(i, src_step, gx)]; - - __local int data[HISTOGRAM256_WORK_GROUP_SIZE]; - data[lx] = sum; - - for(int stride = HISTOGRAM256_WORK_GROUP_SIZE /2; stride > 0; stride >>= 1) - { - barrier(CLK_LOCAL_MEM_FENCE); - if(lx < stride) - data[lx] += data[lx + stride]; - } - - if(lx == 0) - hist[gx] = data[0]; -} - -__kernel __attribute__((reqd_work_group_size(256,1,1))) -void calLUT(__global uchar * dst, __constant int * hist, int total) -{ - int lid = get_local_id(0); - __local int sumhist[HISTOGRAM256_BIN_COUNT]; - __local float scale; - - sumhist[lid] = hist[lid]; - barrier(CLK_LOCAL_MEM_FENCE); - if (lid == 0) - { - int sum = 0, i = 0; - while (!sumhist[i]) - ++i; - - if (total == sumhist[i]) - { - scale = 1; - for (int j = 0; j < HISTOGRAM256_BIN_COUNT; ++j) - sumhist[i] = i; - } - else - { - scale = 255.f/(total - sumhist[i]); - - for (sumhist[i++] = 0; i < HISTOGRAM256_BIN_COUNT; i++) - { - sum += sumhist[i]; - sumhist[i] = sum; - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - dst[lid]= convert_uchar_sat_rte(convert_float(sumhist[lid])*scale); -} - -/* -///////////////////////////////equalizeHist////////////////////////////////////////////////// -__kernel __attribute__((reqd_work_group_size(256,1,1)))void equalizeHist( - __global uchar * src, - __global uchar * dst, - __constant int * hist, - int srcstep, - int srcoffset, - int dststep, - int dstoffset, - int width, - int height, - float scale, - int inc_x, - int inc_y) -{ - int gidx = get_global_id(0); - int lid = get_local_id(0); - int glb_size = get_global_size(0); - src+=srcoffset; - dst+=dstoffset; - __local int sumhist[HISTOGRAM256_BIN_COUNT]; - __local uchar lut[HISTOGRAM256_BIN_COUNT+1]; - - sumhist[lid]=hist[lid]; - barrier(CLK_LOCAL_MEM_FENCE); - if(lid==0) - { - int sum = 0; - for(int i=0;i= width ? -1 : 0); - pos_x = mad24(off,width,pos_x); - pos_y += inc_y - off; - } -} -*/ diff --git a/modules/imgproc/src/opencl/hough.cl b/modules/imgproc/src/opencl/hough.cl deleted file mode 100644 index fd1c5b9..0000000 --- a/modules/imgproc/src/opencl/hough.cl +++ /dev/null @@ -1,280 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or bpied warranties, including, but not limited to, the bpied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable -#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable - -//////////////////////////////////////////////////////////////////////// -// buildPointList - -#define PIXELS_PER_THREAD 16 - -// TODO: add offset to support ROI -__kernel void buildPointList(__global const uchar* src, - int cols, - int rows, - int step, - __global unsigned int* list, - __global int* counter) -{ - __local unsigned int s_queues[4][32 * PIXELS_PER_THREAD]; - __local int s_qsize[4]; - __local int s_globStart[4]; - - const int x = get_group_id(0) * get_local_size(0) * PIXELS_PER_THREAD + get_local_id(0); - const int y = get_global_id(1); - - if (get_local_id(0) == 0) - s_qsize[get_local_id(1)] = 0; - barrier(CLK_LOCAL_MEM_FENCE); - - if (y < rows) - { - // fill the queue - __global const uchar* srcRow = &src[y * step]; - for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < cols; ++i, xx += get_local_size(0)) - { - if (srcRow[xx]) - { - const unsigned int val = (y << 16) | xx; - const int qidx = atomic_add(&s_qsize[get_local_id(1)], 1); - s_queues[get_local_id(1)][qidx] = val; - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // let one work-item reserve the space required in the global list - if (get_local_id(0) == 0 && get_local_id(1) == 0) - { - // find how many items are stored in each list - int totalSize = 0; - for (int i = 0; i < get_local_size(1); ++i) - { - s_globStart[i] = totalSize; - totalSize += s_qsize[i]; - } - - // calculate the offset in the global list - const int globalOffset = atomic_add(counter, totalSize); - for (int i = 0; i < get_local_size(1); ++i) - s_globStart[i] += globalOffset; - } - - barrier(CLK_GLOBAL_MEM_FENCE); - - // copy local queues to global queue - const int qsize = s_qsize[get_local_id(1)]; - int gidx = s_globStart[get_local_id(1)] + get_local_id(0); - for(int i = get_local_id(0); i < qsize; i += get_local_size(0), gidx += get_local_size(0)) - list[gidx] = s_queues[get_local_id(1)][i]; -} - -//////////////////////////////////////////////////////////////////////// -// circlesAccumCenters - -// TODO: add offset to support ROI -__kernel void circlesAccumCenters(__global const unsigned int* list, - const int count, - __global const int* dx, - const int dxStep, - __global const int* dy, - const int dyStep, - __global int* accum, - const int accumStep, - const int width, - const int height, - const int minRadius, - const int maxRadius, - const float idp) -{ - const int dxStepInPixel = dxStep / sizeof(int); - const int dyStepInPixel = dyStep / sizeof(int); - const int accumStepInPixel = accumStep / sizeof(int); - - const int SHIFT = 10; - const int ONE = 1 << SHIFT; - - // const int tid = blockIdx.x * blockDim.x + threadIdx.x; - const int wid = get_global_id(0); - - if (wid >= count) - return; - - const unsigned int val = list[wid]; - - const int x = (val & 0xFFFF); - const int y = (val >> 16) & 0xFFFF; - - const int vx = dx[mad24(y, dxStepInPixel, x)]; - const int vy = dy[mad24(y, dyStepInPixel, x)]; - - if (vx == 0 && vy == 0) - return; - - const float mag = sqrt(convert_float(vx * vx + vy * vy)); - - const int x0 = convert_int_rte((x * idp) * ONE); - const int y0 = convert_int_rte((y * idp) * ONE); - - int sx = convert_int_rte((vx * idp) * ONE / mag); - int sy = convert_int_rte((vy * idp) * ONE / mag); - - // Step from minRadius to maxRadius in both directions of the gradient - for (int k1 = 0; k1 < 2; ++k1) - { - int x1 = x0 + minRadius * sx; - int y1 = y0 + minRadius * sy; - - for (int r = minRadius; r <= maxRadius; x1 += sx, y1 += sy, ++r) - { - const int x2 = x1 >> SHIFT; - const int y2 = y1 >> SHIFT; - - if (x2 < 0 || x2 >= width || y2 < 0 || y2 >= height) - break; - - atomic_add(&accum[mad24(y2+1, accumStepInPixel, x2+1)], 1); - } - - sx = -sx; - sy = -sy; - } -} - -// //////////////////////////////////////////////////////////////////////// -// // buildCentersList - -// TODO: add offset to support ROI -__kernel void buildCentersList(__global const int* accum, - const int accumCols, - const int accumRows, - const int accumStep, - __global unsigned int* centers, - const int threshold, - __global int* counter) -{ - const int accumStepInPixel = accumStep/sizeof(int); - - const int x = get_global_id(0); - const int y = get_global_id(1); - - if (x < accumCols - 2 && y < accumRows - 2) - { - const int top = accum[mad24(y, accumStepInPixel, x + 1)]; - - const int left = accum[mad24(y + 1, accumStepInPixel, x)]; - const int cur = accum[mad24(y + 1, accumStepInPixel, x + 1)]; - const int right = accum[mad24(y + 1, accumStepInPixel, x + 2)]; - - const int bottom = accum[mad24(y + 2, accumStepInPixel, x + 1)];; - - if (cur > threshold && cur > top && cur >= bottom && cur > left && cur >= right) - { - const unsigned int val = (y << 16) | x; - const int idx = atomic_add(counter, 1); - centers[idx] = val; - } - } -} - - -// //////////////////////////////////////////////////////////////////////// -// // circlesAccumRadius - -// TODO: add offset to support ROI -__kernel void circlesAccumRadius(__global const unsigned int* centers, - __global const unsigned int* list, const int count, - __global float4* circles, const int maxCircles, - const float dp, - const int minRadius, const int maxRadius, - const int histSize, - const int threshold, - __local int* smem, - __global int* counter) -{ - for (int i = get_local_id(0); i < histSize + 2; i += get_local_size(0)) - smem[i] = 0; - barrier(CLK_LOCAL_MEM_FENCE); - - unsigned int val = centers[get_group_id(0)]; - - float cx = convert_float(val & 0xFFFF); - float cy = convert_float((val >> 16) & 0xFFFF); - - cx = (cx + 0.5f) * dp; - cy = (cy + 0.5f) * dp; - - for (int i = get_local_id(0); i < count; i += get_local_size(0)) - { - val = list[i]; - - const int x = (val & 0xFFFF); - const int y = (val >> 16) & 0xFFFF; - - const float rad = sqrt((cx - x) * (cx - x) + (cy - y) * (cy - y)); - if (rad >= minRadius && rad <= maxRadius) - { - const int r = convert_int_rte(rad - minRadius); - - atomic_add(&smem[r + 1], 1); - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - for (int i = get_local_id(0); i < histSize; i += get_local_size(0)) - { - const int curVotes = smem[i + 1]; - - if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2]) - - { - const int ind = atomic_add(counter, 1); - if (ind < maxCircles) - { - circles[ind] = (float4)(cx, cy, convert_float(i + minRadius), 0.0f); - } - } - } -} diff --git a/modules/imgproc/src/opencl/integral.cl b/modules/imgproc/src/opencl/integral.cl deleted file mode 100644 index f10b184..0000000 --- a/modules/imgproc/src/opencl/integral.cl +++ /dev/null @@ -1,493 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Shengen Yan,yanshengen@gmail.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#if defined (DOUBLE_SUPPORT) -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#elif defined (cl_amd_fp64) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#endif -#endif -#define LSIZE 256 -#define LSIZE_1 255 -#define LSIZE_2 254 -#define HF_LSIZE 128 -#define LOG_LSIZE 8 -#define LOG_NUM_BANKS 5 -#define NUM_BANKS 32 -#define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS) - - -kernel void integral_cols_D4(__global uchar4 *src,__global int *sum ,__global float *sqsum, - int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step) -{ - unsigned int lid = get_local_id(0); - unsigned int gid = get_group_id(0); - int4 src_t[2], sum_t[2]; - float4 sqsum_t[2]; - __local int4 lm_sum[2][LSIZE + LOG_LSIZE]; - __local float4 lm_sqsum[2][LSIZE + LOG_LSIZE]; - __local int* sum_p; - __local float* sqsum_p; - src_step = src_step >> 2; - gid = gid << 1; - for(int i = 0; i < rows; i =i + LSIZE_1) - { - src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : 0); - src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : 0); - - sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); - sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); - sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); - sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); - barrier(CLK_LOCAL_MEM_FENCE); - - int bf_loc = lid + GET_CONFLICT_OFFSET(lid); - lm_sum[0][bf_loc] = src_t[0]; - lm_sqsum[0][bf_loc] = convert_float4(src_t[0] * src_t[0]); - - lm_sum[1][bf_loc] = src_t[1]; - lm_sqsum[1][bf_loc] = convert_float4(src_t[1] * src_t[1]); - - int offset = 1; - for(int d = LSIZE >> 1 ; d > 0; d>>=1) - { - barrier(CLK_LOCAL_MEM_FENCE); - int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; - ai += GET_CONFLICT_OFFSET(ai); - bi += GET_CONFLICT_OFFSET(bi); - - if((lid & 127) < d) - { - lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; - lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; - } - offset <<= 1; - } - barrier(CLK_LOCAL_MEM_FENCE); - if(lid < 2) - { - lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; - lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0; - } - for(int d = 1; d < LSIZE; d <<= 1) - { - barrier(CLK_LOCAL_MEM_FENCE); - offset >>= 1; - int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; - ai += GET_CONFLICT_OFFSET(ai); - bi += GET_CONFLICT_OFFSET(bi); - - if((lid & 127) < d) - { - lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; - lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; - - lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; - lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ; - if(lid > 0 && (i+lid) <= rows) - { - lm_sum[0][bf_loc] += sum_t[0]; - lm_sum[1][bf_loc] += sum_t[1]; - lm_sqsum[0][bf_loc] += sqsum_t[0]; - lm_sqsum[1][bf_loc] += sqsum_t[1]; - sum_p = (__local int*)(&(lm_sum[0][bf_loc])); - sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc])); - for(int k = 0; k < 4; k++) - { - if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue; - sum[loc_s0 + k * dst_step / 4] = sum_p[k]; - sqsum[loc_s0 + k * dst_step / 4] = sqsum_p[k]; - } - sum_p = (__local int*)(&(lm_sum[1][bf_loc])); - sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc])); - for(int k = 0; k < 4; k++) - { - if(gid * 4 + k + 4 >= cols + pre_invalid) break; - sum[loc_s1 + k * dst_step / 4] = sum_p[k]; - sqsum[loc_s1 + k * dst_step / 4] = sqsum_p[k]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - } -} - - -kernel void integral_rows_D4(__global int4 *srcsum,__global float4 * srcsqsum,__global int *sum , - __global float *sqsum,int rows,int cols,int src_step,int sum_step, - int sqsum_step,int sum_offset,int sqsum_offset) -{ - unsigned int lid = get_local_id(0); - unsigned int gid = get_group_id(0); - int4 src_t[2], sum_t[2]; - float4 sqsrc_t[2],sqsum_t[2]; - __local int4 lm_sum[2][LSIZE + LOG_LSIZE]; - __local float4 lm_sqsum[2][LSIZE + LOG_LSIZE]; - __local int *sum_p; - __local float *sqsum_p; - src_step = src_step >> 4; - for(int i = 0; i < rows; i =i + LSIZE_1) - { - src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (int4)0; - sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : (float4)0; - src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (int4)0; - sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0; - - sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); - sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); - sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); - sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); - barrier(CLK_LOCAL_MEM_FENCE); - - int bf_loc = lid + GET_CONFLICT_OFFSET(lid); - lm_sum[0][bf_loc] = src_t[0]; - lm_sqsum[0][bf_loc] = sqsrc_t[0]; - - lm_sum[1][bf_loc] = src_t[1]; - lm_sqsum[1][bf_loc] = sqsrc_t[1]; - - int offset = 1; - for(int d = LSIZE >> 1 ; d > 0; d>>=1) - { - barrier(CLK_LOCAL_MEM_FENCE); - int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; - ai += GET_CONFLICT_OFFSET(ai); - bi += GET_CONFLICT_OFFSET(bi); - - if((lid & 127) < d) - { - lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; - lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; - } - offset <<= 1; - } - barrier(CLK_LOCAL_MEM_FENCE); - if(lid < 2) - { - lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; - lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0; - } - for(int d = 1; d < LSIZE; d <<= 1) - { - barrier(CLK_LOCAL_MEM_FENCE); - offset >>= 1; - int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; - ai += GET_CONFLICT_OFFSET(ai); - bi += GET_CONFLICT_OFFSET(bi); - - if((lid & 127) < d) - { - lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; - lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; - - lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; - lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - if(gid == 0 && (i + lid) <= rows) - { - sum[sum_offset + i + lid] = 0; - sqsum[sqsum_offset + i + lid] = 0; - } - if(i + lid == 0) - { - int loc0 = gid * 2 * sum_step; - int loc1 = gid * 2 * sqsum_step; - for(int k = 1; k <= 8; k++) - { - if(gid * 8 + k > cols) break; - sum[sum_offset + loc0 + k * sum_step / 4] = 0; - sqsum[sqsum_offset + loc1 + k * sqsum_step / 4] = 0; - } - } - int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ; - int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ; - if(lid > 0 && (i+lid) <= rows) - { - lm_sum[0][bf_loc] += sum_t[0]; - lm_sum[1][bf_loc] += sum_t[1]; - lm_sqsum[0][bf_loc] += sqsum_t[0]; - lm_sqsum[1][bf_loc] += sqsum_t[1]; - sum_p = (__local int*)(&(lm_sum[0][bf_loc])); - sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc])); - for(int k = 0; k < 4; k++) - { - if(gid * 8 + k >= cols) break; - sum[loc_s0 + k * sum_step / 4] = sum_p[k]; - sqsum[loc_sq0 + k * sqsum_step / 4] = sqsum_p[k]; - } - sum_p = (__local int*)(&(lm_sum[1][bf_loc])); - sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc])); - for(int k = 0; k < 4; k++) - { - if(gid * 8 + 4 + k >= cols) break; - sum[loc_s1 + k * sum_step / 4] = sum_p[k]; - sqsum[loc_sq1 + k * sqsum_step / 4] = sqsum_p[k]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - } -} - -kernel void integral_cols_D5(__global uchar4 *src,__global float *sum ,__global float *sqsum, - int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step) -{ - unsigned int lid = get_local_id(0); - unsigned int gid = get_group_id(0); - float4 src_t[2], sum_t[2]; - float4 sqsum_t[2]; - __local float4 lm_sum[2][LSIZE + LOG_LSIZE]; - __local float4 lm_sqsum[2][LSIZE + LOG_LSIZE]; - __local float* sum_p; - __local float* sqsum_p; - src_step = src_step >> 2; - gid = gid << 1; - for(int i = 0; i < rows; i =i + LSIZE_1) - { - src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : (float4)0); - src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : (float4)0); - - sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); - sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); - sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); - sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); - barrier(CLK_LOCAL_MEM_FENCE); - - int bf_loc = lid + GET_CONFLICT_OFFSET(lid); - lm_sum[0][bf_loc] = src_t[0]; - lm_sqsum[0][bf_loc] = convert_float4(src_t[0] * src_t[0]); - - lm_sum[1][bf_loc] = src_t[1]; - lm_sqsum[1][bf_loc] = convert_float4(src_t[1] * src_t[1]); - - int offset = 1; - for(int d = LSIZE >> 1 ; d > 0; d>>=1) - { - barrier(CLK_LOCAL_MEM_FENCE); - int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; - ai += GET_CONFLICT_OFFSET(ai); - bi += GET_CONFLICT_OFFSET(bi); - - if((lid & 127) < d) - { - lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; - lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; - } - offset <<= 1; - } - barrier(CLK_LOCAL_MEM_FENCE); - if(lid < 2) - { - lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; - lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0; - } - for(int d = 1; d < LSIZE; d <<= 1) - { - barrier(CLK_LOCAL_MEM_FENCE); - offset >>= 1; - int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; - ai += GET_CONFLICT_OFFSET(ai); - bi += GET_CONFLICT_OFFSET(bi); - - if((lid & 127) < d) - { - lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; - lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; - - lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; - lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ; - if(lid > 0 && (i+lid) <= rows) - { - lm_sum[0][bf_loc] += sum_t[0]; - lm_sum[1][bf_loc] += sum_t[1]; - lm_sqsum[0][bf_loc] += sqsum_t[0]; - lm_sqsum[1][bf_loc] += sqsum_t[1]; - sum_p = (__local float*)(&(lm_sum[0][bf_loc])); - sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc])); - for(int k = 0; k < 4; k++) - { - if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue; - sum[loc_s0 + k * dst_step / 4] = sum_p[k]; - sqsum[loc_s0 + k * dst_step / 4] = sqsum_p[k]; - } - sum_p = (__local float*)(&(lm_sum[1][bf_loc])); - sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc])); - for(int k = 0; k < 4; k++) - { - if(gid * 4 + k + 4 >= cols + pre_invalid) break; - sum[loc_s1 + k * dst_step / 4] = sum_p[k]; - sqsum[loc_s1 + k * dst_step / 4] = sqsum_p[k]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - } -} - - -kernel void integral_rows_D5(__global float4 *srcsum,__global float4 * srcsqsum,__global float *sum , - __global float *sqsum,int rows,int cols,int src_step,int sum_step, - int sqsum_step,int sum_offset,int sqsum_offset) -{ - unsigned int lid = get_local_id(0); - unsigned int gid = get_group_id(0); - float4 src_t[2], sum_t[2]; - float4 sqsrc_t[2],sqsum_t[2]; - __local float4 lm_sum[2][LSIZE + LOG_LSIZE]; - __local float4 lm_sqsum[2][LSIZE + LOG_LSIZE]; - __local float *sum_p; - __local float *sqsum_p; - src_step = src_step >> 4; - for(int i = 0; i < rows; i =i + LSIZE_1) - { - src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (float4)0; - sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : (float4)0; - src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0; - sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0; - - sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); - sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); - sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); - sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); - barrier(CLK_LOCAL_MEM_FENCE); - - int bf_loc = lid + GET_CONFLICT_OFFSET(lid); - lm_sum[0][bf_loc] = src_t[0]; - lm_sqsum[0][bf_loc] = sqsrc_t[0]; - - lm_sum[1][bf_loc] = src_t[1]; - lm_sqsum[1][bf_loc] = sqsrc_t[1]; - - int offset = 1; - for(int d = LSIZE >> 1 ; d > 0; d>>=1) - { - barrier(CLK_LOCAL_MEM_FENCE); - int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; - ai += GET_CONFLICT_OFFSET(ai); - bi += GET_CONFLICT_OFFSET(bi); - - if((lid & 127) < d) - { - lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; - lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; - } - offset <<= 1; - } - barrier(CLK_LOCAL_MEM_FENCE); - if(lid < 2) - { - lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; - lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0; - } - for(int d = 1; d < LSIZE; d <<= 1) - { - barrier(CLK_LOCAL_MEM_FENCE); - offset >>= 1; - int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; - ai += GET_CONFLICT_OFFSET(ai); - bi += GET_CONFLICT_OFFSET(bi); - - if((lid & 127) < d) - { - lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; - lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; - - lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; - lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - if(gid == 0 && (i + lid) <= rows) - { - sum[sum_offset + i + lid] = 0; - sqsum[sqsum_offset + i + lid] = 0; - } - if(i + lid == 0) - { - int loc0 = gid * 2 * sum_step; - int loc1 = gid * 2 * sqsum_step; - for(int k = 1; k <= 8; k++) - { - if(gid * 8 + k > cols) break; - sum[sum_offset + loc0 + k * sum_step / 4] = 0; - sqsum[sqsum_offset + loc1 + k * sqsum_step / 4] = 0; - } - } - int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ; - int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ; - if(lid > 0 && (i+lid) <= rows) - { - lm_sum[0][bf_loc] += sum_t[0]; - lm_sum[1][bf_loc] += sum_t[1]; - lm_sqsum[0][bf_loc] += sqsum_t[0]; - lm_sqsum[1][bf_loc] += sqsum_t[1]; - sum_p = (__local float*)(&(lm_sum[0][bf_loc])); - sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc])); - for(int k = 0; k < 4; k++) - { - if(gid * 8 + k >= cols) break; - sum[loc_s0 + k * sum_step / 4] = sum_p[k]; - sqsum[loc_sq0 + k * sqsum_step / 4] = sqsum_p[k]; - } - sum_p = (__local float*)(&(lm_sum[1][bf_loc])); - sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc])); - for(int k = 0; k < 4; k++) - { - if(gid * 8 + 4 + k >= cols) break; - sum[loc_s1 + k * sum_step / 4] = sum_p[k]; - sqsum[loc_sq1 + k * sqsum_step / 4] = sqsum_p[k]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - } -} diff --git a/modules/imgproc/src/opencl/integral_sum.cl b/modules/imgproc/src/opencl/integral_sum.cl deleted file mode 100644 index ee063a5..0000000 --- a/modules/imgproc/src/opencl/integral_sum.cl +++ /dev/null @@ -1,412 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Shengen Yan,yanshengen@gmail.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#if defined (DOUBLE_SUPPORT) -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#elif defined (cl_amd_fp64) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#endif -#endif - -#define LSIZE 256 -#define LSIZE_1 255 -#define LSIZE_2 254 -#define HF_LSIZE 128 -#define LOG_LSIZE 8 -#define LOG_NUM_BANKS 5 -#define NUM_BANKS 32 -#define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS) - - -kernel void integral_sum_cols_D4(__global uchar4 *src,__global int *sum , - int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step) -{ - unsigned int lid = get_local_id(0); - unsigned int gid = get_group_id(0); - int4 src_t[2], sum_t[2]; - __local int4 lm_sum[2][LSIZE + LOG_LSIZE]; - __local int* sum_p; - src_step = src_step >> 2; - gid = gid << 1; - for(int i = 0; i < rows; i =i + LSIZE_1) - { - src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid]) : 0); - src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid + 1]) : 0); - - sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); - sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); - barrier(CLK_LOCAL_MEM_FENCE); - - int bf_loc = lid + GET_CONFLICT_OFFSET(lid); - lm_sum[0][bf_loc] = src_t[0]; - - lm_sum[1][bf_loc] = src_t[1]; - - int offset = 1; - for(int d = LSIZE >> 1 ; d > 0; d>>=1) - { - barrier(CLK_LOCAL_MEM_FENCE); - int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; - ai += GET_CONFLICT_OFFSET(ai); - bi += GET_CONFLICT_OFFSET(bi); - - if((lid & 127) < d) - { - lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; - } - offset <<= 1; - } - barrier(CLK_LOCAL_MEM_FENCE); - if(lid < 2) - { - lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; - } - for(int d = 1; d < LSIZE; d <<= 1) - { - barrier(CLK_LOCAL_MEM_FENCE); - offset >>= 1; - int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; - ai += GET_CONFLICT_OFFSET(ai); - bi += GET_CONFLICT_OFFSET(bi); - - if((lid & 127) < d) - { - lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; - lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - if(lid > 0 && (i+lid) <= rows) - { - int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ; - lm_sum[0][bf_loc] += sum_t[0]; - lm_sum[1][bf_loc] += sum_t[1]; - sum_p = (__local int*)(&(lm_sum[0][bf_loc])); - for(int k = 0; k < 4; k++) - { - if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue; - sum[loc_s0 + k * dst_step / 4] = sum_p[k]; - } - sum_p = (__local int*)(&(lm_sum[1][bf_loc])); - for(int k = 0; k < 4; k++) - { - if(gid * 4 + k + 4 >= cols + pre_invalid) break; - sum[loc_s1 + k * dst_step / 4] = sum_p[k]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - } -} - - -kernel void integral_sum_rows_D4(__global int4 *srcsum,__global int *sum , - int rows,int cols,int src_step,int sum_step, - int sum_offset) -{ - unsigned int lid = get_local_id(0); - unsigned int gid = get_group_id(0); - int4 src_t[2], sum_t[2]; - __local int4 lm_sum[2][LSIZE + LOG_LSIZE]; - __local int *sum_p; - src_step = src_step >> 4; - for(int i = 0; i < rows; i =i + LSIZE_1) - { - src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : 0; - src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : 0; - - sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); - sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); - barrier(CLK_LOCAL_MEM_FENCE); - - int bf_loc = lid + GET_CONFLICT_OFFSET(lid); - lm_sum[0][bf_loc] = src_t[0]; - - lm_sum[1][bf_loc] = src_t[1]; - - int offset = 1; - for(int d = LSIZE >> 1 ; d > 0; d>>=1) - { - barrier(CLK_LOCAL_MEM_FENCE); - int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; - ai += GET_CONFLICT_OFFSET(ai); - bi += GET_CONFLICT_OFFSET(bi); - - if((lid & 127) < d) - { - lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; - } - offset <<= 1; - } - barrier(CLK_LOCAL_MEM_FENCE); - if(lid < 2) - { - lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; - } - for(int d = 1; d < LSIZE; d <<= 1) - { - barrier(CLK_LOCAL_MEM_FENCE); - offset >>= 1; - int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; - ai += GET_CONFLICT_OFFSET(ai); - bi += GET_CONFLICT_OFFSET(bi); - - if((lid & 127) < d) - { - lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; - lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - if(gid == 0 && (i + lid) <= rows) - { - sum[sum_offset + i + lid] = 0; - } - if(i + lid == 0) - { - int loc0 = gid * 2 * sum_step; - for(int k = 1; k <= 8; k++) - { - if(gid * 8 + k > cols) break; - sum[sum_offset + loc0 + k * sum_step / 4] = 0; - } - } - - if(lid > 0 && (i+lid) <= rows) - { - int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ; - lm_sum[0][bf_loc] += sum_t[0]; - lm_sum[1][bf_loc] += sum_t[1]; - sum_p = (__local int*)(&(lm_sum[0][bf_loc])); - for(int k = 0; k < 4; k++) - { - if(gid * 8 + k >= cols) break; - sum[loc_s0 + k * sum_step / 4] = sum_p[k]; - } - sum_p = (__local int*)(&(lm_sum[1][bf_loc])); - for(int k = 0; k < 4; k++) - { - if(gid * 8 + 4 + k >= cols) break; - sum[loc_s1 + k * sum_step / 4] = sum_p[k]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - } -} - -kernel void integral_sum_cols_D5(__global uchar4 *src,__global float *sum , - int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step) -{ - unsigned int lid = get_local_id(0); - unsigned int gid = get_group_id(0); - float4 src_t[2], sum_t[2]; - __local float4 lm_sum[2][LSIZE + LOG_LSIZE]; - __local float* sum_p; - src_step = src_step >> 2; - gid = gid << 1; - for(int i = 0; i < rows; i =i + LSIZE_1) - { - src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + gid]) : (float4)0); - src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + gid + 1]) : (float4)0); - - sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); - sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); - barrier(CLK_LOCAL_MEM_FENCE); - - int bf_loc = lid + GET_CONFLICT_OFFSET(lid); - lm_sum[0][bf_loc] = src_t[0]; - - lm_sum[1][bf_loc] = src_t[1]; - - int offset = 1; - for(int d = LSIZE >> 1 ; d > 0; d>>=1) - { - barrier(CLK_LOCAL_MEM_FENCE); - int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; - ai += GET_CONFLICT_OFFSET(ai); - bi += GET_CONFLICT_OFFSET(bi); - - if((lid & 127) < d) - { - lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; - } - offset <<= 1; - } - barrier(CLK_LOCAL_MEM_FENCE); - if(lid < 2) - { - lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; - } - for(int d = 1; d < LSIZE; d <<= 1) - { - barrier(CLK_LOCAL_MEM_FENCE); - offset >>= 1; - int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; - ai += GET_CONFLICT_OFFSET(ai); - bi += GET_CONFLICT_OFFSET(bi); - - if((lid & 127) < d) - { - lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; - lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - if(lid > 0 && (i+lid) <= rows) - { - int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ; - lm_sum[0][bf_loc] += sum_t[0]; - lm_sum[1][bf_loc] += sum_t[1]; - sum_p = (__local float*)(&(lm_sum[0][bf_loc])); - for(int k = 0; k < 4; k++) - { - if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue; - sum[loc_s0 + k * dst_step / 4] = sum_p[k]; - } - sum_p = (__local float*)(&(lm_sum[1][bf_loc])); - for(int k = 0; k < 4; k++) - { - if(gid * 4 + k + 4 >= cols + pre_invalid) break; - sum[loc_s1 + k * dst_step / 4] = sum_p[k]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - } -} - - -kernel void integral_sum_rows_D5(__global float4 *srcsum,__global float *sum , - int rows,int cols,int src_step,int sum_step, - int sum_offset) -{ - unsigned int lid = get_local_id(0); - unsigned int gid = get_group_id(0); - float4 src_t[2], sum_t[2]; - __local float4 lm_sum[2][LSIZE + LOG_LSIZE]; - __local float *sum_p; - src_step = src_step >> 4; - for(int i = 0; i < rows; i =i + LSIZE_1) - { - src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (float4)0; - src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0; - - sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); - sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); - barrier(CLK_LOCAL_MEM_FENCE); - - int bf_loc = lid + GET_CONFLICT_OFFSET(lid); - lm_sum[0][bf_loc] = src_t[0]; - - lm_sum[1][bf_loc] = src_t[1]; - - int offset = 1; - for(int d = LSIZE >> 1 ; d > 0; d>>=1) - { - barrier(CLK_LOCAL_MEM_FENCE); - int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; - ai += GET_CONFLICT_OFFSET(ai); - bi += GET_CONFLICT_OFFSET(bi); - - if((lid & 127) < d) - { - lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; - } - offset <<= 1; - } - barrier(CLK_LOCAL_MEM_FENCE); - if(lid < 2) - { - lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; - } - for(int d = 1; d < LSIZE; d <<= 1) - { - barrier(CLK_LOCAL_MEM_FENCE); - offset >>= 1; - int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; - ai += GET_CONFLICT_OFFSET(ai); - bi += GET_CONFLICT_OFFSET(bi); - - if((lid & 127) < d) - { - lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; - lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - if(gid == 0 && (i + lid) <= rows) - { - sum[sum_offset + i + lid] = 0; - } - if(i + lid == 0) - { - int loc0 = gid * 2 * sum_step; - for(int k = 1; k <= 8; k++) - { - if(gid * 8 + k > cols) break; - sum[sum_offset + loc0 + k * sum_step / 4] = 0; - } - } - - if(lid > 0 && (i+lid) <= rows) - { - int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ; - lm_sum[0][bf_loc] += sum_t[0]; - lm_sum[1][bf_loc] += sum_t[1]; - sum_p = (__local float*)(&(lm_sum[0][bf_loc])); - for(int k = 0; k < 4; k++) - { - if(gid * 8 + k >= cols) break; - sum[loc_s0 + k * sum_step / 4] = sum_p[k]; - } - sum_p = (__local float*)(&(lm_sum[1][bf_loc])); - for(int k = 0; k < 4; k++) - { - if(gid * 8 + 4 + k >= cols) break; - sum[loc_s1 + k * sum_step / 4] = sum_p[k]; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - } -} diff --git a/modules/imgproc/src/opencl/laplacian.cl b/modules/imgproc/src/opencl/laplacian.cl deleted file mode 100644 index ea22967..0000000 --- a/modules/imgproc/src/opencl/laplacian.cl +++ /dev/null @@ -1,381 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Pang Erping, erping@multicorewareinc.com -// Jia Haipeng, jiahaipeng95@gmail.com -// Peng Xiao, pengxiao@outlook.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////Macro for border type//////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////////////////////////////// -#ifdef BORDER_REPLICATE - -//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i)) -#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) : (i)) -#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr)) -#endif - -#ifdef BORDER_REFLECT -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? ((l_edge)<<1)-(i)-1 : (i)) -#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? ((t_edge)<<1)-(i)-1 : (i)) -#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr)) -#endif - -#ifdef BORDER_REFLECT_101 -//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? ((l_edge)<<1)-(i) : (i)) -#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? ((t_edge)<<1)-(i) : (i)) -#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr)) -#endif - -#ifdef IMG_C_1_0 -#define T_IMG uchar -#define T_IMGx4 uchar4 -#define T_IMG_C1 uchar -#define CONVERT_TYPE convert_uchar_sat -#define CONVERT_TYPEx4 convert_uchar4_sat -#endif -#ifdef IMG_C_4_0 -#define T_IMG uchar4 -#define T_IMGx4 uchar16 -#define T_IMG_C1 uchar -#define CONVERT_TYPE convert_uchar4_sat -#define CONVERT_TYPEx4 convert_uchar16_sat -#endif -#ifdef IMG_C_1_5 -#define T_IMG float -#define T_IMGx4 float4 -#define T_IMG_C1 float -#define CONVERT_TYPE convert_float -#define CONVERT_TYPEx4 convert_float4 -#endif -#ifdef IMG_C_4_5 -#define T_IMG float4 -#define T_IMGx4 float16 -#define T_IMG_C1 float -#define CONVERT_TYPE convert_float4 -#define CONVERT_TYPEx4 convert_float16 -#endif - -#ifndef CN -#define CN 1 -#endif - -#if CN == 1 -#define T_SUM float -#define T_SUMx4 float4 -#define CONVERT_TYPE_SUM convert_float -#define CONVERT_TYPE_SUMx4 convert_float4 -#define SUM_ZERO (0.0f) -#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f) -#define VLOAD4 vload4 -#define SX x -#define SY y -#define SZ z -#define SW w -#elif CN == 4 -#define T_SUM float4 -#define T_SUMx4 float16 -#define CONVERT_TYPE_SUM convert_float4 -#define CONVERT_TYPE_SUMx4 convert_float16 -#define SUM_ZERO (0.0f, 0.0f, 0.0f, 0.0f) -#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f) -#define VLOAD4 vload16 -#define SX s0123 -#define SY s4567 -#define SZ s89ab -#define SW scdef -#endif - -#ifndef FILTER_SIZE -#define FILTER_SIZE 3 -#endif - -#define LOCAL_GROUP_SIZE 16 - -#define LOCAL_WIDTH ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE) -#define LOCAL_HEIGHT ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE) - -#define FILTER_RADIUS (FILTER_SIZE >> 1) - -__kernel void filter2D( - __global T_IMG *src, - __global T_IMG *dst, - int src_step, - int dst_step, - __constant float *mat_kernel, - __local T_IMG *local_data, - int wholerows, - int wholecols, - int src_offset_x, - int src_offset_y, - int dst_offset_x, - int dst_offset_y, - int cols, - int rows, - int operate_cols -) -{ - int groupStartCol = get_group_id(0) * get_local_size(0); - int groupStartRow = get_group_id(1) * get_local_size(1); - - int localCol = get_local_id(0); - int localRow = get_local_id(1); - int globalCol = groupStartCol + localCol; - int globalRow = groupStartRow + localRow; - const int src_offset = mad24(src_offset_y, src_step, src_offset_x); - const int dst_offset = mad24(dst_offset_y, dst_step, dst_offset_x); - -#ifdef BORDER_CONSTANT - for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1)) - { - int curRow = groupStartRow + i; - for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0)) - { - int curCol = groupStartCol + j; - if(curRow < FILTER_RADIUS - src_offset_y || (curRow - FILTER_RADIUS) >= wholerows - src_offset_y|| - curCol < FILTER_RADIUS - src_offset_x || (curCol - FILTER_RADIUS) >= wholecols - src_offset_x) - { - local_data[(i) * LOCAL_WIDTH + j] = 0; - } - else - { - local_data[(i) * LOCAL_WIDTH + j] = src[(curRow - FILTER_RADIUS) * src_step + curCol - FILTER_RADIUS + src_offset]; - } - } - } -#else - for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1)) - { - int curRow = groupStartRow + i; - - curRow = ADDR_H(curRow, FILTER_RADIUS - src_offset_y, wholerows - src_offset_y); - - curRow = ADDR_B(curRow - FILTER_RADIUS, wholerows - src_offset_y, curRow - FILTER_RADIUS); - - for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0)) - { - int curCol = groupStartCol + j; - curCol = ADDR_L(curCol, FILTER_RADIUS - src_offset_x, wholecols - src_offset_x); - curCol = ADDR_R(curCol - FILTER_RADIUS, wholecols - src_offset_x, curCol - FILTER_RADIUS); - if(curRow < wholerows && curCol < wholecols) - { - local_data[(i) * LOCAL_WIDTH + j] = src[(curRow) * src_step + curCol + src_offset]; - } - } - } -#endif - - barrier(CLK_LOCAL_MEM_FENCE); - if(globalRow < rows && globalCol < cols) - { - T_SUM sum = (T_SUM)(SUM_ZERO); - int filterIdx = 0; - for(int i = 0; i < FILTER_SIZE; i++) - { - int offset = (i + localRow) * LOCAL_WIDTH; - - for(int j = 0; j < FILTER_SIZE; j++) - { - sum += CONVERT_TYPE_SUM(local_data[offset + j + localCol]) * mat_kernel[filterIdx++]; - } - } - dst[(globalRow)*dst_step + (globalCol) + dst_offset] = CONVERT_TYPE(sum); - } -} - -/// following is specific for 3x3 kernels - -////////////////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////Macro for define elements number per thread///////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////// - -#define ANX 1 -#define ANY 1 - -#define ROWS_PER_GROUP 4 -#define ROWS_PER_GROUP_BITS 2 -#define ROWS_FETCH (ROWS_PER_GROUP + ANY + ANY) //(ROWS_PER_GROUP + anY * 2) - -#define THREADS_PER_ROW 64 -#define THREADS_PER_ROW_BIT 6 - -#define ELEMENTS_PER_THREAD 4 -#define ELEMENTS_PER_THREAD_BIT 2 - -#define LOCAL_MEM_STEP 260 //divup((get_local_size(0) + anX * 2), 4) * 4 - -/////////////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////////////8uC1//////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////////////////////////// - -__kernel void filter2D_3x3( - __global T_IMG *src, - __global T_IMG *dst, - int src_step, - int dst_step, - __constant float *mat_kernel, - __local T_IMG *local_data, - int wholerows, - int wholecols, - int src_offset_x, - int src_offset_y, - int dst_offset_x, - int dst_offset_y, - int cols, - int rows, - int operate_cols -) -{ - int gX = get_global_id(0); - int gY = get_global_id(1); - - int lX = get_local_id(0); - - int groupX_size = get_local_size(0); - int groupX_id = get_group_id(0); - -#define dst_align (dst_offset_x & 3) - int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; - int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; - - if((gY << 2) < rows) - { - for(int i = 0; i < ROWS_FETCH; ++i) - { - if((rows_start_index - src_offset_y) + i < rows + ANY) - { -#ifdef BORDER_CONSTANT - int selected_row = rows_start_index + i; - int selected_cols = cols_start_index_group + lX; - - T_IMG data = src[mad24(selected_row, src_step, selected_cols)]; - int con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols; - data = con ? data : (T_IMG)(0); - local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data; - - if(lX < (ANX << 1)) - { - selected_cols = cols_start_index_group + lX + groupX_size; - - data = src[mad24(selected_row, src_step, selected_cols)]; - con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols; - data = con ? data : (T_IMG)(0); - local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data; - } -#else - int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); - selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); - - int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols); - selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols); - - T_IMG data = src[mad24(selected_row, src_step, selected_cols)]; - - local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data; - - if(lX < (ANX << 1)) - { - selected_cols = cols_start_index_group + lX + groupX_size; - selected_cols = ADDR_R(selected_cols, wholecols, selected_cols); - - data = src[mad24(selected_row, src_step, selected_cols)]; - local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data; - } -#endif - } - } - } - barrier(CLK_LOCAL_MEM_FENCE); - - int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2); - if(((gY << 2) < rows) && (process_col < operate_cols)) - { - int dst_cols_start = dst_offset_x; - int dst_cols_end = dst_offset_x + cols; - int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc; - - int dst_rows_end = dst_offset_y + rows; - int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT); - dst = dst + mad24(dst_rows_index, dst_step, dst_cols_index); - - T_IMGx4 dst_data = *(__global T_IMGx4 *)dst; - - T_SUMx4 sum = (T_SUMx4)SUM_ZEROx4; - T_IMGx4 data; - - for(int i = 0; i < FILTER_SIZE; i++) - { -#pragma unroll - for(int j = 0; j < FILTER_SIZE; j++) - { - if(dst_rows_index < dst_rows_end) - { - int local_row = (lX >> THREADS_PER_ROW_BIT) + i; - int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; - - data = VLOAD4(0, (__local T_IMG_C1 *)(local_data + local_row * LOCAL_MEM_STEP + local_cols)); - sum = sum + (mat_kernel[i * FILTER_SIZE + j] * CONVERT_TYPE_SUMx4(data)); - } - } - } - - if(dst_rows_index < dst_rows_end) - { - T_IMGx4 tmp_dst = CONVERT_TYPEx4(sum); - tmp_dst.SX = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ? - tmp_dst.SX : dst_data.SX; - tmp_dst.SY = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ? - tmp_dst.SY : dst_data.SY; - tmp_dst.SZ = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ? - tmp_dst.SZ : dst_data.SZ; - tmp_dst.SW = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? - tmp_dst.SW : dst_data.SW; - *(__global T_IMGx4 *)dst = tmp_dst; - } - } -} diff --git a/modules/imgproc/src/opencl/match_template.cl b/modules/imgproc/src/opencl/match_template.cl deleted file mode 100644 index 6fc4c74..0000000 --- a/modules/imgproc/src/opencl/match_template.cl +++ /dev/null @@ -1,857 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Peng Xiao, pengxiao@multicorewareinc.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#pragma OPENCL EXTENSION cl_amd_printf : enable - -#if defined (DOUBLE_SUPPORT) - -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#elif defined (cl_amd_fp64) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#endif - -#define TYPE_IMAGE_SQSUM double -#else -#define TYPE_IMAGE_SQSUM float -#endif - -#ifndef CN4 -#define CN4 1 -#else -#define CN4 4 -#endif - -////////////////////////////////////////////////// -// utilities -#define SQSUMS_PTR(ox, oy) mad24(gidy + oy, img_sqsums_step, (gidx + img_sqsums_offset + ox) * CN4) -#define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step, gidx + img_sums_offset + ox) -// normAcc* are accurate normalization routines which make GPU matchTemplate -// consistent with CPU one -float normAcc(float num, float denum) -{ - if(fabs(num) < denum) - { - return num / denum; - } - if(fabs(num) < denum * 1.125f) - { - return num > 0 ? 1 : -1; - } - return 0; -} - -float normAcc_SQDIFF(float num, float denum) -{ - if(fabs(num) < denum) - { - return num / denum; - } - if(fabs(num) < denum * 1.125f) - { - return num > 0 ? 1 : -1; - } - return 1; -} -////////////////////////////////////////////////////////////////////// -// normalize - -__kernel -void normalizeKernel_C1_D0 -( - __global const float * img_sqsums, - __global float * res, - ulong tpl_sqsum, - int res_rows, - int res_cols, - int tpl_rows, - int tpl_cols, - int img_sqsums_offset, - int img_sqsums_step, - int res_offset, - int res_step -) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - - res_step /= sizeof(*res); - res_offset /= sizeof(*res); - img_sqsums_step /= sizeof(*img_sqsums); - img_sqsums_offset /= sizeof(*img_sqsums); - int res_idx = mad24(gidy, res_step, res_offset + gidx); - if(gidx < res_cols && gidy < res_rows) - { - float image_sqsum_ = (float)( - (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - - (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); - res[res_idx] = normAcc(res[res_idx], sqrt(image_sqsum_ * tpl_sqsum)); - } -} - -__kernel -void matchTemplate_Prepared_SQDIFF_C1_D0 -( - __global const TYPE_IMAGE_SQSUM * img_sqsums, - __global float * res, - ulong tpl_sqsum, - int res_rows, - int res_cols, - int tpl_rows, - int tpl_cols, - int img_sqsums_offset, - int img_sqsums_step, - int res_offset, - int res_step -) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - - res_step /= sizeof(*res); - res_offset /= sizeof(*res); - img_sqsums_step /= sizeof(*img_sqsums); - img_sqsums_offset /= sizeof(*img_sqsums); - int res_idx = mad24(gidy, res_step, res_offset + gidx); - if(gidx < res_cols && gidy < res_rows) - { - float image_sqsum_ = (float)( - (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - - (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); - res[res_idx] = image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum; - } -} - -__kernel -void matchTemplate_Prepared_SQDIFF_NORMED_C1_D0 -( - __global const float * img_sqsums, - __global float * res, - ulong tpl_sqsum, - int res_rows, - int res_cols, - int tpl_rows, - int tpl_cols, - int img_sqsums_offset, - int img_sqsums_step, - int res_offset, - int res_step -) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - - res_step /= sizeof(*res); - res_offset /= sizeof(*res); - img_sqsums_step /= sizeof(*img_sqsums); - img_sqsums_offset /= sizeof(*img_sqsums); - int res_idx = mad24(gidy, res_step, res_offset + gidx); - if(gidx < res_cols && gidy < res_rows) - { - float image_sqsum_ = (float)( - (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - - (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); - res[res_idx] = normAcc_SQDIFF(image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum, - sqrt(image_sqsum_ * tpl_sqsum)); - } -} - -////////////////////////////////////////////////// -// SQDIFF -__kernel -void matchTemplate_Naive_SQDIFF_C1_D0 -( - __global const uchar * img, - __global const uchar * tpl, - __global float * res, - int img_rows, - int img_cols, - int tpl_rows, - int tpl_cols, - int res_rows, - int res_cols, - int img_offset, - int tpl_offset, - int res_offset, - int img_step, - int tpl_step, - int res_step -) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - int i,j; - int delta; - int sum = 0; - res_step /= sizeof(*res); - res_offset /= sizeof(*res); - int res_idx = mad24(gidy, res_step, res_offset + gidx); - - if(gidx < res_cols && gidy < res_rows) - { - for(i = 0; i < tpl_rows; i ++) - { - // get specific rows of img data - __global const uchar * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); - __global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); - for(j = 0; j < tpl_cols; j ++) - { - delta = img_ptr[j] - tpl_ptr[j]; - sum = mad24(delta, delta, sum); - } - } - res[res_idx] = sum; - } -} - -__kernel -void matchTemplate_Naive_SQDIFF_C1_D5 -( - __global const float * img, - __global const float * tpl, - __global float * res, - int img_rows, - int img_cols, - int tpl_rows, - int tpl_cols, - int res_rows, - int res_cols, - int img_offset, - int tpl_offset, - int res_offset, - int img_step, - int tpl_step, - int res_step -) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - int i,j; - float delta; - float sum = 0; - img_step /= sizeof(*img); - img_offset /= sizeof(*img); - tpl_step /= sizeof(*tpl); - tpl_offset /= sizeof(*tpl); - res_step /= sizeof(*res); - res_offset /= sizeof(*res); - - int res_idx = mad24(gidy, res_step, res_offset + gidx); - - if(gidx < res_cols && gidy < res_rows) - { - for(i = 0; i < tpl_rows; i ++) - { - // get specific rows of img data - __global const float * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); - __global const float * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); - for(j = 0; j < tpl_cols; j ++) - { - delta = img_ptr[j] - tpl_ptr[j]; - sum = mad(delta, delta, sum); - } - } - res[res_idx] = sum; - } -} - -__kernel -void matchTemplate_Naive_SQDIFF_C4_D0 -( - __global const uchar4 * img, - __global const uchar4 * tpl, - __global float * res, - int img_rows, - int img_cols, - int tpl_rows, - int tpl_cols, - int res_rows, - int res_cols, - int img_offset, - int tpl_offset, - int res_offset, - int img_step, - int tpl_step, - int res_step -) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - int i,j; - int4 delta; - int4 sum = (int4)(0, 0, 0, 0); - img_step /= sizeof(*img); - img_offset /= sizeof(*img); - tpl_step /= sizeof(*tpl); - tpl_offset /= sizeof(*tpl); - res_step /= sizeof(*res); - res_offset /= sizeof(*res); - - int res_idx = mad24(gidy, res_step, res_offset + gidx); - - if(gidx < res_cols && gidy < res_rows) - { - for(i = 0; i < tpl_rows; i ++) - { - // get specific rows of img data - __global const uchar4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); - __global const uchar4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); - for(j = 0; j < tpl_cols; j ++) - { - //delta = convert_int4(img_ptr[j] - tpl_ptr[j]); // this alternative is incorrect - delta.x = img_ptr[j].x - tpl_ptr[j].x; - delta.y = img_ptr[j].y - tpl_ptr[j].y; - delta.z = img_ptr[j].z - tpl_ptr[j].z; - delta.w = img_ptr[j].w - tpl_ptr[j].w; - sum = mad24(delta, delta, sum); - } - } - res[res_idx] = sum.x + sum.y + sum.z + sum.w; - } -} - -__kernel -void matchTemplate_Naive_SQDIFF_C4_D5 -( - __global const float4 * img, - __global const float4 * tpl, - __global float * res, - int img_rows, - int img_cols, - int tpl_rows, - int tpl_cols, - int res_rows, - int res_cols, - int img_offset, - int tpl_offset, - int res_offset, - int img_step, - int tpl_step, - int res_step -) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - int i,j; - float4 delta; - float4 sum = (float4)(0, 0, 0, 0); - img_step /= sizeof(*img); - img_offset /= sizeof(*img); - tpl_step /= sizeof(*tpl); - tpl_offset /= sizeof(*tpl); - res_step /= sizeof(*res); - res_offset /= sizeof(*res); - - int res_idx = mad24(gidy, res_step, res_offset + gidx); - - if(gidx < res_cols && gidy < res_rows) - { - for(i = 0; i < tpl_rows; i ++) - { - // get specific rows of img data - __global const float4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); - __global const float4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); - for(j = 0; j < tpl_cols; j ++) - { - //delta = convert_int4(img_ptr[j] - tpl_ptr[j]); // this alternative is incorrect - delta.x = img_ptr[j].x - tpl_ptr[j].x; - delta.y = img_ptr[j].y - tpl_ptr[j].y; - delta.z = img_ptr[j].z - tpl_ptr[j].z; - delta.w = img_ptr[j].w - tpl_ptr[j].w; - sum = mad(delta, delta, sum); - } - } - res[res_idx] = sum.x + sum.y + sum.z + sum.w; - } -} - -////////////////////////////////////////////////// -// CCORR -__kernel -void matchTemplate_Naive_CCORR_C1_D0 -( - __global const uchar * img, - __global const uchar * tpl, - __global float * res, - int img_rows, - int img_cols, - int tpl_rows, - int tpl_cols, - int res_rows, - int res_cols, - int img_offset, - int tpl_offset, - int res_offset, - int img_step, - int tpl_step, - int res_step -) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - int i,j; - int sum = 0; - res_step /= sizeof(*res); - res_offset /= sizeof(*res); - - int res_idx = mad24(gidy, res_step, res_offset + gidx); - - if(gidx < res_cols && gidy < res_rows) - { - for(i = 0; i < tpl_rows; i ++) - { - // get specific rows of img data - __global const uchar * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); - __global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); - for(j = 0; j < tpl_cols; j ++) - { - sum = mad24(convert_int(img_ptr[j]), convert_int(tpl_ptr[j]), sum); - } - } - res[res_idx] = (float)sum; - } -} - -__kernel -void matchTemplate_Naive_CCORR_C1_D5 -( - __global const float * img, - __global const float * tpl, - __global float * res, - int img_rows, - int img_cols, - int tpl_rows, - int tpl_cols, - int res_rows, - int res_cols, - int img_offset, - int tpl_offset, - int res_offset, - int img_step, - int tpl_step, - int res_step -) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - int i,j; - float sum = 0; - img_step /= sizeof(*img); - img_offset /= sizeof(*img); - tpl_step /= sizeof(*tpl); - tpl_offset /= sizeof(*tpl); - res_step /= sizeof(*res); - res_offset /= sizeof(*res); - - int res_idx = mad24(gidy, res_step, res_offset + gidx); - - if(gidx < res_cols && gidy < res_rows) - { - for(i = 0; i < tpl_rows; i ++) - { - // get specific rows of img data - __global const float * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); - __global const float * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); - for(j = 0; j < tpl_cols; j ++) - { - sum = mad(img_ptr[j], tpl_ptr[j], sum); - } - } - res[res_idx] = sum; - } -} - -__kernel -void matchTemplate_Naive_CCORR_C4_D0 -( - __global const uchar4 * img, - __global const uchar4 * tpl, - __global float * res, - int img_rows, - int img_cols, - int tpl_rows, - int tpl_cols, - int res_rows, - int res_cols, - int img_offset, - int tpl_offset, - int res_offset, - int img_step, - int tpl_step, - int res_step -) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - int i,j; - int4 sum = (int4)(0, 0, 0, 0); - img_step /= sizeof(*img); - img_offset /= sizeof(*img); - tpl_step /= sizeof(*tpl); - tpl_offset /= sizeof(*tpl); - res_step /= sizeof(*res); - res_offset /= sizeof(*res); - - int res_idx = mad24(gidy, res_step, res_offset + gidx); - - if(gidx < res_cols && gidy < res_rows) - { - for(i = 0; i < tpl_rows; i ++) - { - // get specific rows of img data - __global const uchar4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); - __global const uchar4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); - for(j = 0; j < tpl_cols; j ++) - { - sum = mad24(convert_int4(img_ptr[j]), convert_int4(tpl_ptr[j]), sum); - } - } - res[res_idx] = (float)(sum.x + sum.y + sum.z + sum.w); - } -} - -__kernel -void matchTemplate_Naive_CCORR_C4_D5 -( - __global const float4 * img, - __global const float4 * tpl, - __global float * res, - int img_rows, - int img_cols, - int tpl_rows, - int tpl_cols, - int res_rows, - int res_cols, - int img_offset, - int tpl_offset, - int res_offset, - int img_step, - int tpl_step, - int res_step -) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - int i,j; - float4 sum = (float4)(0, 0, 0, 0); - img_step /= sizeof(*img); - img_offset /= sizeof(*img); - tpl_step /= sizeof(*tpl); - tpl_offset /= sizeof(*tpl); - res_step /= sizeof(*res); - res_offset /= sizeof(*res); - - int res_idx = mad24(gidy, res_step, res_offset + gidx); - - if(gidx < res_cols && gidy < res_rows) - { - for(i = 0; i < tpl_rows; i ++) - { - // get specific rows of img data - __global const float4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); - __global const float4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); - for(j = 0; j < tpl_cols; j ++) - { - sum = mad(convert_float4(img_ptr[j]), convert_float4(tpl_ptr[j]), sum); - } - } - res[res_idx] = sum.x + sum.y + sum.z + sum.w; - } -} - -////////////////////////////////////////////////// -// CCOFF -__kernel -void matchTemplate_Prepared_CCOFF_C1_D0 -( - __global float * res, - int img_rows, - int img_cols, - int tpl_rows, - int tpl_cols, - int res_rows, - int res_cols, - int res_offset, - int res_step, - __global const uint * img_sums, - int img_sums_offset, - int img_sums_step, - float tpl_sum -) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - - img_sums_offset /= sizeof(*img_sums); - img_sums_step /= sizeof(*img_sums); - res_step /= sizeof(*res); - res_offset /= sizeof(*res); - - int res_idx = mad24(gidy, res_step, res_offset + gidx); - - if(gidx < res_cols && gidy < res_rows) - { - float sum = (float)((img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) - -(img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)])); - res[res_idx] -= sum * tpl_sum; - } -} -__kernel -void matchTemplate_Prepared_CCOFF_C4_D0 -( - __global float * res, - int img_rows, - int img_cols, - int tpl_rows, - int tpl_cols, - int res_rows, - int res_cols, - int res_offset, - int res_step, - __global const uint * img_sums_c0, - __global const uint * img_sums_c1, - __global const uint * img_sums_c2, - __global const uint * img_sums_c3, - int img_sums_offset, - int img_sums_step, - float tpl_sum_c0, - float tpl_sum_c1, - float tpl_sum_c2, - float tpl_sum_c3 -) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - - img_sums_offset /= sizeof(*img_sums_c0); - img_sums_step /= sizeof(*img_sums_c0); - res_step /= sizeof(*res); - res_offset /= sizeof(*res); - - int res_idx = mad24(gidy, res_step, res_offset + gidx); - - if(gidx < res_cols && gidy < res_rows) - { - float ccorr = res[res_idx]; - ccorr -= tpl_sum_c0*(float)( - (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)]) - - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)])); - ccorr -= tpl_sum_c1*(float)( - (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)]) - - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)])); - ccorr -= tpl_sum_c2*(float)( - (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)]) - - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)])); - ccorr -= tpl_sum_c3*(float)( - (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)]) - - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)])); - res[res_idx] = ccorr; - } -} - -__kernel -void matchTemplate_Prepared_CCOFF_NORMED_C1_D0 -( - __global float * res, - int img_rows, - int img_cols, - int tpl_rows, - int tpl_cols, - int res_rows, - int res_cols, - int res_offset, - int res_step, - float weight, - __global const uint * img_sums, - int img_sums_offset, - int img_sums_step, - __global const float * img_sqsums, - int img_sqsums_offset, - int img_sqsums_step, - float tpl_sum, - float tpl_sqsum -) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - - img_sqsums_step /= sizeof(*img_sqsums); - img_sqsums_offset /= sizeof(*img_sqsums); - img_sums_offset /= sizeof(*img_sums); - img_sums_step /= sizeof(*img_sums); - res_step /= sizeof(*res); - res_offset /= sizeof(*res); - - - int res_idx = mad24(gidy, res_step, res_offset + gidx); - - if(gidx < res_cols && gidy < res_rows) - { - float image_sum_ = (float)( - (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) - - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)])); - - float image_sqsum_ = (float)( - (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - - (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); - res[res_idx] = normAcc(res[res_idx] - image_sum_ * tpl_sum, - sqrt(tpl_sqsum * (image_sqsum_ - weight * image_sum_ * image_sum_))); - } -} -__kernel -void matchTemplate_Prepared_CCOFF_NORMED_C4_D0 -( - __global float * res, - int img_rows, - int img_cols, - int tpl_rows, - int tpl_cols, - int res_rows, - int res_cols, - int res_offset, - int res_step, - float weight, - __global const uint * img_sums_c0, - __global const uint * img_sums_c1, - __global const uint * img_sums_c2, - __global const uint * img_sums_c3, - int img_sums_offset, - int img_sums_step, - __global const float * img_sqsums_c0, - __global const float * img_sqsums_c1, - __global const float * img_sqsums_c2, - __global const float * img_sqsums_c3, - int img_sqsums_offset, - int img_sqsums_step, - float tpl_sum_c0, - float tpl_sum_c1, - float tpl_sum_c2, - float tpl_sum_c3, - float tpl_sqsum -) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); - - img_sqsums_step /= sizeof(*img_sqsums_c0); - img_sqsums_offset /= sizeof(*img_sqsums_c0); - img_sums_offset /= sizeof(*img_sums_c0); - img_sums_step /= sizeof(*img_sums_c0); - res_step /= sizeof(*res); - res_offset /= sizeof(*res); - - int res_idx = mad24(gidy, res_step, res_offset + gidx); - - if(gidx < res_cols && gidy < res_rows) - { - float image_sum_c0 = (float)( - (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)]) - - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)])); - float image_sum_c1 = (float)( - (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)]) - - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)])); - float image_sum_c2 = (float)( - (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)]) - - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)])); - float image_sum_c3 = (float)( - (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)]) - - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)])); - - float image_sqsum_c0 = (float)( - (img_sqsums_c0[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(tpl_cols, 0)]) - - (img_sqsums_c0[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(0, 0)])); - float image_sqsum_c1 = (float)( - (img_sqsums_c1[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(tpl_cols, 0)]) - - (img_sqsums_c1[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(0, 0)])); - float image_sqsum_c2 = (float)( - (img_sqsums_c2[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(tpl_cols, 0)]) - - (img_sqsums_c2[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(0, 0)])); - float image_sqsum_c3 = (float)( - (img_sqsums_c3[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(tpl_cols, 0)]) - - (img_sqsums_c3[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(0, 0)])); - - float num = res[res_idx] - - image_sum_c0 * tpl_sum_c0 - - image_sum_c1 * tpl_sum_c1 - - image_sum_c2 * tpl_sum_c2 - - image_sum_c3 * tpl_sum_c3; - float denum = sqrt( tpl_sqsum * ( - image_sqsum_c0 - weight * image_sum_c0 * image_sum_c0 + - image_sqsum_c1 - weight * image_sum_c1 * image_sum_c1 + - image_sqsum_c2 - weight * image_sum_c2 * image_sum_c2 + - image_sqsum_c3 - weight * image_sum_c0 * image_sum_c3) - ); - res[res_idx] = normAcc(num, denum); - } -} - -////////////////////////////////////////////////////////////////////// -// extractFirstChannel -__kernel -void extractFirstChannel -( - const __global float4* img, - __global float* res, - int rows, - int cols, - int img_offset, - int res_offset, - int img_step, - int res_step -) -{ - img_step /= sizeof(float4); - res_step /= sizeof(float); - img_offset /= sizeof(float4); - res_offset /= sizeof(float); - img += img_offset; - res += res_offset; - int gidx = get_global_id(0); - int gidy = get_global_id(1); - if(gidx < cols && gidy < rows) - { - res[gidx + gidy * res_step] = img[gidx + gidy * img_step].x; - } -} diff --git a/modules/imgproc/src/opencl/median.cl b/modules/imgproc/src/opencl/median.cl deleted file mode 100644 index ccb5299..0000000 --- a/modules/imgproc/src/opencl/median.cl +++ /dev/null @@ -1,486 +0,0 @@ -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Niko Li, newlife20080214@gmail.com -// Zero Lin, zero.lin@amd.com -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -// - - -/* -__kernel void medianFilter_C1(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, int cols, - int rows, int srcStep, int dstStep, int m) -{ - int dx = get_global_id(0)-(m>>1); - int dy = get_global_id(1)-(m>>1); - - short histom[256]; - for(int i=0;i<256;++i) - histom[i]=0; - - - for(int i=0;i>1; - int v; - for(int i=0;i<256;++i) - { - v=(now= (r_edge) ? (r_edge)-1 : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i)) -#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr)) -#endif - -#ifdef BORDER_REFLECT -//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i)) -#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i)) -#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr)) -#endif - -#ifdef BORDER_REFLECT101 -//BORDER_REFLECT101: gfedcb|abcdefgh|gfedcba -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i)) -#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i)) -#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr)) -#endif - -#ifdef BORDER_WRAP -//BORDER_WRAP: cdefgh|abcdefgh|abcdefg -#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i)) -#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr)) -#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i)) -#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr)) -#endif - -#define THREADS 256 -#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2) -/////////////////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////////calcHarris//////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////////////////// -__kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy, __global float *dst, - int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step, - int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step, - int dst_offset, int dst_rows, int dst_cols, int dst_step, - float k) -{ - int col = get_local_id(0); - const int gX = get_group_id(0); - const int gY = get_group_id(1); - const int glx = get_global_id(0); - const int gly = get_global_id(1); - - int dx_x_off = (dx_offset % dx_step) >> 2; - int dx_y_off = dx_offset / dx_step; - int dy_x_off = (dy_offset % dy_step) >> 2; - int dy_y_off = dy_offset / dy_step; - int dst_x_off = (dst_offset % dst_step) >> 2; - int dst_y_off = dst_offset / dst_step; - - int dx_startX = gX * (THREADS-ksX+1) - anX + dx_x_off; - int dx_startY = (gY << 1) - anY + dx_y_off; - int dy_startX = gX * (THREADS-ksX+1) - anX + dy_x_off; - int dy_startY = (gY << 1) - anY + dy_y_off; - int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; - int dst_startY = (gY << 1) + dst_y_off; - - float dx_data[ksY+1],dy_data[ksY+1],data[3][ksY+1]; - __local float temp[6][THREADS]; -#ifdef BORDER_CONSTANT - bool dx_con,dy_con; - float dx_s,dy_s; - for(int i=0; i < ksY+1; i++) - { - dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows; - dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)]; - dx_data[i] = dx_con ? dx_s : 0.0; - dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows; - dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)]; - dy_data[i] = dy_con ? dy_s : 0.0; - data[0][i] = dx_data[i] * dx_data[i]; - data[1][i] = dx_data[i] * dy_data[i]; - data[2][i] = dy_data[i] * dy_data[i]; - } -#else - int clamped_col = min(dst_cols, col); - - for(int i=0; i < ksY+1; i++) - { - int dx_selected_row; - int dx_selected_col; - dx_selected_row = ADDR_H(dx_startY+i, 0, dx_whole_rows); - dx_selected_row = ADDR_B(dx_startY+i, dx_whole_rows, dx_selected_row); - dx_selected_col = ADDR_L(dx_startX+clamped_col, 0, dx_whole_cols); - dx_selected_col = ADDR_R(dx_startX+clamped_col, dx_whole_cols, dx_selected_col); - dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col]; - - int dy_selected_row; - int dy_selected_col; - dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows); - dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row); - dy_selected_col = ADDR_L(dy_startX+clamped_col, 0, dy_whole_cols); - dy_selected_col = ADDR_R(dy_startX+clamped_col, dy_whole_cols, dy_selected_col); - dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col]; - - data[0][i] = dx_data[i] * dx_data[i]; - data[1][i] = dx_data[i] * dy_data[i]; - data[2][i] = dy_data[i] * dy_data[i]; - } -#endif - float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; - for(int i=1; i < ksY; i++) - { - sum0 += (data[0][i]); - sum1 += (data[1][i]); - sum2 += (data[2][i]); - } - float sum01,sum02,sum11,sum12,sum21,sum22; - sum01 = sum0 + (data[0][0]); - sum02 = sum0 + (data[0][ksY]); - temp[0][col] = sum01; - temp[1][col] = sum02; - sum11 = sum1 + (data[1][0]); - sum12 = sum1 + (data[1][ksY]); - temp[2][col] = sum11; - temp[3][col] = sum12; - sum21 = sum2 + (data[2][0]); - sum22 = sum2 + (data[2][ksY]); - temp[4][col] = sum21; - temp[5][col] = sum22; - barrier(CLK_LOCAL_MEM_FENCE); - if(col < (THREADS-(ksX-1))) - { - col += anX; - int posX = dst_startX - dst_x_off + col - anX; - int posY = (gly << 1); - int till = (ksX + 1)%2; - float tmp_sum[6]={ 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 }; - for(int k=0; k<6; k++) - for(int i=-anX; i<=anX - till; i++) - { - tmp_sum[k] += temp[k][col+i]; - } - - if(posX < dst_cols && (posY) < dst_rows) - { - float a = tmp_sum[0] * 0.5f; - float b = tmp_sum[2]; - float c = tmp_sum[4] * 0.5f; - dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] = (float)((a+c) - sqrt((a-c)*(a-c) + b*b)); - } - if(posX < dst_cols && (posY + 1) < dst_rows) - { - float a = tmp_sum[1] * 0.5f; - float b = tmp_sum[3]; - float c = tmp_sum[5] * 0.5f; - dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] = (float)((a+c) - sqrt((a-c)*(a-c) + b*b)); - } - } -} diff --git a/modules/imgproc/src/opencl/moments.cl b/modules/imgproc/src/opencl/moments.cl deleted file mode 100644 index d61b8d5..0000000 --- a/modules/imgproc/src/opencl/moments.cl +++ /dev/null @@ -1,980 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Sen Liu, swjtuls1987@126.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#if defined (DOUBLE_SUPPORT) - -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#elif defined (cl_amd_fp64) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#endif -typedef double T; -typedef double F; -typedef double4 F4; -#define convert_F4 convert_double4 - -#else -typedef float F; -typedef float4 F4; -typedef long T; -#define convert_F4 convert_float4 -#endif - -#define DST_ROW_00 0 -#define DST_ROW_10 1 -#define DST_ROW_01 2 -#define DST_ROW_20 3 -#define DST_ROW_11 4 -#define DST_ROW_02 5 -#define DST_ROW_30 6 -#define DST_ROW_21 7 -#define DST_ROW_12 8 -#define DST_ROW_03 9 - -__kernel void icvContourMoments(int contour_total, - __global float* reader_oclmat_data, - __global T* dst_a, - int dst_step) -{ - T xi_1, yi_1, xi_12, yi_12, xi, yi, xi2, yi2, dxy, xii_1, yii_1; - int idx = get_global_id(0); - - if (idx < 0 || idx >= contour_total) - return; - - xi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1))); - yi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1) + 1)); - xi_12 = xi_1 * xi_1; - yi_12 = yi_1 * yi_1; - - if(idx == contour_total - 1) - { - xi = (T)(*(reader_oclmat_data)); - yi = (T)(*(reader_oclmat_data + 1)); - } - else - { - xi = (T)(*(reader_oclmat_data + (idx + 1) * 2)); - yi = (T)(*(reader_oclmat_data + (idx + 1) * 2 + 1)); - } - - xi2 = xi * xi; - yi2 = yi * yi; - dxy = xi_1 * yi - xi * yi_1; - xii_1 = xi_1 + xi; - yii_1 = yi_1 + yi; - - dst_step /= sizeof(T); - *( dst_a + DST_ROW_00 * dst_step + idx) = dxy; - *( dst_a + DST_ROW_10 * dst_step + idx) = dxy * xii_1; - *( dst_a + DST_ROW_01 * dst_step + idx) = dxy * yii_1; - *( dst_a + DST_ROW_20 * dst_step + idx) = dxy * (xi_1 * xii_1 + xi2); - *( dst_a + DST_ROW_11 * dst_step + idx) = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi)); - *( dst_a + DST_ROW_02 * dst_step + idx) = dxy * (yi_1 * yii_1 + yi2); - *( dst_a + DST_ROW_30 * dst_step + idx) = dxy * xii_1 * (xi_12 + xi2); - *( dst_a + DST_ROW_03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2); - *( dst_a + DST_ROW_21 * dst_step + idx) = - dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 + - xi2 * (yi_1 + 3 * yi)); - *( dst_a + DST_ROW_12 * dst_step + idx) = - dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 + - yi2 * (xi_1 + 3 * xi)); -} - -__kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_width, int TILE_SIZE, - __global F* sum, __global F* dst_m, int dst_step) -{ - int gidy = get_global_id(0); - int gidx = get_global_id(1); - int block_y = src_rows/tile_height; - int block_x = src_cols/tile_width; - int block_num; - - if(src_rows > TILE_SIZE && src_rows % TILE_SIZE != 0) - block_y ++; - if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0) - block_x ++; - block_num = block_y * block_x; - __local F dst_sum[10][128]; - if(gidy<128-block_num) - for(int i=0; i<10; i++) - dst_sum[i][gidy+block_num]=0; - barrier(CLK_LOCAL_MEM_FENCE); - - dst_step /= sizeof(F); - if(gidy0; lsize>>=1) - { - if(gidy 0 ) //channel of interest - for(int i = 0; i < tileSize_width; i += VLEN_C) - { - for(int j=0; j= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) - { - m[9][lidy-bheight] = ((int)py) * sy; // m03 - m[8][lidy-bheight] = ((int)x1.s0) * sy; // m12 - m[7][lidy-bheight] = ((int)x2.s0) * lidy; // m21 - m[6][lidy-bheight] = x3.s0; // m30 - m[5][lidy-bheight] = x0.s0 * sy; // m02 - m[4][lidy-bheight] = x1.s0 * lidy; // m11 - m[3][lidy-bheight] = x2.s0; // m20 - m[2][lidy-bheight] = py; // m01 - m[1][lidy-bheight] = x1.s0; // m10 - m[0][lidy-bheight] = x0.s0; // m00 - } - else if(lidy < bheight) - { - lm[9] = ((int)py) * sy; // m03 - lm[8] = ((int)x1.s0) * sy; // m12 - lm[7] = ((int)x2.s0) * lidy; // m21 - lm[6] = x3.s0; // m30 - lm[5] = x0.s0 * sy; // m02 - lm[4] = x1.s0 * lidy; // m11 - lm[3] = x2.s0; // m20 - lm[2] = py; // m01 - lm[1] = x1.s0; // m10 - lm[0] = x0.s0; // m00 - } - barrier(CLK_LOCAL_MEM_FENCE); - for( int j = bheight; j >= 1; j = j/2 ) - { - if(lidy < j) - for( int i = 0; i < 10; i++ ) - lm[i] = lm[i] + m[i][lidy]; - barrier(CLK_LOCAL_MEM_FENCE); - if(lidy >= j/2&&lidy < j) - for( int i = 0; i < 10; i++ ) - m[i][lidy-j/2] = lm[i]; - barrier(CLK_LOCAL_MEM_FENCE); - } - - if(lidy == 0&&lidx == 0) - { - for( int mt = 0; mt < 10; mt++ ) - mom[mt] = (F)lm[mt]; - if(binary) - { - F s = 1./255; - for( int mt = 0; mt < 10; mt++ ) - mom[mt] *= s; - } - F xm = x * mom[0], ym = y * mom[0]; - - // accumulate moments computed in each tile - dst_step /= sizeof(F); - - // + m00 ( = m00' ) - *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; - - // + m10 ( = m10' + x*m00' ) - *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; - - // + m01 ( = m01' + y*m00' ) - *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; - - // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) - *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); - - // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) - *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; - - // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) - *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); - - // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) - *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); - - // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') - *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; - - // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') - *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; - - // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) - *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); - } -} - -__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step, - __global F* dst_m, - int dst_cols, int dst_step, int blocky, - int depth, int cn, int coi, int binary, const int TILE_SIZE) -{ - ushort tmp_coi[8]; // get the coi data - ushort8 tmp[32]; - int VLEN_US = 8; // vector length of ushort - int gidy = get_global_id(0); - int gidx = get_global_id(1); - int wgidy = get_group_id(0); - int wgidx = get_group_id(1); - int lidy = get_local_id(0); - int lidx = get_local_id(1); - int y = wgidy*TILE_SIZE; // real Y index of pixel - int x = wgidx*TILE_SIZE; // real X index of pixel - int kcn = (cn==2)?2:4; - int rstep = min(src_step/2, TILE_SIZE); - int tileSize_height = min(TILE_SIZE, src_rows - y); - int tileSize_width = min(TILE_SIZE, src_cols -x); - - if ( y+lidy < src_rows ) - { - if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE) - for(int i=tileSize_width; i < rstep && (x+i) < src_cols; i++ ) - *((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0; - if( coi > 0 ) - for(int i=0; i < tileSize_width; i+=VLEN_US) - { - for(int j=0; j= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) - { - m[9][lidy-bheight] = ((long)py) * sy; // m03 - m[8][lidy-bheight] = ((long)x1.s0) * sy; // m12 - m[7][lidy-bheight] = ((long)x2.s0) * lidy; // m21 - m[6][lidy-bheight] = x3.s0; // m30 - m[5][lidy-bheight] = x0.s0 * sy; // m02 - m[4][lidy-bheight] = x1.s0 * lidy; // m11 - m[3][lidy-bheight] = x2.s0; // m20 - m[2][lidy-bheight] = py; // m01 - m[1][lidy-bheight] = x1.s0; // m10 - m[0][lidy-bheight] = x0.s0; // m00 - } - else if(lidy < bheight) - { - lm[9] = ((long)py) * sy; // m03 - lm[8] = ((long)x1.s0) * sy; // m12 - lm[7] = ((long)x2.s0) * lidy; // m21 - lm[6] = x3.s0; // m30 - lm[5] = x0.s0 * sy; // m02 - lm[4] = x1.s0 * lidy; // m11 - lm[3] = x2.s0; // m20 - lm[2] = py; // m01 - lm[1] = x1.s0; // m10 - lm[0] = x0.s0; // m00 - } - barrier(CLK_LOCAL_MEM_FENCE); - - for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) - { - if(lidy < j) - for( int i = 0; i < 10; i++ ) - lm[i] = lm[i] + m[i][lidy]; - } - barrier(CLK_LOCAL_MEM_FENCE); - for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) - { - if(lidy >= j/2&&lidy < j) - for( int i = 0; i < 10; i++ ) - m[i][lidy-j/2] = lm[i]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if(lidy == 0&&lidx == 0) - { - for(int mt = 0; mt < 10; mt++ ) - mom[mt] = (F)lm[mt]; - - if(binary) - { - F s = 1./255; - for( int mt = 0; mt < 10; mt++ ) - mom[mt] *= s; - } - - F xm = x *mom[0], ym = y * mom[0]; - - // accumulate moments computed in each tile - dst_step /= sizeof(F); - - // + m00 ( = m00' ) - *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; - - // + m10 ( = m10' + x*m00' ) - *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; - - // + m01 ( = m01' + y*m00' ) - *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; - - // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) - *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); - - // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) - *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; - - // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) - *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); - - // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) - *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); - - // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') - *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; - - // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') - *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; - - // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) - *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); - } -} - -__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step, - __global F* dst_m, - int dst_cols, int dst_step, int blocky, - int depth, int cn, int coi, int binary, const int TILE_SIZE) -{ - short tmp_coi[8]; // get the coi data - short8 tmp[32]; - int VLEN_S =8; // vector length of short - int gidy = get_global_id(0); - int gidx = get_global_id(1); - int wgidy = get_group_id(0); - int wgidx = get_group_id(1); - int lidy = get_local_id(0); - int lidx = get_local_id(1); - int y = wgidy*TILE_SIZE; // real Y index of pixel - int x = wgidx*TILE_SIZE; // real X index of pixel - int kcn = (cn==2)?2:4; - int rstep = min(src_step/2, TILE_SIZE); - int tileSize_height = min(TILE_SIZE, src_rows - y); - int tileSize_width = min(TILE_SIZE, src_cols -x); - - if ( y+lidy < src_rows ) - { - if(tileSize_width < TILE_SIZE) - for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ ) - *((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0; - if( coi > 0 ) - for(int i=0; i < tileSize_width; i+=VLEN_S) - { - for(int j=0; j= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) - { - m[9][lidy-bheight] = ((long)py) * sy; // m03 - m[8][lidy-bheight] = ((long)x1.s0) * sy; // m12 - m[7][lidy-bheight] = ((long)x2.s0) * lidy; // m21 - m[6][lidy-bheight] = x3.s0; // m30 - m[5][lidy-bheight] = x0.s0 * sy; // m02 - m[4][lidy-bheight] = x1.s0 * lidy; // m11 - m[3][lidy-bheight] = x2.s0; // m20 - m[2][lidy-bheight] = py; // m01 - m[1][lidy-bheight] = x1.s0; // m10 - m[0][lidy-bheight] = x0.s0; // m00 - } - else if(lidy < bheight) - { - lm[9] = ((long)py) * sy; // m03 - lm[8] = ((long)(x1.s0)) * sy; // m12 - lm[7] = ((long)(x2.s0)) * lidy; // m21 - lm[6] = x3.s0; // m30 - lm[5] = x0.s0 * sy; // m02 - lm[4] = x1.s0 * lidy; // m11 - lm[3] = x2.s0; // m20 - lm[2] = py; // m01 - lm[1] = x1.s0; // m10 - lm[0] = x0.s0; // m00 - } - barrier(CLK_LOCAL_MEM_FENCE); - for( int j = TILE_SIZE/2; j >=1; j = j/2 ) - { - if(lidy < j) - for( int i = 0; i < 10; i++ ) - lm[i] = lm[i] + m[i][lidy]; - barrier(CLK_LOCAL_MEM_FENCE); - if(lidy >= j/2&&lidy < j) - for( int i = 0; i < 10; i++ ) - m[i][lidy-j/2] = lm[i]; - barrier(CLK_LOCAL_MEM_FENCE); - } - if(lidy ==0 &&lidx ==0) - { - for(int mt = 0; mt < 10; mt++ ) - mom[mt] = (F)lm[mt]; - - if(binary) - { - F s = 1./255; - for( int mt = 0; mt < 10; mt++ ) - mom[mt] *= s; - } - - F xm = x * mom[0], ym = y*mom[0]; - - // accumulate moments computed in each tile - dst_step /= sizeof(F); - - // + m00 ( = m00' ) - *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; - - // + m10 ( = m10' + x*m00' ) - *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; - - // + m01 ( = m01' + y*m00' ) - *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; - - // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) - *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); - - // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) - *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; - - // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) - *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); - - // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) - *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); - - // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') - *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; - - // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') - *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; - - // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) - *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); - } -} - -__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step, - __global F* dst_m, - int dst_cols, int dst_step, int blocky, - int depth, int cn, int coi, int binary, const int TILE_SIZE) -{ - float tmp_coi[4]; // get the coi data - float4 tmp[64] ; - int VLEN_F = 4; // vector length of float - int gidy = get_global_id(0); - int gidx = get_global_id(1); - int wgidy = get_group_id(0); - int wgidx = get_group_id(1); - int lidy = get_local_id(0); - int lidx = get_local_id(1); - int y = wgidy*TILE_SIZE; // real Y index of pixel - int x = wgidx*TILE_SIZE; // real X index of pixel - int kcn = (cn==2)?2:4; - int rstep = min(src_step/4, TILE_SIZE); - int tileSize_height = min(TILE_SIZE, src_rows - y); - int tileSize_width = min(TILE_SIZE, src_cols -x); - int maxIdx = mul24(src_rows, src_cols); - int yOff = (y+lidy)*src_step; - int index; - - if ( y+lidy < src_rows ) - { - if(tileSize_width < TILE_SIZE) - for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ ) - *((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0; - if( coi > 0 ) - for(int i=0; i < tileSize_width; i+=VLEN_F) - { - for(int j=0; j<4; j++) - tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1); - tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]); - } - else - for(int i=0; i < tileSize_width; i+=VLEN_F) - tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3)); - } - - float4 zero = (float4)(0); - float4 full = (float4)(255); - if( binary ) - for(int i=0; i < tileSize_width; i+=4) - tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero; - F mom[10]; - __local F m[10][128]; - if(lidy < 128) - for(int i = 0; i < 10; i ++) - m[i][lidy] = 0; - barrier(CLK_LOCAL_MEM_FENCE); - F lm[10] = {0}; - F4 x0 = (F4)(0); - F4 x1 = (F4)(0); - F4 x2 = (F4)(0); - F4 x3 = (F4)(0); - for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_F ) - { - F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3); - F4 p = convert_F4(tmp[xt/VLEN_F]); - F4 xp = v_xt * p, xxp = xp * v_xt; - x0 += p; - x1 += xp; - x2 += xxp; - x3 += xxp * v_xt; - } - x0.s0 += x0.s1 + x0.s2 + x0.s3; - x1.s0 += x1.s1 + x1.s2 + x1.s3; - x2.s0 += x2.s1 + x2.s2 + x2.s3; - x3.s0 += x3.s1 + x3.s2 + x3.s3; - - F py = lidy * x0.s0, sy = lidy*lidy; - int bheight = min(tileSize_height, TILE_SIZE/2); - if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) - { - m[9][lidy-bheight] = ((F)py) * sy; // m03 - m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12 - m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21 - m[6][lidy-bheight] = x3.s0; // m30 - m[5][lidy-bheight] = x0.s0 * sy; // m02 - m[4][lidy-bheight] = x1.s0 * lidy; // m11 - m[3][lidy-bheight] = x2.s0; // m20 - m[2][lidy-bheight] = py; // m01 - m[1][lidy-bheight] = x1.s0; // m10 - m[0][lidy-bheight] = x0.s0; // m00 - } - - else if(lidy < bheight) - { - lm[9] = ((F)py) * sy; // m03 - lm[8] = ((F)x1.s0) * sy; // m12 - lm[7] = ((F)x2.s0) * lidy; // m21 - lm[6] = x3.s0; // m30 - lm[5] = x0.s0 * sy; // m02 - lm[4] = x1.s0 * lidy; // m11 - lm[3] = x2.s0; // m20 - lm[2] = py; // m01 - lm[1] = x1.s0; // m10 - lm[0] = x0.s0; // m00 - } - barrier(CLK_LOCAL_MEM_FENCE); - for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) - { - if(lidy < j) - for( int i = 0; i < 10; i++ ) - lm[i] = lm[i] + m[i][lidy]; - barrier(CLK_LOCAL_MEM_FENCE); - if(lidy >= j/2&&lidy < j) - for( int i = 0; i < 10; i++ ) - m[i][lidy-j/2] = lm[i]; - barrier(CLK_LOCAL_MEM_FENCE); - } - if(lidy == 0&&lidx == 0) - { - for( int mt = 0; mt < 10; mt++ ) - mom[mt] = (F)lm[mt]; - if(binary) - { - F s = 1./255; - for( int mt = 0; mt < 10; mt++ ) - mom[mt] *= s; - } - - F xm = x * mom[0], ym = y * mom[0]; - - // accumulate moments computed in each tile - dst_step /= sizeof(F); - - // + m00 ( = m00' ) - *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; - - // + m10 ( = m10' + x*m00' ) - *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; - - // + m01 ( = m01' + y*m00' ) - *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; - - // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) - *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); - - // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) - *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; - - // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) - *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); - - // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) - *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); - - // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') - *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; - - // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') - *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; - - // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) - *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); - } -} - -__kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, int src_step, - __global F* dst_m, - int dst_cols, int dst_step, int blocky, - int depth, int cn, int coi, int binary, const int TILE_SIZE) -{ - F tmp_coi[4]; // get the coi data - F4 tmp[64]; - int VLEN_D = 4; // length of vetor - int gidy = get_global_id(0); - int gidx = get_global_id(1); - int wgidy = get_group_id(0); - int wgidx = get_group_id(1); - int lidy = get_local_id(0); - int lidx = get_local_id(1); - int y = wgidy*TILE_SIZE; // real Y index of pixel - int x = wgidx*TILE_SIZE; // real X index of pixel - int kcn = (cn==2)?2:4; - int rstep = min(src_step/8, TILE_SIZE); - int tileSize_height = min(TILE_SIZE, src_rows - y); - int tileSize_width = min(TILE_SIZE, src_cols - x); - - if ( y+lidy < src_rows ) - { - if(tileSize_width < TILE_SIZE) - for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ ) - *((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0; - if( coi > 0 ) - for(int i=0; i < tileSize_width; i+=VLEN_D) - { - for(int j=0; j<4 && ((x+i+j)*kcn+coi-1)= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) - { - m[9][lidy-bheight] = ((F)py) * sy; // m03 - m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12 - m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21 - m[6][lidy-bheight] = x3.s0; // m30 - m[5][lidy-bheight] = x0.s0 * sy; // m02 - m[4][lidy-bheight] = x1.s0 * lidy; // m11 - m[3][lidy-bheight] = x2.s0; // m20 - m[2][lidy-bheight] = py; // m01 - m[1][lidy-bheight] = x1.s0; // m10 - m[0][lidy-bheight] = x0.s0; // m00 - } - else if(lidy < bheight) - { - lm[9] = ((F)py) * sy; // m03 - lm[8] = ((F)x1.s0) * sy; // m12 - lm[7] = ((F)x2.s0) * lidy; // m21 - lm[6] = x3.s0; // m30 - lm[5] = x0.s0 * sy; // m02 - lm[4] = x1.s0 * lidy; // m11 - lm[3] = x2.s0; // m20 - lm[2] = py; // m01 - lm[1] = x1.s0; // m10 - lm[0] = x0.s0; // m00 - } - barrier(CLK_LOCAL_MEM_FENCE); - - for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) - { - if(lidy < j) - for( int i = 0; i < 10; i++ ) - lm[i] = lm[i] + m[i][lidy]; - barrier(CLK_LOCAL_MEM_FENCE); - if(lidy >= j/2&&lidy < j) - for( int i = 0; i < 10; i++ ) - m[i][lidy-j/2] = lm[i]; - barrier(CLK_LOCAL_MEM_FENCE); - } - if(lidy == 0&&lidx == 0) - { - for( int mt = 0; mt < 10; mt++ ) - mom[mt] = (F)lm[mt]; - if(binary) - { - F s = 1./255; - for( int mt = 0; mt < 10; mt++ ) - mom[mt] *= s; - } - - F xm = x * mom[0], ym = y * mom[0]; - - // accumulate moments computed in each tile - dst_step /= sizeof(F); - - // + m00 ( = m00' ) - *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; - - // + m10 ( = m10' + x*m00' ) - *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; - - // + m01 ( = m01' + y*m00' ) - *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; - - // + m20 ( = m20' + 2*x*m10' + x*x*m00' ) - *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); - - // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) - *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; - - // + m02 ( = m02' + 2*y*m01' + y*y*m00' ) - *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); - - // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) - *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); - - // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') - *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; - - // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') - *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; - - // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) - *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); - } -} diff --git a/modules/imgproc/src/opencl/morph.cl b/modules/imgproc/src/opencl/morph.cl deleted file mode 100644 index c402ff7..0000000 --- a/modules/imgproc/src/opencl/morph.cl +++ /dev/null @@ -1,228 +0,0 @@ -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Niko Li, newlife20080214@gmail.com -// Zero Lin, zero.lin@amd.com -// Yao Wang, bitwangyaoyao@gmail.com -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -// - - -#ifdef ERODE -#define MORPH_OP(A,B) min((A),(B)) -#endif -#ifdef DILATE -#define MORPH_OP(A,B) max((A),(B)) -#endif -//BORDER_CONSTANT: iiiiii|abcdefgh|iiiiiii -#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2) -#ifndef GENTYPE - -__kernel void morph_C1_D0(__global const uchar * restrict src, - __global uchar *dst, - int src_offset_x, int src_offset_y, - int cols, int rows, - int src_step_in_pixel, int dst_step_in_pixel, - __constant uchar * mat_kernel, - int src_whole_cols, int src_whole_rows, - int dst_offset_in_pixel) -{ - int l_x = get_local_id(0); - int l_y = get_local_id(1); - int x = get_group_id(0)*4*LSIZE0; - int y = get_group_id(1)*LSIZE1; - int start_x = x+src_offset_x-RADIUSX & 0xfffffffc; - int end_x = x + src_offset_x+LSIZE0*4+RADIUSX & 0xfffffffc; - int width = (end_x -start_x+4)>>2; - int offset = src_offset_x-RADIUSX & 3; - int start_y = y+src_offset_y-RADIUSY; - int point1 = mad24(l_y,LSIZE0,l_x); - int point2 = point1 + LSIZE0*LSIZE1; - int tl_x = (point1 % width)<<2; - int tl_y = point1 / width; - int tl_x2 = (point2 % width)<<2; - int tl_y2 = point2 / width; - int cur_x = start_x + tl_x; - int cur_y = start_y + tl_y; - int cur_x2 = start_x + tl_x2; - int cur_y2 = start_y + tl_y2; - int start_addr = mad24(cur_y,src_step_in_pixel,cur_x); - int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2); - uchar4 temp0,temp1; - __local uchar4 LDS_DAT[2*LSIZE1*LSIZE0]; - - int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols); - //read pixels from src - start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0; - start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0; - temp0 = *(__global uchar4*)&src[start_addr]; - temp1 = *(__global uchar4*)&src[start_addr2]; - //judge if read out of boundary - temp0.x= ELEM(cur_x,0,src_whole_cols,VAL,temp0.x); - temp0.y= ELEM(cur_x+1,0,src_whole_cols,VAL,temp0.y); - temp0.z= ELEM(cur_x+2,0,src_whole_cols,VAL,temp0.z); - temp0.w= ELEM(cur_x+3,0,src_whole_cols,VAL,temp0.w); - temp0= ELEM(cur_y,0,src_whole_rows,(uchar4)VAL,temp0); - - temp1.x= ELEM(cur_x2,0,src_whole_cols,VAL,temp1.x); - temp1.y= ELEM(cur_x2+1,0,src_whole_cols,VAL,temp1.y); - temp1.z= ELEM(cur_x2+2,0,src_whole_cols,VAL,temp1.z); - temp1.w= ELEM(cur_x2+3,0,src_whole_cols,VAL,temp1.w); - temp1= ELEM(cur_y2,0,src_whole_rows,(uchar4)VAL,temp1); - - LDS_DAT[point1] = temp0; - LDS_DAT[point2] = temp1; - barrier(CLK_LOCAL_MEM_FENCE); - uchar4 res = (uchar4)VAL; - - for(int i=0; i<2*RADIUSY+1; i++) - for(int j=0; j<2*RADIUSX+1; j++) - { - res = -#ifndef RECTKERNEL - mat_kernel[i*(2*RADIUSX+1)+j] ? -#endif - MORPH_OP(res,vload4(0,(__local uchar*)&LDS_DAT[mad24((l_y+i),width,l_x)]+offset+j)) -#ifndef RECTKERNEL - :res -#endif - ; - } - - int gidx = get_global_id(0)<<2; - int gidy = get_global_id(1); - int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel); - - if(gidx+3 0)) ? start_addr : 0; - start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0; - temp0 = src[start_addr]; - temp1 = src[start_addr2]; - //judge if read out of boundary - temp0= ELEM(cur_x,0,src_whole_cols,(GENTYPE)VAL,temp0); - temp0= ELEM(cur_y,0,src_whole_rows,(GENTYPE)VAL,temp0); - - temp1= ELEM(cur_x2,0,src_whole_cols,(GENTYPE)VAL,temp1); - temp1= ELEM(cur_y2,0,src_whole_rows,(GENTYPE)VAL,temp1); - - LDS_DAT[point1] = temp0; - LDS_DAT[point2] = temp1; - barrier(CLK_LOCAL_MEM_FENCE); - GENTYPE res = (GENTYPE)VAL; - for(int i=0; i<2*RADIUSY+1; i++) - for(int j=0; j<2*RADIUSX+1; j++) - { - res = -#ifndef RECTKERNEL - mat_kernel[i*(2*RADIUSX+1)+j] ? -#endif - MORPH_OP(res,LDS_DAT[mad24(l_y+i,width,l_x+j)]) -#ifndef RECTKERNEL - :res -#endif - ; - } - int gidx = get_global_id(0); - int gidy = get_global_id(1); - int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel); - if(gidx= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2) - { - sum = 0.0625f * (((srcData + (src_y - 2) * srcStep))[x]); - sum = sum + 0.25f * (((srcData + (src_y - 1) * srcStep))[x]); - sum = sum + 0.375f * (((srcData + (src_y ) * srcStep))[x]); - sum = sum + 0.25f * (((srcData + (src_y + 1) * srcStep))[x]); - sum = sum + 0.0625f * (((srcData + (src_y + 2) * srcStep))[x]); - - smem[2 + get_local_id(0)] = sum; - - if (get_local_id(0) < 2) - { - const int left_x = x - 2; - - sum = 0.0625f * (((srcData + (src_y - 2) * srcStep))[left_x]); - sum = sum + 0.25f * (((srcData + (src_y - 1) * srcStep))[left_x]); - sum = sum + 0.375f * (((srcData + (src_y ) * srcStep))[left_x]); - sum = sum + 0.25f * (((srcData + (src_y + 1) * srcStep))[left_x]); - sum = sum + 0.0625f * (((srcData + (src_y + 2) * srcStep))[left_x]); - - smem[get_local_id(0)] = sum; - } - - if (get_local_id(0) > 253) - { - const int right_x = x + 2; - - sum = 0.0625f * (((srcData + (src_y - 2) * srcStep))[right_x]); - sum = sum + 0.25f * (((srcData + (src_y - 1) * srcStep))[right_x]); - sum = sum + 0.375f * (((srcData + (src_y ) * srcStep))[right_x]); - sum = sum + 0.25f * (((srcData + (src_y + 1) * srcStep))[right_x]); - sum = sum + 0.0625f * (((srcData + (src_y + 2) * srcStep))[right_x]); - - smem[4 + get_local_id(0)] = sum; - } - } - else - { - int col = idx_col(x, last_col); - - sum = 0.0625f * (((srcData + idx_row(src_y - 2, last_row) * srcStep))[col]); - sum = sum + 0.25f * (((srcData + idx_row(src_y - 1, last_row) * srcStep))[col]); - sum = sum + 0.375f * (((srcData + idx_row(src_y , last_row) * srcStep))[col]); - sum = sum + 0.25f * (((srcData + idx_row(src_y + 1, last_row) * srcStep))[col]); - sum = sum + 0.0625f * (((srcData + idx_row(src_y + 2, last_row) * srcStep))[col]); - - smem[2 + get_local_id(0)] = sum; - - if (get_local_id(0) < 2) - { - const int left_x = x - 2; - - col = idx_col(left_x, last_col); - - sum = 0.0625f * (((srcData + idx_row(src_y - 2, last_row) * srcStep))[col]); - sum = sum + 0.25f * (((srcData + idx_row(src_y - 1, last_row) * srcStep))[col]); - sum = sum + 0.375f * (((srcData + idx_row(src_y , last_row) * srcStep))[col]); - sum = sum + 0.25f * (((srcData + idx_row(src_y + 1, last_row) * srcStep))[col]); - sum = sum + 0.0625f * (((srcData + idx_row(src_y + 2, last_row) * srcStep))[col]); - - smem[get_local_id(0)] = sum; - } - - if (get_local_id(0) > 253) - { - const int right_x = x + 2; - - col = idx_col(right_x, last_col); - - sum = 0.0625f * (((srcData + idx_row(src_y - 2, last_row) * srcStep))[col]); - sum = sum + 0.25f * (((srcData + idx_row(src_y - 1, last_row) * srcStep))[col]); - sum = sum + 0.375f * (((srcData + idx_row(src_y , last_row) * srcStep))[col]); - sum = sum + 0.25f * (((srcData + idx_row(src_y + 1, last_row) * srcStep))[col]); - sum = sum + 0.0625f * (((srcData + idx_row(src_y + 2, last_row) * srcStep))[col]); - - smem[4 + get_local_id(0)] = sum; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if (get_local_id(0) < 128) - { - const int tid2 = get_local_id(0) * 2; - - sum = 0.0625f * smem[2 + tid2 - 2]; - sum = sum + 0.25f * smem[2 + tid2 - 1]; - sum = sum + 0.375f * smem[2 + tid2 ]; - sum = sum + 0.25f * smem[2 + tid2 + 1]; - sum = sum + 0.0625f * smem[2 + tid2 + 2]; - - const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; - - if (dst_x < dstCols) - dst[y * dstStep + dst_x] = convert_uchar_sat_rte(sum); - } -} - -/////////////////////////////////////////////////////////////////////// -////////////////////////// CV_8UC4 /////////////////////////////////// -/////////////////////////////////////////////////////////////////////// - -__kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows, int srcCols, __global uchar4 *dst, int dstStep, int dstCols) -{ - const int x = get_global_id(0); - const int y = get_group_id(1); - - __local float4 smem[256 + 4]; - - float4 sum; - - const int src_y = 2*y; - const int last_row = srcRows - 1; - const int last_col = srcCols - 1; - - float4 co1 = 0.375f; - float4 co2 = 0.25f; - float4 co3 = 0.0625f; - - if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2) - { - sum = co3 * convert_float4((((srcData + (src_y - 2) * srcStep / 4))[x])); - sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[x])); - sum = sum + co1 * convert_float4((((srcData + (src_y ) * srcStep / 4))[x])); - sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[x])); - sum = sum + co3 * convert_float4((((srcData + (src_y + 2) * srcStep / 4))[x])); - - smem[2 + get_local_id(0)] = sum; - - if (get_local_id(0) < 2) - { - const int left_x = x - 2; - - sum = co3 * convert_float4((((srcData + (src_y - 2) * srcStep / 4))[left_x])); - sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[left_x])); - sum = sum + co1 * convert_float4((((srcData + (src_y ) * srcStep / 4))[left_x])); - sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[left_x])); - sum = sum + co3 * convert_float4((((srcData + (src_y + 2) * srcStep / 4))[left_x])); - - smem[get_local_id(0)] = sum; - } - - if (get_local_id(0) > 253) - { - const int right_x = x + 2; - - sum = co3 * convert_float4((((srcData + (src_y - 2) * srcStep / 4))[right_x])); - sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[right_x])); - sum = sum + co1 * convert_float4((((srcData + (src_y ) * srcStep / 4))[right_x])); - sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[right_x])); - sum = sum + co3 * convert_float4((((srcData + (src_y + 2) * srcStep / 4))[right_x])); - - smem[4 + get_local_id(0)] = sum; - } - } - else - { - int col = idx_col(x, last_col); - - sum = co3 * convert_float4((((srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col])); - sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col])); - sum = sum + co1 * convert_float4((((srcData + idx_row(src_y , last_row) * srcStep / 4))[col])); - sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col])); - sum = sum + co3 * convert_float4((((srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col])); - - smem[2 + get_local_id(0)] = sum; - - if (get_local_id(0) < 2) - { - const int left_x = x - 2; - - col = idx_col(left_x, last_col); - - sum = co3 * convert_float4((((srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col])); - sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col])); - sum = sum + co1 * convert_float4((((srcData + idx_row(src_y , last_row) * srcStep / 4))[col])); - sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col])); - sum = sum + co3 * convert_float4((((srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col])); - - smem[get_local_id(0)] = sum; - } - - if (get_local_id(0) > 253) - { - const int right_x = x + 2; - - col = idx_col(right_x, last_col); - - sum = co3 * convert_float4((((srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col])); - sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col])); - sum = sum + co1 * convert_float4((((srcData + idx_row(src_y , last_row) * srcStep / 4))[col])); - sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col])); - sum = sum + co3 * convert_float4((((srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col])); - - smem[4 + get_local_id(0)] = sum; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if (get_local_id(0) < 128) - { - const int tid2 = get_local_id(0) * 2; - - sum = co3 * smem[2 + tid2 - 2]; - sum = sum + co2 * smem[2 + tid2 - 1]; - sum = sum + co1 * smem[2 + tid2 ]; - sum = sum + co2 * smem[2 + tid2 + 1]; - sum = sum + co3 * smem[2 + tid2 + 2]; - - const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; - - if (dst_x < dstCols) - dst[y * dstStep / 4 + dst_x] = convert_uchar4_sat_rte(sum); - } -} - -/////////////////////////////////////////////////////////////////////// -////////////////////////// CV_16UC1 ////////////////////////////////// -/////////////////////////////////////////////////////////////////////// - -__kernel void pyrDown_C1_D2(__global ushort * srcData, int srcStep, int srcRows, int srcCols, __global ushort *dst, int dstStep, int dstCols) -{ - const int x = get_global_id(0); - const int y = get_group_id(1); - - __local float smem[256 + 4]; - - float sum; - - const int src_y = 2*y; - const int last_row = srcRows - 1; - const int last_col = srcCols - 1; - - if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2) - { - sum = 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y - 2) * srcStep))[x]; - sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y - 1) * srcStep))[x]; - sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + (src_y ) * srcStep))[x]; - sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y + 1) * srcStep))[x]; - sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y + 2) * srcStep))[x]; - - smem[2 + get_local_id(0)] = sum; - - if (get_local_id(0) < 2) - { - const int left_x = x - 2; - - sum = 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y - 2) * srcStep))[left_x]; - sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y - 1) * srcStep))[left_x]; - sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + (src_y ) * srcStep))[left_x]; - sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y + 1) * srcStep))[left_x]; - sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y + 2) * srcStep))[left_x]; - - smem[get_local_id(0)] = sum; - } - - if (get_local_id(0) > 253) - { - const int right_x = x + 2; - - sum = 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y - 2) * srcStep))[right_x]; - sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y - 1) * srcStep))[right_x]; - sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + (src_y ) * srcStep))[right_x]; - sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y + 1) * srcStep))[right_x]; - sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y + 2) * srcStep))[right_x]; - - smem[4 + get_local_id(0)] = sum; - } - } - else - { - int col = idx_col(x, last_col); - - sum = 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col]; - sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col]; - sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col]; - sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col]; - sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col]; - - smem[2 + get_local_id(0)] = sum; - - if (get_local_id(0) < 2) - { - const int left_x = x - 2; - - col = idx_col(left_x, last_col); - - sum = 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col]; - sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col]; - sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col]; - sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col]; - sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col]; - - smem[get_local_id(0)] = sum; - } - - if (get_local_id(0) > 253) - { - const int right_x = x + 2; - - col = idx_col(right_x, last_col); - - sum = 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col]; - sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col]; - sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col]; - sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col]; - sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col]; - - smem[4 + get_local_id(0)] = sum; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if (get_local_id(0) < 128) - { - const int tid2 = get_local_id(0) * 2; - - sum = 0.0625f * smem[2 + tid2 - 2]; - sum = sum + 0.25f * smem[2 + tid2 - 1]; - sum = sum + 0.375f * smem[2 + tid2 ]; - sum = sum + 0.25f * smem[2 + tid2 + 1]; - sum = sum + 0.0625f * smem[2 + tid2 + 2]; - - const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; - - if (dst_x < dstCols) - dst[y * dstStep / 2 + dst_x] = convert_ushort_sat_rte(sum); - } -} - -/////////////////////////////////////////////////////////////////////// -////////////////////////// CV_16UC4 ////////////////////////////////// -/////////////////////////////////////////////////////////////////////// - -__kernel void pyrDown_C4_D2(__global ushort4 * srcData, int srcStep, int srcRows, int srcCols, __global ushort4 *dst, int dstStep, int dstCols) -{ - const int x = get_global_id(0); - const int y = get_group_id(1); - - __local float4 smem[256 + 4]; - - float4 sum; - - const int src_y = 2*y; - const int last_row = srcRows - 1; - const int last_col = srcCols - 1; - - float4 co1 = 0.375f; - float4 co2 = 0.25f; - float4 co3 = 0.0625f; - - if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2) - { - sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[x]); - sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x]); - sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[x]); - sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x]); - sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[x]); - - smem[2 + get_local_id(0)] = sum; - - if (get_local_id(0) < 2) - { - const int left_x = x - 2; - - sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[left_x]); - sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x]); - sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[left_x]); - sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x]); - sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[left_x]); - - smem[get_local_id(0)] = sum; - } - - if (get_local_id(0) > 253) - { - const int right_x = x + 2; - - sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[right_x]); - sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x]); - sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[right_x]); - sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x]); - sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[right_x]); - - smem[4 + get_local_id(0)] = sum; - } - } - else - { - int col = idx_col(x, last_col); - - sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]); - sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]); - sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]); - sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]); - sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]); - - smem[2 + get_local_id(0)] = sum; - - if (get_local_id(0) < 2) - { - const int left_x = x - 2; - - col = idx_col(left_x, last_col); - - sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]); - sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]); - sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]); - sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]); - sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]); - - smem[get_local_id(0)] = sum; - } - - if (get_local_id(0) > 253) - { - const int right_x = x + 2; - - col = idx_col(right_x, last_col); - - sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]); - sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]); - sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]); - sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]); - sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]); - - smem[4 + get_local_id(0)] = sum; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if (get_local_id(0) < 128) - { - const int tid2 = get_local_id(0) * 2; - - sum = co3 * smem[2 + tid2 - 2]; - sum = sum + co2 * smem[2 + tid2 - 1]; - sum = sum + co1 * smem[2 + tid2 ]; - sum = sum + co2 * smem[2 + tid2 + 1]; - sum = sum + co3 * smem[2 + tid2 + 2]; - - const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; - - if (dst_x < dstCols) - dst[y * dstStep / 8 + dst_x] = convert_ushort4_sat_rte(sum); - } -} - -/////////////////////////////////////////////////////////////////////// -////////////////////////// CV_16SC1 ////////////////////////////////// -/////////////////////////////////////////////////////////////////////// - -__kernel void pyrDown_C1_D3(__global short * srcData, int srcStep, int srcRows, int srcCols, __global short *dst, int dstStep, int dstCols) -{ - const int x = get_global_id(0); - const int y = get_group_id(1); - - __local float smem[256 + 4]; - - float sum; - - const int src_y = 2*y; - const int last_row = srcRows - 1; - const int last_col = srcCols - 1; - - if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2) - { - sum = 0.0625f * ((__global short*)((__global char*)srcData + (src_y - 2) * srcStep))[x]; - sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y - 1) * srcStep))[x]; - sum = sum + 0.375f * ((__global short*)((__global char*)srcData + (src_y ) * srcStep))[x]; - sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y + 1) * srcStep))[x]; - sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + (src_y + 2) * srcStep))[x]; - - smem[2 + get_local_id(0)] = sum; - - if (get_local_id(0) < 2) - { - const int left_x = x - 2; - - sum = 0.0625f * ((__global short*)((__global char*)srcData + (src_y - 2) * srcStep))[left_x]; - sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y - 1) * srcStep))[left_x]; - sum = sum + 0.375f * ((__global short*)((__global char*)srcData + (src_y ) * srcStep))[left_x]; - sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y + 1) * srcStep))[left_x]; - sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + (src_y + 2) * srcStep))[left_x]; - - smem[get_local_id(0)] = sum; - } - - if (get_local_id(0) > 253) - { - const int right_x = x + 2; - - sum = 0.0625f * ((__global short*)((__global char*)srcData + (src_y - 2) * srcStep))[right_x]; - sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y - 1) * srcStep))[right_x]; - sum = sum + 0.375f * ((__global short*)((__global char*)srcData + (src_y ) * srcStep))[right_x]; - sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y + 1) * srcStep))[right_x]; - sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + (src_y + 2) * srcStep))[right_x]; - - smem[4 + get_local_id(0)] = sum; - } - } - else - { - int col = idx_col(x, last_col); - - sum = 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col]; - sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col]; - sum = sum + 0.375f * ((__global short*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col]; - sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col]; - sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col]; - - smem[2 + get_local_id(0)] = sum; - - if (get_local_id(0) < 2) - { - const int left_x = x - 2; - - col = idx_col(left_x, last_col); - - sum = 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col]; - sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col]; - sum = sum + 0.375f * ((__global short*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col]; - sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col]; - sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col]; - - smem[get_local_id(0)] = sum; - } - - if (get_local_id(0) > 253) - { - const int right_x = x + 2; - - col = idx_col(right_x, last_col); - - sum = 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col]; - sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col]; - sum = sum + 0.375f * ((__global short*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col]; - sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col]; - sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col]; - - smem[4 + get_local_id(0)] = sum; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if (get_local_id(0) < 128) - { - const int tid2 = get_local_id(0) * 2; - - sum = 0.0625f * smem[2 + tid2 - 2]; - sum = sum + 0.25f * smem[2 + tid2 - 1]; - sum = sum + 0.375f * smem[2 + tid2 ]; - sum = sum + 0.25f * smem[2 + tid2 + 1]; - sum = sum + 0.0625f * smem[2 + tid2 + 2]; - - const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; - - if (dst_x < dstCols) - dst[y * dstStep / 2 + dst_x] = convert_short_sat_rte(sum); - } -} - -/////////////////////////////////////////////////////////////////////// -////////////////////////// CV_16SC4 ////////////////////////////////// -/////////////////////////////////////////////////////////////////////// - -__kernel void pyrDown_C4_D3(__global short4 * srcData, int srcStep, int srcRows, int srcCols, __global short4 *dst, int dstStep, int dstCols) -{ - const int x = get_global_id(0); - const int y = get_group_id(1); - - __local float4 smem[256 + 4]; - - float4 sum; - - const int src_y = 2*y; - const int last_row = srcRows - 1; - const int last_col = srcCols - 1; - - float4 co1 = 0.375f; - float4 co2 = 0.25f; - float4 co3 = 0.0625f; - - if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2) - { - sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[x]); - sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x]); - sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[x]); - sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x]); - sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[x]); - - smem[2 + get_local_id(0)] = sum; - - if (get_local_id(0) < 2) - { - const int left_x = x - 2; - - sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[left_x]); - sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x]); - sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[left_x]); - sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x]); - sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[left_x]); - - smem[get_local_id(0)] = sum; - } - - if (get_local_id(0) > 253) - { - const int right_x = x + 2; - - sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[right_x]); - sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x]); - sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[right_x]); - sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x]); - sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[right_x]); - - smem[4 + get_local_id(0)] = sum; - } - } - else - { - int col = idx_col(x, last_col); - - sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]); - sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]); - sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]); - sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]); - sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]); - - smem[2 + get_local_id(0)] = sum; - - if (get_local_id(0) < 2) - { - const int left_x = x - 2; - - col = idx_col(left_x, last_col); - - sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]); - sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]); - sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]); - sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]); - sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]); - - smem[get_local_id(0)] = sum; - } - - if (get_local_id(0) > 253) - { - const int right_x = x + 2; - - col = idx_col(right_x, last_col); - - sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]); - sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]); - sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]); - sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]); - sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]); - - smem[4 + get_local_id(0)] = sum; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if (get_local_id(0) < 128) - { - const int tid2 = get_local_id(0) * 2; - - sum = co3 * smem[2 + tid2 - 2]; - sum = sum + co2 * smem[2 + tid2 - 1]; - sum = sum + co1 * smem[2 + tid2 ]; - sum = sum + co2 * smem[2 + tid2 + 1]; - sum = sum + co3 * smem[2 + tid2 + 2]; - - const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; - - if (dst_x < dstCols) - dst[y * dstStep / 8 + dst_x] = convert_short4_sat_rte(sum); - } -} - -/////////////////////////////////////////////////////////////////////// -////////////////////////// CV_32FC1 ////////////////////////////////// -/////////////////////////////////////////////////////////////////////// - -__kernel void pyrDown_C1_D5(__global float * srcData, int srcStep, int srcRows, int srcCols, __global float *dst, int dstStep, int dstCols) -{ - const int x = get_global_id(0); - const int y = get_group_id(1); - - __local float smem[256 + 4]; - - float sum; - - const int src_y = 2*y; - const int last_row = srcRows - 1; - const int last_col = srcCols - 1; - - if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2) - { - sum = 0.0625f * ((__global float*)((__global char*)srcData + (src_y - 2) * srcStep))[x]; - sum = sum + 0.25f * ((__global float*)((__global char*)srcData + (src_y - 1) * srcStep))[x]; - sum = sum + 0.375f * ((__global float*)((__global char*)srcData + (src_y ) * srcStep))[x]; - sum = sum + 0.25f * ((__global float*)((__global char*)srcData + (src_y + 1) * srcStep))[x]; - sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + (src_y + 2) * srcStep))[x]; - - smem[2 + get_local_id(0)] = sum; - - if (get_local_id(0) < 2) - { - const int left_x = x - 2; - - sum = 0.0625f * ((__global float*)((__global char*)srcData + (src_y - 2) * srcStep))[left_x]; - sum = sum + 0.25f * ((__global float*)((__global char*)srcData + (src_y - 1) * srcStep))[left_x]; - sum = sum + 0.375f * ((__global float*)((__global char*)srcData + (src_y ) * srcStep))[left_x]; - sum = sum + 0.25f * ((__global float*)((__global char*)srcData + (src_y + 1) * srcStep))[left_x]; - sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + (src_y + 2) * srcStep))[left_x]; - - smem[get_local_id(0)] = sum; - } - - if (get_local_id(0) > 253) - { - const int right_x = x + 2; - - sum = 0.0625f * ((__global float*)((__global char*)srcData + (src_y - 2) * srcStep))[right_x]; - sum = sum + 0.25f * ((__global float*)((__global char*)srcData + (src_y - 1) * srcStep))[right_x]; - sum = sum + 0.375f * ((__global float*)((__global char*)srcData + (src_y ) * srcStep))[right_x]; - sum = sum + 0.25f * ((__global float*)((__global char*)srcData + (src_y + 1) * srcStep))[right_x]; - sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + (src_y + 2) * srcStep))[right_x]; - - smem[4 + get_local_id(0)] = sum; - } - } - else - { - int col = idx_col(x, last_col); - - sum = 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col]; - sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col]; - sum = sum + 0.375f * ((__global float*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col]; - sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col]; - sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col]; - - smem[2 + get_local_id(0)] = sum; - - if (get_local_id(0) < 2) - { - const int left_x = x - 2; - - col = idx_col(left_x, last_col); - - sum = 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col]; - sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col]; - sum = sum + 0.375f * ((__global float*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col]; - sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col]; - sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col]; - - smem[get_local_id(0)] = sum; - } - - if (get_local_id(0) > 253) - { - const int right_x = x + 2; - - col = idx_col(right_x, last_col); - - sum = 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col]; - sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col]; - sum = sum + 0.375f * ((__global float*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col]; - sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col]; - sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col]; - - smem[4 + get_local_id(0)] = sum; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if (get_local_id(0) < 128) - { - const int tid2 = get_local_id(0) * 2; - - sum = 0.0625f * smem[2 + tid2 - 2]; - sum = sum + 0.25f * smem[2 + tid2 - 1]; - sum = sum + 0.375f * smem[2 + tid2 ]; - sum = sum + 0.25f * smem[2 + tid2 + 1]; - sum = sum + 0.0625f * smem[2 + tid2 + 2]; - - const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; - - if (dst_x < dstCols) - dst[y * dstStep / 4 + dst_x] = sum; - } -} - -/////////////////////////////////////////////////////////////////////// -////////////////////////// CV_32FC4 ////////////////////////////////// -/////////////////////////////////////////////////////////////////////// - -__kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows, int srcCols, __global float4 *dst, int dstStep, int dstCols) -{ - const int x = get_global_id(0); - const int y = get_group_id(1); - - __local float4 smem[256 + 4]; - - float4 sum; - - const int src_y = 2*y; - const int last_row = srcRows - 1; - const int last_col = srcCols - 1; - - float4 co1 = 0.375f; - float4 co2 = 0.25f; - float4 co3 = 0.0625f; - - if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2) - { - sum = co3 * ((__global float4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[x]; - sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x]; - sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[x]; - sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x]; - sum = sum + co3 * ((__global float4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[x]; - - smem[2 + get_local_id(0)] = sum; - - if (get_local_id(0) < 2) - { - const int left_x = x - 2; - - sum = co3 * ((__global float4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[left_x]; - sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x]; - sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[left_x]; - sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x]; - sum = sum + co3 * ((__global float4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[left_x]; - - smem[get_local_id(0)] = sum; - } - - if (get_local_id(0) > 253) - { - const int right_x = x + 2; - - sum = co3 * ((__global float4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[right_x]; - sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x]; - sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[right_x]; - sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x]; - sum = sum + co3 * ((__global float4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[right_x]; - - smem[4 + get_local_id(0)] = sum; - } - } - else - { - int col = idx_col(x, last_col); - - sum = co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]; - sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]; - sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]; - sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]; - sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]; - - smem[2 + get_local_id(0)] = sum; - - if (get_local_id(0) < 2) - { - const int left_x = x - 2; - - col = idx_col(left_x, last_col); - - sum = co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]; - sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]; - sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]; - sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]; - sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]; - - smem[get_local_id(0)] = sum; - } - - if (get_local_id(0) > 253) - { - const int right_x = x + 2; - - col = idx_col(right_x, last_col); - - sum = co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]; - sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]; - sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]; - sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]; - sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]; - - smem[4 + get_local_id(0)] = sum; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if (get_local_id(0) < 128) - { - const int tid2 = get_local_id(0) * 2; - - sum = co3 * smem[2 + tid2 - 2]; - sum = sum + co2 * smem[2 + tid2 - 1]; - sum = sum + co1 * smem[2 + tid2 ]; - sum = sum + co2 * smem[2 + tid2 + 1]; - sum = sum + co3 * smem[2 + tid2 + 2]; - - const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2; - - if (dst_x < dstCols) - dst[y * dstStep / 16 + dst_x] = sum; - } -} diff --git a/modules/imgproc/src/opencl/remap.cl b/modules/imgproc/src/opencl/remap.cl deleted file mode 100644 index d545497..0000000 --- a/modules/imgproc/src/opencl/remap.cl +++ /dev/null @@ -1,323 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Wu Zailong, bullet@yeah.net -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#if defined (DOUBLE_SUPPORT) -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#elif defined (cl_amd_fp64) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#endif -#endif - -#ifdef INTER_NEAREST -#define convertToWT -#endif - -#ifdef BORDER_CONSTANT -#define EXTRAPOLATE(v2, v) v = scalar; -#elif defined BORDER_REPLICATE -#define EXTRAPOLATE(v2, v) \ - { \ - v2 = max(min(v2, (int2)(src_cols - 1, src_rows - 1)), zero); \ - v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \ - } -#elif defined BORDER_WRAP -#define EXTRAPOLATE(v2, v) \ - { \ - if (v2.x < 0) \ - v2.x -= ((v2.x - src_cols + 1) / src_cols) * src_cols; \ - if (v2.x >= src_cols) \ - v2.x %= src_cols; \ - \ - if (v2.y < 0) \ - v2.y -= ((v2.y - src_rows + 1) / src_rows) * src_rows; \ - if( v2.y >= src_rows ) \ - v2.y %= src_rows; \ - v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \ - } -#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101) -#ifdef BORDER_REFLECT -#define DELTA int delta = 0 -#else -#define DELTA int delta = 1 -#endif -#define EXTRAPOLATE(v2, v) \ - { \ - DELTA; \ - if (src_cols == 1) \ - v2.x = 0; \ - else \ - do \ - { \ - if( v2.x < 0 ) \ - v2.x = -v2.x - 1 + delta; \ - else \ - v2.x = src_cols - 1 - (v2.x - src_cols) - delta; \ - } \ - while (v2.x >= src_cols || v2.x < 0); \ - \ - if (src_rows == 1) \ - v2.y = 0; \ - else \ - do \ - { \ - if( v2.y < 0 ) \ - v2.y = -v2.y - 1 + delta; \ - else \ - v2.y = src_rows - 1 - (v2.y - src_rows) - delta; \ - } \ - while (v2.y >= src_rows || v2.y < 0); \ - v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \ - } -#else -#error No extrapolation method -#endif - -#define NEED_EXTRAPOLATION(gx, gy) (gx >= src_cols || gy >= src_rows || gx < 0 || gy < 0) - -#ifdef INTER_NEAREST - -__kernel void remap_2_32FC1(__global const T * restrict src, __global T * dst, - __global float * map1, __global float * map2, - int src_offset, int dst_offset, int map1_offset, int map2_offset, - int src_step, int dst_step, int map1_step, int map2_step, - int src_cols, int src_rows, int dst_cols, int dst_rows, T scalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < dst_cols && y < dst_rows) - { - int dstIdx = mad24(y, dst_step, x + dst_offset); - int map1Idx = mad24(y, map1_step, x + map1_offset); - int map2Idx = mad24(y, map2_step, x + map2_offset); - - int gx = convert_int_sat_rte(map1[map1Idx]); - int gy = convert_int_sat_rte(map2[map2Idx]); - - if (NEED_EXTRAPOLATION(gx, gy)) - { - int2 gxy = (int2)(gx, gy), zero = (int2)(0); - EXTRAPOLATE(gxy, dst[dstIdx]); - } - else - { - int srcIdx = mad24(gy, src_step, gx + src_offset); - dst[dstIdx] = src[srcIdx]; - } - } -} - -__kernel void remap_32FC2(__global const T * restrict src, __global T * dst, __global float2 * map1, - int src_offset, int dst_offset, int map1_offset, - int src_step, int dst_step, int map1_step, - int src_cols, int src_rows, int dst_cols, int dst_rows, T scalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < dst_cols && y < dst_rows) - { - int dstIdx = mad24(y, dst_step, x + dst_offset); - int map1Idx = mad24(y, map1_step, x + map1_offset); - - int2 gxy = convert_int2_sat_rte(map1[map1Idx]); - int gx = gxy.x, gy = gxy.y; - - if (NEED_EXTRAPOLATION(gx, gy)) - { - int2 zero = (int2)(0); - EXTRAPOLATE(gxy, dst[dstIdx]); - } - else - { - int srcIdx = mad24(gy, src_step, gx + src_offset); - dst[dstIdx] = src[srcIdx]; - } - } -} - -__kernel void remap_16SC2(__global const T * restrict src, __global T * dst, __global short2 * map1, - int src_offset, int dst_offset, int map1_offset, - int src_step, int dst_step, int map1_step, - int src_cols, int src_rows, int dst_cols, int dst_rows, T scalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < dst_cols && y < dst_rows) - { - int dstIdx = mad24(y, dst_step, x + dst_offset); - int map1Idx = mad24(y, map1_step, x + map1_offset); - - int2 gxy = convert_int2(map1[map1Idx]); - int gx = gxy.x, gy = gxy.y; - - if (NEED_EXTRAPOLATION(gx, gy)) - { - int2 zero = (int2)(0); - EXTRAPOLATE(gxy, dst[dstIdx]); - } - else - { - int srcIdx = mad24(gy, src_step, gx + src_offset); - dst[dstIdx] = src[srcIdx]; - } - } -} - -#elif INTER_LINEAR - -__kernel void remap_2_32FC1(__global T const * restrict src, __global T * dst, - __global float * map1, __global float * map2, - int src_offset, int dst_offset, int map1_offset, int map2_offset, - int src_step, int dst_step, int map1_step, int map2_step, - int src_cols, int src_rows, int dst_cols, int dst_rows, T nVal) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < dst_cols && y < dst_rows) - { - int dstIdx = mad24(y, dst_step, x + dst_offset); - int map1Idx = mad24(y, map1_step, x + map1_offset); - int map2Idx = mad24(y, map2_step, x + map2_offset); - - float2 map_data = (float2)(map1[map1Idx], map2[map2Idx]); - - int2 map_dataA = convert_int2_sat_rtn(map_data); - int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y); - int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1); - int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y +1); - int2 zero = (int2)(0); - - float2 _u = map_data - convert_float2(map_dataA); - WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)32)) / (WT2)32; - WT scalar = convertToWT(nVal); - WT a = scalar, b = scalar, c = scalar, d = scalar; - - if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y)) - a = convertToWT(src[mad24(map_dataA.y, src_step, map_dataA.x + src_offset)]); - else - EXTRAPOLATE(map_dataA, a); - - if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y)) - b = convertToWT(src[mad24(map_dataB.y, src_step, map_dataB.x + src_offset)]); - else - EXTRAPOLATE(map_dataB, b); - - if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y)) - c = convertToWT(src[mad24(map_dataC.y, src_step, map_dataC.x + src_offset)]); - else - EXTRAPOLATE(map_dataC, c); - - if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y)) - d = convertToWT(src[mad24(map_dataD.y, src_step, map_dataD.x + src_offset)]); - else - EXTRAPOLATE(map_dataD, d); - - WT dst_data = a * (WT)(1 - u.x) * (WT)(1 - u.y) + - b * (WT)(u.x) * (WT)(1 - u.y) + - c * (WT)(1 - u.x) * (WT)(u.y) + - d * (WT)(u.x) * (WT)(u.y); - dst[dstIdx] = convertToT(dst_data); - } -} - -__kernel void remap_32FC2(__global T const * restrict src, __global T * dst, - __global float2 * map1, - int src_offset, int dst_offset, int map1_offset, - int src_step, int dst_step, int map1_step, - int src_cols, int src_rows, int dst_cols, int dst_rows, T nVal) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < dst_cols && y < dst_rows) - { - int dstIdx = mad24(y, dst_step, x + dst_offset); - int map1Idx = mad24(y, map1_step, x + map1_offset); - - float2 map_data = map1[map1Idx]; - int2 map_dataA = convert_int2_sat_rtn(map_data); - int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y); - int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1); - int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1); - int2 zero = (int2)(0); - - float2 _u = map_data - convert_float2(map_dataA); - WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)32)) / (WT2)32; - WT scalar = convertToWT(nVal); - WT a = scalar, b = scalar, c = scalar, d = scalar; - - if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y)) - a = convertToWT(src[mad24(map_dataA.y, src_step, map_dataA.x + src_offset)]); - else - EXTRAPOLATE(map_dataA, a); - - if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y)) - b = convertToWT(src[mad24(map_dataB.y, src_step, map_dataB.x + src_offset)]); - else - EXTRAPOLATE(map_dataB, b); - - if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y)) - c = convertToWT(src[mad24(map_dataC.y, src_step, map_dataC.x + src_offset)]); - else - EXTRAPOLATE(map_dataC, c); - - if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y)) - d = convertToWT(src[mad24(map_dataD.y, src_step, map_dataD.x + src_offset)]); - else - EXTRAPOLATE(map_dataD, d); - - WT dst_data = a * (WT)(1 - u.x) * (WT)(1 - u.y) + - b * (WT)(u.x) * (WT)(1 - u.y) + - c * (WT)(1 - u.x) * (WT)(u.y) + - d * (WT)(u.x) * (WT)(u.y); - dst[dstIdx] = convertToT(dst_data); - } -} - -#endif diff --git a/modules/imgproc/src/opencl/threshold.cl b/modules/imgproc/src/opencl/threshold.cl deleted file mode 100644 index 8d7c77e..0000000 --- a/modules/imgproc/src/opencl/threshold.cl +++ /dev/null @@ -1,152 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Zhang Ying, zhangying913@gmail.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#if defined (DOUBLE_SUPPORT) -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#endif - -// threshold type: -// enum { THRESH_BINARY=0, THRESH_BINARY_INV=1, THRESH_TRUNC=2, THRESH_TOZERO=3, -// THRESH_TOZERO_INV=4, THRESH_MASK=7, THRESH_OTSU=8 }; - -__kernel void threshold_C1_D0(__global const uchar * restrict src, __global uchar *dst, - int src_offset, int src_step, - int dst_offset, int dst_rows, int dst_cols, int dst_step, - uchar thresh, uchar max_val, int thresh_type - ) -{ - int gx = get_global_id(0); - const int gy = get_global_id(1); - - int offset = (dst_offset & 15); - src_offset -= offset; - - int dstart = (gx << 4) - offset; - if(dstart < dst_cols && gy < dst_rows) - { - uchar16 sdata = vload16(gx, src+src_offset+gy*src_step); - uchar16 ddata; - uchar16 zero = 0; - switch (thresh_type) - { - case 0: - ddata = ((sdata > thresh) ) ? (uchar16)(max_val) : (uchar16)(0); - break; - case 1: - ddata = ((sdata > thresh)) ? zero : (uchar16)(max_val); - break; - case 2: - ddata = ((sdata > thresh)) ? (uchar16)(thresh) : sdata; - break; - case 3: - ddata = ((sdata > thresh)) ? sdata : zero; - break; - case 4: - ddata = ((sdata > thresh)) ? zero : sdata; - break; - default: - ddata = sdata; - } - int16 dpos = (int16)(dstart, dstart+1, dstart+2, dstart+3, dstart+4, dstart+5, dstart+6, dstart+7, dstart+8, - dstart+9, dstart+10, dstart+11, dstart+12, dstart+13, dstart+14, dstart+15); - uchar16 dVal = *(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart); - int16 con = dpos >= 0 && dpos < dst_cols; - ddata = convert_uchar16(con != 0) ? ddata : dVal; - if(dstart < dst_cols) - { - *(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart) = ddata; - } - } -} - - -__kernel void threshold_C1_D5(__global const float * restrict src, __global float *dst, - int src_offset, int src_step, - int dst_offset, int dst_rows, int dst_cols, int dst_step, - float thresh, float max_val, int thresh_type - ) -{ - const int gx = get_global_id(0); - const int gy = get_global_id(1); - - int offset = (dst_offset & 3); - src_offset -= offset; - - int dstart = (gx << 2) - offset; - if(dstart < dst_cols && gy < dst_rows) - { - float4 sdata = vload4(gx, src+src_offset+gy*src_step); - float4 ddata; - float4 zero = 0; - switch (thresh_type) - { - case 0: - ddata = sdata > thresh ? (float4)(max_val) : (float4)(0.f); - break; - case 1: - ddata = sdata > thresh ? zero : (float4)max_val; - break; - case 2: - ddata = sdata > thresh ? (float4)thresh : sdata; - break; - case 3: - ddata = sdata > thresh ? sdata : (float4)(0.f); - break; - case 4: - ddata = sdata > thresh ? (float4)(0.f) : sdata; - break; - default: - ddata = sdata; - } - int4 dpos = (int4)(dstart, dstart+1, dstart+2, dstart+3); - float4 dVal = *(__global float4*)(dst+dst_offset+gy*dst_step+dstart); - int4 con = dpos >= 0 && dpos < dst_cols; - ddata = convert_float4(con) != (float4)(0) ? ddata : dVal; - if(dstart < dst_cols) - { - *(__global float4*)(dst+dst_offset+gy*dst_step+dstart) = ddata; - } - } -} diff --git a/modules/imgproc/src/opencl/warpaffine.cl b/modules/imgproc/src/opencl/warpaffine.cl deleted file mode 100644 index caafdfb..0000000 --- a/modules/imgproc/src/opencl/warpaffine.cl +++ /dev/null @@ -1,761 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Zhang Ying, zhangying913@gmail.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - - -//warpAffine kernel -//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic. - -#if defined (DOUBLE_SUPPORT) -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#elif defined (cl_amd_fp64) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#endif -typedef double F; -typedef double4 F4; -#define convert_F4 convert_double4 -#else -typedef float F; -typedef float4 F4; -#define convert_F4 convert_float4 -#endif - -#define INTER_BITS 5 -#define INTER_TAB_SIZE (1 << INTER_BITS) -#define INTER_SCALE 1.f/INTER_TAB_SIZE -#define AB_BITS max(10, (int)INTER_BITS) -#define AB_SCALE (1 << AB_BITS) -#define INTER_REMAP_COEF_BITS 15 -#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS) - -inline void interpolateCubic( float x, float* coeffs ) -{ - const float A = -0.75f; - - coeffs[0] = ((A*(x + 1.f) - 5.0f*A)*(x + 1.f) + 8.0f*A)*(x + 1.f) - 4.0f*A; - coeffs[1] = ((A + 2.f)*x - (A + 3.f))*x*x + 1.f; - coeffs[2] = ((A + 2.f)*(1.f - x) - (A + 3.f))*(1.f - x)*(1.f - x) + 1.f; - coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; -} - - -/**********************************************8UC1********************************************* -***********************************************************************************************/ -__kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - if( dx < threadCols && dy < dst_rows) - { - dx = (dx<<2) - (dst_offset&3); - - int round_delta = (AB_SCALE>>1); - - int4 X, Y; - int4 sx, sy; - int4 DX = (int4)(dx, dx+1, dx+2, dx+3); - DX = (DX << AB_BITS); - F4 M0DX, M3DX; - M0DX = M[0] * convert_F4(DX); - M3DX = M[3] * convert_F4(DX); - X = convert_int4(rint(M0DX)); - Y = convert_int4(rint(M3DX)); - int tmp1, tmp2; - tmp1 = rint((M[1]*dy + M[2]) * AB_SCALE); - tmp2 = rint((M[4]*dy + M[5]) * AB_SCALE); - - X += tmp1 + round_delta; - Y += tmp2 + round_delta; - - sx = convert_int4(convert_short4(X >> AB_BITS)); - sy = convert_int4(convert_short4(Y >> AB_BITS)); - - __global uchar4 * d = (__global uchar4 *)(dst+dst_offset+dy*dstStep+dx); - uchar4 dval = *d; - DX = (int4)(dx, dx+1, dx+2, dx+3); - int4 dcon = DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows; - int4 scon = sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows; - int4 spos = src_offset + sy * srcStep + sx; - uchar4 sval; - sval.s0 = scon.s0 ? src[spos.s0] : 0; - sval.s1 = scon.s1 ? src[spos.s1] : 0; - sval.s2 = scon.s2 ? src[spos.s2] : 0; - sval.s3 = scon.s3 ? src[spos.s3] : 0; - dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval; - *d = dval; - } -} - -__kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - - if( dx < threadCols && dy < dst_rows) - { - dx = (dx<<2) - (dst_offset&3); - - int round_delta = ((AB_SCALE >> INTER_BITS) >> 1); - - int4 X, Y; - short4 ax, ay; - int4 sx, sy; - int4 DX = (int4)(dx, dx+1, dx+2, dx+3); - DX = (DX << AB_BITS); - F4 M0DX, M3DX; - M0DX = M[0] * convert_F4(DX); - M3DX = M[3] * convert_F4(DX); - X = convert_int4(rint(M0DX)); - Y = convert_int4(rint(M3DX)); - - int tmp1, tmp2; - tmp1 = rint((M[1]*dy + M[2]) * AB_SCALE); - tmp2 = rint((M[4]*dy + M[5]) * AB_SCALE); - - X += tmp1 + round_delta; - Y += tmp2 + round_delta; - - X = X >> (AB_BITS - INTER_BITS); - Y = Y >> (AB_BITS - INTER_BITS); - - sx = convert_int4(convert_short4(X >> INTER_BITS)); - sy = convert_int4(convert_short4(Y >> INTER_BITS)); - ax = convert_short4(X & (INTER_TAB_SIZE-1)); - ay = convert_short4(Y & (INTER_TAB_SIZE-1)); - - uchar4 v0, v1, v2,v3; - int4 scon0, scon1, scon2, scon3; - int4 spos0, spos1, spos2, spos3; - - scon0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows); - scon1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows); - scon2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows); - scon3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows); - spos0 = src_offset + sy * srcStep + sx; - spos1 = src_offset + sy * srcStep + sx + 1; - spos2 = src_offset + (sy+1) * srcStep + sx; - spos3 = src_offset + (sy+1) * srcStep + sx + 1; - - v0.s0 = scon0.s0 ? src[spos0.s0] : 0; - v1.s0 = scon1.s0 ? src[spos1.s0] : 0; - v2.s0 = scon2.s0 ? src[spos2.s0] : 0; - v3.s0 = scon3.s0 ? src[spos3.s0] : 0; - - v0.s1 = scon0.s1 ? src[spos0.s1] : 0; - v1.s1 = scon1.s1 ? src[spos1.s1] : 0; - v2.s1 = scon2.s1 ? src[spos2.s1] : 0; - v3.s1 = scon3.s1 ? src[spos3.s1] : 0; - - v0.s2 = scon0.s2 ? src[spos0.s2] : 0; - v1.s2 = scon1.s2 ? src[spos1.s2] : 0; - v2.s2 = scon2.s2 ? src[spos2.s2] : 0; - v3.s2 = scon3.s2 ? src[spos3.s2] : 0; - - v0.s3 = scon0.s3 ? src[spos0.s3] : 0; - v1.s3 = scon1.s3 ? src[spos1.s3] : 0; - v2.s3 = scon2.s3 ? src[spos2.s3] : 0; - v3.s3 = scon3.s3 ? src[spos3.s3] : 0; - - short4 itab0, itab1, itab2, itab3; - float4 taby, tabx; - taby = INTER_SCALE * convert_float4(ay); - tabx = INTER_SCALE * convert_float4(ax); - - itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE )); - itab1 = convert_short4_sat(( (1.0f-taby)*tabx * (float4)INTER_REMAP_COEF_SCALE )); - itab2 = convert_short4_sat(( taby*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE )); - itab3 = convert_short4_sat(( taby*tabx * (float4)INTER_REMAP_COEF_SCALE )); - - - int4 val; - uchar4 tval; - val = convert_int4(v0) * convert_int4(itab0) + convert_int4(v1) * convert_int4(itab1) - + convert_int4(v2) * convert_int4(itab2) + convert_int4(v3) * convert_int4(itab3); - tval = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; - - __global uchar4 * d =(__global uchar4 *)(dst+dst_offset+dy*dstStep+dx); - uchar4 dval = *d; - DX = (int4)(dx, dx+1, dx+2, dx+3); - int4 dcon = DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows; - dval = convert_uchar4(dcon != 0) ? tval : dval; - *d = dval; - } -} - -__kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - if( dx < threadCols && dy < dst_rows) - { - int round_delta = ((AB_SCALE>>INTER_BITS)>>1); - - int X0 = rint(M[0] * dx * AB_SCALE); - int Y0 = rint(M[3] * dx * AB_SCALE); - X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; - int X = X0 >> (AB_BITS - INTER_BITS); - int Y = Y0 >> (AB_BITS - INTER_BITS); - - short sx = (short)(X >> INTER_BITS) - 1; - short sy = (short)(Y >> INTER_BITS) - 1; - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - uchar v[16]; - int i, j; - -#pragma unroll 4 - for(i=0; i<4; i++) - for(j=0; j<4; j++) - { - v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0; - } - - short itab[16]; - float tab1y[4], tab1x[4]; - float axx, ayy; - - ayy = 1.f/INTER_TAB_SIZE * ay; - axx = 1.f/INTER_TAB_SIZE * ax; - interpolateCubic(ayy, tab1y); - interpolateCubic(axx, tab1x); - int isum = 0; - -#pragma unroll 16 - for( i=0; i<16; i++ ) - { - F v = tab1y[(i>>2)] * tab1x[(i&3)]; - isum += itab[i] = convert_short_sat( rint( v * INTER_REMAP_COEF_SCALE ) ); - } - - if( isum != INTER_REMAP_COEF_SCALE ) - { - int k1, k2; - int diff = isum - INTER_REMAP_COEF_SCALE; - int Mk1=2, Mk2=2, mk1=2, mk2=2; - for( k1 = 2; k1 < 4; k1++ ) - for( k2 = 2; k2 < 4; k2++ ) - { - if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) - mk1 = k1, mk2 = k2; - else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) - Mk1 = k1, Mk2 = k2; - } - diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); - } - - if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - { - int sum=0; - for ( i =0; i<16; i++ ) - { - sum += v[i] * itab[i] ; - } - dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; - } - } -} - -/**********************************************8UC4********************************************* -***********************************************************************************************/ - -__kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - if( dx < threadCols && dy < dst_rows) - { - int round_delta = (AB_SCALE >> 1); - - int X0 = rint(M[0] * dx * AB_SCALE); - int Y0 = rint(M[3] * dx * AB_SCALE); - X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; - - int sx0 = (short)(X0 >> AB_BITS); - int sy0 = (short)(Y0 >> AB_BITS); - - if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0=0 && sy0>2)+sy0*(srcStep>>2)+sx0] : (uchar4)0; - } -} - -__kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - - if( dx < threadCols && dy < dst_rows) - { - int round_delta = AB_SCALE/INTER_TAB_SIZE/2; - - src_offset = (src_offset>>2); - srcStep = (srcStep>>2); - - int tmp = (dx << AB_BITS); - int X0 = rint(M[0] * tmp); - int Y0 = rint(M[3] * tmp); - X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; - X0 = X0 >> (AB_BITS - INTER_BITS); - Y0 = Y0 >> (AB_BITS - INTER_BITS); - - short sx0 = (short)(X0 >> INTER_BITS); - short sy0 = (short)(Y0 >> INTER_BITS); - short ax0 = (short)(X0 & (INTER_TAB_SIZE-1)); - short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1)); - - int4 v0, v1, v2, v3; - - v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? convert_int4(src[src_offset+sy0 * srcStep + sx0]) : 0; - v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? convert_int4(src[src_offset+sy0 * srcStep + sx0+1]) : 0; - v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? convert_int4(src[src_offset+(sy0+1) * srcStep + sx0]) : 0; - v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? convert_int4(src[src_offset+(sy0+1) * srcStep + sx0+1]) : 0; - - int itab0, itab1, itab2, itab3; - float taby, tabx; - taby = 1.f/INTER_TAB_SIZE*ay0; - tabx = 1.f/INTER_TAB_SIZE*ax0; - - itab0 = convert_short_sat(rint( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); - itab1 = convert_short_sat(rint( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE )); - itab2 = convert_short_sat(rint( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); - itab3 = convert_short_sat(rint( taby*tabx * INTER_REMAP_COEF_SCALE )); - - int4 val; - val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3; - - if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - dst[(dst_offset>>2)+dy*(dstStep>>2)+dx] = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; - } -} - -__kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - if( dx < threadCols && dy < dst_rows) - { - int round_delta = ((AB_SCALE>>INTER_BITS)>>1); - - src_offset = (src_offset>>2); - srcStep = (srcStep>>2); - dst_offset = (dst_offset>>2); - dstStep = (dstStep>>2); - - int tmp = (dx << AB_BITS); - int X0 = rint(M[0] * tmp); - int Y0 = rint(M[3] * tmp); - X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; - X0 = X0 >> (AB_BITS - INTER_BITS); - Y0 = Y0 >> (AB_BITS - INTER_BITS); - - int sx = (short)(X0 >> INTER_BITS) - 1; - int sy = (short)(Y0 >> INTER_BITS) - 1; - int ay = (short)(Y0 & (INTER_TAB_SIZE-1)); - int ax = (short)(X0 & (INTER_TAB_SIZE-1)); - - uchar4 v[16]; - int i,j; -#pragma unroll 4 - for(i=0; i<4; i++) - for(j=0; j<4; j++) - { - v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0; - } - int itab[16]; - float tab1y[4], tab1x[4]; - float axx, ayy; - - ayy = INTER_SCALE * ay; - axx = INTER_SCALE * ax; - interpolateCubic(ayy, tab1y); - interpolateCubic(axx, tab1x); - int isum = 0; - -#pragma unroll 16 - for( i=0; i<16; i++ ) - { - float tmp; - tmp = tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE; - itab[i] = rint(tmp); - isum += itab[i]; - } - - if( isum != INTER_REMAP_COEF_SCALE ) - { - int k1, k2; - int diff = isum - INTER_REMAP_COEF_SCALE; - int Mk1=2, Mk2=2, mk1=2, mk2=2; - - for( k1 = 2; k1 < 4; k1++ ) - for( k2 = 2; k2 < 4; k2++ ) - { - - if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) - mk1 = k1, mk2 = k2; - else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) - Mk1 = k1, Mk2 = k2; - } - - diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); - } - - if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - { - int4 sum=0; - for ( i =0; i<16; i++ ) - { - sum += convert_int4(v[i]) * itab[i]; - } - dst[dst_offset+dy*dstStep+dx] = convert_uchar4_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; - } - } -} - - -/**********************************************32FC1******************************************** -***********************************************************************************************/ - -__kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - if( dx < threadCols && dy < dst_rows) - { - int round_delta = AB_SCALE/2; - - int X0 = rint(M[0] * dx * AB_SCALE); - int Y0 = rint(M[3] * dx * AB_SCALE); - X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; - - short sx0 = (short)(X0 >> AB_BITS); - short sy0 = (short)(Y0 >> AB_BITS); - - if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - dst[(dst_offset>>2)+dy*dstStep+dx]= (sx0>=0 && sx0=0 && sy0>2)+sy0*srcStep+sx0] : 0; - } -} - -__kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - if( dx < threadCols && dy < dst_rows) - { - int round_delta = AB_SCALE/INTER_TAB_SIZE/2; - - src_offset = (src_offset>>2); - - int X0 = rint(M[0] * dx * AB_SCALE); - int Y0 = rint(M[3] * dx * AB_SCALE); - X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; - X0 = X0 >> (AB_BITS - INTER_BITS); - Y0 = Y0 >> (AB_BITS - INTER_BITS); - - short sx0 = (short)(X0 >> INTER_BITS); - short sy0 = (short)(Y0 >> INTER_BITS); - short ax0 = (short)(X0 & (INTER_TAB_SIZE-1)); - short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1)); - - float v0, v1, v2, v3; - - v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0; - v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0; - v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0; - v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0; - - float tab[4]; - float taby[2], tabx[2]; - taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0; - taby[1] = 1.f/INTER_TAB_SIZE*ay0; - tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0; - tabx[1] = 1.f/INTER_TAB_SIZE*ax0; - - tab[0] = taby[0] * tabx[0]; - tab[1] = taby[0] * tabx[1]; - tab[2] = taby[1] * tabx[0]; - tab[3] = taby[1] * tabx[1]; - - float sum = 0; - sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3]; - if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - dst[(dst_offset>>2)+dy*dstStep+dx] = sum; - } -} - -__kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - if( dx < threadCols && dy < dst_rows) - { - int round_delta = AB_SCALE/INTER_TAB_SIZE/2; - - src_offset = (src_offset>>2); - dst_offset = (dst_offset>>2); - - int X0 = rint(M[0] * dx * AB_SCALE); - int Y0 = rint(M[3] * dx * AB_SCALE); - X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; - X0 = X0 >> (AB_BITS - INTER_BITS); - Y0 = Y0 >> (AB_BITS - INTER_BITS); - - short sx = (short)(X0 >> INTER_BITS) - 1; - short sy = (short)(Y0 >> INTER_BITS) - 1; - short ay = (short)(Y0 & (INTER_TAB_SIZE-1)); - short ax = (short)(X0 & (INTER_TAB_SIZE-1)); - - float v[16]; - int i; - - for(i=0; i<16; i++) - v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0; - - float tab[16]; - float tab1y[4], tab1x[4]; - float axx, ayy; - - ayy = 1.f/INTER_TAB_SIZE * ay; - axx = 1.f/INTER_TAB_SIZE * ax; - interpolateCubic(ayy, tab1y); - interpolateCubic(axx, tab1x); - -#pragma unroll 4 - for( i=0; i<16; i++ ) - { - tab[i] = tab1y[(i>>2)] * tab1x[(i&3)]; - } - - if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - { - float sum = 0; -#pragma unroll 4 - for ( i =0; i<16; i++ ) - { - sum += v[i] * tab[i]; - } - dst[dst_offset+dy*dstStep+dx] = sum; - - } - } -} - - -/**********************************************32FC4******************************************** -***********************************************************************************************/ - -__kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - if( dx < threadCols && dy < dst_rows) - { - int round_delta = AB_SCALE/2; - - int X0 = rint(M[0] * dx * AB_SCALE); - int Y0 = rint(M[3] * dx * AB_SCALE); - X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; - - short sx0 = (short)(X0 >> AB_BITS); - short sy0 = (short)(Y0 >> AB_BITS); - - if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0=0 && sy0>4)+sy0*(srcStep>>2)+sx0] : (float4)0; - } -} - -__kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - if( dx < threadCols && dy < dst_rows) - { - int round_delta = AB_SCALE/INTER_TAB_SIZE/2; - - src_offset = (src_offset>>4); - dst_offset = (dst_offset>>4); - srcStep = (srcStep>>2); - dstStep = (dstStep>>2); - - int X0 = rint(M[0] * dx * AB_SCALE); - int Y0 = rint(M[3] * dx * AB_SCALE); - X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; - X0 = X0 >> (AB_BITS - INTER_BITS); - Y0 = Y0 >> (AB_BITS - INTER_BITS); - - short sx0 = (short)(X0 >> INTER_BITS); - short sy0 = (short)(Y0 >> INTER_BITS); - short ax0 = (short)(X0 & (INTER_TAB_SIZE-1)); - short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1)); - - float4 v0, v1, v2, v3; - - v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0; - v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0; - v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0; - v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0; - - float tab[4]; - float taby[2], tabx[2]; - taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0; - taby[1] = 1.f/INTER_TAB_SIZE*ay0; - tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0; - tabx[1] = 1.f/INTER_TAB_SIZE*ax0; - - tab[0] = taby[0] * tabx[0]; - tab[1] = taby[0] * tabx[1]; - tab[2] = taby[1] * tabx[0]; - tab[3] = taby[1] * tabx[1]; - - float4 sum = 0; - sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3]; - if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - dst[dst_offset+dy*dstStep+dx] = sum; - } -} - -__kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - if( dx < threadCols && dy < dst_rows) - { - int round_delta = AB_SCALE/INTER_TAB_SIZE/2; - - src_offset = (src_offset>>4); - dst_offset = (dst_offset>>4); - srcStep = (srcStep>>2); - dstStep = (dstStep>>2); - - int X0 = rint(M[0] * dx * AB_SCALE); - int Y0 = rint(M[3] * dx * AB_SCALE); - X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; - Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; - X0 = X0 >> (AB_BITS - INTER_BITS); - Y0 = Y0 >> (AB_BITS - INTER_BITS); - - short sx = (short)(X0 >> INTER_BITS) - 1; - short sy = (short)(Y0 >> INTER_BITS) - 1; - short ay = (short)(Y0 & (INTER_TAB_SIZE-1)); - short ax = (short)(X0 & (INTER_TAB_SIZE-1)); - - float4 v[16]; - int i; - - for(i=0; i<16; i++) - v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0; - - float tab[16]; - float tab1y[4], tab1x[4]; - float axx, ayy; - - ayy = 1.f/INTER_TAB_SIZE * ay; - axx = 1.f/INTER_TAB_SIZE * ax; - interpolateCubic(ayy, tab1y); - interpolateCubic(axx, tab1x); - -#pragma unroll 4 - for( i=0; i<16; i++ ) - { - tab[i] = tab1y[(i>>2)] * tab1x[(i&3)]; - } - - if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - { - float4 sum = 0; -#pragma unroll 4 - for ( i =0; i<16; i++ ) - { - sum += v[i] * tab[i]; - } - dst[dst_offset+dy*dstStep+dx] = sum; - - } - } -} diff --git a/modules/imgproc/src/opencl/warpperspective.cl b/modules/imgproc/src/opencl/warpperspective.cl deleted file mode 100644 index 43863c1..0000000 --- a/modules/imgproc/src/opencl/warpperspective.cl +++ /dev/null @@ -1,688 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Zhang Ying, zhangying913@gmail.com -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - - -//wrapPerspective kernel -//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic. - -#if defined (DOUBLE_SUPPORT) -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64:enable -#elif defined (cl_amd_fp64) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#endif -typedef double F; -typedef double4 F4; -#define convert_F4 convert_double4 -#else -typedef float F; -typedef float4 F4; -#define convert_F4 convert_float4 -#endif - - -#define INTER_BITS 5 -#define INTER_TAB_SIZE (1 << INTER_BITS) -#define INTER_SCALE 1.f/INTER_TAB_SIZE -#define AB_BITS max(10, (int)INTER_BITS) -#define AB_SCALE (1 << AB_BITS) -#define INTER_REMAP_COEF_BITS 15 -#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS) - -inline void interpolateCubic( float x, float* coeffs ) -{ - const float A = -0.75f; - - coeffs[0] = ((A*(x + 1.f) - 5.0f*A)*(x + 1.f) + 8.0f*A)*(x + 1.f) - 4.0f*A; - coeffs[1] = ((A + 2.f)*x - (A + 3.f))*x*x + 1.f; - coeffs[2] = ((A + 2.f)*(1.f - x) - (A + 3.f))*(1.f - x)*(1.f - x) + 1.f; - coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; -} - - -/**********************************************8UC1********************************************* -***********************************************************************************************/ -__kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - if( dx < threadCols && dy < dst_rows) - { - dx = (dx<<2) - (dst_offset&3); - - F4 DX = (F4)(dx, dx+1, dx+2, dx+3); - F4 X0 = M[0]*DX + M[1]*dy + M[2]; - F4 Y0 = M[3]*DX + M[4]*dy + M[5]; - F4 W = M[6]*DX + M[7]*dy + M[8],one=1,zero=0; - W = (W!=zero) ? one/W : zero; - short4 X = convert_short4(rint(X0*W)); - short4 Y = convert_short4(rint(Y0*W)); - int4 sx = convert_int4(X); - int4 sy = convert_int4(Y); - - int4 DXD = (int4)(dx, dx+1, dx+2, dx+3); - __global uchar4 * d = (__global uchar4 *)(dst+dst_offset+dy*dstStep+dx); - uchar4 dval = *d; - int4 dcon = DXD >= 0 && DXD < dst_cols && dy >= 0 && dy < dst_rows; - int4 scon = sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows; - int4 spos = src_offset + sy * srcStep + sx; - uchar4 sval; - sval.s0 = scon.s0 ? src[spos.s0] : 0; - sval.s1 = scon.s1 ? src[spos.s1] : 0; - sval.s2 = scon.s2 ? src[spos.s2] : 0; - sval.s3 = scon.s3 ? src[spos.s3] : 0; - dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval; - *d = dval; - } -} - -__kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, - int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - if( dx < threadCols && dy < dst_rows) - { - F X0 = M[0]*dx + M[1]*dy + M[2]; - F Y0 = M[3]*dx + M[4]*dy + M[5]; - F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; - int X = rint(X0*W); - int Y = rint(Y0*W); - - int sx = (short)(X >> INTER_BITS); - int sy = (short)(Y >> INTER_BITS); - int ay = (short)(Y & (INTER_TAB_SIZE-1)); - int ax = (short)(X & (INTER_TAB_SIZE-1)); - - uchar v[4]; - int i; -#pragma unroll 4 - for(i=0; i<4; i++) - v[i] = (sx+(i&1) >= 0 && sx+(i&1) < src_cols && sy+(i>>1) >= 0 && sy+(i>>1) < src_rows) ? src[src_offset + (sy+(i>>1)) * srcStep + (sx+(i&1))] : (uchar)0; - - short itab[4]; - float tab1y[2], tab1x[2]; - tab1y[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay; - tab1y[1] = 1.f/INTER_TAB_SIZE*ay; - tab1x[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax; - tab1x[1] = 1.f/INTER_TAB_SIZE*ax; - -#pragma unroll 4 - for(i=0; i<4; i++) - { - float v = tab1y[(i>>1)] * tab1x[(i&1)]; - itab[i] = convert_short_sat(rint( v * INTER_REMAP_COEF_SCALE )); - } - if(dx >=0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - { - int sum = 0; - for ( i =0; i<4; i++ ) - { - sum += v[i] * itab[i] ; - } - dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat ( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; - } - } -} - -__kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - if( dx < threadCols && dy < dst_rows) - { - F X0 = M[0]*dx + M[1]*dy + M[2]; - F Y0 = M[3]*dx + M[4]*dy + M[5]; - F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; - int X = rint(X0*W); - int Y = rint(Y0*W); - - short sx = (short)(X >> INTER_BITS) - 1; - short sy = (short)(Y >> INTER_BITS) - 1; - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - uchar v[16]; - int i, j; - -#pragma unroll 4 - for(i=0; i<4; i++) - for(j=0; j<4; j++) - { - v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : (uchar)0; - } - - short itab[16]; - float tab1y[4], tab1x[4]; - float axx, ayy; - - ayy = 1.f/INTER_TAB_SIZE * ay; - axx = 1.f/INTER_TAB_SIZE * ax; - interpolateCubic(ayy, tab1y); - interpolateCubic(axx, tab1x); - - int isum = 0; -#pragma unroll 16 - for( i=0; i<16; i++ ) - { - F v = tab1y[(i>>2)] * tab1x[(i&3)]; - isum += itab[i] = convert_short_sat( rint( v * INTER_REMAP_COEF_SCALE ) ); - } - if( isum != INTER_REMAP_COEF_SCALE ) - { - int k1, k2; - int diff = isum - INTER_REMAP_COEF_SCALE; - int Mk1=2, Mk2=2, mk1=2, mk2=2; - for( k1 = 2; k1 < 4; k1++ ) - for( k2 = 2; k2 < 4; k2++ ) - { - if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) - mk1 = k1, mk2 = k2; - else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) - Mk1 = k1, Mk2 = k2; - } - diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); - } - - - if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - { - int sum=0; - for ( i =0; i<16; i++ ) - { - sum += v[i] * itab[i] ; - } - dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; - } - } -} - -/**********************************************8UC4********************************************* -***********************************************************************************************/ - -__kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, - int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - if( dx < threadCols && dy < dst_rows) - { - - F X0 = M[0]*dx + M[1]*dy + M[2]; - F Y0 = M[3]*dx + M[4]*dy + M[5]; - F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? 1./W : 0.0; - int X = rint(X0*W); - int Y = rint(Y0*W); - short sx = (short)X; - short sy = (short)Y; - - if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx>=0 && sx=0 && sy>2)+sy*(srcStep>>2)+sx] : (uchar4)0; - } -} - -__kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, - int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - if( dx < threadCols && dy < dst_rows) - { - src_offset = (src_offset>>2); - srcStep = (srcStep>>2); - - F X0 = M[0]*dx + M[1]*dy + M[2]; - F Y0 = M[3]*dx + M[4]*dy + M[5]; - F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; - int X = rint(X0*W); - int Y = rint(Y0*W); - - short sx = (short)(X >> INTER_BITS); - short sy = (short)(Y >> INTER_BITS); - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - - int4 v0, v1, v2, v3; - - v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : (int4)0; - v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx+1]) : (int4)0; - v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx]) : (int4)0; - v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx+1]) : (int4)0; - - int itab0, itab1, itab2, itab3; - float taby, tabx; - taby = 1.f/INTER_TAB_SIZE*ay; - tabx = 1.f/INTER_TAB_SIZE*ax; - - itab0 = convert_short_sat(rint( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); - itab1 = convert_short_sat(rint( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE )); - itab2 = convert_short_sat(rint( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); - itab3 = convert_short_sat(rint( taby*tabx * INTER_REMAP_COEF_SCALE )); - - int4 val; - val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3; - - if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - dst[(dst_offset>>2)+dy*(dstStep>>2)+dx] = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; - } -} - -__kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, - int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - if( dx < threadCols && dy < dst_rows) - { - src_offset = (src_offset>>2); - srcStep = (srcStep>>2); - dst_offset = (dst_offset>>2); - dstStep = (dstStep>>2); - - F X0 = M[0]*dx + M[1]*dy + M[2]; - F Y0 = M[3]*dx + M[4]*dy + M[5]; - F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; - int X = rint(X0*W); - int Y = rint(Y0*W); - - short sx = (short)(X >> INTER_BITS) - 1; - short sy = (short)(Y >> INTER_BITS) - 1; - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - uchar4 v[16]; - int i,j; -#pragma unroll 4 - for(i=0; i<4; i++) - for(j=0; j<4; j++) - { - v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0; - } - int itab[16]; - float tab1y[4], tab1x[4]; - float axx, ayy; - - ayy = INTER_SCALE * ay; - axx = INTER_SCALE * ax; - interpolateCubic(ayy, tab1y); - interpolateCubic(axx, tab1x); - int isum = 0; - -#pragma unroll 16 - for( i=0; i<16; i++ ) - { - float tmp; - tmp = tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE; - itab[i] = rint(tmp); - isum += itab[i]; - } - - if( isum != INTER_REMAP_COEF_SCALE ) - { - int k1, k2; - int diff = isum - INTER_REMAP_COEF_SCALE; - int Mk1=2, Mk2=2, mk1=2, mk2=2; - - for( k1 = 2; k1 < 4; k1++ ) - for( k2 = 2; k2 < 4; k2++ ) - { - - if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) - mk1 = k1, mk2 = k2; - else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) - Mk1 = k1, Mk2 = k2; - } - - diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); - } - - if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - { - int4 sum=0; - for ( i =0; i<16; i++ ) - { - sum += convert_int4(v[i]) * itab[i]; - } - dst[dst_offset+dy*dstStep+dx] = convert_uchar4_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; - } - } -} - - -/**********************************************32FC1******************************************** -***********************************************************************************************/ - -__kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - if( dx < threadCols && dy < dst_rows) - { - F X0 = M[0]*dx + M[1]*dy + M[2]; - F Y0 = M[3]*dx + M[4]*dy + M[5]; - F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? 1./W : 0.0; - int X = rint(X0*W); - int Y = rint(Y0*W); - short sx = (short)X; - short sy = (short)Y; - - if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - dst[(dst_offset>>2)+dy*dstStep+dx]= (sx>=0 && sx=0 && sy>2)+sy*srcStep+sx] : 0; - } -} - -__kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - if( dx < threadCols && dy < dst_rows) - { - src_offset = (src_offset>>2); - - F X0 = M[0]*dx + M[1]*dy + M[2]; - F Y0 = M[3]*dx + M[4]*dy + M[5]; - F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; - int X = rint(X0*W); - int Y = rint(Y0*W); - - short sx = (short)(X >> INTER_BITS); - short sy = (short)(Y >> INTER_BITS); - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - float v0, v1, v2, v3; - - v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : (float)0; - v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx+1] : (float)0; - v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx] : (float)0; - v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx+1] : (float)0; - - float tab[4]; - float taby[2], tabx[2]; - taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay; - taby[1] = 1.f/INTER_TAB_SIZE*ay; - tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax; - tabx[1] = 1.f/INTER_TAB_SIZE*ax; - - tab[0] = taby[0] * tabx[0]; - tab[1] = taby[0] * tabx[1]; - tab[2] = taby[1] * tabx[0]; - tab[3] = taby[1] * tabx[1]; - - float sum = 0; - sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3]; - if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - dst[(dst_offset>>2)+dy*dstStep+dx] = sum; - } -} - -__kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - if( dx < threadCols && dy < dst_rows) - { - src_offset = (src_offset>>2); - dst_offset = (dst_offset>>2); - - F X0 = M[0]*dx + M[1]*dy + M[2]; - F Y0 = M[3]*dx + M[4]*dy + M[5]; - F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; - int X = rint(X0*W); - int Y = rint(Y0*W); - - short sx = (short)(X >> INTER_BITS) - 1; - short sy = (short)(Y >> INTER_BITS) - 1; - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - float v[16]; - int i; - - for(i=0; i<16; i++) - v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float)0; - - float tab[16]; - float tab1y[4], tab1x[4]; - float axx, ayy; - - ayy = 1.f/INTER_TAB_SIZE * ay; - axx = 1.f/INTER_TAB_SIZE * ax; - interpolateCubic(ayy, tab1y); - interpolateCubic(axx, tab1x); - -#pragma unroll 4 - for( i=0; i<16; i++ ) - { - tab[i] = tab1y[(i>>2)] * tab1x[(i&3)]; - } - - if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - { - float sum = 0; -#pragma unroll 4 - for ( i =0; i<16; i++ ) - { - sum += v[i] * tab[i]; - } - dst[dst_offset+dy*dstStep+dx] = sum; - - } - } -} - - -/**********************************************32FC4******************************************** -***********************************************************************************************/ - -__kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - if( dx < threadCols && dy < dst_rows) - { - F X0 = M[0]*dx + M[1]*dy + M[2]; - F Y0 = M[3]*dx + M[4]*dy + M[5]; - F W = M[6]*dx + M[7]*dy + M[8]; - W =(W != 0.0)? 1./W : 0.0; - int X = rint(X0*W); - int Y = rint(Y0*W); - short sx = (short)X; - short sy = (short)Y; - - if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx=0 && sy>4)+sy*(srcStep>>2)+sx] : (float)0; - } -} - -__kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, - int dst_cols, int dst_rows, int srcStep, int dstStep, - int src_offset, int dst_offset, __constant F * M, int threadCols ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - if( dx < threadCols && dy < dst_rows) - { - src_offset = (src_offset>>4); - dst_offset = (dst_offset>>4); - srcStep = (srcStep>>2); - dstStep = (dstStep>>2); - - F X0 = M[0]*dx + M[1]*dy + M[2]; - F Y0 = M[3]*dx + M[4]*dy + M[5]; - F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; - int X = rint(X0*W); - int Y = rint(Y0*W); - - short sx0 = (short)(X >> INTER_BITS); - short sy0 = (short)(Y >> INTER_BITS); - short ay0 = (short)(Y & (INTER_TAB_SIZE-1)); - short ax0 = (short)(X & (INTER_TAB_SIZE-1)); - - - float4 v0, v1, v2, v3; - - v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0; - v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0; - v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0; - v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0; - - float tab[4]; - float taby[2], tabx[2]; - taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0; - taby[1] = 1.f/INTER_TAB_SIZE*ay0; - tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0; - tabx[1] = 1.f/INTER_TAB_SIZE*ax0; - - tab[0] = taby[0] * tabx[0]; - tab[1] = taby[0] * tabx[1]; - tab[2] = taby[1] * tabx[0]; - tab[3] = taby[1] * tabx[1]; - - float4 sum = 0; - sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3]; - if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - dst[dst_offset+dy*dstStep+dx] = sum; - } -} - -__kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 * dst, - int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, - int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) -{ - int dx = get_global_id(0); - int dy = get_global_id(1); - - if( dx < threadCols && dy < dst_rows ) - { - src_offset = (src_offset>>4); - dst_offset = (dst_offset>>4); - srcStep = (srcStep>>2); - dstStep = (dstStep>>2); - - F X0 = M[0]*dx + M[1]*dy + M[2]; - F Y0 = M[3]*dx + M[4]*dy + M[5]; - F W = M[6]*dx + M[7]*dy + M[8]; - W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; - int X = rint(X0*W); - int Y = rint(Y0*W); - - short sx = (short)(X >> INTER_BITS)-1; - short sy = (short)(Y >> INTER_BITS)-1; - short ay = (short)(Y & (INTER_TAB_SIZE-1)); - short ax = (short)(X & (INTER_TAB_SIZE-1)); - - - float4 v[16]; - int i; - - for(i=0; i<16; i++) - v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0; - - float tab[16]; - float tab1y[4], tab1x[4]; - float axx, ayy; - - ayy = 1.f/INTER_TAB_SIZE * ay; - axx = 1.f/INTER_TAB_SIZE * ax; - interpolateCubic(ayy, tab1y); - interpolateCubic(axx, tab1x); - -#pragma unroll 4 - for( i=0; i<16; i++ ) - { - tab[i] = tab1y[(i>>2)] * tab1x[(i&3)]; - } - - if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) - { - float4 sum = 0; -#pragma unroll 4 - for ( i =0; i<16; i++ ) - { - sum += v[i] * tab[i]; - } - dst[dst_offset+dy*dstStep+dx] = sum; - - } - } -} diff --git a/modules/imgproc/test/test_imgproc_umat.cpp b/modules/imgproc/test/test_imgproc_umat.cpp index ca72d76..5237038 100644 --- a/modules/imgproc/test/test_imgproc_umat.cpp +++ b/modules/imgproc/test/test_imgproc_umat.cpp @@ -66,6 +66,7 @@ protected: resize(ugray, usmallimg, Size(), 0.75, 0.75, INTER_LINEAR); equalizeHist(usmallimg, uresult); +#if 0 imshow("orig", uimg); imshow("small", usmallimg); imshow("equalized gray", uresult); @@ -73,7 +74,7 @@ protected: destroyWindow("orig"); destroyWindow("small"); destroyWindow("equalized gray"); - +#endif ts->set_failed_test_info(cvtest::TS::OK); } }; diff --git a/modules/objdetect/src/opencl/haarobjectdetect.cl b/modules/objdetect/src/opencl/haarobjectdetect.cl deleted file mode 100644 index 5fa3533..0000000 --- a/modules/objdetect/src/opencl/haarobjectdetect.cl +++ /dev/null @@ -1,423 +0,0 @@ -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. -// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// @Authors -// Niko Li, newlife20080214@gmail.com -// Wang Weiyan, wangweiyanster@gmail.com -// Jia Haipeng, jiahaipeng95@gmail.com -// Nathan, liujun@multicorewareinc.com -// Peng Xiao, pengxiao@outlook.com -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors as is and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -// - -#pragma OPENCL EXTENSION cl_amd_printf : enable -#define CV_HAAR_FEATURE_MAX 3 - -#define calc_sum(rect,offset) (sum[(rect).p0+offset] - sum[(rect).p1+offset] - sum[(rect).p2+offset] + sum[(rect).p3+offset]) -#define calc_sum1(rect,offset,i) (sum[(rect).p0[i]+offset] - sum[(rect).p1[i]+offset] - sum[(rect).p2[i]+offset] + sum[(rect).p3[i]+offset]) - -typedef int sumtype; -typedef float sqsumtype; - -#ifndef STUMP_BASED -#define STUMP_BASED 1 -#endif - -typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode -{ - int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64))); - float weight[CV_HAAR_FEATURE_MAX]; - float threshold; - float alpha[3] __attribute__((aligned (16))); - int left __attribute__((aligned (4))); - int right __attribute__((aligned (4))); -} -GpuHidHaarTreeNode; - - -typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier -{ - int count __attribute__((aligned (4))); - GpuHidHaarTreeNode* node __attribute__((aligned (8))); - float* alpha __attribute__((aligned (8))); -} -GpuHidHaarClassifier; - - -typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier -{ - int count __attribute__((aligned (4))); - float threshold __attribute__((aligned (4))); - int two_rects __attribute__((aligned (4))); - int reserved0 __attribute__((aligned (8))); - int reserved1 __attribute__((aligned (8))); - int reserved2 __attribute__((aligned (8))); - int reserved3 __attribute__((aligned (8))); -} -GpuHidHaarStageClassifier; - - -typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade -{ - int count __attribute__((aligned (4))); - int is_stump_based __attribute__((aligned (4))); - int has_tilted_features __attribute__((aligned (4))); - int is_tree __attribute__((aligned (4))); - int pq0 __attribute__((aligned (4))); - int pq1 __attribute__((aligned (4))); - int pq2 __attribute__((aligned (4))); - int pq3 __attribute__((aligned (4))); - int p0 __attribute__((aligned (4))); - int p1 __attribute__((aligned (4))); - int p2 __attribute__((aligned (4))); - int p3 __attribute__((aligned (4))); - float inv_window_area __attribute__((aligned (4))); -} GpuHidHaarClassifierCascade; - -__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade( - global GpuHidHaarStageClassifier * stagecascadeptr, - global int4 * info, - global GpuHidHaarTreeNode * nodeptr, - global const int * restrict sum1, - global const float * restrict sqsum1, - global int4 * candidate, - const int pixelstep, - const int loopcount, - const int start_stage, - const int split_stage, - const int end_stage, - const int startnode, - const int splitnode, - const int4 p, - const int4 pq, - const float correction) -{ - int grpszx = get_local_size(0); - int grpszy = get_local_size(1); - int grpnumx = get_num_groups(0); - int grpidx = get_group_id(0); - int lclidx = get_local_id(0); - int lclidy = get_local_id(1); - - int lcl_sz = mul24(grpszx,grpszy); - int lcl_id = mad24(lclidy,grpszx,lclidx); - - __local int lclshare[1024]; - __local int* lcldata = lclshare;//for save win data - __local int* glboutindex = lcldata + 28*28;//for save global out index - __local int* lclcount = glboutindex + 1;//for save the numuber of temp pass pixel - __local int* lcloutindex = lclcount + 1;//for save info of temp pass pixel - __local float* partialsum = (__local float*)(lcloutindex + (lcl_sz<<1)); - glboutindex[0]=0; - int outputoff = mul24(grpidx,256); - - //assume window size is 20X20 -#define WINDOWSIZE 20+1 - //make sure readwidth is the multiple of 4 - //ystep =1, from host code - int readwidth = ((grpszx-1 + WINDOWSIZE+3)>>2)<<2; - int readheight = grpszy-1+WINDOWSIZE; - int read_horiz_cnt = readwidth >> 2;//each read int4 - int total_read = mul24(read_horiz_cnt,readheight); - int read_loop = (total_read + lcl_sz - 1) >> 6; - candidate[outputoff+(lcl_id<<2)] = (int4)0; - candidate[outputoff+(lcl_id<<2)+1] = (int4)0; - candidate[outputoff+(lcl_id<<2)+2] = (int4)0; - candidate[outputoff+(lcl_id<<2)+3] = (int4)0; - for(int scalei = 0; scalei > 16; - int height = scaleinfo1.x & 0xffff; - int grpnumperline =(scaleinfo1.y & 0xffff0000) >> 16; - int totalgrp = scaleinfo1.y & 0xffff; - int imgoff = scaleinfo1.z; - float factor = as_float(scaleinfo1.w); - - __global const int * sum = sum1 + imgoff; - __global const float * sqsum = sqsum1 + imgoff; - for(int grploop=grpidx; grploop=0.f ? sqrt(variance_norm_factor) : 1.f; - - for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ ) - { - float stage_sum = 0.f; - int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop); - float stagethreshold = as_float(stageinfo.y); - for(int nodeloop = 0; nodeloop < stageinfo.x; ) - { - __global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter); - - int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0])); - int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0])); - int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0])); - float4 w = *(__global float4*)(&(currentnodeptr->weight[0])); - float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0])); - - float nodethreshold = w.w * variance_norm_factor; - - info1.x +=lcl_off; - info1.z +=lcl_off; - info2.x +=lcl_off; - info2.z +=lcl_off; - - float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] - - lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x; - - classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] - - lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y; - - info3.x +=lcl_off; - info3.z +=lcl_off; - classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] - - lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z; - - bool passThres = classsum >= nodethreshold; -#if STUMP_BASED - stage_sum += passThres ? alpha3.y : alpha3.x; - nodecounter++; - nodeloop++; -#else - bool isRootNode = (nodecounter & 1) == 0; - if(isRootNode) - { - if( (passThres && currentnodeptr->right) || - (!passThres && currentnodeptr->left)) - { - nodecounter ++; - } - else - { - stage_sum += alpha3.x; - nodecounter += 2; - nodeloop ++; - } - } - else - { - stage_sum += passThres ? alpha3.z : alpha3.y; - nodecounter ++; - nodeloop ++; - } -#endif - } - - result = (stage_sum >= stagethreshold); - } - - if(result && (x < width) && (y < height)) - { - int queueindex = atomic_inc(lclcount); - lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx; - lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor); - } - barrier(CLK_LOCAL_MEM_FENCE); - int queuecount = lclcount[0]; - barrier(CLK_LOCAL_MEM_FENCE); - nodecounter = splitnode; - for(int stageloop = split_stage; stageloop< end_stage && queuecount>0; stageloop++) - { - lclcount[0]=0; - barrier(CLK_LOCAL_MEM_FENCE); - - int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop); - float stagethreshold = as_float(stageinfo.y); - - int perfscale = queuecount > 4 ? 3 : 2; - int queuecount_loop = (queuecount + (1<> perfscale; - int lcl_compute_win = lcl_sz >> perfscale; - int lcl_compute_win_id = (lcl_id >>(6-perfscale)); - int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale); - int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale)); - for(int queueloop=0; queueloop>16),readwidth,temp_coord & 0xffff); - - if(lcl_compute_win_id < queuecount) - { - int tempnodecounter = lcl_compute_id; - float part_sum = 0.f; - const int stump_factor = STUMP_BASED ? 1 : 2; - int root_offset = 0; - for(int lcl_loop=0; lcl_loopp[0][0])); - int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0])); - int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0])); - float4 w = *(__global float4*)(&(currentnodeptr->weight[0])); - float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0])); - float nodethreshold = w.w * variance_norm_factor; - - info1.x +=queue_pixel; - info1.z +=queue_pixel; - info2.x +=queue_pixel; - info2.z +=queue_pixel; - - float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] - - lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x; - - - classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] - - lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y; - - info3.x +=queue_pixel; - info3.z +=queue_pixel; - classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] - - lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z; - - bool passThres = classsum >= nodethreshold; -#if STUMP_BASED - part_sum += passThres ? alpha3.y : alpha3.x; - tempnodecounter += lcl_compute_win; - lcl_loop++; -#else - if(root_offset == 0) - { - if( (passThres && currentnodeptr->right) || - (!passThres && currentnodeptr->left)) - { - root_offset = 1; - } - else - { - part_sum += alpha3.x; - tempnodecounter += lcl_compute_win; - lcl_loop++; - } - } - else - { - part_sum += passThres ? alpha3.z : alpha3.y; - tempnodecounter += lcl_compute_win; - lcl_loop++; - root_offset = 0; - } -#endif - }//end for(int lcl_loop=0;lcl_loop= stagethreshold && (lcl_compute_id==0)) - { - int queueindex = atomic_inc(lclcount); - lcloutindex[queueindex<<1] = temp_coord; - lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor); - } - lcl_compute_win_id +=(1<0;stageloop++) - - if(lcl_id> 16)); - temp = glboutindex[0]; - int4 candidate_result; - candidate_result.zw = (int2)convert_int_rtn(factor*20.f); - candidate_result.x = convert_int_rtn(x*factor); - candidate_result.y = convert_int_rtn(y*factor); - atomic_inc(glboutindex); - candidate[outputoff+temp+lcl_id] = candidate_result; - } - barrier(CLK_LOCAL_MEM_FENCE); - }//end for(int grploop=grpidx;grploop> 16; - int height = scaleinfo1.x & 0xffff; - int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16; - int totalgrp = scaleinfo1.y & 0xffff; - float factor = as_float(scaleinfo1.w); - float correction_t = correction[scalei]; - int ystep = (int)(max(2.0f, factor) + 0.5f); - - for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx) - { - int4 cascadeinfo = p[scalei]; - int grpidy = grploop / grpnumperline; - int grpidx = grploop - mul24(grpidy, grpnumperline); - int ix = mad24(grpidx, grpszx, lclidx); - int iy = mad24(grpidy, grpszy, lclidy); - int x = ix * ystep; - int y = iy * ystep; - lcloutindex[lcl_id] = 0; - lclcount[0] = 0; - int nodecounter; - float mean, variance_norm_factor; - //if((ix < width) && (iy < height)) - { - const int p_offset = mad24(y, step, x); - cascadeinfo.x += p_offset; - cascadeinfo.z += p_offset; - mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] - - sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] - - sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] - + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)]) - * correction_t; - variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] - - sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] - - sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] - + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)]; - variance_norm_factor = variance_norm_factor * correction_t - mean * mean; - variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f; - bool result = true; - nodecounter = startnode + nodecount * scalei; - for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++) - { - float stage_sum = 0.f; - int stagecount = stagecascadeptr[stageloop].count; - for (int nodeloop = 0; nodeloop < stagecount;) - { - __global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter); - int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0])); - int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0])); - int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0])); - float4 w = *(__global float4 *)(&(currentnodeptr->weight[0])); - float3 alpha3 = *(__global float3 *)(&(currentnodeptr->alpha[0])); - float nodethreshold = w.w * variance_norm_factor; - - info1.x += p_offset; - info1.z += p_offset; - info2.x += p_offset; - info2.z += p_offset; - info3.x += p_offset; - info3.z += p_offset; - float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)] - - sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] - - sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)] - + sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x; - classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)] - - sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] - - sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)] - + sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y; - classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)] - - sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] - - sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)] - + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z; - - bool passThres = classsum >= nodethreshold; - -#if STUMP_BASED - stage_sum += passThres ? alpha3.y : alpha3.x; - nodecounter++; - nodeloop++; -#else - bool isRootNode = (nodecounter & 1) == 0; - if(isRootNode) - { - if( (passThres && currentnodeptr->right) || - (!passThres && currentnodeptr->left)) - { - nodecounter ++; - } - else - { - stage_sum += alpha3.x; - nodecounter += 2; - nodeloop ++; - } - } - else - { - stage_sum += (passThres ? alpha3.z : alpha3.y); - nodecounter ++; - nodeloop ++; - } -#endif - } - result = (int)(stage_sum >= stagecascadeptr[stageloop].threshold); - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if (result && (ix < width) && (iy < height)) - { - int queueindex = atomic_inc(lclcount); - lcloutindex[queueindex] = (y << 16) | x; - } - barrier(CLK_LOCAL_MEM_FENCE); - int queuecount = lclcount[0]; - - if (lcl_id < queuecount) - { - int temp = lcloutindex[lcl_id]; - int x = temp & 0xffff; - int y = (temp & (int)0xffff0000) >> 16; - temp = atomic_inc(glboutindex); - int4 candidate_result; - candidate_result.zw = (int2)convert_int_rtn(factor * 20.f); - candidate_result.x = x; - candidate_result.y = y; - candidate[outputoff + temp + lcl_id] = candidate_result; - } - - barrier(CLK_LOCAL_MEM_FENCE); - } - } - } -} -__kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, int nodenum) -{ - int counter = get_global_id(0); - int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0; - GpuHidHaarTreeNode t1 = *(orinode + counter); -#pragma unroll - - for (i = 0; i < 3; i++) - { - tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f); - tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f); - tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f); - tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f); - } - - t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]); - counter += nodenum; -#pragma unroll - - for (i = 0; i < 3; i++) - { - newnode[counter].p[i][0] = tr_x[i]; - newnode[counter].p[i][1] = tr_y[i]; - newnode[counter].p[i][2] = tr_x[i] + tr_w[i]; - newnode[counter].p[i][3] = tr_y[i] + tr_h[i]; - newnode[counter].weight[i] = t1.weight[i] * weight_scale; - } - - newnode[counter].left = t1.left; - newnode[counter].right = t1.right; - newnode[counter].threshold = t1.threshold; - newnode[counter].alpha[0] = t1.alpha[0]; - newnode[counter].alpha[1] = t1.alpha[1]; - newnode[counter].alpha[2] = t1.alpha[2]; -}