+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other oclMaterials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the uintel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business uinterruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-typedef float2 cfloat;
-inline cfloat cmulf(cfloat a, cfloat b)
-{
- return (cfloat)( a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
-}
-
-inline cfloat conjf(cfloat a)
-{
- return (cfloat)( a.x, - a.y );
-}
-
-__kernel void
-mulAndScaleSpectrumsKernel(
- __global const cfloat* a,
- __global const cfloat* b,
- float scale,
- __global cfloat* dst,
- uint cols,
- uint rows,
- uint mstep
-)
-{
- const uint x = get_global_id(0);
- const uint y = get_global_id(1);
- const uint idx = mad24(y, mstep / sizeof(cfloat), x);
- if (x < cols && y < rows)
- {
- cfloat v = cmulf(a[idx], b[idx]);
- dst[idx] = (cfloat)( v.x * scale, v.y * scale );
- }
-}
-__kernel void
-mulAndScaleSpectrumsKernel_CONJ(
- __global const cfloat* a,
- __global const cfloat* b,
- float scale,
- __global cfloat* dst,
- uint cols,
- uint rows,
- uint mstep
-)
-{
- const uint x = get_global_id(0);
- const uint y = get_global_id(1);
- const uint idx = mad24(y, mstep / sizeof(cfloat), x);
- if (x < cols && y < rows)
- {
- cfloat v = cmulf(a[idx], conjf(b[idx]));
- dst[idx] = (cfloat)( v.x * scale, v.y * scale );
- }
-}
+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the copyright holders or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-__kernel void polarToCart(__global const uchar* mask, int maskstep, int maskoffset,
- __global uchar* dstptr, int dststep, int dstoffset,
- int rows, int cols, dstT value )
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int mask_index = mad24(y, maskstep, x + maskoffset);
- if( mask[mask_index] )
- {
- int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
- *(dstT*)(dstptr + dst_index) = value;
- }
- }
-}
-
-__kernel void cartToPolar(__global uchar* dstptr, int dststep, int dstoffset,
- int rows, int cols, dstT value )
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
- *(dstT*)(dstptr + dst_index) = value;
- }
-}
+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Shengen Yan,yanshengen@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-
-#if FUNC_SUM
-#define FUNC(a, b) b += a;
-#elif FUNC_ABS_SUM
-#define FUNC(a, b) b += a >= (dstT)(0) ? a : -a;
-#elif FUNC_SQR_SUM
-#define FUNC(a, b) b += a * a;
-#else
-#error No sum function
-#endif
-
-/**************************************Array buffer SUM**************************************/
-
-__kernel void arithm_op_sum(int cols,int invalid_cols,int offset,int elemnum,int groupnum,
- __global srcT *src, __global dstT *dst)
-{
- unsigned int lid = get_local_id(0);
- unsigned int gid = get_group_id(0);
- unsigned int id = get_global_id(0);
- unsigned int idx = offset + id + (id / cols) * invalid_cols;
-
- __local dstT localmem_sum[128];
- dstT sum = (dstT)(0), temp;
-
- for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
- {
- idx = offset + id + (id / cols) * invalid_cols;
- temp = convertToDstT(src[idx]);
- FUNC(temp, sum);
- }
-
- if (lid > 127)
- localmem_sum[lid - 128] = sum;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (lid < 128)
- localmem_sum[lid] = sum + localmem_sum[lid];
- barrier(CLK_LOCAL_MEM_FENCE);
-
- for (int lsize = 64; lsize > 0; lsize >>= 1)
- {
- if (lid < lsize)
- {
- int lid2 = lsize + lid;
- localmem_sum[lid] = localmem_sum[lid] + localmem_sum[lid2];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-
- if (lid == 0)
- dst[gid] = localmem_sum[0];
-}
+++ /dev/null
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Rock Li, Rock.li@amd.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-
-__kernel void bilateral_C1_D0(__global uchar *dst,
- __global const uchar *src,
- const int dst_rows,
- const int dst_cols,
- const int maxk,
- const int radius,
- const int dst_step,
- const int dst_offset,
- const int src_step,
- const int src_rows,
- const int src_cols,
- __constant float *color_weight,
- __constant float *space_weight,
- __constant int *space_ofs)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (y < dst_rows && x < dst_cols)
- {
- int src_index = mad24(y + radius, src_step, x + radius);
- int dst_index = mad24(y, dst_step, x + dst_offset);
- float sum = 0.f, wsum = 0.f;
-
- int val0 = (int)src[src_index];
- for(int k = 0; k < maxk; k++ )
- {
- int val = (int)src[src_index + space_ofs[k]];
- float w = space_weight[k] * color_weight[abs(val - val0)];
- sum += (float)(val) * w;
- wsum += w;
- }
- dst[dst_index] = convert_uchar_rtz(sum / wsum + 0.5f);
- }
-}
-
-__kernel void bilateral2_C1_D0(__global uchar *dst,
- __global const uchar *src,
- const int dst_rows,
- const int dst_cols,
- const int maxk,
- const int radius,
- const int dst_step,
- const int dst_offset,
- const int src_step,
- const int src_rows,
- const int src_cols,
- __constant float *color_weight,
- __constant float *space_weight,
- __constant int *space_ofs)
-{
- int x = get_global_id(0) << 2;
- int y = get_global_id(1);
-
- if (y < dst_rows && x < dst_cols)
- {
- int src_index = mad24(y + radius, src_step, x + radius);
- int dst_index = mad24(y, dst_step, x + dst_offset);
- float4 sum = (float4)(0.f), wsum = (float4)(0.f);
-
- int4 val0 = convert_int4(vload4(0,src + src_index));
- for(int k = 0; k < maxk; k++ )
- {
- int4 val = convert_int4(vload4(0,src+src_index + space_ofs[k]));
- float4 w = (float4)(space_weight[k]) * (float4)(color_weight[abs(val.x - val0.x)], color_weight[abs(val.y - val0.y)],
- color_weight[abs(val.z - val0.z)], color_weight[abs(val.w - val0.w)]);
- sum += convert_float4(val) * w;
- wsum += w;
- }
- *(__global uchar4*)(dst+dst_index) = convert_uchar4_rtz(sum/wsum+0.5f);
- }
-}
-
-__kernel void bilateral_C4_D0(__global uchar4 *dst,
- __global const uchar4 *src,
- const int dst_rows,
- const int dst_cols,
- const int maxk,
- const int radius,
- const int dst_step,
- const int dst_offset,
- const int src_step,
- const int src_rows,
- const int src_cols,
- __constant float *color_weight,
- __constant float *space_weight,
- __constant int *space_ofs)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (y < dst_rows && x < dst_cols)
- {
- int src_index = mad24(y + radius, src_step, x + radius);
- int dst_index = mad24(y, dst_step, x + dst_offset);
- float4 sum = (float4)0.f;
- float wsum = 0.f;
-
- int4 val0 = convert_int4(src[src_index]);
- for(int k = 0; k < maxk; k++ )
- {
- int4 val = convert_int4(src[src_index + space_ofs[k]]);
- float w = space_weight[k] * color_weight[abs(val.x - val0.x) + abs(val.y - val0.y) + abs(val.z - val0.z)];
- sum += convert_float4(val) * (float4)w;
- wsum += w;
- }
-
- wsum = 1.f / wsum;
- dst[dst_index] = convert_uchar4_rtz(sum * (float4)wsum + (float4)0.5f);
- }
-}
+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Zhang Ying, zhangying913@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////Macro for border type////////////////////////////////////////////
-/////////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef BORDER_REPLICATE
-//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
-#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
-#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
-#endif
-
-#ifdef BORDER_REFLECT
-//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i))
-#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i))
-#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
-#endif
-
-#ifdef BORDER_REFLECT_101
-//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i))
-#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i))
-#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
-#endif
-
-//blur function does not support BORDER_WRAP
-#ifdef BORDER_WRAP
-//BORDER_WRAP: cdefgh|abcdefgh|abcdefg
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i))
-#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i))
-#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
-#endif
-
-#define THREADS 256
-#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
-
-inline void update_dst_C1_D0(__global uchar *dst, __local uint* temp,
- int dst_rows, int dst_cols,
- int dst_startX, int dst_x_off,
- float alpha)
-{
- if(get_local_id(0) < anX || get_local_id(0) >= (THREADS-ksX+anX+1))
- {
- return;
- }
-
- uint4 tmp_sum = 0;
- int posX = dst_startX - dst_x_off + (get_local_id(0)-anX)*4;
- int posY = (get_group_id(1) << 1);
-
- for(int i=-anX; i<=anX; i++)
- {
- tmp_sum += vload4(get_local_id(0), temp+i);
- }
-
- if(posY < dst_rows && posX < dst_cols)
- {
- tmp_sum /= (uint4) alpha;
- if(posX >= 0 && posX < dst_cols)
- *(dst) = tmp_sum.x;
- if(posX+1 >= 0 && posX+1 < dst_cols)
- *(dst + 1) = tmp_sum.y;
- if(posX+2 >= 0 && posX+2 < dst_cols)
- *(dst + 2) = tmp_sum.z;
- if(posX+3 >= 0 && posX+3 < dst_cols)
- *(dst + 3) = tmp_sum.w;
- }
-}
-
-
-inline void update_dst_C4_D0(__global uchar4 *dst, __local uint4* temp,
- int dst_rows, int dst_cols,
- int dst_startX, int dst_x_off,
- float alpha)
-{
- if(get_local_id(0) >= (THREADS-ksX+1))
- {
- return;
- }
-
- int posX = dst_startX - dst_x_off + get_local_id(0);
- int posY = (get_group_id(1) << 1);
-
- uint4 temp_sum = 0;
- for(int i=-anX; i<=anX; i++)
- {
- temp_sum += temp[get_local_id(0) + anX + i];
- }
-
- if(posX >= 0 && posX < dst_cols && posY >= 0 && posY < dst_rows)
- *dst = convert_uchar4(convert_float4(temp_sum)/alpha);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global uchar *dst, float alpha,
- int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
- int dst_offset, int dst_rows, int dst_cols, int dst_step
- )
-{
-
- int col = get_local_id(0);
- const int gX = get_group_id(0);
- const int gY = get_group_id(1);
- int src_x_off = src_offset % src_step;
- int src_y_off = src_offset / src_step;
- int dst_x_off = dst_offset % dst_step;
- int dst_y_off = dst_offset / dst_step;
-
- int head_off = dst_x_off%4;
- int startX = ((gX * (THREADS-ksX+1)-anX) * 4) - head_off + src_x_off;
- int startY = (gY << 1) - anY + src_y_off;
- int dst_startX = (gX * (THREADS-ksX+1) * 4) - head_off + dst_x_off;
- int dst_startY = (gY << 1) + dst_y_off;
-
- uint4 data[ksY+1];
- __local uint4 temp[2][THREADS];
-
-#ifdef BORDER_CONSTANT
-
- for(int i=0; i < ksY+1; i++)
- {
- if(startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3<src_whole_cols)
- {
- data[i].x = *(src+(startY+i)*src_step + startX + col * 4);
- data[i].y = *(src+(startY+i)*src_step + startX + col * 4 + 1);
- data[i].z = *(src+(startY+i)*src_step + startX + col * 4 + 2);
- data[i].w = *(src+(startY+i)*src_step + startX + col * 4 + 3);
- }
- else
- {
- data[i]=0;
- int con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4<src_whole_cols;
- if(con)data[i].s0 = *(src+(startY+i)*src_step + startX + col*4);
- con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+1 >=0 && startX+col*4+1<src_whole_cols;
- if(con)data[i].s1 = *(src+(startY+i)*src_step + startX + col*4+1) ;
- con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+2 >=0 && startX+col*4+2<src_whole_cols;
- if(con)data[i].s2 = *(src+(startY+i)*src_step + startX + col*4+2);
- con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+3 >=0 && startX+col*4+3<src_whole_cols;
- if(con)data[i].s3 = *(src+(startY+i)*src_step + startX + col*4+3);
- }
- }
-
-#else
- int not_all_in_range;
- for(int i=0; i < ksY+1; i++)
- {
- not_all_in_range = (startX+col*4<0) | (startX+col*4+3>src_whole_cols-1)
- | (startY+i<0) | (startY+i>src_whole_rows-1);
- if(not_all_in_range)
- {
- int selected_row;
- int4 selected_col;
- selected_row = ADDR_H(startY+i, 0, src_whole_rows);
- selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
-
- selected_col.x = ADDR_L(startX+col*4, 0, src_whole_cols);
- selected_col.x = ADDR_R(startX+col*4, src_whole_cols, selected_col.x);
-
- selected_col.y = ADDR_L(startX+col*4+1, 0, src_whole_cols);
- selected_col.y = ADDR_R(startX+col*4+1, src_whole_cols, selected_col.y);
-
- selected_col.z = ADDR_L(startX+col*4+2, 0, src_whole_cols);
- selected_col.z = ADDR_R(startX+col*4+2, src_whole_cols, selected_col.z);
-
- selected_col.w = ADDR_L(startX+col*4+3, 0, src_whole_cols);
- selected_col.w = ADDR_R(startX+col*4+3, src_whole_cols, selected_col.w);
-
- data[i].x = *(src + selected_row * src_step + selected_col.x);
- data[i].y = *(src + selected_row * src_step + selected_col.y);
- data[i].z = *(src + selected_row * src_step + selected_col.z);
- data[i].w = *(src + selected_row * src_step + selected_col.w);
- }
- else
- {
- data[i] = convert_uint4(vload4(col,(__global uchar*)(src+(startY+i)*src_step + startX)));
- }
- }
-#endif
- uint4 tmp_sum = 0;
- for(int i=1; i < ksY; i++)
- {
- tmp_sum += (data[i]);
- }
-
- int index = dst_startY * dst_step + dst_startX + (col-anX)*4;
-
- temp[0][col] = tmp_sum + (data[0]);
- temp[1][col] = tmp_sum + (data[ksY]);
- barrier(CLK_LOCAL_MEM_FENCE);
- update_dst_C1_D0(dst+index, (__local uint *)(temp[0]),
- dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
- update_dst_C1_D0(dst+index+dst_step, (__local uint *)(temp[1]),
- dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
-
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////8uC4////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, float alpha,
- int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
- int dst_offset, int dst_rows, int dst_cols, int dst_step
- )
-{
- int col = get_local_id(0);
- const int gX = get_group_id(0);
- const int gY = get_group_id(1);
-
- int src_x_off = (src_offset % src_step) >> 2;
- int src_y_off = src_offset / src_step;
- int dst_x_off = (dst_offset % dst_step) >> 2;
- int dst_y_off = dst_offset / dst_step;
-
- int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
- int startY = (gY << 1) - anY + src_y_off;
- int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
- int dst_startY = (gY << 1) + dst_y_off;
-
- uint4 data[ksY+1];
- __local uint4 temp[2][THREADS];
-
-#ifdef BORDER_CONSTANT
- bool con;
- for(int i=0; i < ksY+1; i++)
- {
- con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
- int cur_col = clamp(startX + col, 0, src_whole_cols);
-
- data[i].x = con ? src[(startY+i)*(src_step>>2) + cur_col].x : 0;
- data[i].y = con ? src[(startY+i)*(src_step>>2) + cur_col].y : 0;
- data[i].z = con ? src[(startY+i)*(src_step>>2) + cur_col].z : 0;
- data[i].w = con ? src[(startY+i)*(src_step>>2) + cur_col].w : 0;
- }
-#else
- for(int i=0; i < ksY+1; i++)
- {
- int selected_row;
- int selected_col;
- selected_row = ADDR_H(startY+i, 0, src_whole_rows);
- selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
-
- selected_col = ADDR_L(startX+col, 0, src_whole_cols);
- selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
-
-
- data[i] = convert_uint4(src[selected_row * (src_step>>2) + selected_col]);
- }
-
-#endif
- uint4 tmp_sum = 0;
- for(int i=1; i < ksY; i++)
- {
- tmp_sum += (data[i]);
- }
-
- int index = dst_startY * (dst_step>>2)+ dst_startX + col;
-
- temp[0][col] = tmp_sum + (data[0]);
- temp[1][col] = tmp_sum + (data[ksY]);
- barrier(CLK_LOCAL_MEM_FENCE);
- update_dst_C4_D0(dst+index, (__local uint4 *)(temp[0]),
- dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
- update_dst_C4_D0(dst+index+(dst_step>>2), (__local uint4 *)(temp[1]),
- dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
-
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////32fC1////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void boxFilter_C1_D5(__global const float *restrict src, __global float *dst, float alpha,
- int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
- int dst_offset, int dst_rows, int dst_cols, int dst_step
- )
-{
- int col = get_local_id(0);
- const int gX = get_group_id(0);
- const int gY = get_group_id(1);
-
- int src_x_off = (src_offset % src_step) >> 2;
- int src_y_off = src_offset / src_step;
- int dst_x_off = (dst_offset % dst_step) >> 2;
- int dst_y_off = dst_offset / dst_step;
-
- int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
- int startY = (gY << 1) - anY + src_y_off;
- int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
- int dst_startY = (gY << 1) + dst_y_off;
- float data[ksY+1];
- __local float temp[2][THREADS];
-#ifdef BORDER_CONSTANT
- bool con;
- float ss;
- for(int i=0; i < ksY+1; i++)
- {
- con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
-
- int cur_col = clamp(startX + col, 0, src_whole_cols);
- ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>2) + cur_col]:(float)0;
-
- data[i] = con ? ss : 0.f;
- }
-#else
- for(int i=0; i < ksY+1; i++)
- {
- int selected_row;
- int selected_col;
- selected_row = ADDR_H(startY+i, 0, src_whole_rows);
- selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
-
- selected_col = ADDR_L(startX+col, 0, src_whole_cols);
- selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
-
- data[i] = src[selected_row * (src_step>>2) + selected_col];
- }
-
-#endif
- float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
- for(int i=1; i < ksY; i++)
- {
- sum0 += (data[i]);
- }
- sum1 = sum0 + (data[0]);
- sum2 = sum0 + (data[ksY]);
- temp[0][col] = sum1;
- temp[1][col] = sum2;
- barrier(CLK_LOCAL_MEM_FENCE);
- if(col < (THREADS-(ksX-1)))
- {
- col += anX;
- int posX = dst_startX - dst_x_off + col - anX;
- int posY = (gY << 1);
-
- float tmp_sum[2]= {0.0, 0.0};
- for(int k=0; k<2; k++)
- for(int i=-anX; i<=anX; i++)
- {
- tmp_sum[k] += temp[k][col+i];
- }
- for(int i=0; i<2; i++)
- {
- if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
- dst[(dst_startY+i) * (dst_step>>2)+ dst_startX + col - anX] = tmp_sum[i]/alpha;
- }
-
- }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////32fC4////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global float4 *dst, float alpha,
- int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
- int dst_offset, int dst_rows, int dst_cols, int dst_step
- )
-{
- int col = get_local_id(0);
- const int gX = get_group_id(0);
- const int gY = get_group_id(1);
-
- int src_x_off = (src_offset % src_step) >> 4;
- int src_y_off = src_offset / src_step;
- int dst_x_off = (dst_offset % dst_step) >> 4;
- int dst_y_off = dst_offset / dst_step;
-
- int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
- int startY = (gY << 1) - anY + src_y_off;
- int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
- int dst_startY = (gY << 1) + dst_y_off;
- float4 data[ksY+1];
- __local float4 temp[2][THREADS];
-#ifdef BORDER_CONSTANT
- bool con;
- float4 ss;
- for(int i=0; i < ksY+1; i++)
- {
- con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
-
- int cur_col = clamp(startX + col, 0, src_whole_cols);
- ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>4) + cur_col]:(float4)0;
-
- data[i] = con ? ss : (float4)(0.0,0.0,0.0,0.0);
- }
-#else
- for(int i=0; i < ksY+1; i++)
- {
- int selected_row;
- int selected_col;
- selected_row = ADDR_H(startY+i, 0, src_whole_rows);
- selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
-
- selected_col = ADDR_L(startX+col, 0, src_whole_cols);
- selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
-
- data[i] = src[selected_row * (src_step>>4) + selected_col];
- }
-
-#endif
- float4 sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
- for(int i=1; i < ksY; i++)
- {
- sum0 += (data[i]);
- }
- sum1 = sum0 + (data[0]);
- sum2 = sum0 + (data[ksY]);
- temp[0][col] = sum1;
- temp[1][col] = sum2;
- barrier(CLK_LOCAL_MEM_FENCE);
- if(col < (THREADS-(ksX-1)))
- {
- col += anX;
- int posX = dst_startX - dst_x_off + col - anX;
- int posY = (gY << 1);
-
- float4 tmp_sum[2]= {(float4)(0.0,0.0,0.0,0.0), (float4)(0.0,0.0,0.0,0.0)};
- for(int k=0; k<2; k++)
- for(int i=-anX; i<=anX; i++)
- {
- tmp_sum[k] += temp[k][col+i];
- }
- for(int i=0; i<2; i++)
- {
- if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
- dst[(dst_startY+i) * (dst_step>>4)+ dst_startX + col - anX] = tmp_sum[i]/alpha;
- }
-
- }
-}
+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
-
-#ifdef L2GRAD
-inline float calc(int x, int y)
-{
- return sqrt((float)(x * x + y * y));
-}
-#else
-inline float calc(int x, int y)
-{
- return (float)abs(x) + abs(y);
-}
-#endif //
-
-// Smoothing perpendicular to the derivative direction with a triangle filter
-// only support 3x3 Sobel kernel
-// h (-1) = 1, h (0) = 2, h (1) = 1
-// h'(-1) = -1, h'(0) = 0, h'(1) = 1
-// thus sobel 2D operator can be calculated as:
-// h'(x, y) = h'(x)h(y) for x direction
-//
-// src input 8bit single channel image data
-// dx_buf output dx buffer
-// dy_buf output dy buffer
-__kernel
-void
-__attribute__((reqd_work_group_size(16,16,1)))
-calcSobelRowPass
-(
- __global const uchar * src,
- __global int * dx_buf,
- __global int * dy_buf,
- int rows,
- int cols,
- int src_step,
- int src_offset,
- int dx_buf_step,
- int dx_buf_offset,
- int dy_buf_step,
- int dy_buf_offset
-)
-{
- dx_buf_step /= sizeof(*dx_buf);
- dx_buf_offset /= sizeof(*dx_buf);
- dy_buf_step /= sizeof(*dy_buf);
- dy_buf_offset /= sizeof(*dy_buf);
-
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
-
- int lidx = get_local_id(0);
- int lidy = get_local_id(1);
-
- __local int smem[16][18];
-
- smem[lidy][lidx + 1] =
- src[gidx + min(gidy, rows - 1) * src_step + src_offset];
- if(lidx == 0)
- {
- smem[lidy][0] =
- src[max(gidx - 1, 0) + min(gidy, rows - 1) * src_step + src_offset];
- smem[lidy][17] =
- src[min(gidx + 16, cols - 1) + min(gidy, rows - 1) * src_step + src_offset];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if(gidy < rows && gidx < cols)
- {
- dx_buf[gidx + gidy * dx_buf_step + dx_buf_offset] =
- -smem[lidy][lidx] + smem[lidy][lidx + 2];
- dy_buf[gidx + gidy * dy_buf_step + dy_buf_offset] =
- smem[lidy][lidx] + 2 * smem[lidy][lidx + 1] + smem[lidy][lidx + 2];
- }
-}
-
-// calculate the magnitude of the filter pass combining both x and y directions
-// This is the buffered version(3x3 sobel)
-//
-// dx_buf dx buffer, calculated from calcSobelRowPass
-// dy_buf dy buffer, calculated from calcSobelRowPass
-// dx direvitive in x direction output
-// dy direvitive in y direction output
-// mag magnitude direvitive of xy output
-__kernel
-void
-__attribute__((reqd_work_group_size(16,16,1)))
-calcMagnitude_buf
-(
- __global const int * dx_buf,
- __global const int * dy_buf,
- __global int * dx,
- __global int * dy,
- __global float * mag,
- int rows,
- int cols,
- int dx_buf_step,
- int dx_buf_offset,
- int dy_buf_step,
- int dy_buf_offset,
- int dx_step,
- int dx_offset,
- int dy_step,
- int dy_offset,
- int mag_step,
- int mag_offset
-)
-{
- dx_buf_step /= sizeof(*dx_buf);
- dx_buf_offset /= sizeof(*dx_buf);
- dy_buf_step /= sizeof(*dy_buf);
- dy_buf_offset /= sizeof(*dy_buf);
- dx_step /= sizeof(*dx);
- dx_offset /= sizeof(*dx);
- dy_step /= sizeof(*dy);
- dy_offset /= sizeof(*dy);
- mag_step /= sizeof(*mag);
- mag_offset /= sizeof(*mag);
-
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
-
- int lidx = get_local_id(0);
- int lidy = get_local_id(1);
-
- __local int sdx[18][16];
- __local int sdy[18][16];
-
- sdx[lidy + 1][lidx] =
- dx_buf[gidx + min(gidy, rows - 1) * dx_buf_step + dx_buf_offset];
- sdy[lidy + 1][lidx] =
- dy_buf[gidx + min(gidy, rows - 1) * dy_buf_step + dy_buf_offset];
- if(lidy == 0)
- {
- sdx[0][lidx] =
- dx_buf[gidx + min(max(gidy-1,0),rows-1) * dx_buf_step + dx_buf_offset];
- sdx[17][lidx] =
- dx_buf[gidx + min(gidy + 16, rows - 1) * dx_buf_step + dx_buf_offset];
-
- sdy[0][lidx] =
- dy_buf[gidx + min(max(gidy-1,0),rows-1) * dy_buf_step + dy_buf_offset];
- sdy[17][lidx] =
- dy_buf[gidx + min(gidy + 16, rows - 1) * dy_buf_step + dy_buf_offset];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if(gidx < cols && gidy < rows)
- {
- int x = sdx[lidy][lidx] + 2 * sdx[lidy + 1][lidx] + sdx[lidy + 2][lidx];
- int y = -sdy[lidy][lidx] + sdy[lidy + 2][lidx];
-
- dx[gidx + gidy * dx_step + dx_offset] = x;
- dy[gidx + gidy * dy_step + dy_offset] = y;
-
- mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] = calc(x, y);
- }
-}
-
-// calculate the magnitude of the filter pass combining both x and y directions
-// This is the non-buffered version(non-3x3 sobel)
-//
-// dx_buf dx buffer, calculated from calcSobelRowPass
-// dy_buf dy buffer, calculated from calcSobelRowPass
-// dx direvitive in x direction output
-// dy direvitive in y direction output
-// mag magnitude direvitive of xy output
-__kernel
-void calcMagnitude
-(
- __global const int * dx,
- __global const int * dy,
- __global float * mag,
- int rows,
- int cols,
- int dx_step,
- int dx_offset,
- int dy_step,
- int dy_offset,
- int mag_step,
- int mag_offset
-)
-{
- dx_step /= sizeof(*dx);
- dx_offset /= sizeof(*dx);
- dy_step /= sizeof(*dy);
- dy_offset /= sizeof(*dy);
- mag_step /= sizeof(*mag);
- mag_offset /= sizeof(*mag);
-
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
-
- if(gidy < rows && gidx < cols)
- {
- mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] =
- calc(
- dx[gidx + gidy * dx_step + dx_offset],
- dy[gidx + gidy * dy_step + dy_offset]
- );
- }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// 0.4142135623730950488016887242097 is tan(22.5)
-#define CANNY_SHIFT 15
-#define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
-
-//First pass of edge detection and non-maximum suppression
-// edgetype is set to for each pixel:
-// 0 - below low thres, not an edge
-// 1 - maybe an edge
-// 2 - is an edge, either magnitude is greater than high thres, or
-// Given estimates of the image gradients, a search is then carried out
-// to determine if the gradient magnitude assumes a local maximum in the gradient direction.
-// if the rounded gradient angle is zero degrees (i.e. the edge is in the north-south direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the west and east directions,
-// if the rounded gradient angle is 90 degrees (i.e. the edge is in the east-west direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north and south directions,
-// if the rounded gradient angle is 135 degrees (i.e. the edge is in the north east-south west direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north west and south east directions,
-// if the rounded gradient angle is 45 degrees (i.e. the edge is in the north west-south east direction)the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north east and south west directions.
-//
-// dx, dy direvitives of x and y direction
-// mag magnitudes calculated from calcMagnitude function
-// map output containing raw edge types
-__kernel
-void
-__attribute__((reqd_work_group_size(16,16,1)))
-calcMap
-(
- __global const int * dx,
- __global const int * dy,
- __global const float * mag,
- __global int * map,
- int rows,
- int cols,
- float low_thresh,
- float high_thresh,
- int dx_step,
- int dx_offset,
- int dy_step,
- int dy_offset,
- int mag_step,
- int mag_offset,
- int map_step,
- int map_offset
-)
-{
- dx_step /= sizeof(*dx);
- dx_offset /= sizeof(*dx);
- dy_step /= sizeof(*dy);
- dy_offset /= sizeof(*dy);
- mag_step /= sizeof(*mag);
- mag_offset /= sizeof(*mag);
- map_step /= sizeof(*map);
- map_offset /= sizeof(*map);
-
- mag += mag_offset;
- map += map_offset;
-
- __local float smem[18][18];
-
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
-
- int lidx = get_local_id(0);
- int lidy = get_local_id(1);
-
- int grp_idx = get_global_id(0) & 0xFFFFF0;
- int grp_idy = get_global_id(1) & 0xFFFFF0;
-
- int tid = lidx + lidy * 16;
- int lx = tid % 18;
- int ly = tid / 18;
- if(ly < 14)
- {
- smem[ly][lx] =
- mag[grp_idx + lx + min(grp_idy + ly, rows - 1) * mag_step];
- }
- if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
- {
- smem[ly + 14][lx] =
- mag[grp_idx + lx + min(grp_idy + ly + 14, rows -1) * mag_step];
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if(gidy < rows && gidx < cols)
- {
- int x = dx[gidx + gidy * dx_step];
- int y = dy[gidx + gidy * dy_step];
- const int s = (x ^ y) < 0 ? -1 : 1;
- const float m = smem[lidy + 1][lidx + 1];
- x = abs(x);
- y = abs(y);
-
- // 0 - the pixel can not belong to an edge
- // 1 - the pixel might belong to an edge
- // 2 - the pixel does belong to an edge
- int edge_type = 0;
- if(m > low_thresh)
- {
- const int tg22x = x * TG22;
- const int tg67x = tg22x + (x << (1 + CANNY_SHIFT));
- y <<= CANNY_SHIFT;
- if(y < tg22x)
- {
- if(m > smem[lidy + 1][lidx] && m >= smem[lidy + 1][lidx + 2])
- {
- edge_type = 1 + (int)(m > high_thresh);
- }
- }
- else if (y > tg67x)
- {
- if(m > smem[lidy][lidx + 1]&& m >= smem[lidy + 2][lidx + 1])
- {
- edge_type = 1 + (int)(m > high_thresh);
- }
- }
- else
- {
- if(m > smem[lidy][lidx + 1 - s]&& m > smem[lidy + 2][lidx + 1 + s])
- {
- edge_type = 1 + (int)(m > high_thresh);
- }
- }
- }
- map[gidx + 1 + (gidy + 1) * map_step] = edge_type;
- }
-}
-
-#undef CANNY_SHIFT
-#undef TG22
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// do Hysteresis for pixel whose edge type is 1
-//
-// If candidate pixel (edge type is 1) has a neighbour pixel (in 3x3 area) with type 2, it is believed to be part of an edge and
-// marked as edge. Each thread will iterate for 16 times to connect local edges.
-// Candidate pixel being identified as edge will then be tested if there is nearby potiential edge points. If there is, counter will
-// be incremented by 1 and the point location is stored. These potiential candidates will be processed further in next kernel.
-//
-// map raw edge type results calculated from calcMap.
-// st the potiential edge points found in this kernel call
-// counter the number of potiential edge points
-__kernel
-void
-__attribute__((reqd_work_group_size(16,16,1)))
-edgesHysteresisLocal
-(
- __global int * map,
- __global ushort2 * st,
- __global unsigned int * counter,
- int rows,
- int cols,
- int map_step,
- int map_offset
-)
-{
- map_step /= sizeof(*map);
- map_offset /= sizeof(*map);
-
- map += map_offset;
-
- __local int smem[18][18];
-
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
-
- int lidx = get_local_id(0);
- int lidy = get_local_id(1);
-
- int grp_idx = get_global_id(0) & 0xFFFFF0;
- int grp_idy = get_global_id(1) & 0xFFFFF0;
-
- int tid = lidx + lidy * 16;
- int lx = tid % 18;
- int ly = tid / 18;
- if(ly < 14)
- {
- smem[ly][lx] =
- map[grp_idx + lx + min(grp_idy + ly, rows - 1) * map_step];
- }
- if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
- {
- smem[ly + 14][lx] =
- map[grp_idx + lx + min(grp_idy + ly + 14, rows - 1) * map_step];
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if(gidy < rows && gidx < cols)
- {
- int n;
-
- #pragma unroll
- for (int k = 0; k < 16; ++k)
- {
- n = 0;
-
- if (smem[lidy + 1][lidx + 1] == 1)
- {
- n += smem[lidy ][lidx ] == 2;
- n += smem[lidy ][lidx + 1] == 2;
- n += smem[lidy ][lidx + 2] == 2;
-
- n += smem[lidy + 1][lidx ] == 2;
- n += smem[lidy + 1][lidx + 2] == 2;
-
- n += smem[lidy + 2][lidx ] == 2;
- n += smem[lidy + 2][lidx + 1] == 2;
- n += smem[lidy + 2][lidx + 2] == 2;
- }
-
- if (n > 0)
- smem[lidy + 1][lidx + 1] = 2;
- }
-
- const int e = smem[lidy + 1][lidx + 1];
- map[gidx + 1 + (gidy + 1) * map_step] = e;
-
- n = 0;
- if(e == 2)
- {
- n += smem[lidy ][lidx ] == 1;
- n += smem[lidy ][lidx + 1] == 1;
- n += smem[lidy ][lidx + 2] == 1;
-
- n += smem[lidy + 1][lidx ] == 1;
- n += smem[lidy + 1][lidx + 2] == 1;
-
- n += smem[lidy + 2][lidx ] == 1;
- n += smem[lidy + 2][lidx + 1] == 1;
- n += smem[lidy + 2][lidx + 2] == 1;
- }
-
- if(n > 0)
- {
- unsigned int ind = atomic_inc(counter);
- st[ind] = (ushort2)(gidx + 1, gidy + 1);
- }
- }
-}
-
-__constant int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
-__constant int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
-
-
-#define stack_size 512
-__kernel
-void
-__attribute__((reqd_work_group_size(128,1,1)))
-edgesHysteresisGlobal
-(
- __global int * map,
- __global ushort2 * st1,
- __global ushort2 * st2,
- __global int * counter,
- int rows,
- int cols,
- int count,
- int map_step,
- int map_offset
-)
-{
-
- map_step /= sizeof(*map);
- map_offset /= sizeof(*map);
-
- map += map_offset;
-
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
-
- int lidx = get_local_id(0);
- int lidy = get_local_id(1);
-
- int grp_idx = get_group_id(0);
- int grp_idy = get_group_id(1);
-
- __local unsigned int s_counter;
- __local unsigned int s_ind;
-
- __local ushort2 s_st[stack_size];
-
- if(lidx == 0)
- {
- s_counter = 0;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int ind = mad24(grp_idy, (int)get_local_size(0), grp_idx);
-
- if(ind < count)
- {
- ushort2 pos = st1[ind];
- if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
- {
- if (lidx < 8)
- {
- pos.x += c_dx[lidx];
- pos.y += c_dy[lidx];
-
- if (map[pos.x + pos.y * map_step] == 1)
- {
- map[pos.x + pos.y * map_step] = 2;
-
- ind = atomic_inc(&s_counter);
-
- s_st[ind] = pos;
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- while (s_counter > 0 && s_counter <= stack_size - get_local_size(0))
- {
- const int subTaskIdx = lidx >> 3;
- const int portion = min(s_counter, (uint)(get_local_size(0)>> 3));
-
- pos.x = pos.y = 0;
-
- if (subTaskIdx < portion)
- pos = s_st[s_counter - 1 - subTaskIdx];
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (lidx == 0)
- s_counter -= portion;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
- {
- pos.x += c_dx[lidx & 7];
- pos.y += c_dy[lidx & 7];
-
- if (map[pos.x + pos.y * map_step] == 1)
- {
- map[pos.x + pos.y * map_step] = 2;
-
- ind = atomic_inc(&s_counter);
-
- s_st[ind] = pos;
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-
- if (s_counter > 0)
- {
- if (lidx == 0)
- {
- ind = atomic_add(counter, s_counter);
- s_ind = ind - s_counter;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- ind = s_ind;
-
- for (int i = lidx; i < s_counter; i += get_local_size(0))
- {
- st2[ind + i] = s_st[i];
- }
- }
- }
- }
-}
-#undef stack_size
-
-//Get the edge result. egde type of value 2 will be marked as an edge point and set to 255. Otherwise 0.
-// map edge type mappings
-// dst edge output
-__kernel
-void getEdges
-(
- __global const int * map,
- __global uchar * dst,
- int rows,
- int cols,
- int map_step,
- int map_offset,
- int dst_step,
- int dst_offset
-)
-{
- map_step /= sizeof(*map);
- map_offset /= sizeof(*map);
-
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
-
- if(gidy < rows && gidx < cols)
- {
- dst[gidx + gidy * dst_step] = (uchar)(-(map[gidx + 1 + (gidy + 1) * map_step + map_offset] >> 1));
- }
-}
+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Sen Liu, swjtuls1987@126.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef WAVE_SIZE
-#define WAVE_SIZE 1
-#endif
-
-int calc_lut(__local int* smem, int val, int tid)
-{
- smem[tid] = val;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (tid == 0)
- for (int i = 1; i < 256; ++i)
- smem[i] += smem[i - 1];
- barrier(CLK_LOCAL_MEM_FENCE);
-
- return smem[tid];
-}
-
-#ifdef CPU
-void reduce(volatile __local int* smem, int val, int tid)
-{
- smem[tid] = val;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (tid < 128)
- smem[tid] = val += smem[tid + 128];
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (tid < 64)
- smem[tid] = val += smem[tid + 64];
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (tid < 32)
- smem[tid] += smem[tid + 32];
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (tid < 16)
- smem[tid] += smem[tid + 16];
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (tid < 8)
- smem[tid] += smem[tid + 8];
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (tid < 4)
- smem[tid] += smem[tid + 4];
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (tid < 2)
- smem[tid] += smem[tid + 2];
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (tid < 1)
- smem[256] = smem[tid] + smem[tid + 1];
- barrier(CLK_LOCAL_MEM_FENCE);
-}
-
-#else
-
-void reduce(__local volatile int* smem, int val, int tid)
-{
- smem[tid] = val;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (tid < 128)
- smem[tid] = val += smem[tid + 128];
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (tid < 64)
- smem[tid] = val += smem[tid + 64];
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (tid < 32)
- {
- smem[tid] += smem[tid + 32];
-#if WAVE_SIZE < 32
- } barrier(CLK_LOCAL_MEM_FENCE);
-
- if (tid < 16)
- {
-#endif
- smem[tid] += smem[tid + 16];
-#if WAVE_SIZE < 16
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (tid < 8)
- {
-#endif
- smem[tid] += smem[tid + 8];
- smem[tid] += smem[tid + 4];
- smem[tid] += smem[tid + 2];
- smem[tid] += smem[tid + 1];
- }
-}
-#endif
-
-__kernel void calcLut(__global __const uchar * src, __global uchar * lut,
- const int srcStep, const int dstStep,
- const int2 tileSize, const int tilesX,
- const int clipLimit, const float lutScale,
- const int src_offset, const int dst_offset)
-{
- __local int smem[512];
-
- const int tx = get_group_id(0);
- const int ty = get_group_id(1);
- const unsigned int tid = get_local_id(1) * get_local_size(0)
- + get_local_id(0);
-
- smem[tid] = 0;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1))
- {
- __global const uchar* srcPtr = src + mad24(ty * tileSize.y + i, srcStep, tx * tileSize.x + src_offset);
- for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0))
- {
- const int data = srcPtr[j];
- atomic_inc(&smem[data]);
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int tHistVal = smem[tid];
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (clipLimit > 0)
- {
- // clip histogram bar
- int clipped = 0;
- if (tHistVal > clipLimit)
- {
- clipped = tHistVal - clipLimit;
- tHistVal = clipLimit;
- }
-
- // find number of overall clipped samples
- reduce(smem, clipped, tid);
- barrier(CLK_LOCAL_MEM_FENCE);
-#ifdef CPU
- clipped = smem[256];
-#else
- clipped = smem[0];
-#endif
-
- // broadcast evaluated value
-
- __local int totalClipped;
-
- if (tid == 0)
- totalClipped = clipped;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // redistribute clipped samples evenly
-
- int redistBatch = totalClipped / 256;
- tHistVal += redistBatch;
-
- int residual = totalClipped - redistBatch * 256;
- if (tid < residual)
- ++tHistVal;
- }
-
- const int lutVal = calc_lut(smem, tHistVal, tid);
- uint ires = (uint)convert_int_rte(lutScale * lutVal);
- lut[(ty * tilesX + tx) * dstStep + tid + dst_offset] =
- convert_uchar(clamp(ires, (uint)0, (uint)255));
-}
-
-__kernel void transform(__global __const uchar * src,
- __global uchar * dst,
- __global uchar * lut,
- const int srcStep, const int dstStep, const int lutStep,
- const int cols, const int rows,
- const int2 tileSize,
- const int tilesX, const int tilesY,
- const int src_offset, const int dst_offset, int lut_offset)
-{
- const int x = get_global_id(0);
- const int y = get_global_id(1);
-
- if (x >= cols || y >= rows)
- return;
-
- const float tyf = (convert_float(y) / tileSize.y) - 0.5f;
- int ty1 = convert_int_rtn(tyf);
- int ty2 = ty1 + 1;
- const float ya = tyf - ty1;
- ty1 = max(ty1, 0);
- ty2 = min(ty2, tilesY - 1);
-
- const float txf = (convert_float(x) / tileSize.x) - 0.5f;
- int tx1 = convert_int_rtn(txf);
- int tx2 = tx1 + 1;
- const float xa = txf - tx1;
- tx1 = max(tx1, 0);
- tx2 = min(tx2, tilesX - 1);
-
- const int srcVal = src[mad24(y, srcStep, x + src_offset)];
-
- float res = 0;
-
- res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (1.0f - ya));
- res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (1.0f - ya));
- res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (ya));
- res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (ya));
-
- uint ires = (uint)convert_int_rte(res);
- dst[mad24(y, dstStep, x + dst_offset)] = convert_uchar(clamp(ires, (uint)0, (uint)255));
-}
+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Jiang Liyuan, jlyuan001.good@163.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if defined (__ATI__)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (__NVIDIA__)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-
-/************************************** convolve **************************************/
-
-__kernel void convolve_D5(__global float *src, __global float *temp1, __global float *dst,
- int rows, int cols, int src_step, int dst_step,int k_step, int kWidth, int kHeight,
- int src_offset, int dst_offset, int koffset)
-{
- __local float smem[16 + 2 * 8][16 + 2 * 8];
-
- int x = get_local_id(0);
- int y = get_local_id(1);
- int gx = get_global_id(0);
- int gy = get_global_id(1);
-
- // x | x 0 | 0
- // -----------
- // x | x 0 | 0
- // 0 | 0 0 | 0
- // -----------
- // 0 | 0 0 | 0
- smem[y][x] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset];
-
- // 0 | 0 x | x
- // -----------
- // 0 | 0 x | x
- // 0 | 0 0 | 0
- // -----------
- // 0 | 0 0 | 0
- smem[y][x + 16] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset];
-
- // 0 | 0 0 | 0
- // -----------
- // 0 | 0 0 | 0
- // x | x 0 | 0
- // -----------
- // x | x 0 | 0
- smem[y + 16][x] = src[min(gy + 8, rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset];
-
- // 0 | 0 0 | 0
- // -----------
- // 0 | 0 0 | 0
- // 0 | 0 x | x
- // -----------
- // 0 | 0 x | x
- smem[y + 16][x + 16] = src[min(gy + 8, rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset];
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (gx < cols && gy < rows)
- {
- float res = 0;
-
- for (int i = 0; i < kHeight; ++i)
- for (int j = 0; j < kWidth; ++j)
- res += smem[y + 8 - kHeight / 2 + i][x + 8 - kWidth / 2 + j] * temp1[i * k_step + j + koffset];
-
- dst[gy * dst_step + gx + dst_offset] = res;
- }
-}
+++ /dev/null
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Niko Li, newlife20080214@gmail.com
-// Zero Lin zero.lin@amd.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//
-
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_amd_fp64
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#elif defined (cl_khr_fp64)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-#endif
-
-#ifdef BORDER_CONSTANT
-#define EXTRAPOLATE(x, y, v) v = scalar;
-#elif defined BORDER_REPLICATE
-#define EXTRAPOLATE(x, y, v) \
- { \
- x = max(min(x, src_cols - 1), 0); \
- y = max(min(y, src_rows - 1), 0); \
- v = src[mad24(y, src_step, x + src_offset)]; \
- }
-#elif defined BORDER_WRAP
-#define EXTRAPOLATE(x, y, v) \
- { \
- if (x < 0) \
- x -= ((x - src_cols + 1) / src_cols) * src_cols; \
- if (x >= src_cols) \
- x %= src_cols; \
- \
- if (y < 0) \
- y -= ((y - src_rows + 1) / src_rows) * src_rows; \
- if( y >= src_rows ) \
- y %= src_rows; \
- v = src[mad24(y, src_step, x + src_offset)]; \
- }
-#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)
-#ifdef BORDER_REFLECT
-#define DELTA int delta = 0
-#else
-#define DELTA int delta = 1
-#endif
-#define EXTRAPOLATE(x, y, v) \
- { \
- DELTA; \
- if (src_cols == 1) \
- x = 0; \
- else \
- do \
- { \
- if( x < 0 ) \
- x = -x - 1 + delta; \
- else \
- x = src_cols - 1 - (x - src_cols) - delta; \
- } \
- while (x >= src_cols || x < 0); \
- \
- if (src_rows == 1) \
- y = 0; \
- else \
- do \
- { \
- if( y < 0 ) \
- y = -y - 1 + delta; \
- else \
- y = src_rows - 1 - (y - src_rows) - delta; \
- } \
- while (y >= src_rows || y < 0); \
- v = src[mad24(y, src_step, x + src_offset)]; \
- }
-#else
-#error No extrapolation method
-#endif
-
-#define NEED_EXTRAPOLATION(gx, gy) (gx >= src_cols || gy >= src_rows || gx < 0 || gy < 0)
-
-__kernel void copymakeborder
- (__global const GENTYPE *src,
- __global GENTYPE *dst,
- int dst_cols, int dst_rows,
- int src_cols, int src_rows,
- int src_step, int src_offset,
- int dst_step, int dst_offset,
- int top, int left, GENTYPE scalar)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < dst_cols && y < dst_rows)
- {
- int src_x = x - left;
- int src_y = y - top;
- int dst_index = mad24(y, dst_step, x + dst_offset);
-
- if (NEED_EXTRAPOLATION(src_x, src_y))
- EXTRAPOLATE(src_x, src_y, dst[dst_index])
- else
- {
- int src_index = mad24(src_y, src_step, src_x + src_offset);
- dst[dst_index] = src[src_index];
- }
- }
-}
+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Peng Xiao, pengxiao@outlook.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef WITH_MASK
-#define WITH_MASK 0
-#endif
-
-__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
-
-inline float ELEM_INT2(image2d_t _eig, int _x, int _y)
-{
- return read_imagef(_eig, sampler, (int2)(_x, _y)).x;
-}
-
-inline float ELEM_FLT2(image2d_t _eig, float2 pt)
-{
- return read_imagef(_eig, sampler, pt).x;
-}
-
-__kernel
- void findCorners
- (
- image2d_t eig,
- __global const char * mask,
- __global float2 * corners,
- const int mask_strip,// in pixels
- const float threshold,
- const int rows,
- const int cols,
- const int max_count,
- __global int * g_counter
- )
-{
- const int j = get_global_id(0);
- const int i = get_global_id(1);
-
- if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1
-#if WITH_MASK
- && mask[i * mask_strip + j] != 0
-#endif
- )
- {
- const float val = ELEM_INT2(eig, j, i);
-
- if (val > threshold)
- {
- float maxVal = val;
-
- maxVal = fmax(ELEM_INT2(eig, j - 1, i - 1), maxVal);
- maxVal = fmax(ELEM_INT2(eig, j , i - 1), maxVal);
- maxVal = fmax(ELEM_INT2(eig, j + 1, i - 1), maxVal);
-
- maxVal = fmax(ELEM_INT2(eig, j - 1, i), maxVal);
- maxVal = fmax(ELEM_INT2(eig, j + 1, i), maxVal);
-
- maxVal = fmax(ELEM_INT2(eig, j - 1, i + 1), maxVal);
- maxVal = fmax(ELEM_INT2(eig, j , i + 1), maxVal);
- maxVal = fmax(ELEM_INT2(eig, j + 1, i + 1), maxVal);
-
- if (val == maxVal)
- {
- const int ind = atomic_inc(g_counter);
-
- if (ind < max_count)
- corners[ind] = (float2)(j, i);
- }
- }
- }
-}
-
-//bitonic sort
-__kernel
- void sortCorners_bitonicSort
- (
- image2d_t eig,
- __global float2 * corners,
- const int count,
- const int stage,
- const int passOfStage
- )
-{
- const int threadId = get_global_id(0);
- if(threadId >= count / 2)
- {
- return;
- }
-
- const int sortOrder = (((threadId/(1 << stage)) % 2)) == 1 ? 1 : 0; // 0 is descent
-
- const int pairDistance = 1 << (stage - passOfStage);
- const int blockWidth = 2 * pairDistance;
-
- const int leftId = min( (threadId % pairDistance)
- + (threadId / pairDistance) * blockWidth, count );
-
- const int rightId = min( leftId + pairDistance, count );
-
- const float2 leftPt = corners[leftId];
- const float2 rightPt = corners[rightId];
-
- const float leftVal = ELEM_FLT2(eig, leftPt);
- const float rightVal = ELEM_FLT2(eig, rightPt);
-
- const bool compareResult = leftVal > rightVal;
-
- float2 greater = compareResult ? leftPt:rightPt;
- float2 lesser = compareResult ? rightPt:leftPt;
-
- corners[leftId] = sortOrder ? lesser : greater;
- corners[rightId] = sortOrder ? greater : lesser;
-}
-
-//selection sort for gfft
-//kernel is ported from Bolt library:
-//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl
-// Local sort will firstly sort elements of each workgroup using selection sort
-// its performance is O(n)
-__kernel
- void sortCorners_selectionSortLocal
- (
- image2d_t eig,
- __global float2 * corners,
- const int count,
- __local float2 * scratch
- )
-{
- int i = get_local_id(0); // index in workgroup
- int numOfGroups = get_num_groups(0); // index in workgroup
- int groupID = get_group_id(0);
- int wg = get_local_size(0); // workgroup size = block size
- int n; // number of elements to be processed for this work group
-
- int offset = groupID * wg;
- int same = 0;
- corners += offset;
- n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg;
- float2 pt1, pt2;
-
- pt1 = corners[min(i, n)];
- scratch[i] = pt1;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if(i >= n)
- {
- return;
- }
-
- float val1 = ELEM_FLT2(eig, pt1);
- float val2;
-
- int pos = 0;
- for (int j=0;j<n;++j)
- {
- pt2 = scratch[j];
- val2 = ELEM_FLT2(eig, pt2);
- if(val2 > val1)
- pos++;//calculate the rank of this element in this work group
- else
- {
- if(val1 > val2)
- continue;
- else
- {
- // val1 and val2 are same
- same++;
- }
- }
- }
- for (int j=0; j< same; j++)
- corners[pos + j] = pt1;
-}
-__kernel
- void sortCorners_selectionSortFinal
- (
- image2d_t eig,
- __global float2 * corners,
- const int count
- )
-{
- const int i = get_local_id(0); // index in workgroup
- const int numOfGroups = get_num_groups(0); // index in workgroup
- const int groupID = get_group_id(0);
- const int wg = get_local_size(0); // workgroup size = block size
- int pos = 0, same = 0;
- const int offset = get_group_id(0) * wg;
- const int remainder = count - wg*(numOfGroups-1);
-
- if((offset + i ) >= count)
- return;
- float2 pt1, pt2;
- pt1 = corners[groupID*wg + i];
-
- float val1 = ELEM_FLT2(eig, pt1);
- float val2;
-
- for(int j=0; j<numOfGroups-1; j++ )
- {
- for(int k=0; k<wg; k++)
- {
- pt2 = corners[j*wg + k];
- val2 = ELEM_FLT2(eig, pt2);
- if(val1 > val2)
- break;
- else
- {
- //Increment only if the value is not the same.
- if( val2 > val1 )
- pos++;
- else
- same++;
- }
- }
- }
-
- for(int k=0; k<remainder; k++)
- {
- pt2 = corners[(numOfGroups-1)*wg + k];
- val2 = ELEM_FLT2(eig, pt2);
- if(val1 > val2)
- break;
- else
- {
- //Don't increment if the value is the same.
- //Two elements are same if (*userComp)(jData, iData) and (*userComp)(iData, jData) are both false
- if(val2 > val1)
- pos++;
- else
- same++;
- }
- }
- for (int j=0; j< same; j++)
- corners[pos + j] = pt1;
-}
+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Shengen Yan,yanshengen@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////Macro for border type////////////////////////////////////////////
-/////////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef BORDER_REPLICATE
-//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
-#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
-#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
-#endif
-
-#ifdef BORDER_REFLECT
-//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i))
-#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i))
-#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
-#endif
-
-#ifdef BORDER_REFLECT101
-//BORDER_REFLECT101: gfedcb|abcdefgh|gfedcba
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i))
-#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i))
-#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
-#endif
-
-#ifdef BORDER_WRAP
-//BORDER_WRAP: cdefgh|abcdefgh|abcdefg
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i))
-#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i))
-#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
-#endif
-
-#define THREADS 256
-#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////calcHarris////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void calcHarris(__global const float *Dx,__global const float *Dy, __global float *dst,
- int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step,
- int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step,
- int dst_offset, int dst_rows, int dst_cols, int dst_step,
- float k)
-{
- int col = get_local_id(0);
- const int gX = get_group_id(0);
- const int gY = get_group_id(1);
- const int glx = get_global_id(0);
- const int gly = get_global_id(1);
-
- int dx_x_off = (dx_offset % dx_step) >> 2;
- int dx_y_off = dx_offset / dx_step;
- int dy_x_off = (dy_offset % dy_step) >> 2;
- int dy_y_off = dy_offset / dy_step;
- int dst_x_off = (dst_offset % dst_step) >> 2;
- int dst_y_off = dst_offset / dst_step;
-
- int dx_startX = gX * (THREADS-ksX+1) - anX + dx_x_off;
- int dx_startY = (gY << 1) - anY + dx_y_off;
- int dy_startX = gX * (THREADS-ksX+1) - anX + dy_x_off;
- int dy_startY = (gY << 1) - anY + dy_y_off;
- int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
- int dst_startY = (gY << 1) + dst_y_off;
-
- float dx_data[ksY+1],dy_data[ksY+1],data[3][ksY+1];
- __local float temp[6][THREADS];
-#ifdef BORDER_CONSTANT
- bool dx_con,dy_con;
- float dx_s,dy_s;
- for(int i=0; i < ksY+1; i++)
- {
- dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
- dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)];
- dx_data[i] = dx_con ? dx_s : 0.0;
- dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
- dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)];
- dy_data[i] = dy_con ? dy_s : 0.0;
- data[0][i] = dx_data[i] * dx_data[i];
- data[1][i] = dx_data[i] * dy_data[i];
- data[2][i] = dy_data[i] * dy_data[i];
- }
-#else
- int clamped_col = min(dst_cols, col);
- for(int i=0; i < ksY+1; i++)
- {
- int dx_selected_row;
- int dx_selected_col;
- dx_selected_row = ADDR_H(dx_startY+i, 0, dx_whole_rows);
- dx_selected_row = ADDR_B(dx_startY+i, dx_whole_rows, dx_selected_row);
- dx_selected_col = ADDR_L(dx_startX+clamped_col, 0, dx_whole_cols);
- dx_selected_col = ADDR_R(dx_startX+clamped_col, dx_whole_cols, dx_selected_col);
- dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
-
- int dy_selected_row;
- int dy_selected_col;
- dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows);
- dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row);
- dy_selected_col = ADDR_L(dy_startX+clamped_col, 0, dy_whole_cols);
- dy_selected_col = ADDR_R(dy_startX+clamped_col, dy_whole_cols, dy_selected_col);
- dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col];
-
- data[0][i] = dx_data[i] * dx_data[i];
- data[1][i] = dx_data[i] * dy_data[i];
- data[2][i] = dy_data[i] * dy_data[i];
- }
-#endif
- float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
- for(int i=1; i < ksY; i++)
- {
- sum0 += (data[0][i]);
- sum1 += (data[1][i]);
- sum2 += (data[2][i]);
- }
- float sum01,sum02,sum11,sum12,sum21,sum22;
- sum01 = sum0 + (data[0][0]);
- sum02 = sum0 + (data[0][ksY]);
- temp[0][col] = sum01;
- temp[1][col] = sum02;
- sum11 = sum1 + (data[1][0]);
- sum12 = sum1 + (data[1][ksY]);
- temp[2][col] = sum11;
- temp[3][col] = sum12;
- sum21 = sum2 + (data[2][0]);
- sum22 = sum2 + (data[2][ksY]);
- temp[4][col] = sum21;
- temp[5][col] = sum22;
- barrier(CLK_LOCAL_MEM_FENCE);
- if(col < (THREADS-(ksX-1)))
- {
- col += anX;
- int posX = dst_startX - dst_x_off + col - anX;
- int posY = (gly << 1);
- int till = (ksX + 1)%2;
- float tmp_sum[6]={ 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 };
- for(int k=0; k<6; k++)
- for(int i=-anX; i<=anX - till; i++)
- {
- tmp_sum[k] += temp[k][col+i];
- }
-
- if(posX < dst_cols && (posY) < dst_rows)
- {
- dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] =
- tmp_sum[0] * tmp_sum[4] - tmp_sum[2] * tmp_sum[2] - k * (tmp_sum[0] + tmp_sum[4]) * (tmp_sum[0] + tmp_sum[4]);
- }
- if(posX < dst_cols && (posY + 1) < dst_rows)
- {
- dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] =
- tmp_sum[1] * tmp_sum[5] - tmp_sum[3] * tmp_sum[3] - k * (tmp_sum[1] + tmp_sum[5]) * (tmp_sum[1] + tmp_sum[5]);
- }
- }
-}
+++ /dev/null
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Niko Li, newlife20080214@gmail.com
-// Jia Haipeng, jiahaipeng95@gmail.com
-// Xu Pang, pangxu010@163.com
-// Wenju He, wenju@multicorewareinc.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//
-#define PARTIAL_HISTOGRAM256_COUNT (256)
-#define HISTOGRAM256_BIN_COUNT (256)
-
-#define HISTOGRAM256_WORK_GROUP_SIZE (256)
-#define HISTOGRAM256_LOCAL_MEM_SIZE (HISTOGRAM256_BIN_COUNT)
-
-#define NBANKS (16)
-#define NBANKS_BIT (4)
-
-
-__kernel __attribute__((reqd_work_group_size(HISTOGRAM256_BIN_COUNT,1,1)))void calc_sub_hist_D0(
- __global const uint4* src,
- int src_step, int src_offset,
- __global int* globalHist,
- int dataCount, int cols,
- int inc_x, int inc_y,
- int hist_step)
-{
- __local int subhist[(HISTOGRAM256_BIN_COUNT << NBANKS_BIT)]; // NBINS*NBANKS
- int gid = get_global_id(0);
- int lid = get_local_id(0);
- int gx = get_group_id(0);
- int gsize = get_global_size(0);
- int lsize = get_local_size(0);
- const int shift = 8;
- const int mask = HISTOGRAM256_BIN_COUNT-1;
- int offset = (lid & (NBANKS-1));// lid % NBANKS
- uint4 data, temp1, temp2, temp3, temp4;
- src += src_offset;
-
- //clear LDS
- for(int i=0, idx=lid; i<(NBANKS >> 2); i++, idx += lsize)
- {
- subhist[idx] = 0;
- subhist[idx+=lsize] = 0;
- subhist[idx+=lsize] = 0;
- subhist[idx+=lsize] = 0;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- //read and scatter
- int y = gid/cols;
- int x = gid - mul24(y, cols);
- for(int idx=gid; idx<dataCount; idx+=gsize)
- {
- data = src[mad24(y, src_step, x)];
- temp1 = ((data & mask) << NBANKS_BIT) + offset;
- data >>= shift;
- temp2 = ((data & mask) << NBANKS_BIT) + offset;
- data >>= shift;
- temp3 = ((data & mask) << NBANKS_BIT) + offset;
- data >>= shift;
- temp4 = ((data & mask) << NBANKS_BIT) + offset;
-
- atomic_inc(subhist + temp1.x);
- atomic_inc(subhist + temp1.y);
- atomic_inc(subhist + temp1.z);
- atomic_inc(subhist + temp1.w);
-
- atomic_inc(subhist + temp2.x);
- atomic_inc(subhist + temp2.y);
- atomic_inc(subhist + temp2.z);
- atomic_inc(subhist + temp2.w);
-
- atomic_inc(subhist + temp3.x);
- atomic_inc(subhist + temp3.y);
- atomic_inc(subhist + temp3.z);
- atomic_inc(subhist + temp3.w);
-
- atomic_inc(subhist + temp4.x);
- atomic_inc(subhist + temp4.y);
- atomic_inc(subhist + temp4.z);
- atomic_inc(subhist + temp4.w);
-
- x += inc_x;
- int off = ((x>=cols) ? -1 : 0);
- x = mad24(off, cols, x);
- y += inc_y - off;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- //reduce local banks to single histogram per workgroup
- int bin1=0, bin2=0, bin3=0, bin4=0;
- for(int i=0; i<NBANKS; i+=4)
- {
- bin1 += subhist[(lid << NBANKS_BIT) + i];
- bin2 += subhist[(lid << NBANKS_BIT) + i+1];
- bin3 += subhist[(lid << NBANKS_BIT) + i+2];
- bin4 += subhist[(lid << NBANKS_BIT) + i+3];
- }
-
- globalHist[mad24(gx, hist_step, lid)] = bin1+bin2+bin3+bin4;
-}
-
-__kernel void __attribute__((reqd_work_group_size(1,HISTOGRAM256_BIN_COUNT,1)))
-calc_sub_hist_border_D0(__global const uchar* src, int src_step, int src_offset,
- __global int* globalHist, int left_col, int cols,
- int rows, int hist_step)
-{
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
- int lidy = get_local_id(1);
- int gx = get_group_id(0);
- int gy = get_group_id(1);
- int gn = get_num_groups(0);
- int rowIndex = mad24(gy, gn, gx);
-// rowIndex &= (PARTIAL_HISTOGRAM256_COUNT - 1);
-
- __local int subhist[HISTOGRAM256_LOCAL_MEM_SIZE];
- subhist[lidy] = 0;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- gidx = ((gidx>=left_col) ? (gidx+cols) : gidx);
- if(gidy<rows)
- {
- int src_index = src_offset + mad24(gidy, src_step, gidx);
- int p = (int)src[src_index];
-// p = gidy >= rows ? HISTOGRAM256_LOCAL_MEM_SIZE : p;
- atomic_inc(subhist + p);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- globalHist[mad24(rowIndex, hist_step, lidy)] += subhist[lidy];
-}
-
-__kernel __attribute__((reqd_work_group_size(256,1,1)))void merge_hist(__global int* buf,
- __global int* hist,
- int src_step)
-{
- int lx = get_local_id(0);
- int gx = get_group_id(0);
-
- int sum = 0;
-
- for(int i = lx; i < PARTIAL_HISTOGRAM256_COUNT; i += HISTOGRAM256_WORK_GROUP_SIZE)
- sum += buf[ mad24(i, src_step, gx)];
-
- __local int data[HISTOGRAM256_WORK_GROUP_SIZE];
- data[lx] = sum;
-
- for(int stride = HISTOGRAM256_WORK_GROUP_SIZE /2; stride > 0; stride >>= 1)
- {
- barrier(CLK_LOCAL_MEM_FENCE);
- if(lx < stride)
- data[lx] += data[lx + stride];
- }
-
- if(lx == 0)
- hist[gx] = data[0];
-}
-
-__kernel __attribute__((reqd_work_group_size(256,1,1)))
-void calLUT(__global uchar * dst, __constant int * hist, int total)
-{
- int lid = get_local_id(0);
- __local int sumhist[HISTOGRAM256_BIN_COUNT];
- __local float scale;
-
- sumhist[lid] = hist[lid];
- barrier(CLK_LOCAL_MEM_FENCE);
- if (lid == 0)
- {
- int sum = 0, i = 0;
- while (!sumhist[i])
- ++i;
-
- if (total == sumhist[i])
- {
- scale = 1;
- for (int j = 0; j < HISTOGRAM256_BIN_COUNT; ++j)
- sumhist[i] = i;
- }
- else
- {
- scale = 255.f/(total - sumhist[i]);
-
- for (sumhist[i++] = 0; i < HISTOGRAM256_BIN_COUNT; i++)
- {
- sum += sumhist[i];
- sumhist[i] = sum;
- }
- }
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
- dst[lid]= convert_uchar_sat_rte(convert_float(sumhist[lid])*scale);
-}
-
-/*
-///////////////////////////////equalizeHist//////////////////////////////////////////////////
-__kernel __attribute__((reqd_work_group_size(256,1,1)))void equalizeHist(
- __global uchar * src,
- __global uchar * dst,
- __constant int * hist,
- int srcstep,
- int srcoffset,
- int dststep,
- int dstoffset,
- int width,
- int height,
- float scale,
- int inc_x,
- int inc_y)
-{
- int gidx = get_global_id(0);
- int lid = get_local_id(0);
- int glb_size = get_global_size(0);
- src+=srcoffset;
- dst+=dstoffset;
- __local int sumhist[HISTOGRAM256_BIN_COUNT];
- __local uchar lut[HISTOGRAM256_BIN_COUNT+1];
-
- sumhist[lid]=hist[lid];
- barrier(CLK_LOCAL_MEM_FENCE);
- if(lid==0)
- {
- int sum = 0;
- for(int i=0;i<HISTOGRAM256_BIN_COUNT;i++)
- {
- sum+=sumhist[i];
- sumhist[i]=sum;
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- lut[lid]= convert_uchar_sat(convert_float(sumhist[lid])*scale);
- lut[0]=0;
- int pos_y = gidx / width;
- int pos_x = gidx - mul24(pos_y, width);
-
- for(int pos = gidx; pos < mul24(width,height); pos += glb_size)
- {
- int inaddr = mad24(pos_y,srcstep,pos_x);
- int outaddr = mad24(pos_y,dststep,pos_x);
- dst[outaddr] = lut[src[inaddr]];
- pos_x +=inc_x;
- int off = (pos_x >= width ? -1 : 0);
- pos_x = mad24(off,width,pos_x);
- pos_y += inc_y - off;
- }
-}
-*/
+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or bpied warranties, including, but not limited to, the bpied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
-
-////////////////////////////////////////////////////////////////////////
-// buildPointList
-
-#define PIXELS_PER_THREAD 16
-
-// TODO: add offset to support ROI
-__kernel void buildPointList(__global const uchar* src,
- int cols,
- int rows,
- int step,
- __global unsigned int* list,
- __global int* counter)
-{
- __local unsigned int s_queues[4][32 * PIXELS_PER_THREAD];
- __local int s_qsize[4];
- __local int s_globStart[4];
-
- const int x = get_group_id(0) * get_local_size(0) * PIXELS_PER_THREAD + get_local_id(0);
- const int y = get_global_id(1);
-
- if (get_local_id(0) == 0)
- s_qsize[get_local_id(1)] = 0;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (y < rows)
- {
- // fill the queue
- __global const uchar* srcRow = &src[y * step];
- for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < cols; ++i, xx += get_local_size(0))
- {
- if (srcRow[xx])
- {
- const unsigned int val = (y << 16) | xx;
- const int qidx = atomic_add(&s_qsize[get_local_id(1)], 1);
- s_queues[get_local_id(1)][qidx] = val;
- }
- }
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // let one work-item reserve the space required in the global list
- if (get_local_id(0) == 0 && get_local_id(1) == 0)
- {
- // find how many items are stored in each list
- int totalSize = 0;
- for (int i = 0; i < get_local_size(1); ++i)
- {
- s_globStart[i] = totalSize;
- totalSize += s_qsize[i];
- }
-
- // calculate the offset in the global list
- const int globalOffset = atomic_add(counter, totalSize);
- for (int i = 0; i < get_local_size(1); ++i)
- s_globStart[i] += globalOffset;
- }
-
- barrier(CLK_GLOBAL_MEM_FENCE);
-
- // copy local queues to global queue
- const int qsize = s_qsize[get_local_id(1)];
- int gidx = s_globStart[get_local_id(1)] + get_local_id(0);
- for(int i = get_local_id(0); i < qsize; i += get_local_size(0), gidx += get_local_size(0))
- list[gidx] = s_queues[get_local_id(1)][i];
-}
-
-////////////////////////////////////////////////////////////////////////
-// circlesAccumCenters
-
-// TODO: add offset to support ROI
-__kernel void circlesAccumCenters(__global const unsigned int* list,
- const int count,
- __global const int* dx,
- const int dxStep,
- __global const int* dy,
- const int dyStep,
- __global int* accum,
- const int accumStep,
- const int width,
- const int height,
- const int minRadius,
- const int maxRadius,
- const float idp)
-{
- const int dxStepInPixel = dxStep / sizeof(int);
- const int dyStepInPixel = dyStep / sizeof(int);
- const int accumStepInPixel = accumStep / sizeof(int);
-
- const int SHIFT = 10;
- const int ONE = 1 << SHIFT;
-
- // const int tid = blockIdx.x * blockDim.x + threadIdx.x;
- const int wid = get_global_id(0);
-
- if (wid >= count)
- return;
-
- const unsigned int val = list[wid];
-
- const int x = (val & 0xFFFF);
- const int y = (val >> 16) & 0xFFFF;
-
- const int vx = dx[mad24(y, dxStepInPixel, x)];
- const int vy = dy[mad24(y, dyStepInPixel, x)];
-
- if (vx == 0 && vy == 0)
- return;
-
- const float mag = sqrt(convert_float(vx * vx + vy * vy));
-
- const int x0 = convert_int_rte((x * idp) * ONE);
- const int y0 = convert_int_rte((y * idp) * ONE);
-
- int sx = convert_int_rte((vx * idp) * ONE / mag);
- int sy = convert_int_rte((vy * idp) * ONE / mag);
-
- // Step from minRadius to maxRadius in both directions of the gradient
- for (int k1 = 0; k1 < 2; ++k1)
- {
- int x1 = x0 + minRadius * sx;
- int y1 = y0 + minRadius * sy;
-
- for (int r = minRadius; r <= maxRadius; x1 += sx, y1 += sy, ++r)
- {
- const int x2 = x1 >> SHIFT;
- const int y2 = y1 >> SHIFT;
-
- if (x2 < 0 || x2 >= width || y2 < 0 || y2 >= height)
- break;
-
- atomic_add(&accum[mad24(y2+1, accumStepInPixel, x2+1)], 1);
- }
-
- sx = -sx;
- sy = -sy;
- }
-}
-
-// ////////////////////////////////////////////////////////////////////////
-// // buildCentersList
-
-// TODO: add offset to support ROI
-__kernel void buildCentersList(__global const int* accum,
- const int accumCols,
- const int accumRows,
- const int accumStep,
- __global unsigned int* centers,
- const int threshold,
- __global int* counter)
-{
- const int accumStepInPixel = accumStep/sizeof(int);
-
- const int x = get_global_id(0);
- const int y = get_global_id(1);
-
- if (x < accumCols - 2 && y < accumRows - 2)
- {
- const int top = accum[mad24(y, accumStepInPixel, x + 1)];
-
- const int left = accum[mad24(y + 1, accumStepInPixel, x)];
- const int cur = accum[mad24(y + 1, accumStepInPixel, x + 1)];
- const int right = accum[mad24(y + 1, accumStepInPixel, x + 2)];
-
- const int bottom = accum[mad24(y + 2, accumStepInPixel, x + 1)];;
-
- if (cur > threshold && cur > top && cur >= bottom && cur > left && cur >= right)
- {
- const unsigned int val = (y << 16) | x;
- const int idx = atomic_add(counter, 1);
- centers[idx] = val;
- }
- }
-}
-
-
-// ////////////////////////////////////////////////////////////////////////
-// // circlesAccumRadius
-
-// TODO: add offset to support ROI
-__kernel void circlesAccumRadius(__global const unsigned int* centers,
- __global const unsigned int* list, const int count,
- __global float4* circles, const int maxCircles,
- const float dp,
- const int minRadius, const int maxRadius,
- const int histSize,
- const int threshold,
- __local int* smem,
- __global int* counter)
-{
- for (int i = get_local_id(0); i < histSize + 2; i += get_local_size(0))
- smem[i] = 0;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- unsigned int val = centers[get_group_id(0)];
-
- float cx = convert_float(val & 0xFFFF);
- float cy = convert_float((val >> 16) & 0xFFFF);
-
- cx = (cx + 0.5f) * dp;
- cy = (cy + 0.5f) * dp;
-
- for (int i = get_local_id(0); i < count; i += get_local_size(0))
- {
- val = list[i];
-
- const int x = (val & 0xFFFF);
- const int y = (val >> 16) & 0xFFFF;
-
- const float rad = sqrt((cx - x) * (cx - x) + (cy - y) * (cy - y));
- if (rad >= minRadius && rad <= maxRadius)
- {
- const int r = convert_int_rte(rad - minRadius);
-
- atomic_add(&smem[r + 1], 1);
- }
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- for (int i = get_local_id(0); i < histSize; i += get_local_size(0))
- {
- const int curVotes = smem[i + 1];
-
- if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2])
-
- {
- const int ind = atomic_add(counter, 1);
- if (ind < maxCircles)
- {
- circles[ind] = (float4)(cx, cy, convert_float(i + minRadius), 0.0f);
- }
- }
- }
-}
+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Shengen Yan,yanshengen@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-#define LSIZE 256
-#define LSIZE_1 255
-#define LSIZE_2 254
-#define HF_LSIZE 128
-#define LOG_LSIZE 8
-#define LOG_NUM_BANKS 5
-#define NUM_BANKS 32
-#define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS)
-
-
-kernel void integral_cols_D4(__global uchar4 *src,__global int *sum ,__global float *sqsum,
- int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
-{
- unsigned int lid = get_local_id(0);
- unsigned int gid = get_group_id(0);
- int4 src_t[2], sum_t[2];
- float4 sqsum_t[2];
- __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
- __local float4 lm_sqsum[2][LSIZE + LOG_LSIZE];
- __local int* sum_p;
- __local float* sqsum_p;
- src_step = src_step >> 2;
- gid = gid << 1;
- for(int i = 0; i < rows; i =i + LSIZE_1)
- {
- src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : 0);
- src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : 0);
-
- sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
- sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
- sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
- sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
- lm_sum[0][bf_loc] = src_t[0];
- lm_sqsum[0][bf_loc] = convert_float4(src_t[0] * src_t[0]);
-
- lm_sum[1][bf_loc] = src_t[1];
- lm_sqsum[1][bf_loc] = convert_float4(src_t[1] * src_t[1]);
-
- int offset = 1;
- for(int d = LSIZE >> 1 ; d > 0; d>>=1)
- {
- barrier(CLK_LOCAL_MEM_FENCE);
- int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
- ai += GET_CONFLICT_OFFSET(ai);
- bi += GET_CONFLICT_OFFSET(bi);
-
- if((lid & 127) < d)
- {
- lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
- lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
- }
- offset <<= 1;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if(lid < 2)
- {
- lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
- lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
- }
- for(int d = 1; d < LSIZE; d <<= 1)
- {
- barrier(CLK_LOCAL_MEM_FENCE);
- offset >>= 1;
- int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
- ai += GET_CONFLICT_OFFSET(ai);
- bi += GET_CONFLICT_OFFSET(bi);
-
- if((lid & 127) < d)
- {
- lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
- lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
-
- lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
- lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
- if(lid > 0 && (i+lid) <= rows)
- {
- lm_sum[0][bf_loc] += sum_t[0];
- lm_sum[1][bf_loc] += sum_t[1];
- lm_sqsum[0][bf_loc] += sqsum_t[0];
- lm_sqsum[1][bf_loc] += sqsum_t[1];
- sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
- sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc]));
- for(int k = 0; k < 4; k++)
- {
- if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
- sum[loc_s0 + k * dst_step / 4] = sum_p[k];
- sqsum[loc_s0 + k * dst_step / 4] = sqsum_p[k];
- }
- sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
- sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc]));
- for(int k = 0; k < 4; k++)
- {
- if(gid * 4 + k + 4 >= cols + pre_invalid) break;
- sum[loc_s1 + k * dst_step / 4] = sum_p[k];
- sqsum[loc_s1 + k * dst_step / 4] = sqsum_p[k];
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-}
-
-
-kernel void integral_rows_D4(__global int4 *srcsum,__global float4 * srcsqsum,__global int *sum ,
- __global float *sqsum,int rows,int cols,int src_step,int sum_step,
- int sqsum_step,int sum_offset,int sqsum_offset)
-{
- unsigned int lid = get_local_id(0);
- unsigned int gid = get_group_id(0);
- int4 src_t[2], sum_t[2];
- float4 sqsrc_t[2],sqsum_t[2];
- __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
- __local float4 lm_sqsum[2][LSIZE + LOG_LSIZE];
- __local int *sum_p;
- __local float *sqsum_p;
- src_step = src_step >> 4;
- for(int i = 0; i < rows; i =i + LSIZE_1)
- {
- src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (int4)0;
- sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : (float4)0;
- src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (int4)0;
- sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
-
- sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
- sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
- sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
- sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
- lm_sum[0][bf_loc] = src_t[0];
- lm_sqsum[0][bf_loc] = sqsrc_t[0];
-
- lm_sum[1][bf_loc] = src_t[1];
- lm_sqsum[1][bf_loc] = sqsrc_t[1];
-
- int offset = 1;
- for(int d = LSIZE >> 1 ; d > 0; d>>=1)
- {
- barrier(CLK_LOCAL_MEM_FENCE);
- int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
- ai += GET_CONFLICT_OFFSET(ai);
- bi += GET_CONFLICT_OFFSET(bi);
-
- if((lid & 127) < d)
- {
- lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
- lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
- }
- offset <<= 1;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if(lid < 2)
- {
- lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
- lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
- }
- for(int d = 1; d < LSIZE; d <<= 1)
- {
- barrier(CLK_LOCAL_MEM_FENCE);
- offset >>= 1;
- int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
- ai += GET_CONFLICT_OFFSET(ai);
- bi += GET_CONFLICT_OFFSET(bi);
-
- if((lid & 127) < d)
- {
- lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
- lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
-
- lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
- lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if(gid == 0 && (i + lid) <= rows)
- {
- sum[sum_offset + i + lid] = 0;
- sqsum[sqsum_offset + i + lid] = 0;
- }
- if(i + lid == 0)
- {
- int loc0 = gid * 2 * sum_step;
- int loc1 = gid * 2 * sqsum_step;
- for(int k = 1; k <= 8; k++)
- {
- if(gid * 8 + k > cols) break;
- sum[sum_offset + loc0 + k * sum_step / 4] = 0;
- sqsum[sqsum_offset + loc1 + k * sqsum_step / 4] = 0;
- }
- }
- int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
- int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ;
- if(lid > 0 && (i+lid) <= rows)
- {
- lm_sum[0][bf_loc] += sum_t[0];
- lm_sum[1][bf_loc] += sum_t[1];
- lm_sqsum[0][bf_loc] += sqsum_t[0];
- lm_sqsum[1][bf_loc] += sqsum_t[1];
- sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
- sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc]));
- for(int k = 0; k < 4; k++)
- {
- if(gid * 8 + k >= cols) break;
- sum[loc_s0 + k * sum_step / 4] = sum_p[k];
- sqsum[loc_sq0 + k * sqsum_step / 4] = sqsum_p[k];
- }
- sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
- sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc]));
- for(int k = 0; k < 4; k++)
- {
- if(gid * 8 + 4 + k >= cols) break;
- sum[loc_s1 + k * sum_step / 4] = sum_p[k];
- sqsum[loc_sq1 + k * sqsum_step / 4] = sqsum_p[k];
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-}
-
-kernel void integral_cols_D5(__global uchar4 *src,__global float *sum ,__global float *sqsum,
- int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
-{
- unsigned int lid = get_local_id(0);
- unsigned int gid = get_group_id(0);
- float4 src_t[2], sum_t[2];
- float4 sqsum_t[2];
- __local float4 lm_sum[2][LSIZE + LOG_LSIZE];
- __local float4 lm_sqsum[2][LSIZE + LOG_LSIZE];
- __local float* sum_p;
- __local float* sqsum_p;
- src_step = src_step >> 2;
- gid = gid << 1;
- for(int i = 0; i < rows; i =i + LSIZE_1)
- {
- src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : (float4)0);
- src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : (float4)0);
-
- sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
- sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
- sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
- sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
- lm_sum[0][bf_loc] = src_t[0];
- lm_sqsum[0][bf_loc] = convert_float4(src_t[0] * src_t[0]);
-
- lm_sum[1][bf_loc] = src_t[1];
- lm_sqsum[1][bf_loc] = convert_float4(src_t[1] * src_t[1]);
-
- int offset = 1;
- for(int d = LSIZE >> 1 ; d > 0; d>>=1)
- {
- barrier(CLK_LOCAL_MEM_FENCE);
- int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
- ai += GET_CONFLICT_OFFSET(ai);
- bi += GET_CONFLICT_OFFSET(bi);
-
- if((lid & 127) < d)
- {
- lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
- lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
- }
- offset <<= 1;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if(lid < 2)
- {
- lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
- lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
- }
- for(int d = 1; d < LSIZE; d <<= 1)
- {
- barrier(CLK_LOCAL_MEM_FENCE);
- offset >>= 1;
- int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
- ai += GET_CONFLICT_OFFSET(ai);
- bi += GET_CONFLICT_OFFSET(bi);
-
- if((lid & 127) < d)
- {
- lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
- lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
-
- lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
- lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
- if(lid > 0 && (i+lid) <= rows)
- {
- lm_sum[0][bf_loc] += sum_t[0];
- lm_sum[1][bf_loc] += sum_t[1];
- lm_sqsum[0][bf_loc] += sqsum_t[0];
- lm_sqsum[1][bf_loc] += sqsum_t[1];
- sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
- sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc]));
- for(int k = 0; k < 4; k++)
- {
- if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
- sum[loc_s0 + k * dst_step / 4] = sum_p[k];
- sqsum[loc_s0 + k * dst_step / 4] = sqsum_p[k];
- }
- sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
- sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc]));
- for(int k = 0; k < 4; k++)
- {
- if(gid * 4 + k + 4 >= cols + pre_invalid) break;
- sum[loc_s1 + k * dst_step / 4] = sum_p[k];
- sqsum[loc_s1 + k * dst_step / 4] = sqsum_p[k];
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-}
-
-
-kernel void integral_rows_D5(__global float4 *srcsum,__global float4 * srcsqsum,__global float *sum ,
- __global float *sqsum,int rows,int cols,int src_step,int sum_step,
- int sqsum_step,int sum_offset,int sqsum_offset)
-{
- unsigned int lid = get_local_id(0);
- unsigned int gid = get_group_id(0);
- float4 src_t[2], sum_t[2];
- float4 sqsrc_t[2],sqsum_t[2];
- __local float4 lm_sum[2][LSIZE + LOG_LSIZE];
- __local float4 lm_sqsum[2][LSIZE + LOG_LSIZE];
- __local float *sum_p;
- __local float *sqsum_p;
- src_step = src_step >> 4;
- for(int i = 0; i < rows; i =i + LSIZE_1)
- {
- src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (float4)0;
- sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : (float4)0;
- src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
- sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
-
- sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
- sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
- sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
- sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
- lm_sum[0][bf_loc] = src_t[0];
- lm_sqsum[0][bf_loc] = sqsrc_t[0];
-
- lm_sum[1][bf_loc] = src_t[1];
- lm_sqsum[1][bf_loc] = sqsrc_t[1];
-
- int offset = 1;
- for(int d = LSIZE >> 1 ; d > 0; d>>=1)
- {
- barrier(CLK_LOCAL_MEM_FENCE);
- int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
- ai += GET_CONFLICT_OFFSET(ai);
- bi += GET_CONFLICT_OFFSET(bi);
-
- if((lid & 127) < d)
- {
- lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
- lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
- }
- offset <<= 1;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if(lid < 2)
- {
- lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
- lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
- }
- for(int d = 1; d < LSIZE; d <<= 1)
- {
- barrier(CLK_LOCAL_MEM_FENCE);
- offset >>= 1;
- int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
- ai += GET_CONFLICT_OFFSET(ai);
- bi += GET_CONFLICT_OFFSET(bi);
-
- if((lid & 127) < d)
- {
- lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
- lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
-
- lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
- lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if(gid == 0 && (i + lid) <= rows)
- {
- sum[sum_offset + i + lid] = 0;
- sqsum[sqsum_offset + i + lid] = 0;
- }
- if(i + lid == 0)
- {
- int loc0 = gid * 2 * sum_step;
- int loc1 = gid * 2 * sqsum_step;
- for(int k = 1; k <= 8; k++)
- {
- if(gid * 8 + k > cols) break;
- sum[sum_offset + loc0 + k * sum_step / 4] = 0;
- sqsum[sqsum_offset + loc1 + k * sqsum_step / 4] = 0;
- }
- }
- int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
- int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ;
- if(lid > 0 && (i+lid) <= rows)
- {
- lm_sum[0][bf_loc] += sum_t[0];
- lm_sum[1][bf_loc] += sum_t[1];
- lm_sqsum[0][bf_loc] += sqsum_t[0];
- lm_sqsum[1][bf_loc] += sqsum_t[1];
- sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
- sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc]));
- for(int k = 0; k < 4; k++)
- {
- if(gid * 8 + k >= cols) break;
- sum[loc_s0 + k * sum_step / 4] = sum_p[k];
- sqsum[loc_sq0 + k * sqsum_step / 4] = sqsum_p[k];
- }
- sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
- sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc]));
- for(int k = 0; k < 4; k++)
- {
- if(gid * 8 + 4 + k >= cols) break;
- sum[loc_s1 + k * sum_step / 4] = sum_p[k];
- sqsum[loc_sq1 + k * sqsum_step / 4] = sqsum_p[k];
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-}
+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Shengen Yan,yanshengen@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-
-#define LSIZE 256
-#define LSIZE_1 255
-#define LSIZE_2 254
-#define HF_LSIZE 128
-#define LOG_LSIZE 8
-#define LOG_NUM_BANKS 5
-#define NUM_BANKS 32
-#define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS)
-
-
-kernel void integral_sum_cols_D4(__global uchar4 *src,__global int *sum ,
- int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
-{
- unsigned int lid = get_local_id(0);
- unsigned int gid = get_group_id(0);
- int4 src_t[2], sum_t[2];
- __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
- __local int* sum_p;
- src_step = src_step >> 2;
- gid = gid << 1;
- for(int i = 0; i < rows; i =i + LSIZE_1)
- {
- src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid]) : 0);
- src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid + 1]) : 0);
-
- sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
- sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
- lm_sum[0][bf_loc] = src_t[0];
-
- lm_sum[1][bf_loc] = src_t[1];
-
- int offset = 1;
- for(int d = LSIZE >> 1 ; d > 0; d>>=1)
- {
- barrier(CLK_LOCAL_MEM_FENCE);
- int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
- ai += GET_CONFLICT_OFFSET(ai);
- bi += GET_CONFLICT_OFFSET(bi);
-
- if((lid & 127) < d)
- {
- lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
- }
- offset <<= 1;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if(lid < 2)
- {
- lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
- }
- for(int d = 1; d < LSIZE; d <<= 1)
- {
- barrier(CLK_LOCAL_MEM_FENCE);
- offset >>= 1;
- int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
- ai += GET_CONFLICT_OFFSET(ai);
- bi += GET_CONFLICT_OFFSET(bi);
-
- if((lid & 127) < d)
- {
- lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
- lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if(lid > 0 && (i+lid) <= rows)
- {
- int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
- lm_sum[0][bf_loc] += sum_t[0];
- lm_sum[1][bf_loc] += sum_t[1];
- sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
- for(int k = 0; k < 4; k++)
- {
- if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
- sum[loc_s0 + k * dst_step / 4] = sum_p[k];
- }
- sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
- for(int k = 0; k < 4; k++)
- {
- if(gid * 4 + k + 4 >= cols + pre_invalid) break;
- sum[loc_s1 + k * dst_step / 4] = sum_p[k];
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-}
-
-
-kernel void integral_sum_rows_D4(__global int4 *srcsum,__global int *sum ,
- int rows,int cols,int src_step,int sum_step,
- int sum_offset)
-{
- unsigned int lid = get_local_id(0);
- unsigned int gid = get_group_id(0);
- int4 src_t[2], sum_t[2];
- __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
- __local int *sum_p;
- src_step = src_step >> 4;
- for(int i = 0; i < rows; i =i + LSIZE_1)
- {
- src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : 0;
- src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : 0;
-
- sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
- sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
- lm_sum[0][bf_loc] = src_t[0];
-
- lm_sum[1][bf_loc] = src_t[1];
-
- int offset = 1;
- for(int d = LSIZE >> 1 ; d > 0; d>>=1)
- {
- barrier(CLK_LOCAL_MEM_FENCE);
- int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
- ai += GET_CONFLICT_OFFSET(ai);
- bi += GET_CONFLICT_OFFSET(bi);
-
- if((lid & 127) < d)
- {
- lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
- }
- offset <<= 1;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if(lid < 2)
- {
- lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
- }
- for(int d = 1; d < LSIZE; d <<= 1)
- {
- barrier(CLK_LOCAL_MEM_FENCE);
- offset >>= 1;
- int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
- ai += GET_CONFLICT_OFFSET(ai);
- bi += GET_CONFLICT_OFFSET(bi);
-
- if((lid & 127) < d)
- {
- lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
- lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if(gid == 0 && (i + lid) <= rows)
- {
- sum[sum_offset + i + lid] = 0;
- }
- if(i + lid == 0)
- {
- int loc0 = gid * 2 * sum_step;
- for(int k = 1; k <= 8; k++)
- {
- if(gid * 8 + k > cols) break;
- sum[sum_offset + loc0 + k * sum_step / 4] = 0;
- }
- }
-
- if(lid > 0 && (i+lid) <= rows)
- {
- int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
- lm_sum[0][bf_loc] += sum_t[0];
- lm_sum[1][bf_loc] += sum_t[1];
- sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
- for(int k = 0; k < 4; k++)
- {
- if(gid * 8 + k >= cols) break;
- sum[loc_s0 + k * sum_step / 4] = sum_p[k];
- }
- sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
- for(int k = 0; k < 4; k++)
- {
- if(gid * 8 + 4 + k >= cols) break;
- sum[loc_s1 + k * sum_step / 4] = sum_p[k];
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-}
-
-kernel void integral_sum_cols_D5(__global uchar4 *src,__global float *sum ,
- int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
-{
- unsigned int lid = get_local_id(0);
- unsigned int gid = get_group_id(0);
- float4 src_t[2], sum_t[2];
- __local float4 lm_sum[2][LSIZE + LOG_LSIZE];
- __local float* sum_p;
- src_step = src_step >> 2;
- gid = gid << 1;
- for(int i = 0; i < rows; i =i + LSIZE_1)
- {
- src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + gid]) : (float4)0);
- src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + gid + 1]) : (float4)0);
-
- sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
- sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
- lm_sum[0][bf_loc] = src_t[0];
-
- lm_sum[1][bf_loc] = src_t[1];
-
- int offset = 1;
- for(int d = LSIZE >> 1 ; d > 0; d>>=1)
- {
- barrier(CLK_LOCAL_MEM_FENCE);
- int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
- ai += GET_CONFLICT_OFFSET(ai);
- bi += GET_CONFLICT_OFFSET(bi);
-
- if((lid & 127) < d)
- {
- lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
- }
- offset <<= 1;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if(lid < 2)
- {
- lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
- }
- for(int d = 1; d < LSIZE; d <<= 1)
- {
- barrier(CLK_LOCAL_MEM_FENCE);
- offset >>= 1;
- int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
- ai += GET_CONFLICT_OFFSET(ai);
- bi += GET_CONFLICT_OFFSET(bi);
-
- if((lid & 127) < d)
- {
- lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
- lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if(lid > 0 && (i+lid) <= rows)
- {
- int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
- lm_sum[0][bf_loc] += sum_t[0];
- lm_sum[1][bf_loc] += sum_t[1];
- sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
- for(int k = 0; k < 4; k++)
- {
- if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
- sum[loc_s0 + k * dst_step / 4] = sum_p[k];
- }
- sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
- for(int k = 0; k < 4; k++)
- {
- if(gid * 4 + k + 4 >= cols + pre_invalid) break;
- sum[loc_s1 + k * dst_step / 4] = sum_p[k];
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-}
-
-
-kernel void integral_sum_rows_D5(__global float4 *srcsum,__global float *sum ,
- int rows,int cols,int src_step,int sum_step,
- int sum_offset)
-{
- unsigned int lid = get_local_id(0);
- unsigned int gid = get_group_id(0);
- float4 src_t[2], sum_t[2];
- __local float4 lm_sum[2][LSIZE + LOG_LSIZE];
- __local float *sum_p;
- src_step = src_step >> 4;
- for(int i = 0; i < rows; i =i + LSIZE_1)
- {
- src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (float4)0;
- src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
-
- sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
- sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
- lm_sum[0][bf_loc] = src_t[0];
-
- lm_sum[1][bf_loc] = src_t[1];
-
- int offset = 1;
- for(int d = LSIZE >> 1 ; d > 0; d>>=1)
- {
- barrier(CLK_LOCAL_MEM_FENCE);
- int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
- ai += GET_CONFLICT_OFFSET(ai);
- bi += GET_CONFLICT_OFFSET(bi);
-
- if((lid & 127) < d)
- {
- lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
- }
- offset <<= 1;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if(lid < 2)
- {
- lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
- }
- for(int d = 1; d < LSIZE; d <<= 1)
- {
- barrier(CLK_LOCAL_MEM_FENCE);
- offset >>= 1;
- int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
- ai += GET_CONFLICT_OFFSET(ai);
- bi += GET_CONFLICT_OFFSET(bi);
-
- if((lid & 127) < d)
- {
- lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
- lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if(gid == 0 && (i + lid) <= rows)
- {
- sum[sum_offset + i + lid] = 0;
- }
- if(i + lid == 0)
- {
- int loc0 = gid * 2 * sum_step;
- for(int k = 1; k <= 8; k++)
- {
- if(gid * 8 + k > cols) break;
- sum[sum_offset + loc0 + k * sum_step / 4] = 0;
- }
- }
-
- if(lid > 0 && (i+lid) <= rows)
- {
- int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
- lm_sum[0][bf_loc] += sum_t[0];
- lm_sum[1][bf_loc] += sum_t[1];
- sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
- for(int k = 0; k < 4; k++)
- {
- if(gid * 8 + k >= cols) break;
- sum[loc_s0 + k * sum_step / 4] = sum_p[k];
- }
- sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
- for(int k = 0; k < 4; k++)
- {
- if(gid * 8 + 4 + k >= cols) break;
- sum[loc_s1 + k * sum_step / 4] = sum_p[k];
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-}
+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Pang Erping, erping@multicorewareinc.com
-// Jia Haipeng, jiahaipeng95@gmail.com
-// Peng Xiao, pengxiao@outlook.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////Macro for border type////////////////////////////////////////////
-/////////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef BORDER_REPLICATE
-
-//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
-#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) : (i))
-#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
-#endif
-
-#ifdef BORDER_REFLECT
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? ((l_edge)<<1)-(i)-1 : (i))
-#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? ((t_edge)<<1)-(i)-1 : (i))
-#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
-#endif
-
-#ifdef BORDER_REFLECT_101
-//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? ((l_edge)<<1)-(i) : (i))
-#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? ((t_edge)<<1)-(i) : (i))
-#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
-#endif
-
-#ifdef IMG_C_1_0
-#define T_IMG uchar
-#define T_IMGx4 uchar4
-#define T_IMG_C1 uchar
-#define CONVERT_TYPE convert_uchar_sat
-#define CONVERT_TYPEx4 convert_uchar4_sat
-#endif
-#ifdef IMG_C_4_0
-#define T_IMG uchar4
-#define T_IMGx4 uchar16
-#define T_IMG_C1 uchar
-#define CONVERT_TYPE convert_uchar4_sat
-#define CONVERT_TYPEx4 convert_uchar16_sat
-#endif
-#ifdef IMG_C_1_5
-#define T_IMG float
-#define T_IMGx4 float4
-#define T_IMG_C1 float
-#define CONVERT_TYPE convert_float
-#define CONVERT_TYPEx4 convert_float4
-#endif
-#ifdef IMG_C_4_5
-#define T_IMG float4
-#define T_IMGx4 float16
-#define T_IMG_C1 float
-#define CONVERT_TYPE convert_float4
-#define CONVERT_TYPEx4 convert_float16
-#endif
-
-#ifndef CN
-#define CN 1
-#endif
-
-#if CN == 1
-#define T_SUM float
-#define T_SUMx4 float4
-#define CONVERT_TYPE_SUM convert_float
-#define CONVERT_TYPE_SUMx4 convert_float4
-#define SUM_ZERO (0.0f)
-#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f)
-#define VLOAD4 vload4
-#define SX x
-#define SY y
-#define SZ z
-#define SW w
-#elif CN == 4
-#define T_SUM float4
-#define T_SUMx4 float16
-#define CONVERT_TYPE_SUM convert_float4
-#define CONVERT_TYPE_SUMx4 convert_float16
-#define SUM_ZERO (0.0f, 0.0f, 0.0f, 0.0f)
-#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f)
-#define VLOAD4 vload16
-#define SX s0123
-#define SY s4567
-#define SZ s89ab
-#define SW scdef
-#endif
-
-#ifndef FILTER_SIZE
-#define FILTER_SIZE 3
-#endif
-
-#define LOCAL_GROUP_SIZE 16
-
-#define LOCAL_WIDTH ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE)
-#define LOCAL_HEIGHT ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE)
-
-#define FILTER_RADIUS (FILTER_SIZE >> 1)
-
-__kernel void filter2D(
- __global T_IMG *src,
- __global T_IMG *dst,
- int src_step,
- int dst_step,
- __constant float *mat_kernel,
- __local T_IMG *local_data,
- int wholerows,
- int wholecols,
- int src_offset_x,
- int src_offset_y,
- int dst_offset_x,
- int dst_offset_y,
- int cols,
- int rows,
- int operate_cols
-)
-{
- int groupStartCol = get_group_id(0) * get_local_size(0);
- int groupStartRow = get_group_id(1) * get_local_size(1);
-
- int localCol = get_local_id(0);
- int localRow = get_local_id(1);
- int globalCol = groupStartCol + localCol;
- int globalRow = groupStartRow + localRow;
- const int src_offset = mad24(src_offset_y, src_step, src_offset_x);
- const int dst_offset = mad24(dst_offset_y, dst_step, dst_offset_x);
-
-#ifdef BORDER_CONSTANT
- for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1))
- {
- int curRow = groupStartRow + i;
- for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0))
- {
- int curCol = groupStartCol + j;
- if(curRow < FILTER_RADIUS - src_offset_y || (curRow - FILTER_RADIUS) >= wholerows - src_offset_y||
- curCol < FILTER_RADIUS - src_offset_x || (curCol - FILTER_RADIUS) >= wholecols - src_offset_x)
- {
- local_data[(i) * LOCAL_WIDTH + j] = 0;
- }
- else
- {
- local_data[(i) * LOCAL_WIDTH + j] = src[(curRow - FILTER_RADIUS) * src_step + curCol - FILTER_RADIUS + src_offset];
- }
- }
- }
-#else
- for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1))
- {
- int curRow = groupStartRow + i;
-
- curRow = ADDR_H(curRow, FILTER_RADIUS - src_offset_y, wholerows - src_offset_y);
-
- curRow = ADDR_B(curRow - FILTER_RADIUS, wholerows - src_offset_y, curRow - FILTER_RADIUS);
-
- for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0))
- {
- int curCol = groupStartCol + j;
- curCol = ADDR_L(curCol, FILTER_RADIUS - src_offset_x, wholecols - src_offset_x);
- curCol = ADDR_R(curCol - FILTER_RADIUS, wholecols - src_offset_x, curCol - FILTER_RADIUS);
- if(curRow < wholerows && curCol < wholecols)
- {
- local_data[(i) * LOCAL_WIDTH + j] = src[(curRow) * src_step + curCol + src_offset];
- }
- }
- }
-#endif
-
- barrier(CLK_LOCAL_MEM_FENCE);
- if(globalRow < rows && globalCol < cols)
- {
- T_SUM sum = (T_SUM)(SUM_ZERO);
- int filterIdx = 0;
- for(int i = 0; i < FILTER_SIZE; i++)
- {
- int offset = (i + localRow) * LOCAL_WIDTH;
-
- for(int j = 0; j < FILTER_SIZE; j++)
- {
- sum += CONVERT_TYPE_SUM(local_data[offset + j + localCol]) * mat_kernel[filterIdx++];
- }
- }
- dst[(globalRow)*dst_step + (globalCol) + dst_offset] = CONVERT_TYPE(sum);
- }
-}
-
-/// following is specific for 3x3 kernels
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////Macro for define elements number per thread/////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#define ANX 1
-#define ANY 1
-
-#define ROWS_PER_GROUP 4
-#define ROWS_PER_GROUP_BITS 2
-#define ROWS_FETCH (ROWS_PER_GROUP + ANY + ANY) //(ROWS_PER_GROUP + anY * 2)
-
-#define THREADS_PER_ROW 64
-#define THREADS_PER_ROW_BIT 6
-
-#define ELEMENTS_PER_THREAD 4
-#define ELEMENTS_PER_THREAD_BIT 2
-
-#define LOCAL_MEM_STEP 260 //divup((get_local_size(0) + anX * 2), 4) * 4
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-__kernel void filter2D_3x3(
- __global T_IMG *src,
- __global T_IMG *dst,
- int src_step,
- int dst_step,
- __constant float *mat_kernel,
- __local T_IMG *local_data,
- int wholerows,
- int wholecols,
- int src_offset_x,
- int src_offset_y,
- int dst_offset_x,
- int dst_offset_y,
- int cols,
- int rows,
- int operate_cols
-)
-{
- int gX = get_global_id(0);
- int gY = get_global_id(1);
-
- int lX = get_local_id(0);
-
- int groupX_size = get_local_size(0);
- int groupX_id = get_group_id(0);
-
-#define dst_align (dst_offset_x & 3)
- int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
- int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
-
- if((gY << 2) < rows)
- {
- for(int i = 0; i < ROWS_FETCH; ++i)
- {
- if((rows_start_index - src_offset_y) + i < rows + ANY)
- {
-#ifdef BORDER_CONSTANT
- int selected_row = rows_start_index + i;
- int selected_cols = cols_start_index_group + lX;
-
- T_IMG data = src[mad24(selected_row, src_step, selected_cols)];
- int con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols;
- data = con ? data : (T_IMG)(0);
- local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data;
-
- if(lX < (ANX << 1))
- {
- selected_cols = cols_start_index_group + lX + groupX_size;
-
- data = src[mad24(selected_row, src_step, selected_cols)];
- con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols;
- data = con ? data : (T_IMG)(0);
- local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data;
- }
-#else
- int selected_row = ADDR_H(rows_start_index + i, 0, wholerows);
- selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row);
-
- int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols);
- selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
-
- T_IMG data = src[mad24(selected_row, src_step, selected_cols)];
-
- local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data;
-
- if(lX < (ANX << 1))
- {
- selected_cols = cols_start_index_group + lX + groupX_size;
- selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
-
- data = src[mad24(selected_row, src_step, selected_cols)];
- local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data;
- }
-#endif
- }
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2);
- if(((gY << 2) < rows) && (process_col < operate_cols))
- {
- int dst_cols_start = dst_offset_x;
- int dst_cols_end = dst_offset_x + cols;
- int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;
-
- int dst_rows_end = dst_offset_y + rows;
- int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
- dst = dst + mad24(dst_rows_index, dst_step, dst_cols_index);
-
- T_IMGx4 dst_data = *(__global T_IMGx4 *)dst;
-
- T_SUMx4 sum = (T_SUMx4)SUM_ZEROx4;
- T_IMGx4 data;
-
- for(int i = 0; i < FILTER_SIZE; i++)
- {
-#pragma unroll
- for(int j = 0; j < FILTER_SIZE; j++)
- {
- if(dst_rows_index < dst_rows_end)
- {
- int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
- int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
-
- data = VLOAD4(0, (__local T_IMG_C1 *)(local_data + local_row * LOCAL_MEM_STEP + local_cols));
- sum = sum + (mat_kernel[i * FILTER_SIZE + j] * CONVERT_TYPE_SUMx4(data));
- }
- }
- }
-
- if(dst_rows_index < dst_rows_end)
- {
- T_IMGx4 tmp_dst = CONVERT_TYPEx4(sum);
- tmp_dst.SX = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ?
- tmp_dst.SX : dst_data.SX;
- tmp_dst.SY = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ?
- tmp_dst.SY : dst_data.SY;
- tmp_dst.SZ = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ?
- tmp_dst.SZ : dst_data.SZ;
- tmp_dst.SW = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ?
- tmp_dst.SW : dst_data.SW;
- *(__global T_IMGx4 *)dst = tmp_dst;
- }
- }
-}
+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Peng Xiao, pengxiao@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#pragma OPENCL EXTENSION cl_amd_printf : enable
-
-#if defined (DOUBLE_SUPPORT)
-
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-
-#define TYPE_IMAGE_SQSUM double
-#else
-#define TYPE_IMAGE_SQSUM float
-#endif
-
-#ifndef CN4
-#define CN4 1
-#else
-#define CN4 4
-#endif
-
-//////////////////////////////////////////////////
-// utilities
-#define SQSUMS_PTR(ox, oy) mad24(gidy + oy, img_sqsums_step, (gidx + img_sqsums_offset + ox) * CN4)
-#define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step, gidx + img_sums_offset + ox)
-// normAcc* are accurate normalization routines which make GPU matchTemplate
-// consistent with CPU one
-float normAcc(float num, float denum)
-{
- if(fabs(num) < denum)
- {
- return num / denum;
- }
- if(fabs(num) < denum * 1.125f)
- {
- return num > 0 ? 1 : -1;
- }
- return 0;
-}
-
-float normAcc_SQDIFF(float num, float denum)
-{
- if(fabs(num) < denum)
- {
- return num / denum;
- }
- if(fabs(num) < denum * 1.125f)
- {
- return num > 0 ? 1 : -1;
- }
- return 1;
-}
-//////////////////////////////////////////////////////////////////////
-// normalize
-
-__kernel
-void normalizeKernel_C1_D0
-(
- __global const float * img_sqsums,
- __global float * res,
- ulong tpl_sqsum,
- int res_rows,
- int res_cols,
- int tpl_rows,
- int tpl_cols,
- int img_sqsums_offset,
- int img_sqsums_step,
- int res_offset,
- int res_step
-)
-{
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
-
- res_step /= sizeof(*res);
- res_offset /= sizeof(*res);
- img_sqsums_step /= sizeof(*img_sqsums);
- img_sqsums_offset /= sizeof(*img_sqsums);
- int res_idx = mad24(gidy, res_step, res_offset + gidx);
- if(gidx < res_cols && gidy < res_rows)
- {
- float image_sqsum_ = (float)(
- (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
- (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
- res[res_idx] = normAcc(res[res_idx], sqrt(image_sqsum_ * tpl_sqsum));
- }
-}
-
-__kernel
-void matchTemplate_Prepared_SQDIFF_C1_D0
-(
- __global const TYPE_IMAGE_SQSUM * img_sqsums,
- __global float * res,
- ulong tpl_sqsum,
- int res_rows,
- int res_cols,
- int tpl_rows,
- int tpl_cols,
- int img_sqsums_offset,
- int img_sqsums_step,
- int res_offset,
- int res_step
-)
-{
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
-
- res_step /= sizeof(*res);
- res_offset /= sizeof(*res);
- img_sqsums_step /= sizeof(*img_sqsums);
- img_sqsums_offset /= sizeof(*img_sqsums);
- int res_idx = mad24(gidy, res_step, res_offset + gidx);
- if(gidx < res_cols && gidy < res_rows)
- {
- float image_sqsum_ = (float)(
- (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
- (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
- res[res_idx] = image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum;
- }
-}
-
-__kernel
-void matchTemplate_Prepared_SQDIFF_NORMED_C1_D0
-(
- __global const float * img_sqsums,
- __global float * res,
- ulong tpl_sqsum,
- int res_rows,
- int res_cols,
- int tpl_rows,
- int tpl_cols,
- int img_sqsums_offset,
- int img_sqsums_step,
- int res_offset,
- int res_step
-)
-{
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
-
- res_step /= sizeof(*res);
- res_offset /= sizeof(*res);
- img_sqsums_step /= sizeof(*img_sqsums);
- img_sqsums_offset /= sizeof(*img_sqsums);
- int res_idx = mad24(gidy, res_step, res_offset + gidx);
- if(gidx < res_cols && gidy < res_rows)
- {
- float image_sqsum_ = (float)(
- (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
- (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
- res[res_idx] = normAcc_SQDIFF(image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum,
- sqrt(image_sqsum_ * tpl_sqsum));
- }
-}
-
-//////////////////////////////////////////////////
-// SQDIFF
-__kernel
-void matchTemplate_Naive_SQDIFF_C1_D0
-(
- __global const uchar * img,
- __global const uchar * tpl,
- __global float * res,
- int img_rows,
- int img_cols,
- int tpl_rows,
- int tpl_cols,
- int res_rows,
- int res_cols,
- int img_offset,
- int tpl_offset,
- int res_offset,
- int img_step,
- int tpl_step,
- int res_step
-)
-{
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
- int i,j;
- int delta;
- int sum = 0;
- res_step /= sizeof(*res);
- res_offset /= sizeof(*res);
- int res_idx = mad24(gidy, res_step, res_offset + gidx);
-
- if(gidx < res_cols && gidy < res_rows)
- {
- for(i = 0; i < tpl_rows; i ++)
- {
- // get specific rows of img data
- __global const uchar * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
- __global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
- for(j = 0; j < tpl_cols; j ++)
- {
- delta = img_ptr[j] - tpl_ptr[j];
- sum = mad24(delta, delta, sum);
- }
- }
- res[res_idx] = sum;
- }
-}
-
-__kernel
-void matchTemplate_Naive_SQDIFF_C1_D5
-(
- __global const float * img,
- __global const float * tpl,
- __global float * res,
- int img_rows,
- int img_cols,
- int tpl_rows,
- int tpl_cols,
- int res_rows,
- int res_cols,
- int img_offset,
- int tpl_offset,
- int res_offset,
- int img_step,
- int tpl_step,
- int res_step
-)
-{
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
- int i,j;
- float delta;
- float sum = 0;
- img_step /= sizeof(*img);
- img_offset /= sizeof(*img);
- tpl_step /= sizeof(*tpl);
- tpl_offset /= sizeof(*tpl);
- res_step /= sizeof(*res);
- res_offset /= sizeof(*res);
-
- int res_idx = mad24(gidy, res_step, res_offset + gidx);
-
- if(gidx < res_cols && gidy < res_rows)
- {
- for(i = 0; i < tpl_rows; i ++)
- {
- // get specific rows of img data
- __global const float * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
- __global const float * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
- for(j = 0; j < tpl_cols; j ++)
- {
- delta = img_ptr[j] - tpl_ptr[j];
- sum = mad(delta, delta, sum);
- }
- }
- res[res_idx] = sum;
- }
-}
-
-__kernel
-void matchTemplate_Naive_SQDIFF_C4_D0
-(
- __global const uchar4 * img,
- __global const uchar4 * tpl,
- __global float * res,
- int img_rows,
- int img_cols,
- int tpl_rows,
- int tpl_cols,
- int res_rows,
- int res_cols,
- int img_offset,
- int tpl_offset,
- int res_offset,
- int img_step,
- int tpl_step,
- int res_step
-)
-{
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
- int i,j;
- int4 delta;
- int4 sum = (int4)(0, 0, 0, 0);
- img_step /= sizeof(*img);
- img_offset /= sizeof(*img);
- tpl_step /= sizeof(*tpl);
- tpl_offset /= sizeof(*tpl);
- res_step /= sizeof(*res);
- res_offset /= sizeof(*res);
-
- int res_idx = mad24(gidy, res_step, res_offset + gidx);
-
- if(gidx < res_cols && gidy < res_rows)
- {
- for(i = 0; i < tpl_rows; i ++)
- {
- // get specific rows of img data
- __global const uchar4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
- __global const uchar4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
- for(j = 0; j < tpl_cols; j ++)
- {
- //delta = convert_int4(img_ptr[j] - tpl_ptr[j]); // this alternative is incorrect
- delta.x = img_ptr[j].x - tpl_ptr[j].x;
- delta.y = img_ptr[j].y - tpl_ptr[j].y;
- delta.z = img_ptr[j].z - tpl_ptr[j].z;
- delta.w = img_ptr[j].w - tpl_ptr[j].w;
- sum = mad24(delta, delta, sum);
- }
- }
- res[res_idx] = sum.x + sum.y + sum.z + sum.w;
- }
-}
-
-__kernel
-void matchTemplate_Naive_SQDIFF_C4_D5
-(
- __global const float4 * img,
- __global const float4 * tpl,
- __global float * res,
- int img_rows,
- int img_cols,
- int tpl_rows,
- int tpl_cols,
- int res_rows,
- int res_cols,
- int img_offset,
- int tpl_offset,
- int res_offset,
- int img_step,
- int tpl_step,
- int res_step
-)
-{
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
- int i,j;
- float4 delta;
- float4 sum = (float4)(0, 0, 0, 0);
- img_step /= sizeof(*img);
- img_offset /= sizeof(*img);
- tpl_step /= sizeof(*tpl);
- tpl_offset /= sizeof(*tpl);
- res_step /= sizeof(*res);
- res_offset /= sizeof(*res);
-
- int res_idx = mad24(gidy, res_step, res_offset + gidx);
-
- if(gidx < res_cols && gidy < res_rows)
- {
- for(i = 0; i < tpl_rows; i ++)
- {
- // get specific rows of img data
- __global const float4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
- __global const float4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
- for(j = 0; j < tpl_cols; j ++)
- {
- //delta = convert_int4(img_ptr[j] - tpl_ptr[j]); // this alternative is incorrect
- delta.x = img_ptr[j].x - tpl_ptr[j].x;
- delta.y = img_ptr[j].y - tpl_ptr[j].y;
- delta.z = img_ptr[j].z - tpl_ptr[j].z;
- delta.w = img_ptr[j].w - tpl_ptr[j].w;
- sum = mad(delta, delta, sum);
- }
- }
- res[res_idx] = sum.x + sum.y + sum.z + sum.w;
- }
-}
-
-//////////////////////////////////////////////////
-// CCORR
-__kernel
-void matchTemplate_Naive_CCORR_C1_D0
-(
- __global const uchar * img,
- __global const uchar * tpl,
- __global float * res,
- int img_rows,
- int img_cols,
- int tpl_rows,
- int tpl_cols,
- int res_rows,
- int res_cols,
- int img_offset,
- int tpl_offset,
- int res_offset,
- int img_step,
- int tpl_step,
- int res_step
-)
-{
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
- int i,j;
- int sum = 0;
- res_step /= sizeof(*res);
- res_offset /= sizeof(*res);
-
- int res_idx = mad24(gidy, res_step, res_offset + gidx);
-
- if(gidx < res_cols && gidy < res_rows)
- {
- for(i = 0; i < tpl_rows; i ++)
- {
- // get specific rows of img data
- __global const uchar * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
- __global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
- for(j = 0; j < tpl_cols; j ++)
- {
- sum = mad24(convert_int(img_ptr[j]), convert_int(tpl_ptr[j]), sum);
- }
- }
- res[res_idx] = (float)sum;
- }
-}
-
-__kernel
-void matchTemplate_Naive_CCORR_C1_D5
-(
- __global const float * img,
- __global const float * tpl,
- __global float * res,
- int img_rows,
- int img_cols,
- int tpl_rows,
- int tpl_cols,
- int res_rows,
- int res_cols,
- int img_offset,
- int tpl_offset,
- int res_offset,
- int img_step,
- int tpl_step,
- int res_step
-)
-{
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
- int i,j;
- float sum = 0;
- img_step /= sizeof(*img);
- img_offset /= sizeof(*img);
- tpl_step /= sizeof(*tpl);
- tpl_offset /= sizeof(*tpl);
- res_step /= sizeof(*res);
- res_offset /= sizeof(*res);
-
- int res_idx = mad24(gidy, res_step, res_offset + gidx);
-
- if(gidx < res_cols && gidy < res_rows)
- {
- for(i = 0; i < tpl_rows; i ++)
- {
- // get specific rows of img data
- __global const float * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
- __global const float * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
- for(j = 0; j < tpl_cols; j ++)
- {
- sum = mad(img_ptr[j], tpl_ptr[j], sum);
- }
- }
- res[res_idx] = sum;
- }
-}
-
-__kernel
-void matchTemplate_Naive_CCORR_C4_D0
-(
- __global const uchar4 * img,
- __global const uchar4 * tpl,
- __global float * res,
- int img_rows,
- int img_cols,
- int tpl_rows,
- int tpl_cols,
- int res_rows,
- int res_cols,
- int img_offset,
- int tpl_offset,
- int res_offset,
- int img_step,
- int tpl_step,
- int res_step
-)
-{
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
- int i,j;
- int4 sum = (int4)(0, 0, 0, 0);
- img_step /= sizeof(*img);
- img_offset /= sizeof(*img);
- tpl_step /= sizeof(*tpl);
- tpl_offset /= sizeof(*tpl);
- res_step /= sizeof(*res);
- res_offset /= sizeof(*res);
-
- int res_idx = mad24(gidy, res_step, res_offset + gidx);
-
- if(gidx < res_cols && gidy < res_rows)
- {
- for(i = 0; i < tpl_rows; i ++)
- {
- // get specific rows of img data
- __global const uchar4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
- __global const uchar4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
- for(j = 0; j < tpl_cols; j ++)
- {
- sum = mad24(convert_int4(img_ptr[j]), convert_int4(tpl_ptr[j]), sum);
- }
- }
- res[res_idx] = (float)(sum.x + sum.y + sum.z + sum.w);
- }
-}
-
-__kernel
-void matchTemplate_Naive_CCORR_C4_D5
-(
- __global const float4 * img,
- __global const float4 * tpl,
- __global float * res,
- int img_rows,
- int img_cols,
- int tpl_rows,
- int tpl_cols,
- int res_rows,
- int res_cols,
- int img_offset,
- int tpl_offset,
- int res_offset,
- int img_step,
- int tpl_step,
- int res_step
-)
-{
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
- int i,j;
- float4 sum = (float4)(0, 0, 0, 0);
- img_step /= sizeof(*img);
- img_offset /= sizeof(*img);
- tpl_step /= sizeof(*tpl);
- tpl_offset /= sizeof(*tpl);
- res_step /= sizeof(*res);
- res_offset /= sizeof(*res);
-
- int res_idx = mad24(gidy, res_step, res_offset + gidx);
-
- if(gidx < res_cols && gidy < res_rows)
- {
- for(i = 0; i < tpl_rows; i ++)
- {
- // get specific rows of img data
- __global const float4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
- __global const float4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
- for(j = 0; j < tpl_cols; j ++)
- {
- sum = mad(convert_float4(img_ptr[j]), convert_float4(tpl_ptr[j]), sum);
- }
- }
- res[res_idx] = sum.x + sum.y + sum.z + sum.w;
- }
-}
-
-//////////////////////////////////////////////////
-// CCOFF
-__kernel
-void matchTemplate_Prepared_CCOFF_C1_D0
-(
- __global float * res,
- int img_rows,
- int img_cols,
- int tpl_rows,
- int tpl_cols,
- int res_rows,
- int res_cols,
- int res_offset,
- int res_step,
- __global const uint * img_sums,
- int img_sums_offset,
- int img_sums_step,
- float tpl_sum
-)
-{
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
-
- img_sums_offset /= sizeof(*img_sums);
- img_sums_step /= sizeof(*img_sums);
- res_step /= sizeof(*res);
- res_offset /= sizeof(*res);
-
- int res_idx = mad24(gidy, res_step, res_offset + gidx);
-
- if(gidx < res_cols && gidy < res_rows)
- {
- float sum = (float)((img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
- -(img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
- res[res_idx] -= sum * tpl_sum;
- }
-}
-__kernel
-void matchTemplate_Prepared_CCOFF_C4_D0
-(
- __global float * res,
- int img_rows,
- int img_cols,
- int tpl_rows,
- int tpl_cols,
- int res_rows,
- int res_cols,
- int res_offset,
- int res_step,
- __global const uint * img_sums_c0,
- __global const uint * img_sums_c1,
- __global const uint * img_sums_c2,
- __global const uint * img_sums_c3,
- int img_sums_offset,
- int img_sums_step,
- float tpl_sum_c0,
- float tpl_sum_c1,
- float tpl_sum_c2,
- float tpl_sum_c3
-)
-{
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
-
- img_sums_offset /= sizeof(*img_sums_c0);
- img_sums_step /= sizeof(*img_sums_c0);
- res_step /= sizeof(*res);
- res_offset /= sizeof(*res);
-
- int res_idx = mad24(gidy, res_step, res_offset + gidx);
-
- if(gidx < res_cols && gidy < res_rows)
- {
- float ccorr = res[res_idx];
- ccorr -= tpl_sum_c0*(float)(
- (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
- - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
- ccorr -= tpl_sum_c1*(float)(
- (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
- - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
- ccorr -= tpl_sum_c2*(float)(
- (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
- - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
- ccorr -= tpl_sum_c3*(float)(
- (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
- - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
- res[res_idx] = ccorr;
- }
-}
-
-__kernel
-void matchTemplate_Prepared_CCOFF_NORMED_C1_D0
-(
- __global float * res,
- int img_rows,
- int img_cols,
- int tpl_rows,
- int tpl_cols,
- int res_rows,
- int res_cols,
- int res_offset,
- int res_step,
- float weight,
- __global const uint * img_sums,
- int img_sums_offset,
- int img_sums_step,
- __global const float * img_sqsums,
- int img_sqsums_offset,
- int img_sqsums_step,
- float tpl_sum,
- float tpl_sqsum
-)
-{
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
-
- img_sqsums_step /= sizeof(*img_sqsums);
- img_sqsums_offset /= sizeof(*img_sqsums);
- img_sums_offset /= sizeof(*img_sums);
- img_sums_step /= sizeof(*img_sums);
- res_step /= sizeof(*res);
- res_offset /= sizeof(*res);
-
-
- int res_idx = mad24(gidy, res_step, res_offset + gidx);
-
- if(gidx < res_cols && gidy < res_rows)
- {
- float image_sum_ = (float)(
- (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
- - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
-
- float image_sqsum_ = (float)(
- (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
- (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
- res[res_idx] = normAcc(res[res_idx] - image_sum_ * tpl_sum,
- sqrt(tpl_sqsum * (image_sqsum_ - weight * image_sum_ * image_sum_)));
- }
-}
-__kernel
-void matchTemplate_Prepared_CCOFF_NORMED_C4_D0
-(
- __global float * res,
- int img_rows,
- int img_cols,
- int tpl_rows,
- int tpl_cols,
- int res_rows,
- int res_cols,
- int res_offset,
- int res_step,
- float weight,
- __global const uint * img_sums_c0,
- __global const uint * img_sums_c1,
- __global const uint * img_sums_c2,
- __global const uint * img_sums_c3,
- int img_sums_offset,
- int img_sums_step,
- __global const float * img_sqsums_c0,
- __global const float * img_sqsums_c1,
- __global const float * img_sqsums_c2,
- __global const float * img_sqsums_c3,
- int img_sqsums_offset,
- int img_sqsums_step,
- float tpl_sum_c0,
- float tpl_sum_c1,
- float tpl_sum_c2,
- float tpl_sum_c3,
- float tpl_sqsum
-)
-{
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
-
- img_sqsums_step /= sizeof(*img_sqsums_c0);
- img_sqsums_offset /= sizeof(*img_sqsums_c0);
- img_sums_offset /= sizeof(*img_sums_c0);
- img_sums_step /= sizeof(*img_sums_c0);
- res_step /= sizeof(*res);
- res_offset /= sizeof(*res);
-
- int res_idx = mad24(gidy, res_step, res_offset + gidx);
-
- if(gidx < res_cols && gidy < res_rows)
- {
- float image_sum_c0 = (float)(
- (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
- - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
- float image_sum_c1 = (float)(
- (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
- - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
- float image_sum_c2 = (float)(
- (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
- - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
- float image_sum_c3 = (float)(
- (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
- - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
-
- float image_sqsum_c0 = (float)(
- (img_sqsums_c0[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(tpl_cols, 0)]) -
- (img_sqsums_c0[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(0, 0)]));
- float image_sqsum_c1 = (float)(
- (img_sqsums_c1[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(tpl_cols, 0)]) -
- (img_sqsums_c1[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(0, 0)]));
- float image_sqsum_c2 = (float)(
- (img_sqsums_c2[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(tpl_cols, 0)]) -
- (img_sqsums_c2[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(0, 0)]));
- float image_sqsum_c3 = (float)(
- (img_sqsums_c3[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(tpl_cols, 0)]) -
- (img_sqsums_c3[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(0, 0)]));
-
- float num = res[res_idx] -
- image_sum_c0 * tpl_sum_c0 -
- image_sum_c1 * tpl_sum_c1 -
- image_sum_c2 * tpl_sum_c2 -
- image_sum_c3 * tpl_sum_c3;
- float denum = sqrt( tpl_sqsum * (
- image_sqsum_c0 - weight * image_sum_c0 * image_sum_c0 +
- image_sqsum_c1 - weight * image_sum_c1 * image_sum_c1 +
- image_sqsum_c2 - weight * image_sum_c2 * image_sum_c2 +
- image_sqsum_c3 - weight * image_sum_c0 * image_sum_c3)
- );
- res[res_idx] = normAcc(num, denum);
- }
-}
-
-//////////////////////////////////////////////////////////////////////
-// extractFirstChannel
-__kernel
-void extractFirstChannel
-(
- const __global float4* img,
- __global float* res,
- int rows,
- int cols,
- int img_offset,
- int res_offset,
- int img_step,
- int res_step
-)
-{
- img_step /= sizeof(float4);
- res_step /= sizeof(float);
- img_offset /= sizeof(float4);
- res_offset /= sizeof(float);
- img += img_offset;
- res += res_offset;
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
- if(gidx < cols && gidy < rows)
- {
- res[gidx + gidy * res_step] = img[gidx + gidy * img_step].x;
- }
-}
+++ /dev/null
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Niko Li, newlife20080214@gmail.com
-// Zero Lin, zero.lin@amd.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//
-
-
-/*
-__kernel void medianFilter_C1(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, int cols,
- int rows, int srcStep, int dstStep, int m)
-{
- int dx = get_global_id(0)-(m>>1);
- int dy = get_global_id(1)-(m>>1);
-
- short histom[256];
- for(int i=0;i<256;++i)
- histom[i]=0;
-
-
- for(int i=0;i<m;++i)
- {
- __global uchar * data = src + srcOffset + mul24(srcStep,clamp(dy + (i), 0, rows-1));
- for(int j=dx;j<dx+m;++j)
- {
- histom[data[clamp(j, 0, cols-1)]]++;
- }
- }
-
- int now=0;
- int goal=(m*m+1)>>1;
- int v;
- for(int i=0;i<256;++i)
- {
- v=(now<goal?i:v);
- now+=histom[i];
- }
-
- if(dy<rows && dx<cols)
- dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=v;
-}
-*/
-#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
-__kernel void medianFilter3_C4_D0(__global uchar4 * src, __global uchar4 * dst, int srcOffset, int dstOffset, int cols,
- int rows, int srcStep, int dstStep)
-{
-
- __local uchar4 data[18][18];
- __global uchar4* source=src + srcOffset;
-
- int dx = get_global_id(0) - get_local_id(0) -1;
- int dy = get_global_id(1) - get_local_id(1) -1;
-
- const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
-
- int dr=id/18;
- int dc=id%18;
- int r=clamp(dy+dr, 0, rows-1);
- int c=clamp(dx+dc, 0, cols-1);
-
- data[dr][dc] = source[r*srcStep + c];
- r=clamp(dy+dr+9, 0, rows-1);
- data[dr+9][dc] = source[r*srcStep + c];
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int x =get_local_id(0);
- int y =get_local_id(1);
- uchar4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
- uchar4 p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
- uchar4 p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
- uchar4 mid;
-
- op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
- op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
- op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
- op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
- op(p4, p2); op(p6, p4); op(p4, p2);
-
- if(get_global_id(1)<rows && get_global_id(0)<cols)
- dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
-}
-#undef op(a,b)
-
-#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
-__kernel void medianFilter3_C1_D0(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, int cols,
- int rows, int srcStep, int dstStep)
-{
-
- __local uchar data[18][18];
- __global uchar* source=src + srcOffset;
-
- int dx = get_global_id(0) - get_local_id(0) -1;
- int dy = get_global_id(1) - get_local_id(1) -1;
-
- const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
-
- int dr=id/18;
- int dc=id%18;
- int r=clamp(dy+dr, 0, rows-1);
- int c=clamp(dx+dc, 0, cols-1);
-
- data[dr][dc] = source[r*srcStep + c];
- r=clamp(dy+dr+9, 0, rows-1);
- data[dr+9][dc] = source[r*srcStep + c];
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int x =get_local_id(0);
- int y =get_local_id(1);
- uchar p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
- uchar p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
- uchar p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
- uchar mid;
-
- op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
- op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
- op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
- op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
- op(p4, p2); op(p6, p4); op(p4, p2);
-
- if(get_global_id(1)<rows && get_global_id(0)<cols)
- dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
-}
-#undef op(a,b)
-
-#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
-__kernel void medianFilter3_C1_D5(__global float * src, __global float * dst, int srcOffset, int dstOffset, int cols,
- int rows, int srcStep, int dstStep)
-{
-
- __local float data[18][18];
- __global float* source=src + srcOffset;
-
- int dx = get_global_id(0) - get_local_id(0) -1;
- int dy = get_global_id(1) - get_local_id(1) -1;
-
- const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
-
- int dr=id/18;
- int dc=id%18;
- int r=clamp(dy+dr, 0, rows-1);
- int c=clamp(dx+dc, 0, cols-1);
-
- data[dr][dc] = source[r*srcStep + c];
- r=clamp(dy+dr+9, 0, rows-1);
- data[dr+9][dc] = source[r*srcStep + c];
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int x =get_local_id(0);
- int y =get_local_id(1);
- float p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
- float p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
- float p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
- float mid;
-
- op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
- op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
- op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
- op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
- op(p4, p2); op(p6, p4); op(p4, p2);
-
- if(get_global_id(1)<rows && get_global_id(0)<cols)
- dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
-}
-#undef op(a,b)
-
-#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
-__kernel void medianFilter3_C4_D5(__global float4 * src, __global float4 * dst, int srcOffset, int dstOffset, int cols,
- int rows, int srcStep, int dstStep)
-{
-
- __local float4 data[18][18];
- __global float4* source=src + srcOffset;
-
- int dx = get_global_id(0) - get_local_id(0) -1;
- int dy = get_global_id(1) - get_local_id(1) -1;
-
- const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
-
- int dr=id/18;
- int dc=id%18;
- int r=clamp(dy+dr, 0, rows-1);
- int c=clamp(dx+dc, 0, cols-1);
-
- data[dr][dc] = source[r*srcStep + c];
- r=clamp(dy+dr+9, 0, rows-1);
- data[dr+9][dc] = source[r*srcStep + c];
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int x =get_local_id(0);
- int y =get_local_id(1);
- float4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
- float4 p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
- float4 p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
- float4 mid;
-
- op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
- op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
- op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
- op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
- op(p4, p2); op(p6, p4); op(p4, p2);
-
- if(get_global_id(1)<rows && get_global_id(0)<cols)
- dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
-}
-#undef op(a,b)
-
-#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
-__kernel void medianFilter5_C4_D0(__global uchar4 * src, __global uchar4 * dst, int srcOffset, int dstOffset, int cols,
- int rows, int srcStep, int dstStep)
-{
-
- __local uchar4 data[20][20];
- __global uchar4* source=src + srcOffset;
-
- int dx = get_global_id(0) - get_local_id(0) -2;
- int dy = get_global_id(1) - get_local_id(1) -2;
-
- const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
-
- int dr=id/20;
- int dc=id%20;
- int r=clamp(dy+dr, 0, rows-1);
- int c=clamp(dx+dc, 0, cols-1);
-
- data[dr][dc] = source[r*srcStep + c];
- r=clamp(dy+dr+10, 0, rows-1);
- data[dr+10][dc] = source[r*srcStep + c];
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int x =get_local_id(0);
- int y =get_local_id(1);
- uchar4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
- uchar4 p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
- uchar4 p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
- uchar4 p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
- uchar4 p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
- uchar4 mid;
-
- op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
- op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
- op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
- op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
- op(p8, p9); op(p7, p10); op(p7, p8); op(p9, p10); op(p0, p6);
- op(p4, p10); op(p4, p6); op(p2, p8); op(p2, p4); op(p6, p8);
- op(p1, p7); op(p5, p11); op(p5, p7); op(p3, p9); op(p3, p5);
- op(p7, p9); op(p1, p2); op(p3, p4); op(p5, p6); op(p7, p8);
- op(p9, p10); op(p13, p14); op(p12, p13); op(p13, p14); op(p16, p17);
- op(p15, p16); op(p16, p17); op(p12, p15); op(p14, p17); op(p14, p15);
- op(p13, p16); op(p13, p14); op(p15, p16); op(p19, p20); op(p18, p19);
- op(p19, p20); op(p21, p22); op(p23, p24); op(p21, p23); op(p22, p24);
- op(p22, p23); op(p18, p21); op(p20, p23); op(p20, p21); op(p19, p22);
- op(p22, p24); op(p19, p20); op(p21, p22); op(p23, p24); op(p12, p18);
- op(p16, p22); op(p16, p18); op(p14, p20); op(p20, p24); op(p14, p16);
- op(p18, p20); op(p22, p24); op(p13, p19); op(p17, p23); op(p17, p19);
- op(p15, p21); op(p15, p17); op(p19, p21); op(p13, p14); op(p15, p16);
- op(p17, p18); op(p19, p20); op(p21, p22); op(p23, p24); op(p0, p12);
- op(p8, p20); op(p8, p12); op(p4, p16); op(p16, p24); op(p12, p16);
- op(p2, p14); op(p10, p22); op(p10, p14); op(p6, p18); op(p6, p10);
- op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
- op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
- op(p7, p11); op(p11, p13); op(p11, p12);
-
- if(get_global_id(1)<rows && get_global_id(0)<cols)
- dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
-}
-#undef op(a,b)
-
-#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
-__kernel void medianFilter5_C1_D0(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, int cols,
- int rows, int srcStep, int dstStep)
-{
-
- __local uchar data[20][20];
- __global uchar* source=src + srcOffset;
-
- int dx = get_global_id(0) - get_local_id(0) -2;
- int dy = get_global_id(1) - get_local_id(1) -2;
-
- const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
-
- int dr=id/20;
- int dc=id%20;
- int r=clamp(dy+dr, 0, rows-1);
- int c=clamp(dx+dc, 0, cols-1);
-
- data[dr][dc] = source[r*srcStep + c];
- r=clamp(dy+dr+10, 0, rows-1);
- data[dr+10][dc] = source[r*srcStep + c];
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int x =get_local_id(0);
- int y =get_local_id(1);
- uchar p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
- uchar p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
- uchar p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
- uchar p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
- uchar p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
- uchar mid;
-
- op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
- op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
- op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
- op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
- op(p8, p9); op(p7, p10); op(p7, p8); op(p9, p10); op(p0, p6);
- op(p4, p10); op(p4, p6); op(p2, p8); op(p2, p4); op(p6, p8);
- op(p1, p7); op(p5, p11); op(p5, p7); op(p3, p9); op(p3, p5);
- op(p7, p9); op(p1, p2); op(p3, p4); op(p5, p6); op(p7, p8);
- op(p9, p10); op(p13, p14); op(p12, p13); op(p13, p14); op(p16, p17);
- op(p15, p16); op(p16, p17); op(p12, p15); op(p14, p17); op(p14, p15);
- op(p13, p16); op(p13, p14); op(p15, p16); op(p19, p20); op(p18, p19);
- op(p19, p20); op(p21, p22); op(p23, p24); op(p21, p23); op(p22, p24);
- op(p22, p23); op(p18, p21); op(p20, p23); op(p20, p21); op(p19, p22);
- op(p22, p24); op(p19, p20); op(p21, p22); op(p23, p24); op(p12, p18);
- op(p16, p22); op(p16, p18); op(p14, p20); op(p20, p24); op(p14, p16);
- op(p18, p20); op(p22, p24); op(p13, p19); op(p17, p23); op(p17, p19);
- op(p15, p21); op(p15, p17); op(p19, p21); op(p13, p14); op(p15, p16);
- op(p17, p18); op(p19, p20); op(p21, p22); op(p23, p24); op(p0, p12);
- op(p8, p20); op(p8, p12); op(p4, p16); op(p16, p24); op(p12, p16);
- op(p2, p14); op(p10, p22); op(p10, p14); op(p6, p18); op(p6, p10);
- op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
- op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
- op(p7, p11); op(p11, p13); op(p11, p12);
-
- if(get_global_id(1)<rows && get_global_id(0)<cols)
- dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
-}
-#undef op(a,b)
-
-#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
-__kernel void medianFilter5_C4_D5(__global float4 * src, __global float4 * dst, int srcOffset, int dstOffset, int cols,
- int rows, int srcStep, int dstStep)
-{
-
- __local float4 data[20][20];
- __global float4* source=src + srcOffset;
-
- int dx = get_global_id(0) - get_local_id(0) -2;
- int dy = get_global_id(1) - get_local_id(1) -2;
-
- const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
-
- int dr=id/20;
- int dc=id%20;
- int r=clamp(dy+dr, 0, rows-1);
- int c=clamp(dx+dc, 0, cols-1);
-
- data[dr][dc] = source[r*srcStep + c];
- r=clamp(dy+dr+10, 0, rows-1);
- data[dr+10][dc] = source[r*srcStep + c];
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int x =get_local_id(0);
- int y =get_local_id(1);
- float4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
- float4 p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
- float4 p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
- float4 p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
- float4 p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
- float4 mid;
-
- op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
- op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
- op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
- op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
- op(p8, p9); op(p7, p10); op(p7, p8); op(p9, p10); op(p0, p6);
- op(p4, p10); op(p4, p6); op(p2, p8); op(p2, p4); op(p6, p8);
- op(p1, p7); op(p5, p11); op(p5, p7); op(p3, p9); op(p3, p5);
- op(p7, p9); op(p1, p2); op(p3, p4); op(p5, p6); op(p7, p8);
- op(p9, p10); op(p13, p14); op(p12, p13); op(p13, p14); op(p16, p17);
- op(p15, p16); op(p16, p17); op(p12, p15); op(p14, p17); op(p14, p15);
- op(p13, p16); op(p13, p14); op(p15, p16); op(p19, p20); op(p18, p19);
- op(p19, p20); op(p21, p22); op(p23, p24); op(p21, p23); op(p22, p24);
- op(p22, p23); op(p18, p21); op(p20, p23); op(p20, p21); op(p19, p22);
- op(p22, p24); op(p19, p20); op(p21, p22); op(p23, p24); op(p12, p18);
- op(p16, p22); op(p16, p18); op(p14, p20); op(p20, p24); op(p14, p16);
- op(p18, p20); op(p22, p24); op(p13, p19); op(p17, p23); op(p17, p19);
- op(p15, p21); op(p15, p17); op(p19, p21); op(p13, p14); op(p15, p16);
- op(p17, p18); op(p19, p20); op(p21, p22); op(p23, p24); op(p0, p12);
- op(p8, p20); op(p8, p12); op(p4, p16); op(p16, p24); op(p12, p16);
- op(p2, p14); op(p10, p22); op(p10, p14); op(p6, p18); op(p6, p10);
- op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
- op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
- op(p7, p11); op(p11, p13); op(p11, p12);
-
- if(get_global_id(1)<rows && get_global_id(0)<cols)
- dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
-}
-#undef op(a,b)
-
-#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
-__kernel void medianFilter5_C1_D5(__global float * src, __global float * dst, int srcOffset, int dstOffset, int cols,
- int rows, int srcStep, int dstStep)
-{
-
- __local float data[20][20];
- __global float* source=src + srcOffset;
-
- int dx = get_global_id(0) - get_local_id(0) -2;
- int dy = get_global_id(1) - get_local_id(1) -2;
-
- const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
-
- int dr=id/20;
- int dc=id%20;
- int r=clamp(dy+dr, 0, rows-1);
- int c=clamp(dx+dc, 0, cols-1);
-
- data[dr][dc] = source[r*srcStep + c];
- r=clamp(dy+dr+10, 0, rows-1);
- data[dr+10][dc] = source[r*srcStep + c];
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int x =get_local_id(0);
- int y =get_local_id(1);
- float p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
- float p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
- float p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
- float p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
- float p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
- float mid;
-
- op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
- op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
- op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
- op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
- op(p8, p9); op(p7, p10); op(p7, p8); op(p9, p10); op(p0, p6);
- op(p4, p10); op(p4, p6); op(p2, p8); op(p2, p4); op(p6, p8);
- op(p1, p7); op(p5, p11); op(p5, p7); op(p3, p9); op(p3, p5);
- op(p7, p9); op(p1, p2); op(p3, p4); op(p5, p6); op(p7, p8);
- op(p9, p10); op(p13, p14); op(p12, p13); op(p13, p14); op(p16, p17);
- op(p15, p16); op(p16, p17); op(p12, p15); op(p14, p17); op(p14, p15);
- op(p13, p16); op(p13, p14); op(p15, p16); op(p19, p20); op(p18, p19);
- op(p19, p20); op(p21, p22); op(p23, p24); op(p21, p23); op(p22, p24);
- op(p22, p23); op(p18, p21); op(p20, p23); op(p20, p21); op(p19, p22);
- op(p22, p24); op(p19, p20); op(p21, p22); op(p23, p24); op(p12, p18);
- op(p16, p22); op(p16, p18); op(p14, p20); op(p20, p24); op(p14, p16);
- op(p18, p20); op(p22, p24); op(p13, p19); op(p17, p23); op(p17, p19);
- op(p15, p21); op(p15, p17); op(p19, p21); op(p13, p14); op(p15, p16);
- op(p17, p18); op(p19, p20); op(p21, p22); op(p23, p24); op(p0, p12);
- op(p8, p20); op(p8, p12); op(p4, p16); op(p16, p24); op(p12, p16);
- op(p2, p14); op(p10, p22); op(p10, p14); op(p6, p18); op(p6, p10);
- op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
- op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
- op(p7, p11); op(p11, p13); op(p11, p12);
-
- if(get_global_id(1)<rows && get_global_id(0)<cols)
- dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
-}
-#undef op(a,b)
+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Shengen Yan,yanshengen@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////Macro for border type////////////////////////////////////////////
-/////////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef BORDER_REPLICATE
-//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
-#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
-#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
-#endif
-
-#ifdef BORDER_REFLECT
-//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i))
-#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i))
-#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
-#endif
-
-#ifdef BORDER_REFLECT101
-//BORDER_REFLECT101: gfedcb|abcdefgh|gfedcba
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i))
-#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i))
-#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
-#endif
-
-#ifdef BORDER_WRAP
-//BORDER_WRAP: cdefgh|abcdefgh|abcdefg
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i))
-#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i))
-#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
-#endif
-
-#define THREADS 256
-#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////calcHarris////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy, __global float *dst,
- int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step,
- int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step,
- int dst_offset, int dst_rows, int dst_cols, int dst_step,
- float k)
-{
- int col = get_local_id(0);
- const int gX = get_group_id(0);
- const int gY = get_group_id(1);
- const int glx = get_global_id(0);
- const int gly = get_global_id(1);
-
- int dx_x_off = (dx_offset % dx_step) >> 2;
- int dx_y_off = dx_offset / dx_step;
- int dy_x_off = (dy_offset % dy_step) >> 2;
- int dy_y_off = dy_offset / dy_step;
- int dst_x_off = (dst_offset % dst_step) >> 2;
- int dst_y_off = dst_offset / dst_step;
-
- int dx_startX = gX * (THREADS-ksX+1) - anX + dx_x_off;
- int dx_startY = (gY << 1) - anY + dx_y_off;
- int dy_startX = gX * (THREADS-ksX+1) - anX + dy_x_off;
- int dy_startY = (gY << 1) - anY + dy_y_off;
- int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
- int dst_startY = (gY << 1) + dst_y_off;
-
- float dx_data[ksY+1],dy_data[ksY+1],data[3][ksY+1];
- __local float temp[6][THREADS];
-#ifdef BORDER_CONSTANT
- bool dx_con,dy_con;
- float dx_s,dy_s;
- for(int i=0; i < ksY+1; i++)
- {
- dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
- dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)];
- dx_data[i] = dx_con ? dx_s : 0.0;
- dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
- dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)];
- dy_data[i] = dy_con ? dy_s : 0.0;
- data[0][i] = dx_data[i] * dx_data[i];
- data[1][i] = dx_data[i] * dy_data[i];
- data[2][i] = dy_data[i] * dy_data[i];
- }
-#else
- int clamped_col = min(dst_cols, col);
-
- for(int i=0; i < ksY+1; i++)
- {
- int dx_selected_row;
- int dx_selected_col;
- dx_selected_row = ADDR_H(dx_startY+i, 0, dx_whole_rows);
- dx_selected_row = ADDR_B(dx_startY+i, dx_whole_rows, dx_selected_row);
- dx_selected_col = ADDR_L(dx_startX+clamped_col, 0, dx_whole_cols);
- dx_selected_col = ADDR_R(dx_startX+clamped_col, dx_whole_cols, dx_selected_col);
- dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
-
- int dy_selected_row;
- int dy_selected_col;
- dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows);
- dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row);
- dy_selected_col = ADDR_L(dy_startX+clamped_col, 0, dy_whole_cols);
- dy_selected_col = ADDR_R(dy_startX+clamped_col, dy_whole_cols, dy_selected_col);
- dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col];
-
- data[0][i] = dx_data[i] * dx_data[i];
- data[1][i] = dx_data[i] * dy_data[i];
- data[2][i] = dy_data[i] * dy_data[i];
- }
-#endif
- float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
- for(int i=1; i < ksY; i++)
- {
- sum0 += (data[0][i]);
- sum1 += (data[1][i]);
- sum2 += (data[2][i]);
- }
- float sum01,sum02,sum11,sum12,sum21,sum22;
- sum01 = sum0 + (data[0][0]);
- sum02 = sum0 + (data[0][ksY]);
- temp[0][col] = sum01;
- temp[1][col] = sum02;
- sum11 = sum1 + (data[1][0]);
- sum12 = sum1 + (data[1][ksY]);
- temp[2][col] = sum11;
- temp[3][col] = sum12;
- sum21 = sum2 + (data[2][0]);
- sum22 = sum2 + (data[2][ksY]);
- temp[4][col] = sum21;
- temp[5][col] = sum22;
- barrier(CLK_LOCAL_MEM_FENCE);
- if(col < (THREADS-(ksX-1)))
- {
- col += anX;
- int posX = dst_startX - dst_x_off + col - anX;
- int posY = (gly << 1);
- int till = (ksX + 1)%2;
- float tmp_sum[6]={ 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 };
- for(int k=0; k<6; k++)
- for(int i=-anX; i<=anX - till; i++)
- {
- tmp_sum[k] += temp[k][col+i];
- }
-
- if(posX < dst_cols && (posY) < dst_rows)
- {
- float a = tmp_sum[0] * 0.5f;
- float b = tmp_sum[2];
- float c = tmp_sum[4] * 0.5f;
- dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] = (float)((a+c) - sqrt((a-c)*(a-c) + b*b));
- }
- if(posX < dst_cols && (posY + 1) < dst_rows)
- {
- float a = tmp_sum[1] * 0.5f;
- float b = tmp_sum[3];
- float c = tmp_sum[5] * 0.5f;
- dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] = (float)((a+c) - sqrt((a-c)*(a-c) + b*b));
- }
- }
-}
+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Sen Liu, swjtuls1987@126.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if defined (DOUBLE_SUPPORT)
-
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-typedef double T;
-typedef double F;
-typedef double4 F4;
-#define convert_F4 convert_double4
-
-#else
-typedef float F;
-typedef float4 F4;
-typedef long T;
-#define convert_F4 convert_float4
-#endif
-
-#define DST_ROW_00 0
-#define DST_ROW_10 1
-#define DST_ROW_01 2
-#define DST_ROW_20 3
-#define DST_ROW_11 4
-#define DST_ROW_02 5
-#define DST_ROW_30 6
-#define DST_ROW_21 7
-#define DST_ROW_12 8
-#define DST_ROW_03 9
-
-__kernel void icvContourMoments(int contour_total,
- __global float* reader_oclmat_data,
- __global T* dst_a,
- int dst_step)
-{
- T xi_1, yi_1, xi_12, yi_12, xi, yi, xi2, yi2, dxy, xii_1, yii_1;
- int idx = get_global_id(0);
-
- if (idx < 0 || idx >= contour_total)
- return;
-
- xi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1)));
- yi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1) + 1));
- xi_12 = xi_1 * xi_1;
- yi_12 = yi_1 * yi_1;
-
- if(idx == contour_total - 1)
- {
- xi = (T)(*(reader_oclmat_data));
- yi = (T)(*(reader_oclmat_data + 1));
- }
- else
- {
- xi = (T)(*(reader_oclmat_data + (idx + 1) * 2));
- yi = (T)(*(reader_oclmat_data + (idx + 1) * 2 + 1));
- }
-
- xi2 = xi * xi;
- yi2 = yi * yi;
- dxy = xi_1 * yi - xi * yi_1;
- xii_1 = xi_1 + xi;
- yii_1 = yi_1 + yi;
-
- dst_step /= sizeof(T);
- *( dst_a + DST_ROW_00 * dst_step + idx) = dxy;
- *( dst_a + DST_ROW_10 * dst_step + idx) = dxy * xii_1;
- *( dst_a + DST_ROW_01 * dst_step + idx) = dxy * yii_1;
- *( dst_a + DST_ROW_20 * dst_step + idx) = dxy * (xi_1 * xii_1 + xi2);
- *( dst_a + DST_ROW_11 * dst_step + idx) = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi));
- *( dst_a + DST_ROW_02 * dst_step + idx) = dxy * (yi_1 * yii_1 + yi2);
- *( dst_a + DST_ROW_30 * dst_step + idx) = dxy * xii_1 * (xi_12 + xi2);
- *( dst_a + DST_ROW_03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2);
- *( dst_a + DST_ROW_21 * dst_step + idx) =
- dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 +
- xi2 * (yi_1 + 3 * yi));
- *( dst_a + DST_ROW_12 * dst_step + idx) =
- dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 +
- yi2 * (xi_1 + 3 * xi));
-}
-
-__kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_width, int TILE_SIZE,
- __global F* sum, __global F* dst_m, int dst_step)
-{
- int gidy = get_global_id(0);
- int gidx = get_global_id(1);
- int block_y = src_rows/tile_height;
- int block_x = src_cols/tile_width;
- int block_num;
-
- if(src_rows > TILE_SIZE && src_rows % TILE_SIZE != 0)
- block_y ++;
- if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0)
- block_x ++;
- block_num = block_y * block_x;
- __local F dst_sum[10][128];
- if(gidy<128-block_num)
- for(int i=0; i<10; i++)
- dst_sum[i][gidy+block_num]=0;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- dst_step /= sizeof(F);
- if(gidy<block_num)
- {
- dst_sum[0][gidy] = *(dst_m + mad24(DST_ROW_00 * block_y, dst_step, gidy));
- dst_sum[1][gidy] = *(dst_m + mad24(DST_ROW_10 * block_y, dst_step, gidy));
- dst_sum[2][gidy] = *(dst_m + mad24(DST_ROW_01 * block_y, dst_step, gidy));
- dst_sum[3][gidy] = *(dst_m + mad24(DST_ROW_20 * block_y, dst_step, gidy));
- dst_sum[4][gidy] = *(dst_m + mad24(DST_ROW_11 * block_y, dst_step, gidy));
- dst_sum[5][gidy] = *(dst_m + mad24(DST_ROW_02 * block_y, dst_step, gidy));
- dst_sum[6][gidy] = *(dst_m + mad24(DST_ROW_30 * block_y, dst_step, gidy));
- dst_sum[7][gidy] = *(dst_m + mad24(DST_ROW_21 * block_y, dst_step, gidy));
- dst_sum[8][gidy] = *(dst_m + mad24(DST_ROW_12 * block_y, dst_step, gidy));
- dst_sum[9][gidy] = *(dst_m + mad24(DST_ROW_03 * block_y, dst_step, gidy));
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- for(int lsize=64; lsize>0; lsize>>=1)
- {
- if(gidy<lsize)
- {
- int lsize2 = gidy + lsize;
- for(int i=0; i<10; i++)
- dst_sum[i][gidy] += dst_sum[i][lsize2];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- if(gidy==0)
- for(int i=0; i<10; i++)
- sum[i] = dst_sum[i][0];
-}
-
-__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step,
- __global F* dst_m,
- int dst_cols, int dst_step, int blocky,
- int depth, int cn, int coi, int binary, int TILE_SIZE)
-{
- uchar tmp_coi[16]; // get the coi data
- uchar16 tmp[16];
- int VLEN_C = 16; // vector length of uchar
-
- int gidy = get_global_id(0);
- int gidx = get_global_id(1);
- int wgidy = get_group_id(0);
- int wgidx = get_group_id(1);
- int lidy = get_local_id(0);
- int lidx = get_local_id(1);
- int y = wgidy*TILE_SIZE; // vector length of uchar
- int x = wgidx*TILE_SIZE; // vector length of uchar
- int kcn = (cn==2)?2:4;
- int rstep = min(src_step, TILE_SIZE);
- int tileSize_height = min(TILE_SIZE, src_rows - y);
- int tileSize_width = min(TILE_SIZE, src_cols - x);
-
- if ( y+lidy < src_rows )
- {
- if( tileSize_width < TILE_SIZE )
- for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
- *((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0;
-
- if( coi > 0 ) //channel of interest
- for(int i = 0; i < tileSize_width; i += VLEN_C)
- {
- for(int j=0; j<VLEN_C; j++)
- tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1);
- tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
- tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]);
- }
- else
- for(int i=0; i < tileSize_width; i+=VLEN_C)
- tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C);
- }
-
- uchar16 zero = (uchar16)(0);
- uchar16 full = (uchar16)(255);
- if( binary )
- for(int i=0; i < tileSize_width; i+=VLEN_C)
- tmp[i/VLEN_C] = (tmp[i/VLEN_C]!=zero)?full:zero;
-
- F mom[10];
- __local int m[10][128];
- if(lidy < 128)
- {
- for(int i=0; i<10; i++)
- m[i][lidy]=0;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int lm[10] = {0};
- int16 x0 = (int16)(0);
- int16 x1 = (int16)(0);
- int16 x2 = (int16)(0);
- int16 x3 = (int16)(0);
- for( int xt = 0 ; xt < tileSize_width; xt+=(VLEN_C) )
- {
- int16 v_xt = (int16)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7, xt+8, xt+9, xt+10, xt+11, xt+12, xt+13, xt+14, xt+15);
- int16 p = convert_int16(tmp[xt/VLEN_C]);
- int16 xp = v_xt * p, xxp = xp *v_xt;
- x0 += p;
- x1 += xp;
- x2 += xxp;
- x3 += xxp * v_xt;
- }
- x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7 + x0.s8 + x0.s9 + x0.sa + x0.sb + x0.sc + x0.sd + x0.se + x0.sf;
- x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7 + x1.s8 + x1.s9 + x1.sa + x1.sb + x1.sc + x1.sd + x1.se + x1.sf;
- x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7 + x2.s8 + x2.s9 + x2.sa + x2.sb + x2.sc + x2.sd + x2.se + x2.sf;
- x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7 + x3.s8 + x3.s9 + x3.sa + x3.sb + x3.sc + x3.sd + x3.se + x3.sf;
- int py = lidy * ((int)x0.s0);
- int sy = lidy*lidy;
- int bheight = min(tileSize_height, TILE_SIZE/2);
- if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
- {
- m[9][lidy-bheight] = ((int)py) * sy; // m03
- m[8][lidy-bheight] = ((int)x1.s0) * sy; // m12
- m[7][lidy-bheight] = ((int)x2.s0) * lidy; // m21
- m[6][lidy-bheight] = x3.s0; // m30
- m[5][lidy-bheight] = x0.s0 * sy; // m02
- m[4][lidy-bheight] = x1.s0 * lidy; // m11
- m[3][lidy-bheight] = x2.s0; // m20
- m[2][lidy-bheight] = py; // m01
- m[1][lidy-bheight] = x1.s0; // m10
- m[0][lidy-bheight] = x0.s0; // m00
- }
- else if(lidy < bheight)
- {
- lm[9] = ((int)py) * sy; // m03
- lm[8] = ((int)x1.s0) * sy; // m12
- lm[7] = ((int)x2.s0) * lidy; // m21
- lm[6] = x3.s0; // m30
- lm[5] = x0.s0 * sy; // m02
- lm[4] = x1.s0 * lidy; // m11
- lm[3] = x2.s0; // m20
- lm[2] = py; // m01
- lm[1] = x1.s0; // m10
- lm[0] = x0.s0; // m00
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- for( int j = bheight; j >= 1; j = j/2 )
- {
- if(lidy < j)
- for( int i = 0; i < 10; i++ )
- lm[i] = lm[i] + m[i][lidy];
- barrier(CLK_LOCAL_MEM_FENCE);
- if(lidy >= j/2&&lidy < j)
- for( int i = 0; i < 10; i++ )
- m[i][lidy-j/2] = lm[i];
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-
- if(lidy == 0&&lidx == 0)
- {
- for( int mt = 0; mt < 10; mt++ )
- mom[mt] = (F)lm[mt];
- if(binary)
- {
- F s = 1./255;
- for( int mt = 0; mt < 10; mt++ )
- mom[mt] *= s;
- }
- F xm = x * mom[0], ym = y * mom[0];
-
- // accumulate moments computed in each tile
- dst_step /= sizeof(F);
-
- // + m00 ( = m00' )
- *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
-
- // + m10 ( = m10' + x*m00' )
- *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
-
- // + m01 ( = m01' + y*m00' )
- *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
-
- // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
- *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
-
- // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
- *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
-
- // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
- *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
-
- // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
- *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
-
- // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
- *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
-
- // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
- *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
-
- // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
- *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
- }
-}
-
-__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step,
- __global F* dst_m,
- int dst_cols, int dst_step, int blocky,
- int depth, int cn, int coi, int binary, const int TILE_SIZE)
-{
- ushort tmp_coi[8]; // get the coi data
- ushort8 tmp[32];
- int VLEN_US = 8; // vector length of ushort
- int gidy = get_global_id(0);
- int gidx = get_global_id(1);
- int wgidy = get_group_id(0);
- int wgidx = get_group_id(1);
- int lidy = get_local_id(0);
- int lidx = get_local_id(1);
- int y = wgidy*TILE_SIZE; // real Y index of pixel
- int x = wgidx*TILE_SIZE; // real X index of pixel
- int kcn = (cn==2)?2:4;
- int rstep = min(src_step/2, TILE_SIZE);
- int tileSize_height = min(TILE_SIZE, src_rows - y);
- int tileSize_width = min(TILE_SIZE, src_cols -x);
-
- if ( y+lidy < src_rows )
- {
- if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE)
- for(int i=tileSize_width; i < rstep && (x+i) < src_cols; i++ )
- *((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0;
- if( coi > 0 )
- for(int i=0; i < tileSize_width; i+=VLEN_US)
- {
- for(int j=0; j<VLEN_US; j++)
- tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1);
- tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
- }
- else
- for(int i=0; i < tileSize_width; i+=VLEN_US)
- tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US);
- }
-
- ushort8 zero = (ushort8)(0);
- ushort8 full = (ushort8)(255);
- if( binary )
- for(int i=0; i < tileSize_width; i+=VLEN_US)
- tmp[i/VLEN_US] = (tmp[i/VLEN_US]!=zero)?full:zero;
- F mom[10];
- __local long m[10][128];
- if(lidy < 128)
- for(int i=0; i<10; i++)
- m[i][lidy]=0;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- long lm[10] = {0};
- int8 x0 = (int8)(0);
- int8 x1 = (int8)(0);
- int8 x2 = (int8)(0);
- long8 x3 = (long8)(0);
- for( int xt = 0 ; xt < tileSize_width; xt+=(VLEN_US) )
- {
- int8 v_xt = (int8)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7);
- int8 p = convert_int8(tmp[xt/VLEN_US]);
- int8 xp = v_xt * p, xxp = xp * v_xt;
- x0 += p;
- x1 += xp;
- x2 += xxp;
- x3 += convert_long8(xxp) *convert_long8(v_xt);
- }
- x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7;
- x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7;
- x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7;
- x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7;
-
- int py = lidy * x0.s0, sy = lidy*lidy;
- int bheight = min(tileSize_height, TILE_SIZE/2);
- if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
- {
- m[9][lidy-bheight] = ((long)py) * sy; // m03
- m[8][lidy-bheight] = ((long)x1.s0) * sy; // m12
- m[7][lidy-bheight] = ((long)x2.s0) * lidy; // m21
- m[6][lidy-bheight] = x3.s0; // m30
- m[5][lidy-bheight] = x0.s0 * sy; // m02
- m[4][lidy-bheight] = x1.s0 * lidy; // m11
- m[3][lidy-bheight] = x2.s0; // m20
- m[2][lidy-bheight] = py; // m01
- m[1][lidy-bheight] = x1.s0; // m10
- m[0][lidy-bheight] = x0.s0; // m00
- }
- else if(lidy < bheight)
- {
- lm[9] = ((long)py) * sy; // m03
- lm[8] = ((long)x1.s0) * sy; // m12
- lm[7] = ((long)x2.s0) * lidy; // m21
- lm[6] = x3.s0; // m30
- lm[5] = x0.s0 * sy; // m02
- lm[4] = x1.s0 * lidy; // m11
- lm[3] = x2.s0; // m20
- lm[2] = py; // m01
- lm[1] = x1.s0; // m10
- lm[0] = x0.s0; // m00
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
- {
- if(lidy < j)
- for( int i = 0; i < 10; i++ )
- lm[i] = lm[i] + m[i][lidy];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
- {
- if(lidy >= j/2&&lidy < j)
- for( int i = 0; i < 10; i++ )
- m[i][lidy-j/2] = lm[i];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if(lidy == 0&&lidx == 0)
- {
- for(int mt = 0; mt < 10; mt++ )
- mom[mt] = (F)lm[mt];
-
- if(binary)
- {
- F s = 1./255;
- for( int mt = 0; mt < 10; mt++ )
- mom[mt] *= s;
- }
-
- F xm = x *mom[0], ym = y * mom[0];
-
- // accumulate moments computed in each tile
- dst_step /= sizeof(F);
-
- // + m00 ( = m00' )
- *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
-
- // + m10 ( = m10' + x*m00' )
- *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
-
- // + m01 ( = m01' + y*m00' )
- *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
-
- // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
- *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
-
- // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
- *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
-
- // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
- *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
-
- // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
- *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
-
- // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
- *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
-
- // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
- *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
-
- // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
- *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
- }
-}
-
-__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step,
- __global F* dst_m,
- int dst_cols, int dst_step, int blocky,
- int depth, int cn, int coi, int binary, const int TILE_SIZE)
-{
- short tmp_coi[8]; // get the coi data
- short8 tmp[32];
- int VLEN_S =8; // vector length of short
- int gidy = get_global_id(0);
- int gidx = get_global_id(1);
- int wgidy = get_group_id(0);
- int wgidx = get_group_id(1);
- int lidy = get_local_id(0);
- int lidx = get_local_id(1);
- int y = wgidy*TILE_SIZE; // real Y index of pixel
- int x = wgidx*TILE_SIZE; // real X index of pixel
- int kcn = (cn==2)?2:4;
- int rstep = min(src_step/2, TILE_SIZE);
- int tileSize_height = min(TILE_SIZE, src_rows - y);
- int tileSize_width = min(TILE_SIZE, src_cols -x);
-
- if ( y+lidy < src_rows )
- {
- if(tileSize_width < TILE_SIZE)
- for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
- *((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0;
- if( coi > 0 )
- for(int i=0; i < tileSize_width; i+=VLEN_S)
- {
- for(int j=0; j<VLEN_S; j++)
- tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1);
- tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
- }
- else
- for(int i=0; i < tileSize_width; i+=VLEN_S)
- tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S);
- }
-
- short8 zero = (short8)(0);
- short8 full = (short8)(255);
- if( binary )
- for(int i=0; i < tileSize_width; i+=(VLEN_S))
- tmp[i/VLEN_S] = (tmp[i/VLEN_S]!=zero)?full:zero;
-
- F mom[10];
- __local long m[10][128];
- if(lidy < 128)
- for(int i=0; i<10; i++)
- m[i][lidy]=0;
- barrier(CLK_LOCAL_MEM_FENCE);
- long lm[10] = {0};
- int8 x0 = (int8)(0);
- int8 x1 = (int8)(0);
- int8 x2 = (int8)(0);
- long8 x3 = (long8)(0);
- for( int xt = 0 ; xt < tileSize_width; xt+= (VLEN_S))
- {
- int8 v_xt = (int8)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7);
- int8 p = convert_int8(tmp[xt/VLEN_S]);
- int8 xp = v_xt * p, xxp = xp * v_xt;
- x0 += p;
- x1 += xp;
- x2 += xxp;
- x3 += convert_long8(xxp) * convert_long8(v_xt);
- }
- x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7;
- x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7;
- x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7;
- x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7;
-
- int py = lidy * x0.s0, sy = lidy*lidy;
- int bheight = min(tileSize_height, TILE_SIZE/2);
- if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
- {
- m[9][lidy-bheight] = ((long)py) * sy; // m03
- m[8][lidy-bheight] = ((long)x1.s0) * sy; // m12
- m[7][lidy-bheight] = ((long)x2.s0) * lidy; // m21
- m[6][lidy-bheight] = x3.s0; // m30
- m[5][lidy-bheight] = x0.s0 * sy; // m02
- m[4][lidy-bheight] = x1.s0 * lidy; // m11
- m[3][lidy-bheight] = x2.s0; // m20
- m[2][lidy-bheight] = py; // m01
- m[1][lidy-bheight] = x1.s0; // m10
- m[0][lidy-bheight] = x0.s0; // m00
- }
- else if(lidy < bheight)
- {
- lm[9] = ((long)py) * sy; // m03
- lm[8] = ((long)(x1.s0)) * sy; // m12
- lm[7] = ((long)(x2.s0)) * lidy; // m21
- lm[6] = x3.s0; // m30
- lm[5] = x0.s0 * sy; // m02
- lm[4] = x1.s0 * lidy; // m11
- lm[3] = x2.s0; // m20
- lm[2] = py; // m01
- lm[1] = x1.s0; // m10
- lm[0] = x0.s0; // m00
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- for( int j = TILE_SIZE/2; j >=1; j = j/2 )
- {
- if(lidy < j)
- for( int i = 0; i < 10; i++ )
- lm[i] = lm[i] + m[i][lidy];
- barrier(CLK_LOCAL_MEM_FENCE);
- if(lidy >= j/2&&lidy < j)
- for( int i = 0; i < 10; i++ )
- m[i][lidy-j/2] = lm[i];
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- if(lidy ==0 &&lidx ==0)
- {
- for(int mt = 0; mt < 10; mt++ )
- mom[mt] = (F)lm[mt];
-
- if(binary)
- {
- F s = 1./255;
- for( int mt = 0; mt < 10; mt++ )
- mom[mt] *= s;
- }
-
- F xm = x * mom[0], ym = y*mom[0];
-
- // accumulate moments computed in each tile
- dst_step /= sizeof(F);
-
- // + m00 ( = m00' )
- *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
-
- // + m10 ( = m10' + x*m00' )
- *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
-
- // + m01 ( = m01' + y*m00' )
- *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
-
- // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
- *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
-
- // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
- *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
-
- // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
- *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
-
- // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
- *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
-
- // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
- *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
-
- // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
- *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
-
- // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
- *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
- }
-}
-
-__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step,
- __global F* dst_m,
- int dst_cols, int dst_step, int blocky,
- int depth, int cn, int coi, int binary, const int TILE_SIZE)
-{
- float tmp_coi[4]; // get the coi data
- float4 tmp[64] ;
- int VLEN_F = 4; // vector length of float
- int gidy = get_global_id(0);
- int gidx = get_global_id(1);
- int wgidy = get_group_id(0);
- int wgidx = get_group_id(1);
- int lidy = get_local_id(0);
- int lidx = get_local_id(1);
- int y = wgidy*TILE_SIZE; // real Y index of pixel
- int x = wgidx*TILE_SIZE; // real X index of pixel
- int kcn = (cn==2)?2:4;
- int rstep = min(src_step/4, TILE_SIZE);
- int tileSize_height = min(TILE_SIZE, src_rows - y);
- int tileSize_width = min(TILE_SIZE, src_cols -x);
- int maxIdx = mul24(src_rows, src_cols);
- int yOff = (y+lidy)*src_step;
- int index;
-
- if ( y+lidy < src_rows )
- {
- if(tileSize_width < TILE_SIZE)
- for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
- *((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0;
- if( coi > 0 )
- for(int i=0; i < tileSize_width; i+=VLEN_F)
- {
- for(int j=0; j<4; j++)
- tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1);
- tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
- }
- else
- for(int i=0; i < tileSize_width; i+=VLEN_F)
- tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3));
- }
-
- float4 zero = (float4)(0);
- float4 full = (float4)(255);
- if( binary )
- for(int i=0; i < tileSize_width; i+=4)
- tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero;
- F mom[10];
- __local F m[10][128];
- if(lidy < 128)
- for(int i = 0; i < 10; i ++)
- m[i][lidy] = 0;
- barrier(CLK_LOCAL_MEM_FENCE);
- F lm[10] = {0};
- F4 x0 = (F4)(0);
- F4 x1 = (F4)(0);
- F4 x2 = (F4)(0);
- F4 x3 = (F4)(0);
- for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_F )
- {
- F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
- F4 p = convert_F4(tmp[xt/VLEN_F]);
- F4 xp = v_xt * p, xxp = xp * v_xt;
- x0 += p;
- x1 += xp;
- x2 += xxp;
- x3 += xxp * v_xt;
- }
- x0.s0 += x0.s1 + x0.s2 + x0.s3;
- x1.s0 += x1.s1 + x1.s2 + x1.s3;
- x2.s0 += x2.s1 + x2.s2 + x2.s3;
- x3.s0 += x3.s1 + x3.s2 + x3.s3;
-
- F py = lidy * x0.s0, sy = lidy*lidy;
- int bheight = min(tileSize_height, TILE_SIZE/2);
- if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
- {
- m[9][lidy-bheight] = ((F)py) * sy; // m03
- m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12
- m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21
- m[6][lidy-bheight] = x3.s0; // m30
- m[5][lidy-bheight] = x0.s0 * sy; // m02
- m[4][lidy-bheight] = x1.s0 * lidy; // m11
- m[3][lidy-bheight] = x2.s0; // m20
- m[2][lidy-bheight] = py; // m01
- m[1][lidy-bheight] = x1.s0; // m10
- m[0][lidy-bheight] = x0.s0; // m00
- }
-
- else if(lidy < bheight)
- {
- lm[9] = ((F)py) * sy; // m03
- lm[8] = ((F)x1.s0) * sy; // m12
- lm[7] = ((F)x2.s0) * lidy; // m21
- lm[6] = x3.s0; // m30
- lm[5] = x0.s0 * sy; // m02
- lm[4] = x1.s0 * lidy; // m11
- lm[3] = x2.s0; // m20
- lm[2] = py; // m01
- lm[1] = x1.s0; // m10
- lm[0] = x0.s0; // m00
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
- {
- if(lidy < j)
- for( int i = 0; i < 10; i++ )
- lm[i] = lm[i] + m[i][lidy];
- barrier(CLK_LOCAL_MEM_FENCE);
- if(lidy >= j/2&&lidy < j)
- for( int i = 0; i < 10; i++ )
- m[i][lidy-j/2] = lm[i];
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- if(lidy == 0&&lidx == 0)
- {
- for( int mt = 0; mt < 10; mt++ )
- mom[mt] = (F)lm[mt];
- if(binary)
- {
- F s = 1./255;
- for( int mt = 0; mt < 10; mt++ )
- mom[mt] *= s;
- }
-
- F xm = x * mom[0], ym = y * mom[0];
-
- // accumulate moments computed in each tile
- dst_step /= sizeof(F);
-
- // + m00 ( = m00' )
- *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
-
- // + m10 ( = m10' + x*m00' )
- *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
-
- // + m01 ( = m01' + y*m00' )
- *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
-
- // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
- *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
-
- // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
- *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
-
- // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
- *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
-
- // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
- *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
-
- // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
- *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
-
- // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
- *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
-
- // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
- *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
- }
-}
-
-__kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, int src_step,
- __global F* dst_m,
- int dst_cols, int dst_step, int blocky,
- int depth, int cn, int coi, int binary, const int TILE_SIZE)
-{
- F tmp_coi[4]; // get the coi data
- F4 tmp[64];
- int VLEN_D = 4; // length of vetor
- int gidy = get_global_id(0);
- int gidx = get_global_id(1);
- int wgidy = get_group_id(0);
- int wgidx = get_group_id(1);
- int lidy = get_local_id(0);
- int lidx = get_local_id(1);
- int y = wgidy*TILE_SIZE; // real Y index of pixel
- int x = wgidx*TILE_SIZE; // real X index of pixel
- int kcn = (cn==2)?2:4;
- int rstep = min(src_step/8, TILE_SIZE);
- int tileSize_height = min(TILE_SIZE, src_rows - y);
- int tileSize_width = min(TILE_SIZE, src_cols - x);
-
- if ( y+lidy < src_rows )
- {
- if(tileSize_width < TILE_SIZE)
- for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
- *((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0;
- if( coi > 0 )
- for(int i=0; i < tileSize_width; i+=VLEN_D)
- {
- for(int j=0; j<4 && ((x+i+j)*kcn+coi-1)<src_cols; j++)
- tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
- tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
- }
- else
- for(int i=0; i < tileSize_width && (x+i+3) < src_cols; i+=VLEN_D)
- tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
- }
-
- F4 zero = (F4)(0);
- F4 full = (F4)(255);
- if( binary )
- for(int i=0; i < tileSize_width; i+=VLEN_D)
- tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero;
- F mom[10];
- __local F m[10][128];
- if(lidy < 128)
- for(int i=0; i<10; i++)
- m[i][lidy]=0;
- barrier(CLK_LOCAL_MEM_FENCE);
- F lm[10] = {0};
- F4 x0 = (F4)(0);
- F4 x1 = (F4)(0);
- F4 x2 = (F4)(0);
- F4 x3 = (F4)(0);
- for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D )
- {
- F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
- F4 p = tmp[xt/VLEN_D];
- F4 xp = v_xt * p, xxp = xp * v_xt;
- x0 += p;
- x1 += xp;
- x2 += xxp;
- x3 += xxp *v_xt;
- }
- x0.s0 += x0.s1 + x0.s2 + x0.s3;
- x1.s0 += x1.s1 + x1.s2 + x1.s3;
- x2.s0 += x2.s1 + x2.s2 + x2.s3;
- x3.s0 += x3.s1 + x3.s2 + x3.s3;
-
- F py = lidy * x0.s0, sy = lidy*lidy;
- int bheight = min(tileSize_height, TILE_SIZE/2);
- if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
- {
- m[9][lidy-bheight] = ((F)py) * sy; // m03
- m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12
- m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21
- m[6][lidy-bheight] = x3.s0; // m30
- m[5][lidy-bheight] = x0.s0 * sy; // m02
- m[4][lidy-bheight] = x1.s0 * lidy; // m11
- m[3][lidy-bheight] = x2.s0; // m20
- m[2][lidy-bheight] = py; // m01
- m[1][lidy-bheight] = x1.s0; // m10
- m[0][lidy-bheight] = x0.s0; // m00
- }
- else if(lidy < bheight)
- {
- lm[9] = ((F)py) * sy; // m03
- lm[8] = ((F)x1.s0) * sy; // m12
- lm[7] = ((F)x2.s0) * lidy; // m21
- lm[6] = x3.s0; // m30
- lm[5] = x0.s0 * sy; // m02
- lm[4] = x1.s0 * lidy; // m11
- lm[3] = x2.s0; // m20
- lm[2] = py; // m01
- lm[1] = x1.s0; // m10
- lm[0] = x0.s0; // m00
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
- {
- if(lidy < j)
- for( int i = 0; i < 10; i++ )
- lm[i] = lm[i] + m[i][lidy];
- barrier(CLK_LOCAL_MEM_FENCE);
- if(lidy >= j/2&&lidy < j)
- for( int i = 0; i < 10; i++ )
- m[i][lidy-j/2] = lm[i];
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- if(lidy == 0&&lidx == 0)
- {
- for( int mt = 0; mt < 10; mt++ )
- mom[mt] = (F)lm[mt];
- if(binary)
- {
- F s = 1./255;
- for( int mt = 0; mt < 10; mt++ )
- mom[mt] *= s;
- }
-
- F xm = x * mom[0], ym = y * mom[0];
-
- // accumulate moments computed in each tile
- dst_step /= sizeof(F);
-
- // + m00 ( = m00' )
- *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
-
- // + m10 ( = m10' + x*m00' )
- *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
-
- // + m01 ( = m01' + y*m00' )
- *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
-
- // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
- *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
-
- // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
- *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
-
- // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
- *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
-
- // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
- *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
-
- // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
- *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
-
- // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
- *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
-
- // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
- *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
- }
-}
+++ /dev/null
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Niko Li, newlife20080214@gmail.com
-// Zero Lin, zero.lin@amd.com
-// Yao Wang, bitwangyaoyao@gmail.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//
-
-
-#ifdef ERODE
-#define MORPH_OP(A,B) min((A),(B))
-#endif
-#ifdef DILATE
-#define MORPH_OP(A,B) max((A),(B))
-#endif
-//BORDER_CONSTANT: iiiiii|abcdefgh|iiiiiii
-#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)
-#ifndef GENTYPE
-
-__kernel void morph_C1_D0(__global const uchar * restrict src,
- __global uchar *dst,
- int src_offset_x, int src_offset_y,
- int cols, int rows,
- int src_step_in_pixel, int dst_step_in_pixel,
- __constant uchar * mat_kernel,
- int src_whole_cols, int src_whole_rows,
- int dst_offset_in_pixel)
-{
- int l_x = get_local_id(0);
- int l_y = get_local_id(1);
- int x = get_group_id(0)*4*LSIZE0;
- int y = get_group_id(1)*LSIZE1;
- int start_x = x+src_offset_x-RADIUSX & 0xfffffffc;
- int end_x = x + src_offset_x+LSIZE0*4+RADIUSX & 0xfffffffc;
- int width = (end_x -start_x+4)>>2;
- int offset = src_offset_x-RADIUSX & 3;
- int start_y = y+src_offset_y-RADIUSY;
- int point1 = mad24(l_y,LSIZE0,l_x);
- int point2 = point1 + LSIZE0*LSIZE1;
- int tl_x = (point1 % width)<<2;
- int tl_y = point1 / width;
- int tl_x2 = (point2 % width)<<2;
- int tl_y2 = point2 / width;
- int cur_x = start_x + tl_x;
- int cur_y = start_y + tl_y;
- int cur_x2 = start_x + tl_x2;
- int cur_y2 = start_y + tl_y2;
- int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
- int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
- uchar4 temp0,temp1;
- __local uchar4 LDS_DAT[2*LSIZE1*LSIZE0];
-
- int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
- //read pixels from src
- start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
- start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
- temp0 = *(__global uchar4*)&src[start_addr];
- temp1 = *(__global uchar4*)&src[start_addr2];
- //judge if read out of boundary
- temp0.x= ELEM(cur_x,0,src_whole_cols,VAL,temp0.x);
- temp0.y= ELEM(cur_x+1,0,src_whole_cols,VAL,temp0.y);
- temp0.z= ELEM(cur_x+2,0,src_whole_cols,VAL,temp0.z);
- temp0.w= ELEM(cur_x+3,0,src_whole_cols,VAL,temp0.w);
- temp0= ELEM(cur_y,0,src_whole_rows,(uchar4)VAL,temp0);
-
- temp1.x= ELEM(cur_x2,0,src_whole_cols,VAL,temp1.x);
- temp1.y= ELEM(cur_x2+1,0,src_whole_cols,VAL,temp1.y);
- temp1.z= ELEM(cur_x2+2,0,src_whole_cols,VAL,temp1.z);
- temp1.w= ELEM(cur_x2+3,0,src_whole_cols,VAL,temp1.w);
- temp1= ELEM(cur_y2,0,src_whole_rows,(uchar4)VAL,temp1);
-
- LDS_DAT[point1] = temp0;
- LDS_DAT[point2] = temp1;
- barrier(CLK_LOCAL_MEM_FENCE);
- uchar4 res = (uchar4)VAL;
-
- for(int i=0; i<2*RADIUSY+1; i++)
- for(int j=0; j<2*RADIUSX+1; j++)
- {
- res =
-#ifndef RECTKERNEL
- mat_kernel[i*(2*RADIUSX+1)+j] ?
-#endif
- MORPH_OP(res,vload4(0,(__local uchar*)&LDS_DAT[mad24((l_y+i),width,l_x)]+offset+j))
-#ifndef RECTKERNEL
- :res
-#endif
- ;
- }
-
- int gidx = get_global_id(0)<<2;
- int gidy = get_global_id(1);
- int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
-
- if(gidx+3<cols && gidy<rows && ((dst_offset_in_pixel&3)==0))
- {
- *(__global uchar4*)&dst[out_addr] = res;
- }
- else
- {
- if(gidx+3<cols && gidy<rows)
- {
- dst[out_addr] = res.x;
- dst[out_addr+1] = res.y;
- dst[out_addr+2] = res.z;
- dst[out_addr+3] = res.w;
- }
- else if(gidx+2<cols && gidy<rows)
- {
- dst[out_addr] = res.x;
- dst[out_addr+1] = res.y;
- dst[out_addr+2] = res.z;
- }
- else if(gidx+1<cols && gidy<rows)
- {
- dst[out_addr] = res.x;
- dst[out_addr+1] = res.y;
- }
- else if(gidx<cols && gidy<rows)
- {
- dst[out_addr] = res.x;
- }
- }
-}
-
-#else
-
-__kernel void morph(__global const GENTYPE * restrict src,
- __global GENTYPE *dst,
- int src_offset_x, int src_offset_y,
- int cols, int rows,
- int src_step_in_pixel, int dst_step_in_pixel,
- __constant uchar * mat_kernel,
- int src_whole_cols, int src_whole_rows,
- int dst_offset_in_pixel)
-{
- int l_x = get_local_id(0);
- int l_y = get_local_id(1);
- int x = get_group_id(0)*LSIZE0;
- int y = get_group_id(1)*LSIZE1;
- int start_x = x+src_offset_x-RADIUSX;
- int end_x = x + src_offset_x+LSIZE0+RADIUSX;
- int width = end_x -(x+src_offset_x-RADIUSX)+1;
- int start_y = y+src_offset_y-RADIUSY;
- int point1 = mad24(l_y,LSIZE0,l_x);
- int point2 = point1 + LSIZE0*LSIZE1;
- int tl_x = point1 % width;
- int tl_y = point1 / width;
- int tl_x2 = point2 % width;
- int tl_y2 = point2 / width;
- int cur_x = start_x + tl_x;
- int cur_y = start_y + tl_y;
- int cur_x2 = start_x + tl_x2;
- int cur_y2 = start_y + tl_y2;
- int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
- int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
- GENTYPE temp0,temp1;
- __local GENTYPE LDS_DAT[2*LSIZE1*LSIZE0];
-
- int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
- //read pixels from src
- start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
- start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
- temp0 = src[start_addr];
- temp1 = src[start_addr2];
- //judge if read out of boundary
- temp0= ELEM(cur_x,0,src_whole_cols,(GENTYPE)VAL,temp0);
- temp0= ELEM(cur_y,0,src_whole_rows,(GENTYPE)VAL,temp0);
-
- temp1= ELEM(cur_x2,0,src_whole_cols,(GENTYPE)VAL,temp1);
- temp1= ELEM(cur_y2,0,src_whole_rows,(GENTYPE)VAL,temp1);
-
- LDS_DAT[point1] = temp0;
- LDS_DAT[point2] = temp1;
- barrier(CLK_LOCAL_MEM_FENCE);
- GENTYPE res = (GENTYPE)VAL;
- for(int i=0; i<2*RADIUSY+1; i++)
- for(int j=0; j<2*RADIUSX+1; j++)
- {
- res =
-#ifndef RECTKERNEL
- mat_kernel[i*(2*RADIUSX+1)+j] ?
-#endif
- MORPH_OP(res,LDS_DAT[mad24(l_y+i,width,l_x+j)])
-#ifndef RECTKERNEL
- :res
-#endif
- ;
- }
- int gidx = get_global_id(0);
- int gidy = get_global_id(1);
- int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
- if(gidx<cols && gidy<rows)
- {
- dst[out_addr] = res;
- }
-}
-
-#endif
+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Dachuan Zhao, dachuan@multicorewareinc.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-int idx_row_low(int y, int last_row)
-{
- return abs(y) % (last_row + 1);
-}
-
-int idx_row_high(int y, int last_row)
-{
- return abs(last_row - (int)abs(last_row - y)) % (last_row + 1);
-}
-
-int idx_row(int y, int last_row)
-{
- return idx_row_low(idx_row_high(y, last_row), last_row);
-}
-
-int idx_col_low(int x, int last_col)
-{
- return abs(x) % (last_col + 1);
-}
-
-int idx_col_high(int x, int last_col)
-{
- return abs(last_col - (int)abs(last_col - x)) % (last_col + 1);
-}
-
-int idx_col(int x, int last_col)
-{
- return idx_col_low(idx_col_high(x, last_col), last_col);
-}
-
-///////////////////////////////////////////////////////////////////////
-////////////////////////// CV_8UC1 ///////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-
-__kernel void pyrDown_C1_D0(__global uchar * srcData, int srcStep, int srcRows, int srcCols, __global uchar *dst, int dstStep, int dstCols)
-{
- const int x = get_global_id(0);
- const int y = get_group_id(1);
-
- __local float smem[256 + 4];
-
- float sum;
-
- const int src_y = 2*y;
- const int last_row = srcRows - 1;
- const int last_col = srcCols - 1;
-
- if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
- {
- sum = 0.0625f * (((srcData + (src_y - 2) * srcStep))[x]);
- sum = sum + 0.25f * (((srcData + (src_y - 1) * srcStep))[x]);
- sum = sum + 0.375f * (((srcData + (src_y ) * srcStep))[x]);
- sum = sum + 0.25f * (((srcData + (src_y + 1) * srcStep))[x]);
- sum = sum + 0.0625f * (((srcData + (src_y + 2) * srcStep))[x]);
-
- smem[2 + get_local_id(0)] = sum;
-
- if (get_local_id(0) < 2)
- {
- const int left_x = x - 2;
-
- sum = 0.0625f * (((srcData + (src_y - 2) * srcStep))[left_x]);
- sum = sum + 0.25f * (((srcData + (src_y - 1) * srcStep))[left_x]);
- sum = sum + 0.375f * (((srcData + (src_y ) * srcStep))[left_x]);
- sum = sum + 0.25f * (((srcData + (src_y + 1) * srcStep))[left_x]);
- sum = sum + 0.0625f * (((srcData + (src_y + 2) * srcStep))[left_x]);
-
- smem[get_local_id(0)] = sum;
- }
-
- if (get_local_id(0) > 253)
- {
- const int right_x = x + 2;
-
- sum = 0.0625f * (((srcData + (src_y - 2) * srcStep))[right_x]);
- sum = sum + 0.25f * (((srcData + (src_y - 1) * srcStep))[right_x]);
- sum = sum + 0.375f * (((srcData + (src_y ) * srcStep))[right_x]);
- sum = sum + 0.25f * (((srcData + (src_y + 1) * srcStep))[right_x]);
- sum = sum + 0.0625f * (((srcData + (src_y + 2) * srcStep))[right_x]);
-
- smem[4 + get_local_id(0)] = sum;
- }
- }
- else
- {
- int col = idx_col(x, last_col);
-
- sum = 0.0625f * (((srcData + idx_row(src_y - 2, last_row) * srcStep))[col]);
- sum = sum + 0.25f * (((srcData + idx_row(src_y - 1, last_row) * srcStep))[col]);
- sum = sum + 0.375f * (((srcData + idx_row(src_y , last_row) * srcStep))[col]);
- sum = sum + 0.25f * (((srcData + idx_row(src_y + 1, last_row) * srcStep))[col]);
- sum = sum + 0.0625f * (((srcData + idx_row(src_y + 2, last_row) * srcStep))[col]);
-
- smem[2 + get_local_id(0)] = sum;
-
- if (get_local_id(0) < 2)
- {
- const int left_x = x - 2;
-
- col = idx_col(left_x, last_col);
-
- sum = 0.0625f * (((srcData + idx_row(src_y - 2, last_row) * srcStep))[col]);
- sum = sum + 0.25f * (((srcData + idx_row(src_y - 1, last_row) * srcStep))[col]);
- sum = sum + 0.375f * (((srcData + idx_row(src_y , last_row) * srcStep))[col]);
- sum = sum + 0.25f * (((srcData + idx_row(src_y + 1, last_row) * srcStep))[col]);
- sum = sum + 0.0625f * (((srcData + idx_row(src_y + 2, last_row) * srcStep))[col]);
-
- smem[get_local_id(0)] = sum;
- }
-
- if (get_local_id(0) > 253)
- {
- const int right_x = x + 2;
-
- col = idx_col(right_x, last_col);
-
- sum = 0.0625f * (((srcData + idx_row(src_y - 2, last_row) * srcStep))[col]);
- sum = sum + 0.25f * (((srcData + idx_row(src_y - 1, last_row) * srcStep))[col]);
- sum = sum + 0.375f * (((srcData + idx_row(src_y , last_row) * srcStep))[col]);
- sum = sum + 0.25f * (((srcData + idx_row(src_y + 1, last_row) * srcStep))[col]);
- sum = sum + 0.0625f * (((srcData + idx_row(src_y + 2, last_row) * srcStep))[col]);
-
- smem[4 + get_local_id(0)] = sum;
- }
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (get_local_id(0) < 128)
- {
- const int tid2 = get_local_id(0) * 2;
-
- sum = 0.0625f * smem[2 + tid2 - 2];
- sum = sum + 0.25f * smem[2 + tid2 - 1];
- sum = sum + 0.375f * smem[2 + tid2 ];
- sum = sum + 0.25f * smem[2 + tid2 + 1];
- sum = sum + 0.0625f * smem[2 + tid2 + 2];
-
- const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
-
- if (dst_x < dstCols)
- dst[y * dstStep + dst_x] = convert_uchar_sat_rte(sum);
- }
-}
-
-///////////////////////////////////////////////////////////////////////
-////////////////////////// CV_8UC4 ///////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-
-__kernel void pyrDown_C4_D0(__global uchar4 * srcData, int srcStep, int srcRows, int srcCols, __global uchar4 *dst, int dstStep, int dstCols)
-{
- const int x = get_global_id(0);
- const int y = get_group_id(1);
-
- __local float4 smem[256 + 4];
-
- float4 sum;
-
- const int src_y = 2*y;
- const int last_row = srcRows - 1;
- const int last_col = srcCols - 1;
-
- float4 co1 = 0.375f;
- float4 co2 = 0.25f;
- float4 co3 = 0.0625f;
-
- if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
- {
- sum = co3 * convert_float4((((srcData + (src_y - 2) * srcStep / 4))[x]));
- sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[x]));
- sum = sum + co1 * convert_float4((((srcData + (src_y ) * srcStep / 4))[x]));
- sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[x]));
- sum = sum + co3 * convert_float4((((srcData + (src_y + 2) * srcStep / 4))[x]));
-
- smem[2 + get_local_id(0)] = sum;
-
- if (get_local_id(0) < 2)
- {
- const int left_x = x - 2;
-
- sum = co3 * convert_float4((((srcData + (src_y - 2) * srcStep / 4))[left_x]));
- sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[left_x]));
- sum = sum + co1 * convert_float4((((srcData + (src_y ) * srcStep / 4))[left_x]));
- sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[left_x]));
- sum = sum + co3 * convert_float4((((srcData + (src_y + 2) * srcStep / 4))[left_x]));
-
- smem[get_local_id(0)] = sum;
- }
-
- if (get_local_id(0) > 253)
- {
- const int right_x = x + 2;
-
- sum = co3 * convert_float4((((srcData + (src_y - 2) * srcStep / 4))[right_x]));
- sum = sum + co2 * convert_float4((((srcData + (src_y - 1) * srcStep / 4))[right_x]));
- sum = sum + co1 * convert_float4((((srcData + (src_y ) * srcStep / 4))[right_x]));
- sum = sum + co2 * convert_float4((((srcData + (src_y + 1) * srcStep / 4))[right_x]));
- sum = sum + co3 * convert_float4((((srcData + (src_y + 2) * srcStep / 4))[right_x]));
-
- smem[4 + get_local_id(0)] = sum;
- }
- }
- else
- {
- int col = idx_col(x, last_col);
-
- sum = co3 * convert_float4((((srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]));
- sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]));
- sum = sum + co1 * convert_float4((((srcData + idx_row(src_y , last_row) * srcStep / 4))[col]));
- sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]));
- sum = sum + co3 * convert_float4((((srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]));
-
- smem[2 + get_local_id(0)] = sum;
-
- if (get_local_id(0) < 2)
- {
- const int left_x = x - 2;
-
- col = idx_col(left_x, last_col);
-
- sum = co3 * convert_float4((((srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]));
- sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]));
- sum = sum + co1 * convert_float4((((srcData + idx_row(src_y , last_row) * srcStep / 4))[col]));
- sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]));
- sum = sum + co3 * convert_float4((((srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]));
-
- smem[get_local_id(0)] = sum;
- }
-
- if (get_local_id(0) > 253)
- {
- const int right_x = x + 2;
-
- col = idx_col(right_x, last_col);
-
- sum = co3 * convert_float4((((srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]));
- sum = sum + co2 * convert_float4((((srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]));
- sum = sum + co1 * convert_float4((((srcData + idx_row(src_y , last_row) * srcStep / 4))[col]));
- sum = sum + co2 * convert_float4((((srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]));
- sum = sum + co3 * convert_float4((((srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]));
-
- smem[4 + get_local_id(0)] = sum;
- }
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (get_local_id(0) < 128)
- {
- const int tid2 = get_local_id(0) * 2;
-
- sum = co3 * smem[2 + tid2 - 2];
- sum = sum + co2 * smem[2 + tid2 - 1];
- sum = sum + co1 * smem[2 + tid2 ];
- sum = sum + co2 * smem[2 + tid2 + 1];
- sum = sum + co3 * smem[2 + tid2 + 2];
-
- const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
-
- if (dst_x < dstCols)
- dst[y * dstStep / 4 + dst_x] = convert_uchar4_sat_rte(sum);
- }
-}
-
-///////////////////////////////////////////////////////////////////////
-////////////////////////// CV_16UC1 //////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-
-__kernel void pyrDown_C1_D2(__global ushort * srcData, int srcStep, int srcRows, int srcCols, __global ushort *dst, int dstStep, int dstCols)
-{
- const int x = get_global_id(0);
- const int y = get_group_id(1);
-
- __local float smem[256 + 4];
-
- float sum;
-
- const int src_y = 2*y;
- const int last_row = srcRows - 1;
- const int last_col = srcCols - 1;
-
- if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
- {
- sum = 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y - 2) * srcStep))[x];
- sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y - 1) * srcStep))[x];
- sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + (src_y ) * srcStep))[x];
- sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y + 1) * srcStep))[x];
- sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y + 2) * srcStep))[x];
-
- smem[2 + get_local_id(0)] = sum;
-
- if (get_local_id(0) < 2)
- {
- const int left_x = x - 2;
-
- sum = 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y - 2) * srcStep))[left_x];
- sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y - 1) * srcStep))[left_x];
- sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + (src_y ) * srcStep))[left_x];
- sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y + 1) * srcStep))[left_x];
- sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y + 2) * srcStep))[left_x];
-
- smem[get_local_id(0)] = sum;
- }
-
- if (get_local_id(0) > 253)
- {
- const int right_x = x + 2;
-
- sum = 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y - 2) * srcStep))[right_x];
- sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y - 1) * srcStep))[right_x];
- sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + (src_y ) * srcStep))[right_x];
- sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + (src_y + 1) * srcStep))[right_x];
- sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + (src_y + 2) * srcStep))[right_x];
-
- smem[4 + get_local_id(0)] = sum;
- }
- }
- else
- {
- int col = idx_col(x, last_col);
-
- sum = 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
- sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
- sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col];
- sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
- sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
-
- smem[2 + get_local_id(0)] = sum;
-
- if (get_local_id(0) < 2)
- {
- const int left_x = x - 2;
-
- col = idx_col(left_x, last_col);
-
- sum = 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
- sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
- sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col];
- sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
- sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
-
- smem[get_local_id(0)] = sum;
- }
-
- if (get_local_id(0) > 253)
- {
- const int right_x = x + 2;
-
- col = idx_col(right_x, last_col);
-
- sum = 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
- sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
- sum = sum + 0.375f * ((__global ushort*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col];
- sum = sum + 0.25f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
- sum = sum + 0.0625f * ((__global ushort*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
-
- smem[4 + get_local_id(0)] = sum;
- }
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (get_local_id(0) < 128)
- {
- const int tid2 = get_local_id(0) * 2;
-
- sum = 0.0625f * smem[2 + tid2 - 2];
- sum = sum + 0.25f * smem[2 + tid2 - 1];
- sum = sum + 0.375f * smem[2 + tid2 ];
- sum = sum + 0.25f * smem[2 + tid2 + 1];
- sum = sum + 0.0625f * smem[2 + tid2 + 2];
-
- const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
-
- if (dst_x < dstCols)
- dst[y * dstStep / 2 + dst_x] = convert_ushort_sat_rte(sum);
- }
-}
-
-///////////////////////////////////////////////////////////////////////
-////////////////////////// CV_16UC4 //////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-
-__kernel void pyrDown_C4_D2(__global ushort4 * srcData, int srcStep, int srcRows, int srcCols, __global ushort4 *dst, int dstStep, int dstCols)
-{
- const int x = get_global_id(0);
- const int y = get_group_id(1);
-
- __local float4 smem[256 + 4];
-
- float4 sum;
-
- const int src_y = 2*y;
- const int last_row = srcRows - 1;
- const int last_col = srcCols - 1;
-
- float4 co1 = 0.375f;
- float4 co2 = 0.25f;
- float4 co3 = 0.0625f;
-
- if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
- {
- sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[x]);
- sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x]);
- sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[x]);
- sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x]);
- sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[x]);
-
- smem[2 + get_local_id(0)] = sum;
-
- if (get_local_id(0) < 2)
- {
- const int left_x = x - 2;
-
- sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[left_x]);
- sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x]);
- sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[left_x]);
- sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x]);
- sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[left_x]);
-
- smem[get_local_id(0)] = sum;
- }
-
- if (get_local_id(0) > 253)
- {
- const int right_x = x + 2;
-
- sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[right_x]);
- sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x]);
- sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[right_x]);
- sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x]);
- sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[right_x]);
-
- smem[4 + get_local_id(0)] = sum;
- }
- }
- else
- {
- int col = idx_col(x, last_col);
-
- sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
- sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
- sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]);
- sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
- sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
-
- smem[2 + get_local_id(0)] = sum;
-
- if (get_local_id(0) < 2)
- {
- const int left_x = x - 2;
-
- col = idx_col(left_x, last_col);
-
- sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
- sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
- sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]);
- sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
- sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
-
- smem[get_local_id(0)] = sum;
- }
-
- if (get_local_id(0) > 253)
- {
- const int right_x = x + 2;
-
- col = idx_col(right_x, last_col);
-
- sum = co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
- sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
- sum = sum + co1 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]);
- sum = sum + co2 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
- sum = sum + co3 * convert_float4(((__global ushort4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
-
- smem[4 + get_local_id(0)] = sum;
- }
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (get_local_id(0) < 128)
- {
- const int tid2 = get_local_id(0) * 2;
-
- sum = co3 * smem[2 + tid2 - 2];
- sum = sum + co2 * smem[2 + tid2 - 1];
- sum = sum + co1 * smem[2 + tid2 ];
- sum = sum + co2 * smem[2 + tid2 + 1];
- sum = sum + co3 * smem[2 + tid2 + 2];
-
- const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
-
- if (dst_x < dstCols)
- dst[y * dstStep / 8 + dst_x] = convert_ushort4_sat_rte(sum);
- }
-}
-
-///////////////////////////////////////////////////////////////////////
-////////////////////////// CV_16SC1 //////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-
-__kernel void pyrDown_C1_D3(__global short * srcData, int srcStep, int srcRows, int srcCols, __global short *dst, int dstStep, int dstCols)
-{
- const int x = get_global_id(0);
- const int y = get_group_id(1);
-
- __local float smem[256 + 4];
-
- float sum;
-
- const int src_y = 2*y;
- const int last_row = srcRows - 1;
- const int last_col = srcCols - 1;
-
- if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
- {
- sum = 0.0625f * ((__global short*)((__global char*)srcData + (src_y - 2) * srcStep))[x];
- sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y - 1) * srcStep))[x];
- sum = sum + 0.375f * ((__global short*)((__global char*)srcData + (src_y ) * srcStep))[x];
- sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y + 1) * srcStep))[x];
- sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + (src_y + 2) * srcStep))[x];
-
- smem[2 + get_local_id(0)] = sum;
-
- if (get_local_id(0) < 2)
- {
- const int left_x = x - 2;
-
- sum = 0.0625f * ((__global short*)((__global char*)srcData + (src_y - 2) * srcStep))[left_x];
- sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y - 1) * srcStep))[left_x];
- sum = sum + 0.375f * ((__global short*)((__global char*)srcData + (src_y ) * srcStep))[left_x];
- sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y + 1) * srcStep))[left_x];
- sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + (src_y + 2) * srcStep))[left_x];
-
- smem[get_local_id(0)] = sum;
- }
-
- if (get_local_id(0) > 253)
- {
- const int right_x = x + 2;
-
- sum = 0.0625f * ((__global short*)((__global char*)srcData + (src_y - 2) * srcStep))[right_x];
- sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y - 1) * srcStep))[right_x];
- sum = sum + 0.375f * ((__global short*)((__global char*)srcData + (src_y ) * srcStep))[right_x];
- sum = sum + 0.25f * ((__global short*)((__global char*)srcData + (src_y + 1) * srcStep))[right_x];
- sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + (src_y + 2) * srcStep))[right_x];
-
- smem[4 + get_local_id(0)] = sum;
- }
- }
- else
- {
- int col = idx_col(x, last_col);
-
- sum = 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
- sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
- sum = sum + 0.375f * ((__global short*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col];
- sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
- sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
-
- smem[2 + get_local_id(0)] = sum;
-
- if (get_local_id(0) < 2)
- {
- const int left_x = x - 2;
-
- col = idx_col(left_x, last_col);
-
- sum = 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
- sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
- sum = sum + 0.375f * ((__global short*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col];
- sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
- sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
-
- smem[get_local_id(0)] = sum;
- }
-
- if (get_local_id(0) > 253)
- {
- const int right_x = x + 2;
-
- col = idx_col(right_x, last_col);
-
- sum = 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
- sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
- sum = sum + 0.375f * ((__global short*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col];
- sum = sum + 0.25f * ((__global short*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
- sum = sum + 0.0625f * ((__global short*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
-
- smem[4 + get_local_id(0)] = sum;
- }
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (get_local_id(0) < 128)
- {
- const int tid2 = get_local_id(0) * 2;
-
- sum = 0.0625f * smem[2 + tid2 - 2];
- sum = sum + 0.25f * smem[2 + tid2 - 1];
- sum = sum + 0.375f * smem[2 + tid2 ];
- sum = sum + 0.25f * smem[2 + tid2 + 1];
- sum = sum + 0.0625f * smem[2 + tid2 + 2];
-
- const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
-
- if (dst_x < dstCols)
- dst[y * dstStep / 2 + dst_x] = convert_short_sat_rte(sum);
- }
-}
-
-///////////////////////////////////////////////////////////////////////
-////////////////////////// CV_16SC4 //////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-
-__kernel void pyrDown_C4_D3(__global short4 * srcData, int srcStep, int srcRows, int srcCols, __global short4 *dst, int dstStep, int dstCols)
-{
- const int x = get_global_id(0);
- const int y = get_group_id(1);
-
- __local float4 smem[256 + 4];
-
- float4 sum;
-
- const int src_y = 2*y;
- const int last_row = srcRows - 1;
- const int last_col = srcCols - 1;
-
- float4 co1 = 0.375f;
- float4 co2 = 0.25f;
- float4 co3 = 0.0625f;
-
- if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
- {
- sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[x]);
- sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x]);
- sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[x]);
- sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x]);
- sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[x]);
-
- smem[2 + get_local_id(0)] = sum;
-
- if (get_local_id(0) < 2)
- {
- const int left_x = x - 2;
-
- sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[left_x]);
- sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x]);
- sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[left_x]);
- sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x]);
- sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[left_x]);
-
- smem[get_local_id(0)] = sum;
- }
-
- if (get_local_id(0) > 253)
- {
- const int right_x = x + 2;
-
- sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[right_x]);
- sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x]);
- sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[right_x]);
- sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x]);
- sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[right_x]);
-
- smem[4 + get_local_id(0)] = sum;
- }
- }
- else
- {
- int col = idx_col(x, last_col);
-
- sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
- sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
- sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]);
- sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
- sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
-
- smem[2 + get_local_id(0)] = sum;
-
- if (get_local_id(0) < 2)
- {
- const int left_x = x - 2;
-
- col = idx_col(left_x, last_col);
-
- sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
- sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
- sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]);
- sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
- sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
-
- smem[get_local_id(0)] = sum;
- }
-
- if (get_local_id(0) > 253)
- {
- const int right_x = x + 2;
-
- col = idx_col(right_x, last_col);
-
- sum = co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col]);
- sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col]);
- sum = sum + co1 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col]);
- sum = sum + co2 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col]);
- sum = sum + co3 * convert_float4(((__global short4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col]);
-
- smem[4 + get_local_id(0)] = sum;
- }
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (get_local_id(0) < 128)
- {
- const int tid2 = get_local_id(0) * 2;
-
- sum = co3 * smem[2 + tid2 - 2];
- sum = sum + co2 * smem[2 + tid2 - 1];
- sum = sum + co1 * smem[2 + tid2 ];
- sum = sum + co2 * smem[2 + tid2 + 1];
- sum = sum + co3 * smem[2 + tid2 + 2];
-
- const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
-
- if (dst_x < dstCols)
- dst[y * dstStep / 8 + dst_x] = convert_short4_sat_rte(sum);
- }
-}
-
-///////////////////////////////////////////////////////////////////////
-////////////////////////// CV_32FC1 //////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-
-__kernel void pyrDown_C1_D5(__global float * srcData, int srcStep, int srcRows, int srcCols, __global float *dst, int dstStep, int dstCols)
-{
- const int x = get_global_id(0);
- const int y = get_group_id(1);
-
- __local float smem[256 + 4];
-
- float sum;
-
- const int src_y = 2*y;
- const int last_row = srcRows - 1;
- const int last_col = srcCols - 1;
-
- if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
- {
- sum = 0.0625f * ((__global float*)((__global char*)srcData + (src_y - 2) * srcStep))[x];
- sum = sum + 0.25f * ((__global float*)((__global char*)srcData + (src_y - 1) * srcStep))[x];
- sum = sum + 0.375f * ((__global float*)((__global char*)srcData + (src_y ) * srcStep))[x];
- sum = sum + 0.25f * ((__global float*)((__global char*)srcData + (src_y + 1) * srcStep))[x];
- sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + (src_y + 2) * srcStep))[x];
-
- smem[2 + get_local_id(0)] = sum;
-
- if (get_local_id(0) < 2)
- {
- const int left_x = x - 2;
-
- sum = 0.0625f * ((__global float*)((__global char*)srcData + (src_y - 2) * srcStep))[left_x];
- sum = sum + 0.25f * ((__global float*)((__global char*)srcData + (src_y - 1) * srcStep))[left_x];
- sum = sum + 0.375f * ((__global float*)((__global char*)srcData + (src_y ) * srcStep))[left_x];
- sum = sum + 0.25f * ((__global float*)((__global char*)srcData + (src_y + 1) * srcStep))[left_x];
- sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + (src_y + 2) * srcStep))[left_x];
-
- smem[get_local_id(0)] = sum;
- }
-
- if (get_local_id(0) > 253)
- {
- const int right_x = x + 2;
-
- sum = 0.0625f * ((__global float*)((__global char*)srcData + (src_y - 2) * srcStep))[right_x];
- sum = sum + 0.25f * ((__global float*)((__global char*)srcData + (src_y - 1) * srcStep))[right_x];
- sum = sum + 0.375f * ((__global float*)((__global char*)srcData + (src_y ) * srcStep))[right_x];
- sum = sum + 0.25f * ((__global float*)((__global char*)srcData + (src_y + 1) * srcStep))[right_x];
- sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + (src_y + 2) * srcStep))[right_x];
-
- smem[4 + get_local_id(0)] = sum;
- }
- }
- else
- {
- int col = idx_col(x, last_col);
-
- sum = 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
- sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
- sum = sum + 0.375f * ((__global float*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col];
- sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
- sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
-
- smem[2 + get_local_id(0)] = sum;
-
- if (get_local_id(0) < 2)
- {
- const int left_x = x - 2;
-
- col = idx_col(left_x, last_col);
-
- sum = 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
- sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
- sum = sum + 0.375f * ((__global float*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col];
- sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
- sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
-
- smem[get_local_id(0)] = sum;
- }
-
- if (get_local_id(0) > 253)
- {
- const int right_x = x + 2;
-
- col = idx_col(right_x, last_col);
-
- sum = 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y - 2, last_row) * srcStep))[col];
- sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y - 1, last_row) * srcStep))[col];
- sum = sum + 0.375f * ((__global float*)((__global char*)srcData + idx_row(src_y , last_row) * srcStep))[col];
- sum = sum + 0.25f * ((__global float*)((__global char*)srcData + idx_row(src_y + 1, last_row) * srcStep))[col];
- sum = sum + 0.0625f * ((__global float*)((__global char*)srcData + idx_row(src_y + 2, last_row) * srcStep))[col];
-
- smem[4 + get_local_id(0)] = sum;
- }
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (get_local_id(0) < 128)
- {
- const int tid2 = get_local_id(0) * 2;
-
- sum = 0.0625f * smem[2 + tid2 - 2];
- sum = sum + 0.25f * smem[2 + tid2 - 1];
- sum = sum + 0.375f * smem[2 + tid2 ];
- sum = sum + 0.25f * smem[2 + tid2 + 1];
- sum = sum + 0.0625f * smem[2 + tid2 + 2];
-
- const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
-
- if (dst_x < dstCols)
- dst[y * dstStep / 4 + dst_x] = sum;
- }
-}
-
-///////////////////////////////////////////////////////////////////////
-////////////////////////// CV_32FC4 //////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-
-__kernel void pyrDown_C4_D5(__global float4 * srcData, int srcStep, int srcRows, int srcCols, __global float4 *dst, int dstStep, int dstCols)
-{
- const int x = get_global_id(0);
- const int y = get_group_id(1);
-
- __local float4 smem[256 + 4];
-
- float4 sum;
-
- const int src_y = 2*y;
- const int last_row = srcRows - 1;
- const int last_col = srcCols - 1;
-
- float4 co1 = 0.375f;
- float4 co2 = 0.25f;
- float4 co3 = 0.0625f;
-
- if (src_y >= 2 && src_y < srcRows - 2 && x >= 2 && x < srcCols - 2)
- {
- sum = co3 * ((__global float4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[x];
- sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[x];
- sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[x];
- sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[x];
- sum = sum + co3 * ((__global float4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[x];
-
- smem[2 + get_local_id(0)] = sum;
-
- if (get_local_id(0) < 2)
- {
- const int left_x = x - 2;
-
- sum = co3 * ((__global float4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[left_x];
- sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[left_x];
- sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[left_x];
- sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[left_x];
- sum = sum + co3 * ((__global float4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[left_x];
-
- smem[get_local_id(0)] = sum;
- }
-
- if (get_local_id(0) > 253)
- {
- const int right_x = x + 2;
-
- sum = co3 * ((__global float4*)((__global char4*)srcData + (src_y - 2) * srcStep / 4))[right_x];
- sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y - 1) * srcStep / 4))[right_x];
- sum = sum + co1 * ((__global float4*)((__global char4*)srcData + (src_y ) * srcStep / 4))[right_x];
- sum = sum + co2 * ((__global float4*)((__global char4*)srcData + (src_y + 1) * srcStep / 4))[right_x];
- sum = sum + co3 * ((__global float4*)((__global char4*)srcData + (src_y + 2) * srcStep / 4))[right_x];
-
- smem[4 + get_local_id(0)] = sum;
- }
- }
- else
- {
- int col = idx_col(x, last_col);
-
- sum = co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col];
- sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col];
- sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col];
- sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col];
- sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col];
-
- smem[2 + get_local_id(0)] = sum;
-
- if (get_local_id(0) < 2)
- {
- const int left_x = x - 2;
-
- col = idx_col(left_x, last_col);
-
- sum = co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col];
- sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col];
- sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col];
- sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col];
- sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col];
-
- smem[get_local_id(0)] = sum;
- }
-
- if (get_local_id(0) > 253)
- {
- const int right_x = x + 2;
-
- col = idx_col(right_x, last_col);
-
- sum = co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 2, last_row) * srcStep / 4))[col];
- sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y - 1, last_row) * srcStep / 4))[col];
- sum = sum + co1 * ((__global float4*)((__global char4*)srcData + idx_row(src_y , last_row) * srcStep / 4))[col];
- sum = sum + co2 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 1, last_row) * srcStep / 4))[col];
- sum = sum + co3 * ((__global float4*)((__global char4*)srcData + idx_row(src_y + 2, last_row) * srcStep / 4))[col];
-
- smem[4 + get_local_id(0)] = sum;
- }
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (get_local_id(0) < 128)
- {
- const int tid2 = get_local_id(0) * 2;
-
- sum = co3 * smem[2 + tid2 - 2];
- sum = sum + co2 * smem[2 + tid2 - 1];
- sum = sum + co1 * smem[2 + tid2 ];
- sum = sum + co2 * smem[2 + tid2 + 1];
- sum = sum + co3 * smem[2 + tid2 + 2];
-
- const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
-
- if (dst_x < dstCols)
- dst[y * dstStep / 16 + dst_x] = sum;
- }
-}
+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Wu Zailong, bullet@yeah.net
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-
-#ifdef INTER_NEAREST
-#define convertToWT
-#endif
-
-#ifdef BORDER_CONSTANT
-#define EXTRAPOLATE(v2, v) v = scalar;
-#elif defined BORDER_REPLICATE
-#define EXTRAPOLATE(v2, v) \
- { \
- v2 = max(min(v2, (int2)(src_cols - 1, src_rows - 1)), zero); \
- v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \
- }
-#elif defined BORDER_WRAP
-#define EXTRAPOLATE(v2, v) \
- { \
- if (v2.x < 0) \
- v2.x -= ((v2.x - src_cols + 1) / src_cols) * src_cols; \
- if (v2.x >= src_cols) \
- v2.x %= src_cols; \
- \
- if (v2.y < 0) \
- v2.y -= ((v2.y - src_rows + 1) / src_rows) * src_rows; \
- if( v2.y >= src_rows ) \
- v2.y %= src_rows; \
- v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \
- }
-#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)
-#ifdef BORDER_REFLECT
-#define DELTA int delta = 0
-#else
-#define DELTA int delta = 1
-#endif
-#define EXTRAPOLATE(v2, v) \
- { \
- DELTA; \
- if (src_cols == 1) \
- v2.x = 0; \
- else \
- do \
- { \
- if( v2.x < 0 ) \
- v2.x = -v2.x - 1 + delta; \
- else \
- v2.x = src_cols - 1 - (v2.x - src_cols) - delta; \
- } \
- while (v2.x >= src_cols || v2.x < 0); \
- \
- if (src_rows == 1) \
- v2.y = 0; \
- else \
- do \
- { \
- if( v2.y < 0 ) \
- v2.y = -v2.y - 1 + delta; \
- else \
- v2.y = src_rows - 1 - (v2.y - src_rows) - delta; \
- } \
- while (v2.y >= src_rows || v2.y < 0); \
- v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \
- }
-#else
-#error No extrapolation method
-#endif
-
-#define NEED_EXTRAPOLATION(gx, gy) (gx >= src_cols || gy >= src_rows || gx < 0 || gy < 0)
-
-#ifdef INTER_NEAREST
-
-__kernel void remap_2_32FC1(__global const T * restrict src, __global T * dst,
- __global float * map1, __global float * map2,
- int src_offset, int dst_offset, int map1_offset, int map2_offset,
- int src_step, int dst_step, int map1_step, int map2_step,
- int src_cols, int src_rows, int dst_cols, int dst_rows, T scalar)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < dst_cols && y < dst_rows)
- {
- int dstIdx = mad24(y, dst_step, x + dst_offset);
- int map1Idx = mad24(y, map1_step, x + map1_offset);
- int map2Idx = mad24(y, map2_step, x + map2_offset);
-
- int gx = convert_int_sat_rte(map1[map1Idx]);
- int gy = convert_int_sat_rte(map2[map2Idx]);
-
- if (NEED_EXTRAPOLATION(gx, gy))
- {
- int2 gxy = (int2)(gx, gy), zero = (int2)(0);
- EXTRAPOLATE(gxy, dst[dstIdx]);
- }
- else
- {
- int srcIdx = mad24(gy, src_step, gx + src_offset);
- dst[dstIdx] = src[srcIdx];
- }
- }
-}
-
-__kernel void remap_32FC2(__global const T * restrict src, __global T * dst, __global float2 * map1,
- int src_offset, int dst_offset, int map1_offset,
- int src_step, int dst_step, int map1_step,
- int src_cols, int src_rows, int dst_cols, int dst_rows, T scalar)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < dst_cols && y < dst_rows)
- {
- int dstIdx = mad24(y, dst_step, x + dst_offset);
- int map1Idx = mad24(y, map1_step, x + map1_offset);
-
- int2 gxy = convert_int2_sat_rte(map1[map1Idx]);
- int gx = gxy.x, gy = gxy.y;
-
- if (NEED_EXTRAPOLATION(gx, gy))
- {
- int2 zero = (int2)(0);
- EXTRAPOLATE(gxy, dst[dstIdx]);
- }
- else
- {
- int srcIdx = mad24(gy, src_step, gx + src_offset);
- dst[dstIdx] = src[srcIdx];
- }
- }
-}
-
-__kernel void remap_16SC2(__global const T * restrict src, __global T * dst, __global short2 * map1,
- int src_offset, int dst_offset, int map1_offset,
- int src_step, int dst_step, int map1_step,
- int src_cols, int src_rows, int dst_cols, int dst_rows, T scalar)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < dst_cols && y < dst_rows)
- {
- int dstIdx = mad24(y, dst_step, x + dst_offset);
- int map1Idx = mad24(y, map1_step, x + map1_offset);
-
- int2 gxy = convert_int2(map1[map1Idx]);
- int gx = gxy.x, gy = gxy.y;
-
- if (NEED_EXTRAPOLATION(gx, gy))
- {
- int2 zero = (int2)(0);
- EXTRAPOLATE(gxy, dst[dstIdx]);
- }
- else
- {
- int srcIdx = mad24(gy, src_step, gx + src_offset);
- dst[dstIdx] = src[srcIdx];
- }
- }
-}
-
-#elif INTER_LINEAR
-
-__kernel void remap_2_32FC1(__global T const * restrict src, __global T * dst,
- __global float * map1, __global float * map2,
- int src_offset, int dst_offset, int map1_offset, int map2_offset,
- int src_step, int dst_step, int map1_step, int map2_step,
- int src_cols, int src_rows, int dst_cols, int dst_rows, T nVal)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < dst_cols && y < dst_rows)
- {
- int dstIdx = mad24(y, dst_step, x + dst_offset);
- int map1Idx = mad24(y, map1_step, x + map1_offset);
- int map2Idx = mad24(y, map2_step, x + map2_offset);
-
- float2 map_data = (float2)(map1[map1Idx], map2[map2Idx]);
-
- int2 map_dataA = convert_int2_sat_rtn(map_data);
- int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
- int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
- int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y +1);
- int2 zero = (int2)(0);
-
- float2 _u = map_data - convert_float2(map_dataA);
- WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)32)) / (WT2)32;
- WT scalar = convertToWT(nVal);
- WT a = scalar, b = scalar, c = scalar, d = scalar;
-
- if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
- a = convertToWT(src[mad24(map_dataA.y, src_step, map_dataA.x + src_offset)]);
- else
- EXTRAPOLATE(map_dataA, a);
-
- if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))
- b = convertToWT(src[mad24(map_dataB.y, src_step, map_dataB.x + src_offset)]);
- else
- EXTRAPOLATE(map_dataB, b);
-
- if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
- c = convertToWT(src[mad24(map_dataC.y, src_step, map_dataC.x + src_offset)]);
- else
- EXTRAPOLATE(map_dataC, c);
-
- if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
- d = convertToWT(src[mad24(map_dataD.y, src_step, map_dataD.x + src_offset)]);
- else
- EXTRAPOLATE(map_dataD, d);
-
- WT dst_data = a * (WT)(1 - u.x) * (WT)(1 - u.y) +
- b * (WT)(u.x) * (WT)(1 - u.y) +
- c * (WT)(1 - u.x) * (WT)(u.y) +
- d * (WT)(u.x) * (WT)(u.y);
- dst[dstIdx] = convertToT(dst_data);
- }
-}
-
-__kernel void remap_32FC2(__global T const * restrict src, __global T * dst,
- __global float2 * map1,
- int src_offset, int dst_offset, int map1_offset,
- int src_step, int dst_step, int map1_step,
- int src_cols, int src_rows, int dst_cols, int dst_rows, T nVal)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < dst_cols && y < dst_rows)
- {
- int dstIdx = mad24(y, dst_step, x + dst_offset);
- int map1Idx = mad24(y, map1_step, x + map1_offset);
-
- float2 map_data = map1[map1Idx];
- int2 map_dataA = convert_int2_sat_rtn(map_data);
- int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
- int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
- int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
- int2 zero = (int2)(0);
-
- float2 _u = map_data - convert_float2(map_dataA);
- WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)32)) / (WT2)32;
- WT scalar = convertToWT(nVal);
- WT a = scalar, b = scalar, c = scalar, d = scalar;
-
- if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
- a = convertToWT(src[mad24(map_dataA.y, src_step, map_dataA.x + src_offset)]);
- else
- EXTRAPOLATE(map_dataA, a);
-
- if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))
- b = convertToWT(src[mad24(map_dataB.y, src_step, map_dataB.x + src_offset)]);
- else
- EXTRAPOLATE(map_dataB, b);
-
- if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
- c = convertToWT(src[mad24(map_dataC.y, src_step, map_dataC.x + src_offset)]);
- else
- EXTRAPOLATE(map_dataC, c);
-
- if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
- d = convertToWT(src[mad24(map_dataD.y, src_step, map_dataD.x + src_offset)]);
- else
- EXTRAPOLATE(map_dataD, d);
-
- WT dst_data = a * (WT)(1 - u.x) * (WT)(1 - u.y) +
- b * (WT)(u.x) * (WT)(1 - u.y) +
- c * (WT)(1 - u.x) * (WT)(u.y) +
- d * (WT)(u.x) * (WT)(u.y);
- dst[dstIdx] = convertToT(dst_data);
- }
-}
-
-#endif
+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Zhang Ying, zhangying913@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-
-// threshold type:
-// enum { THRESH_BINARY=0, THRESH_BINARY_INV=1, THRESH_TRUNC=2, THRESH_TOZERO=3,
-// THRESH_TOZERO_INV=4, THRESH_MASK=7, THRESH_OTSU=8 };
-
-__kernel void threshold_C1_D0(__global const uchar * restrict src, __global uchar *dst,
- int src_offset, int src_step,
- int dst_offset, int dst_rows, int dst_cols, int dst_step,
- uchar thresh, uchar max_val, int thresh_type
- )
-{
- int gx = get_global_id(0);
- const int gy = get_global_id(1);
-
- int offset = (dst_offset & 15);
- src_offset -= offset;
-
- int dstart = (gx << 4) - offset;
- if(dstart < dst_cols && gy < dst_rows)
- {
- uchar16 sdata = vload16(gx, src+src_offset+gy*src_step);
- uchar16 ddata;
- uchar16 zero = 0;
- switch (thresh_type)
- {
- case 0:
- ddata = ((sdata > thresh) ) ? (uchar16)(max_val) : (uchar16)(0);
- break;
- case 1:
- ddata = ((sdata > thresh)) ? zero : (uchar16)(max_val);
- break;
- case 2:
- ddata = ((sdata > thresh)) ? (uchar16)(thresh) : sdata;
- break;
- case 3:
- ddata = ((sdata > thresh)) ? sdata : zero;
- break;
- case 4:
- ddata = ((sdata > thresh)) ? zero : sdata;
- break;
- default:
- ddata = sdata;
- }
- int16 dpos = (int16)(dstart, dstart+1, dstart+2, dstart+3, dstart+4, dstart+5, dstart+6, dstart+7, dstart+8,
- dstart+9, dstart+10, dstart+11, dstart+12, dstart+13, dstart+14, dstart+15);
- uchar16 dVal = *(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart);
- int16 con = dpos >= 0 && dpos < dst_cols;
- ddata = convert_uchar16(con != 0) ? ddata : dVal;
- if(dstart < dst_cols)
- {
- *(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
- }
- }
-}
-
-
-__kernel void threshold_C1_D5(__global const float * restrict src, __global float *dst,
- int src_offset, int src_step,
- int dst_offset, int dst_rows, int dst_cols, int dst_step,
- float thresh, float max_val, int thresh_type
- )
-{
- const int gx = get_global_id(0);
- const int gy = get_global_id(1);
-
- int offset = (dst_offset & 3);
- src_offset -= offset;
-
- int dstart = (gx << 2) - offset;
- if(dstart < dst_cols && gy < dst_rows)
- {
- float4 sdata = vload4(gx, src+src_offset+gy*src_step);
- float4 ddata;
- float4 zero = 0;
- switch (thresh_type)
- {
- case 0:
- ddata = sdata > thresh ? (float4)(max_val) : (float4)(0.f);
- break;
- case 1:
- ddata = sdata > thresh ? zero : (float4)max_val;
- break;
- case 2:
- ddata = sdata > thresh ? (float4)thresh : sdata;
- break;
- case 3:
- ddata = sdata > thresh ? sdata : (float4)(0.f);
- break;
- case 4:
- ddata = sdata > thresh ? (float4)(0.f) : sdata;
- break;
- default:
- ddata = sdata;
- }
- int4 dpos = (int4)(dstart, dstart+1, dstart+2, dstart+3);
- float4 dVal = *(__global float4*)(dst+dst_offset+gy*dst_step+dstart);
- int4 con = dpos >= 0 && dpos < dst_cols;
- ddata = convert_float4(con) != (float4)(0) ? ddata : dVal;
- if(dstart < dst_cols)
- {
- *(__global float4*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
- }
- }
-}
+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Zhang Ying, zhangying913@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-
-//warpAffine kernel
-//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic.
-
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-typedef double F;
-typedef double4 F4;
-#define convert_F4 convert_double4
-#else
-typedef float F;
-typedef float4 F4;
-#define convert_F4 convert_float4
-#endif
-
-#define INTER_BITS 5
-#define INTER_TAB_SIZE (1 << INTER_BITS)
-#define INTER_SCALE 1.f/INTER_TAB_SIZE
-#define AB_BITS max(10, (int)INTER_BITS)
-#define AB_SCALE (1 << AB_BITS)
-#define INTER_REMAP_COEF_BITS 15
-#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)
-
-inline void interpolateCubic( float x, float* coeffs )
-{
- const float A = -0.75f;
-
- coeffs[0] = ((A*(x + 1.f) - 5.0f*A)*(x + 1.f) + 8.0f*A)*(x + 1.f) - 4.0f*A;
- coeffs[1] = ((A + 2.f)*x - (A + 3.f))*x*x + 1.f;
- coeffs[2] = ((A + 2.f)*(1.f - x) - (A + 3.f))*(1.f - x)*(1.f - x) + 1.f;
- coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
-}
-
-
-/**********************************************8UC1*********************************************
-***********************************************************************************************/
-__kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows,
- int dst_cols, int dst_rows, int srcStep, int dstStep,
- int src_offset, int dst_offset, __constant F * M, int threadCols )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
- if( dx < threadCols && dy < dst_rows)
- {
- dx = (dx<<2) - (dst_offset&3);
-
- int round_delta = (AB_SCALE>>1);
-
- int4 X, Y;
- int4 sx, sy;
- int4 DX = (int4)(dx, dx+1, dx+2, dx+3);
- DX = (DX << AB_BITS);
- F4 M0DX, M3DX;
- M0DX = M[0] * convert_F4(DX);
- M3DX = M[3] * convert_F4(DX);
- X = convert_int4(rint(M0DX));
- Y = convert_int4(rint(M3DX));
- int tmp1, tmp2;
- tmp1 = rint((M[1]*dy + M[2]) * AB_SCALE);
- tmp2 = rint((M[4]*dy + M[5]) * AB_SCALE);
-
- X += tmp1 + round_delta;
- Y += tmp2 + round_delta;
-
- sx = convert_int4(convert_short4(X >> AB_BITS));
- sy = convert_int4(convert_short4(Y >> AB_BITS));
-
- __global uchar4 * d = (__global uchar4 *)(dst+dst_offset+dy*dstStep+dx);
- uchar4 dval = *d;
- DX = (int4)(dx, dx+1, dx+2, dx+3);
- int4 dcon = DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows;
- int4 scon = sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows;
- int4 spos = src_offset + sy * srcStep + sx;
- uchar4 sval;
- sval.s0 = scon.s0 ? src[spos.s0] : 0;
- sval.s1 = scon.s1 ? src[spos.s1] : 0;
- sval.s2 = scon.s2 ? src[spos.s2] : 0;
- sval.s3 = scon.s3 ? src[spos.s3] : 0;
- dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval;
- *d = dval;
- }
-}
-
-__kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, int src_cols, int src_rows,
- int dst_cols, int dst_rows, int srcStep, int dstStep,
- int src_offset, int dst_offset, __constant F * M, int threadCols )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
-
- if( dx < threadCols && dy < dst_rows)
- {
- dx = (dx<<2) - (dst_offset&3);
-
- int round_delta = ((AB_SCALE >> INTER_BITS) >> 1);
-
- int4 X, Y;
- short4 ax, ay;
- int4 sx, sy;
- int4 DX = (int4)(dx, dx+1, dx+2, dx+3);
- DX = (DX << AB_BITS);
- F4 M0DX, M3DX;
- M0DX = M[0] * convert_F4(DX);
- M3DX = M[3] * convert_F4(DX);
- X = convert_int4(rint(M0DX));
- Y = convert_int4(rint(M3DX));
-
- int tmp1, tmp2;
- tmp1 = rint((M[1]*dy + M[2]) * AB_SCALE);
- tmp2 = rint((M[4]*dy + M[5]) * AB_SCALE);
-
- X += tmp1 + round_delta;
- Y += tmp2 + round_delta;
-
- X = X >> (AB_BITS - INTER_BITS);
- Y = Y >> (AB_BITS - INTER_BITS);
-
- sx = convert_int4(convert_short4(X >> INTER_BITS));
- sy = convert_int4(convert_short4(Y >> INTER_BITS));
- ax = convert_short4(X & (INTER_TAB_SIZE-1));
- ay = convert_short4(Y & (INTER_TAB_SIZE-1));
-
- uchar4 v0, v1, v2,v3;
- int4 scon0, scon1, scon2, scon3;
- int4 spos0, spos1, spos2, spos3;
-
- scon0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows);
- scon1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows);
- scon2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows);
- scon3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows);
- spos0 = src_offset + sy * srcStep + sx;
- spos1 = src_offset + sy * srcStep + sx + 1;
- spos2 = src_offset + (sy+1) * srcStep + sx;
- spos3 = src_offset + (sy+1) * srcStep + sx + 1;
-
- v0.s0 = scon0.s0 ? src[spos0.s0] : 0;
- v1.s0 = scon1.s0 ? src[spos1.s0] : 0;
- v2.s0 = scon2.s0 ? src[spos2.s0] : 0;
- v3.s0 = scon3.s0 ? src[spos3.s0] : 0;
-
- v0.s1 = scon0.s1 ? src[spos0.s1] : 0;
- v1.s1 = scon1.s1 ? src[spos1.s1] : 0;
- v2.s1 = scon2.s1 ? src[spos2.s1] : 0;
- v3.s1 = scon3.s1 ? src[spos3.s1] : 0;
-
- v0.s2 = scon0.s2 ? src[spos0.s2] : 0;
- v1.s2 = scon1.s2 ? src[spos1.s2] : 0;
- v2.s2 = scon2.s2 ? src[spos2.s2] : 0;
- v3.s2 = scon3.s2 ? src[spos3.s2] : 0;
-
- v0.s3 = scon0.s3 ? src[spos0.s3] : 0;
- v1.s3 = scon1.s3 ? src[spos1.s3] : 0;
- v2.s3 = scon2.s3 ? src[spos2.s3] : 0;
- v3.s3 = scon3.s3 ? src[spos3.s3] : 0;
-
- short4 itab0, itab1, itab2, itab3;
- float4 taby, tabx;
- taby = INTER_SCALE * convert_float4(ay);
- tabx = INTER_SCALE * convert_float4(ax);
-
- itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE ));
- itab1 = convert_short4_sat(( (1.0f-taby)*tabx * (float4)INTER_REMAP_COEF_SCALE ));
- itab2 = convert_short4_sat(( taby*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE ));
- itab3 = convert_short4_sat(( taby*tabx * (float4)INTER_REMAP_COEF_SCALE ));
-
-
- int4 val;
- uchar4 tval;
- val = convert_int4(v0) * convert_int4(itab0) + convert_int4(v1) * convert_int4(itab1)
- + convert_int4(v2) * convert_int4(itab2) + convert_int4(v3) * convert_int4(itab3);
- tval = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
-
- __global uchar4 * d =(__global uchar4 *)(dst+dst_offset+dy*dstStep+dx);
- uchar4 dval = *d;
- DX = (int4)(dx, dx+1, dx+2, dx+3);
- int4 dcon = DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows;
- dval = convert_uchar4(dcon != 0) ? tval : dval;
- *d = dval;
- }
-}
-
-__kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows,
- int dst_cols, int dst_rows, int srcStep, int dstStep,
- int src_offset, int dst_offset, __constant F * M, int threadCols )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
- if( dx < threadCols && dy < dst_rows)
- {
- int round_delta = ((AB_SCALE>>INTER_BITS)>>1);
-
- int X0 = rint(M[0] * dx * AB_SCALE);
- int Y0 = rint(M[3] * dx * AB_SCALE);
- X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
- Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
- int X = X0 >> (AB_BITS - INTER_BITS);
- int Y = Y0 >> (AB_BITS - INTER_BITS);
-
- short sx = (short)(X >> INTER_BITS) - 1;
- short sy = (short)(Y >> INTER_BITS) - 1;
- short ay = (short)(Y & (INTER_TAB_SIZE-1));
- short ax = (short)(X & (INTER_TAB_SIZE-1));
-
- uchar v[16];
- int i, j;
-
-#pragma unroll 4
- for(i=0; i<4; i++)
- for(j=0; j<4; j++)
- {
- v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0;
- }
-
- short itab[16];
- float tab1y[4], tab1x[4];
- float axx, ayy;
-
- ayy = 1.f/INTER_TAB_SIZE * ay;
- axx = 1.f/INTER_TAB_SIZE * ax;
- interpolateCubic(ayy, tab1y);
- interpolateCubic(axx, tab1x);
- int isum = 0;
-
-#pragma unroll 16
- for( i=0; i<16; i++ )
- {
- F v = tab1y[(i>>2)] * tab1x[(i&3)];
- isum += itab[i] = convert_short_sat( rint( v * INTER_REMAP_COEF_SCALE ) );
- }
-
- if( isum != INTER_REMAP_COEF_SCALE )
- {
- int k1, k2;
- int diff = isum - INTER_REMAP_COEF_SCALE;
- int Mk1=2, Mk2=2, mk1=2, mk2=2;
- for( k1 = 2; k1 < 4; k1++ )
- for( k2 = 2; k2 < 4; k2++ )
- {
- if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
- mk1 = k1, mk2 = k2;
- else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
- Mk1 = k1, Mk2 = k2;
- }
- diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
- }
-
- if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
- {
- int sum=0;
- for ( i =0; i<16; i++ )
- {
- sum += v[i] * itab[i] ;
- }
- dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
- }
- }
-}
-
-/**********************************************8UC4*********************************************
-***********************************************************************************************/
-
-__kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
- int dst_cols, int dst_rows, int srcStep, int dstStep,
- int src_offset, int dst_offset, __constant F * M, int threadCols )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
- if( dx < threadCols && dy < dst_rows)
- {
- int round_delta = (AB_SCALE >> 1);
-
- int X0 = rint(M[0] * dx * AB_SCALE);
- int Y0 = rint(M[3] * dx * AB_SCALE);
- X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
- Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
-
- int sx0 = (short)(X0 >> AB_BITS);
- int sy0 = (short)(Y0 >> AB_BITS);
-
- if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
- dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>2)+sy0*(srcStep>>2)+sx0] : (uchar4)0;
- }
-}
-
-__kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
- int dst_cols, int dst_rows, int srcStep, int dstStep,
- int src_offset, int dst_offset, __constant F * M, int threadCols )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
-
- if( dx < threadCols && dy < dst_rows)
- {
- int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
-
- src_offset = (src_offset>>2);
- srcStep = (srcStep>>2);
-
- int tmp = (dx << AB_BITS);
- int X0 = rint(M[0] * tmp);
- int Y0 = rint(M[3] * tmp);
- X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
- Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
- X0 = X0 >> (AB_BITS - INTER_BITS);
- Y0 = Y0 >> (AB_BITS - INTER_BITS);
-
- short sx0 = (short)(X0 >> INTER_BITS);
- short sy0 = (short)(Y0 >> INTER_BITS);
- short ax0 = (short)(X0 & (INTER_TAB_SIZE-1));
- short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1));
-
- int4 v0, v1, v2, v3;
-
- v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? convert_int4(src[src_offset+sy0 * srcStep + sx0]) : 0;
- v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? convert_int4(src[src_offset+sy0 * srcStep + sx0+1]) : 0;
- v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? convert_int4(src[src_offset+(sy0+1) * srcStep + sx0]) : 0;
- v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? convert_int4(src[src_offset+(sy0+1) * srcStep + sx0+1]) : 0;
-
- int itab0, itab1, itab2, itab3;
- float taby, tabx;
- taby = 1.f/INTER_TAB_SIZE*ay0;
- tabx = 1.f/INTER_TAB_SIZE*ax0;
-
- itab0 = convert_short_sat(rint( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
- itab1 = convert_short_sat(rint( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ));
- itab2 = convert_short_sat(rint( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
- itab3 = convert_short_sat(rint( taby*tabx * INTER_REMAP_COEF_SCALE ));
-
- int4 val;
- val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3;
-
- if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
- dst[(dst_offset>>2)+dy*(dstStep>>2)+dx] = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
- }
-}
-
-__kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
- int dst_cols, int dst_rows, int srcStep, int dstStep,
- int src_offset, int dst_offset, __constant F * M, int threadCols )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
- if( dx < threadCols && dy < dst_rows)
- {
- int round_delta = ((AB_SCALE>>INTER_BITS)>>1);
-
- src_offset = (src_offset>>2);
- srcStep = (srcStep>>2);
- dst_offset = (dst_offset>>2);
- dstStep = (dstStep>>2);
-
- int tmp = (dx << AB_BITS);
- int X0 = rint(M[0] * tmp);
- int Y0 = rint(M[3] * tmp);
- X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
- Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
- X0 = X0 >> (AB_BITS - INTER_BITS);
- Y0 = Y0 >> (AB_BITS - INTER_BITS);
-
- int sx = (short)(X0 >> INTER_BITS) - 1;
- int sy = (short)(Y0 >> INTER_BITS) - 1;
- int ay = (short)(Y0 & (INTER_TAB_SIZE-1));
- int ax = (short)(X0 & (INTER_TAB_SIZE-1));
-
- uchar4 v[16];
- int i,j;
-#pragma unroll 4
- for(i=0; i<4; i++)
- for(j=0; j<4; j++)
- {
- v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0;
- }
- int itab[16];
- float tab1y[4], tab1x[4];
- float axx, ayy;
-
- ayy = INTER_SCALE * ay;
- axx = INTER_SCALE * ax;
- interpolateCubic(ayy, tab1y);
- interpolateCubic(axx, tab1x);
- int isum = 0;
-
-#pragma unroll 16
- for( i=0; i<16; i++ )
- {
- float tmp;
- tmp = tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE;
- itab[i] = rint(tmp);
- isum += itab[i];
- }
-
- if( isum != INTER_REMAP_COEF_SCALE )
- {
- int k1, k2;
- int diff = isum - INTER_REMAP_COEF_SCALE;
- int Mk1=2, Mk2=2, mk1=2, mk2=2;
-
- for( k1 = 2; k1 < 4; k1++ )
- for( k2 = 2; k2 < 4; k2++ )
- {
-
- if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
- mk1 = k1, mk2 = k2;
- else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
- Mk1 = k1, Mk2 = k2;
- }
-
- diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
- }
-
- if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
- {
- int4 sum=0;
- for ( i =0; i<16; i++ )
- {
- sum += convert_int4(v[i]) * itab[i];
- }
- dst[dst_offset+dy*dstStep+dx] = convert_uchar4_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
- }
- }
-}
-
-
-/**********************************************32FC1********************************************
-***********************************************************************************************/
-
-__kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
- int dst_cols, int dst_rows, int srcStep, int dstStep,
- int src_offset, int dst_offset, __constant F * M, int threadCols )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
- if( dx < threadCols && dy < dst_rows)
- {
- int round_delta = AB_SCALE/2;
-
- int X0 = rint(M[0] * dx * AB_SCALE);
- int Y0 = rint(M[3] * dx * AB_SCALE);
- X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
- Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
-
- short sx0 = (short)(X0 >> AB_BITS);
- short sy0 = (short)(Y0 >> AB_BITS);
-
- if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
- dst[(dst_offset>>2)+dy*dstStep+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>2)+sy0*srcStep+sx0] : 0;
- }
-}
-
-__kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
- int dst_cols, int dst_rows, int srcStep, int dstStep,
- int src_offset, int dst_offset, __constant F * M, int threadCols )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
- if( dx < threadCols && dy < dst_rows)
- {
- int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
-
- src_offset = (src_offset>>2);
-
- int X0 = rint(M[0] * dx * AB_SCALE);
- int Y0 = rint(M[3] * dx * AB_SCALE);
- X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
- Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
- X0 = X0 >> (AB_BITS - INTER_BITS);
- Y0 = Y0 >> (AB_BITS - INTER_BITS);
-
- short sx0 = (short)(X0 >> INTER_BITS);
- short sy0 = (short)(Y0 >> INTER_BITS);
- short ax0 = (short)(X0 & (INTER_TAB_SIZE-1));
- short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1));
-
- float v0, v1, v2, v3;
-
- v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0;
- v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0;
- v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0;
- v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0;
-
- float tab[4];
- float taby[2], tabx[2];
- taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0;
- taby[1] = 1.f/INTER_TAB_SIZE*ay0;
- tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
- tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
-
- tab[0] = taby[0] * tabx[0];
- tab[1] = taby[0] * tabx[1];
- tab[2] = taby[1] * tabx[0];
- tab[3] = taby[1] * tabx[1];
-
- float sum = 0;
- sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
- if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
- dst[(dst_offset>>2)+dy*dstStep+dx] = sum;
- }
-}
-
-__kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
- int dst_cols, int dst_rows, int srcStep, int dstStep,
- int src_offset, int dst_offset, __constant F * M, int threadCols )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
- if( dx < threadCols && dy < dst_rows)
- {
- int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
-
- src_offset = (src_offset>>2);
- dst_offset = (dst_offset>>2);
-
- int X0 = rint(M[0] * dx * AB_SCALE);
- int Y0 = rint(M[3] * dx * AB_SCALE);
- X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
- Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
- X0 = X0 >> (AB_BITS - INTER_BITS);
- Y0 = Y0 >> (AB_BITS - INTER_BITS);
-
- short sx = (short)(X0 >> INTER_BITS) - 1;
- short sy = (short)(Y0 >> INTER_BITS) - 1;
- short ay = (short)(Y0 & (INTER_TAB_SIZE-1));
- short ax = (short)(X0 & (INTER_TAB_SIZE-1));
-
- float v[16];
- int i;
-
- for(i=0; i<16; i++)
- v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0;
-
- float tab[16];
- float tab1y[4], tab1x[4];
- float axx, ayy;
-
- ayy = 1.f/INTER_TAB_SIZE * ay;
- axx = 1.f/INTER_TAB_SIZE * ax;
- interpolateCubic(ayy, tab1y);
- interpolateCubic(axx, tab1x);
-
-#pragma unroll 4
- for( i=0; i<16; i++ )
- {
- tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
- }
-
- if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
- {
- float sum = 0;
-#pragma unroll 4
- for ( i =0; i<16; i++ )
- {
- sum += v[i] * tab[i];
- }
- dst[dst_offset+dy*dstStep+dx] = sum;
-
- }
- }
-}
-
-
-/**********************************************32FC4********************************************
-***********************************************************************************************/
-
-__kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
- int dst_cols, int dst_rows, int srcStep, int dstStep,
- int src_offset, int dst_offset, __constant F * M, int threadCols )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
- if( dx < threadCols && dy < dst_rows)
- {
- int round_delta = AB_SCALE/2;
-
- int X0 = rint(M[0] * dx * AB_SCALE);
- int Y0 = rint(M[3] * dx * AB_SCALE);
- X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
- Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
-
- short sx0 = (short)(X0 >> AB_BITS);
- short sy0 = (short)(Y0 >> AB_BITS);
-
- if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
- dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>4)+sy0*(srcStep>>2)+sx0] : (float4)0;
- }
-}
-
-__kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
- int dst_cols, int dst_rows, int srcStep, int dstStep,
- int src_offset, int dst_offset, __constant F * M, int threadCols )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
- if( dx < threadCols && dy < dst_rows)
- {
- int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
-
- src_offset = (src_offset>>4);
- dst_offset = (dst_offset>>4);
- srcStep = (srcStep>>2);
- dstStep = (dstStep>>2);
-
- int X0 = rint(M[0] * dx * AB_SCALE);
- int Y0 = rint(M[3] * dx * AB_SCALE);
- X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
- Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
- X0 = X0 >> (AB_BITS - INTER_BITS);
- Y0 = Y0 >> (AB_BITS - INTER_BITS);
-
- short sx0 = (short)(X0 >> INTER_BITS);
- short sy0 = (short)(Y0 >> INTER_BITS);
- short ax0 = (short)(X0 & (INTER_TAB_SIZE-1));
- short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1));
-
- float4 v0, v1, v2, v3;
-
- v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0;
- v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0;
- v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0;
- v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0;
-
- float tab[4];
- float taby[2], tabx[2];
- taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0;
- taby[1] = 1.f/INTER_TAB_SIZE*ay0;
- tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
- tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
-
- tab[0] = taby[0] * tabx[0];
- tab[1] = taby[0] * tabx[1];
- tab[2] = taby[1] * tabx[0];
- tab[3] = taby[1] * tabx[1];
-
- float4 sum = 0;
- sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
- if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
- dst[dst_offset+dy*dstStep+dx] = sum;
- }
-}
-
-__kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
- int dst_cols, int dst_rows, int srcStep, int dstStep,
- int src_offset, int dst_offset, __constant F * M, int threadCols )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
- if( dx < threadCols && dy < dst_rows)
- {
- int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
-
- src_offset = (src_offset>>4);
- dst_offset = (dst_offset>>4);
- srcStep = (srcStep>>2);
- dstStep = (dstStep>>2);
-
- int X0 = rint(M[0] * dx * AB_SCALE);
- int Y0 = rint(M[3] * dx * AB_SCALE);
- X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
- Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
- X0 = X0 >> (AB_BITS - INTER_BITS);
- Y0 = Y0 >> (AB_BITS - INTER_BITS);
-
- short sx = (short)(X0 >> INTER_BITS) - 1;
- short sy = (short)(Y0 >> INTER_BITS) - 1;
- short ay = (short)(Y0 & (INTER_TAB_SIZE-1));
- short ax = (short)(X0 & (INTER_TAB_SIZE-1));
-
- float4 v[16];
- int i;
-
- for(i=0; i<16; i++)
- v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0;
-
- float tab[16];
- float tab1y[4], tab1x[4];
- float axx, ayy;
-
- ayy = 1.f/INTER_TAB_SIZE * ay;
- axx = 1.f/INTER_TAB_SIZE * ax;
- interpolateCubic(ayy, tab1y);
- interpolateCubic(axx, tab1x);
-
-#pragma unroll 4
- for( i=0; i<16; i++ )
- {
- tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
- }
-
- if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
- {
- float4 sum = 0;
-#pragma unroll 4
- for ( i =0; i<16; i++ )
- {
- sum += v[i] * tab[i];
- }
- dst[dst_offset+dy*dstStep+dx] = sum;
-
- }
- }
-}
+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Zhang Ying, zhangying913@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-
-//wrapPerspective kernel
-//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic.
-
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-typedef double F;
-typedef double4 F4;
-#define convert_F4 convert_double4
-#else
-typedef float F;
-typedef float4 F4;
-#define convert_F4 convert_float4
-#endif
-
-
-#define INTER_BITS 5
-#define INTER_TAB_SIZE (1 << INTER_BITS)
-#define INTER_SCALE 1.f/INTER_TAB_SIZE
-#define AB_BITS max(10, (int)INTER_BITS)
-#define AB_SCALE (1 << AB_BITS)
-#define INTER_REMAP_COEF_BITS 15
-#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)
-
-inline void interpolateCubic( float x, float* coeffs )
-{
- const float A = -0.75f;
-
- coeffs[0] = ((A*(x + 1.f) - 5.0f*A)*(x + 1.f) + 8.0f*A)*(x + 1.f) - 4.0f*A;
- coeffs[1] = ((A + 2.f)*x - (A + 3.f))*x*x + 1.f;
- coeffs[2] = ((A + 2.f)*(1.f - x) - (A + 3.f))*(1.f - x)*(1.f - x) + 1.f;
- coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
-}
-
-
-/**********************************************8UC1*********************************************
-***********************************************************************************************/
-__kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows,
- int dst_cols, int dst_rows, int srcStep, int dstStep,
- int src_offset, int dst_offset, __constant F * M, int threadCols )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
- if( dx < threadCols && dy < dst_rows)
- {
- dx = (dx<<2) - (dst_offset&3);
-
- F4 DX = (F4)(dx, dx+1, dx+2, dx+3);
- F4 X0 = M[0]*DX + M[1]*dy + M[2];
- F4 Y0 = M[3]*DX + M[4]*dy + M[5];
- F4 W = M[6]*DX + M[7]*dy + M[8],one=1,zero=0;
- W = (W!=zero) ? one/W : zero;
- short4 X = convert_short4(rint(X0*W));
- short4 Y = convert_short4(rint(Y0*W));
- int4 sx = convert_int4(X);
- int4 sy = convert_int4(Y);
-
- int4 DXD = (int4)(dx, dx+1, dx+2, dx+3);
- __global uchar4 * d = (__global uchar4 *)(dst+dst_offset+dy*dstStep+dx);
- uchar4 dval = *d;
- int4 dcon = DXD >= 0 && DXD < dst_cols && dy >= 0 && dy < dst_rows;
- int4 scon = sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows;
- int4 spos = src_offset + sy * srcStep + sx;
- uchar4 sval;
- sval.s0 = scon.s0 ? src[spos.s0] : 0;
- sval.s1 = scon.s1 ? src[spos.s1] : 0;
- sval.s2 = scon.s2 ? src[spos.s2] : 0;
- sval.s3 = scon.s3 ? src[spos.s3] : 0;
- dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval;
- *d = dval;
- }
-}
-
-__kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst,
- int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
- int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
- if( dx < threadCols && dy < dst_rows)
- {
- F X0 = M[0]*dx + M[1]*dy + M[2];
- F Y0 = M[3]*dx + M[4]*dy + M[5];
- F W = M[6]*dx + M[7]*dy + M[8];
- W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
- int X = rint(X0*W);
- int Y = rint(Y0*W);
-
- int sx = (short)(X >> INTER_BITS);
- int sy = (short)(Y >> INTER_BITS);
- int ay = (short)(Y & (INTER_TAB_SIZE-1));
- int ax = (short)(X & (INTER_TAB_SIZE-1));
-
- uchar v[4];
- int i;
-#pragma unroll 4
- for(i=0; i<4; i++)
- v[i] = (sx+(i&1) >= 0 && sx+(i&1) < src_cols && sy+(i>>1) >= 0 && sy+(i>>1) < src_rows) ? src[src_offset + (sy+(i>>1)) * srcStep + (sx+(i&1))] : (uchar)0;
-
- short itab[4];
- float tab1y[2], tab1x[2];
- tab1y[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay;
- tab1y[1] = 1.f/INTER_TAB_SIZE*ay;
- tab1x[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax;
- tab1x[1] = 1.f/INTER_TAB_SIZE*ax;
-
-#pragma unroll 4
- for(i=0; i<4; i++)
- {
- float v = tab1y[(i>>1)] * tab1x[(i&1)];
- itab[i] = convert_short_sat(rint( v * INTER_REMAP_COEF_SCALE ));
- }
- if(dx >=0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
- {
- int sum = 0;
- for ( i =0; i<4; i++ )
- {
- sum += v[i] * itab[i] ;
- }
- dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat ( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
- }
- }
-}
-
-__kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows,
- int dst_cols, int dst_rows, int srcStep, int dstStep,
- int src_offset, int dst_offset, __constant F * M, int threadCols )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
- if( dx < threadCols && dy < dst_rows)
- {
- F X0 = M[0]*dx + M[1]*dy + M[2];
- F Y0 = M[3]*dx + M[4]*dy + M[5];
- F W = M[6]*dx + M[7]*dy + M[8];
- W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
- int X = rint(X0*W);
- int Y = rint(Y0*W);
-
- short sx = (short)(X >> INTER_BITS) - 1;
- short sy = (short)(Y >> INTER_BITS) - 1;
- short ay = (short)(Y & (INTER_TAB_SIZE-1));
- short ax = (short)(X & (INTER_TAB_SIZE-1));
-
- uchar v[16];
- int i, j;
-
-#pragma unroll 4
- for(i=0; i<4; i++)
- for(j=0; j<4; j++)
- {
- v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : (uchar)0;
- }
-
- short itab[16];
- float tab1y[4], tab1x[4];
- float axx, ayy;
-
- ayy = 1.f/INTER_TAB_SIZE * ay;
- axx = 1.f/INTER_TAB_SIZE * ax;
- interpolateCubic(ayy, tab1y);
- interpolateCubic(axx, tab1x);
-
- int isum = 0;
-#pragma unroll 16
- for( i=0; i<16; i++ )
- {
- F v = tab1y[(i>>2)] * tab1x[(i&3)];
- isum += itab[i] = convert_short_sat( rint( v * INTER_REMAP_COEF_SCALE ) );
- }
- if( isum != INTER_REMAP_COEF_SCALE )
- {
- int k1, k2;
- int diff = isum - INTER_REMAP_COEF_SCALE;
- int Mk1=2, Mk2=2, mk1=2, mk2=2;
- for( k1 = 2; k1 < 4; k1++ )
- for( k2 = 2; k2 < 4; k2++ )
- {
- if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
- mk1 = k1, mk2 = k2;
- else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
- Mk1 = k1, Mk2 = k2;
- }
- diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
- }
-
-
- if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
- {
- int sum=0;
- for ( i =0; i<16; i++ )
- {
- sum += v[i] * itab[i] ;
- }
- dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
- }
- }
-}
-
-/**********************************************8UC4*********************************************
-***********************************************************************************************/
-
-__kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
- int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
- int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
- if( dx < threadCols && dy < dst_rows)
- {
-
- F X0 = M[0]*dx + M[1]*dy + M[2];
- F Y0 = M[3]*dx + M[4]*dy + M[5];
- F W = M[6]*dx + M[7]*dy + M[8];
- W = (W != 0.0) ? 1./W : 0.0;
- int X = rint(X0*W);
- int Y = rint(Y0*W);
- short sx = (short)X;
- short sy = (short)Y;
-
- if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
- dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*(srcStep>>2)+sx] : (uchar4)0;
- }
-}
-
-__kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
- int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
- int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
- if( dx < threadCols && dy < dst_rows)
- {
- src_offset = (src_offset>>2);
- srcStep = (srcStep>>2);
-
- F X0 = M[0]*dx + M[1]*dy + M[2];
- F Y0 = M[3]*dx + M[4]*dy + M[5];
- F W = M[6]*dx + M[7]*dy + M[8];
- W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
- int X = rint(X0*W);
- int Y = rint(Y0*W);
-
- short sx = (short)(X >> INTER_BITS);
- short sy = (short)(Y >> INTER_BITS);
- short ay = (short)(Y & (INTER_TAB_SIZE-1));
- short ax = (short)(X & (INTER_TAB_SIZE-1));
-
-
- int4 v0, v1, v2, v3;
-
- v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : (int4)0;
- v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx+1]) : (int4)0;
- v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx]) : (int4)0;
- v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx+1]) : (int4)0;
-
- int itab0, itab1, itab2, itab3;
- float taby, tabx;
- taby = 1.f/INTER_TAB_SIZE*ay;
- tabx = 1.f/INTER_TAB_SIZE*ax;
-
- itab0 = convert_short_sat(rint( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
- itab1 = convert_short_sat(rint( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ));
- itab2 = convert_short_sat(rint( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
- itab3 = convert_short_sat(rint( taby*tabx * INTER_REMAP_COEF_SCALE ));
-
- int4 val;
- val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3;
-
- if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
- dst[(dst_offset>>2)+dy*(dstStep>>2)+dx] = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
- }
-}
-
-__kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
- int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
- int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
- if( dx < threadCols && dy < dst_rows)
- {
- src_offset = (src_offset>>2);
- srcStep = (srcStep>>2);
- dst_offset = (dst_offset>>2);
- dstStep = (dstStep>>2);
-
- F X0 = M[0]*dx + M[1]*dy + M[2];
- F Y0 = M[3]*dx + M[4]*dy + M[5];
- F W = M[6]*dx + M[7]*dy + M[8];
- W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
- int X = rint(X0*W);
- int Y = rint(Y0*W);
-
- short sx = (short)(X >> INTER_BITS) - 1;
- short sy = (short)(Y >> INTER_BITS) - 1;
- short ay = (short)(Y & (INTER_TAB_SIZE-1));
- short ax = (short)(X & (INTER_TAB_SIZE-1));
-
- uchar4 v[16];
- int i,j;
-#pragma unroll 4
- for(i=0; i<4; i++)
- for(j=0; j<4; j++)
- {
- v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0;
- }
- int itab[16];
- float tab1y[4], tab1x[4];
- float axx, ayy;
-
- ayy = INTER_SCALE * ay;
- axx = INTER_SCALE * ax;
- interpolateCubic(ayy, tab1y);
- interpolateCubic(axx, tab1x);
- int isum = 0;
-
-#pragma unroll 16
- for( i=0; i<16; i++ )
- {
- float tmp;
- tmp = tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE;
- itab[i] = rint(tmp);
- isum += itab[i];
- }
-
- if( isum != INTER_REMAP_COEF_SCALE )
- {
- int k1, k2;
- int diff = isum - INTER_REMAP_COEF_SCALE;
- int Mk1=2, Mk2=2, mk1=2, mk2=2;
-
- for( k1 = 2; k1 < 4; k1++ )
- for( k2 = 2; k2 < 4; k2++ )
- {
-
- if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
- mk1 = k1, mk2 = k2;
- else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
- Mk1 = k1, Mk2 = k2;
- }
-
- diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
- }
-
- if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
- {
- int4 sum=0;
- for ( i =0; i<16; i++ )
- {
- sum += convert_int4(v[i]) * itab[i];
- }
- dst[dst_offset+dy*dstStep+dx] = convert_uchar4_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
- }
- }
-}
-
-
-/**********************************************32FC1********************************************
-***********************************************************************************************/
-
-__kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
- int dst_cols, int dst_rows, int srcStep, int dstStep,
- int src_offset, int dst_offset, __constant F * M, int threadCols )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
- if( dx < threadCols && dy < dst_rows)
- {
- F X0 = M[0]*dx + M[1]*dy + M[2];
- F Y0 = M[3]*dx + M[4]*dy + M[5];
- F W = M[6]*dx + M[7]*dy + M[8];
- W = (W != 0.0) ? 1./W : 0.0;
- int X = rint(X0*W);
- int Y = rint(Y0*W);
- short sx = (short)X;
- short sy = (short)Y;
-
- if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
- dst[(dst_offset>>2)+dy*dstStep+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*srcStep+sx] : 0;
- }
-}
-
-__kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
- int dst_cols, int dst_rows, int srcStep, int dstStep,
- int src_offset, int dst_offset, __constant F * M, int threadCols )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
- if( dx < threadCols && dy < dst_rows)
- {
- src_offset = (src_offset>>2);
-
- F X0 = M[0]*dx + M[1]*dy + M[2];
- F Y0 = M[3]*dx + M[4]*dy + M[5];
- F W = M[6]*dx + M[7]*dy + M[8];
- W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
- int X = rint(X0*W);
- int Y = rint(Y0*W);
-
- short sx = (short)(X >> INTER_BITS);
- short sy = (short)(Y >> INTER_BITS);
- short ay = (short)(Y & (INTER_TAB_SIZE-1));
- short ax = (short)(X & (INTER_TAB_SIZE-1));
-
- float v0, v1, v2, v3;
-
- v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : (float)0;
- v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx+1] : (float)0;
- v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx] : (float)0;
- v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx+1] : (float)0;
-
- float tab[4];
- float taby[2], tabx[2];
- taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay;
- taby[1] = 1.f/INTER_TAB_SIZE*ay;
- tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax;
- tabx[1] = 1.f/INTER_TAB_SIZE*ax;
-
- tab[0] = taby[0] * tabx[0];
- tab[1] = taby[0] * tabx[1];
- tab[2] = taby[1] * tabx[0];
- tab[3] = taby[1] * tabx[1];
-
- float sum = 0;
- sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
- if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
- dst[(dst_offset>>2)+dy*dstStep+dx] = sum;
- }
-}
-
-__kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
- int dst_cols, int dst_rows, int srcStep, int dstStep,
- int src_offset, int dst_offset, __constant F * M, int threadCols )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
- if( dx < threadCols && dy < dst_rows)
- {
- src_offset = (src_offset>>2);
- dst_offset = (dst_offset>>2);
-
- F X0 = M[0]*dx + M[1]*dy + M[2];
- F Y0 = M[3]*dx + M[4]*dy + M[5];
- F W = M[6]*dx + M[7]*dy + M[8];
- W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
- int X = rint(X0*W);
- int Y = rint(Y0*W);
-
- short sx = (short)(X >> INTER_BITS) - 1;
- short sy = (short)(Y >> INTER_BITS) - 1;
- short ay = (short)(Y & (INTER_TAB_SIZE-1));
- short ax = (short)(X & (INTER_TAB_SIZE-1));
-
- float v[16];
- int i;
-
- for(i=0; i<16; i++)
- v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float)0;
-
- float tab[16];
- float tab1y[4], tab1x[4];
- float axx, ayy;
-
- ayy = 1.f/INTER_TAB_SIZE * ay;
- axx = 1.f/INTER_TAB_SIZE * ax;
- interpolateCubic(ayy, tab1y);
- interpolateCubic(axx, tab1x);
-
-#pragma unroll 4
- for( i=0; i<16; i++ )
- {
- tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
- }
-
- if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
- {
- float sum = 0;
-#pragma unroll 4
- for ( i =0; i<16; i++ )
- {
- sum += v[i] * tab[i];
- }
- dst[dst_offset+dy*dstStep+dx] = sum;
-
- }
- }
-}
-
-
-/**********************************************32FC4********************************************
-***********************************************************************************************/
-
-__kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
- int dst_cols, int dst_rows, int srcStep, int dstStep,
- int src_offset, int dst_offset, __constant F * M, int threadCols )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
- if( dx < threadCols && dy < dst_rows)
- {
- F X0 = M[0]*dx + M[1]*dy + M[2];
- F Y0 = M[3]*dx + M[4]*dy + M[5];
- F W = M[6]*dx + M[7]*dy + M[8];
- W =(W != 0.0)? 1./W : 0.0;
- int X = rint(X0*W);
- int Y = rint(Y0*W);
- short sx = (short)X;
- short sy = (short)Y;
-
- if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
- dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>4)+sy*(srcStep>>2)+sx] : (float)0;
- }
-}
-
-__kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
- int dst_cols, int dst_rows, int srcStep, int dstStep,
- int src_offset, int dst_offset, __constant F * M, int threadCols )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
- if( dx < threadCols && dy < dst_rows)
- {
- src_offset = (src_offset>>4);
- dst_offset = (dst_offset>>4);
- srcStep = (srcStep>>2);
- dstStep = (dstStep>>2);
-
- F X0 = M[0]*dx + M[1]*dy + M[2];
- F Y0 = M[3]*dx + M[4]*dy + M[5];
- F W = M[6]*dx + M[7]*dy + M[8];
- W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
- int X = rint(X0*W);
- int Y = rint(Y0*W);
-
- short sx0 = (short)(X >> INTER_BITS);
- short sy0 = (short)(Y >> INTER_BITS);
- short ay0 = (short)(Y & (INTER_TAB_SIZE-1));
- short ax0 = (short)(X & (INTER_TAB_SIZE-1));
-
-
- float4 v0, v1, v2, v3;
-
- v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0;
- v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0;
- v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0;
- v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0;
-
- float tab[4];
- float taby[2], tabx[2];
- taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0;
- taby[1] = 1.f/INTER_TAB_SIZE*ay0;
- tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
- tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
-
- tab[0] = taby[0] * tabx[0];
- tab[1] = taby[0] * tabx[1];
- tab[2] = taby[1] * tabx[0];
- tab[3] = taby[1] * tabx[1];
-
- float4 sum = 0;
- sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
- if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
- dst[dst_offset+dy*dstStep+dx] = sum;
- }
-}
-
-__kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 * dst,
- int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
- int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols )
-{
- int dx = get_global_id(0);
- int dy = get_global_id(1);
-
- if( dx < threadCols && dy < dst_rows )
- {
- src_offset = (src_offset>>4);
- dst_offset = (dst_offset>>4);
- srcStep = (srcStep>>2);
- dstStep = (dstStep>>2);
-
- F X0 = M[0]*dx + M[1]*dy + M[2];
- F Y0 = M[3]*dx + M[4]*dy + M[5];
- F W = M[6]*dx + M[7]*dy + M[8];
- W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
- int X = rint(X0*W);
- int Y = rint(Y0*W);
-
- short sx = (short)(X >> INTER_BITS)-1;
- short sy = (short)(Y >> INTER_BITS)-1;
- short ay = (short)(Y & (INTER_TAB_SIZE-1));
- short ax = (short)(X & (INTER_TAB_SIZE-1));
-
-
- float4 v[16];
- int i;
-
- for(i=0; i<16; i++)
- v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0;
-
- float tab[16];
- float tab1y[4], tab1x[4];
- float axx, ayy;
-
- ayy = 1.f/INTER_TAB_SIZE * ay;
- axx = 1.f/INTER_TAB_SIZE * ax;
- interpolateCubic(ayy, tab1y);
- interpolateCubic(axx, tab1x);
-
-#pragma unroll 4
- for( i=0; i<16; i++ )
- {
- tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
- }
-
- if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
- {
- float4 sum = 0;
-#pragma unroll 4
- for ( i =0; i<16; i++ )
- {
- sum += v[i] * tab[i];
- }
- dst[dst_offset+dy*dstStep+dx] = sum;
-
- }
- }
-}
resize(ugray, usmallimg, Size(), 0.75, 0.75, INTER_LINEAR);
equalizeHist(usmallimg, uresult);
+#if 0
imshow("orig", uimg);
imshow("small", usmallimg);
imshow("equalized gray", uresult);
destroyWindow("orig");
destroyWindow("small");
destroyWindow("equalized gray");
-
+#endif
ts->set_failed_test_info(cvtest::TS::OK);
}
};
+++ /dev/null
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Niko Li, newlife20080214@gmail.com
-// Wang Weiyan, wangweiyanster@gmail.com
-// Jia Haipeng, jiahaipeng95@gmail.com
-// Nathan, liujun@multicorewareinc.com
-// Peng Xiao, pengxiao@outlook.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//
-
-#pragma OPENCL EXTENSION cl_amd_printf : enable
-#define CV_HAAR_FEATURE_MAX 3
-
-#define calc_sum(rect,offset) (sum[(rect).p0+offset] - sum[(rect).p1+offset] - sum[(rect).p2+offset] + sum[(rect).p3+offset])
-#define calc_sum1(rect,offset,i) (sum[(rect).p0[i]+offset] - sum[(rect).p1[i]+offset] - sum[(rect).p2[i]+offset] + sum[(rect).p3[i]+offset])
-
-typedef int sumtype;
-typedef float sqsumtype;
-
-#ifndef STUMP_BASED
-#define STUMP_BASED 1
-#endif
-
-typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode
-{
- int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64)));
- float weight[CV_HAAR_FEATURE_MAX];
- float threshold;
- float alpha[3] __attribute__((aligned (16)));
- int left __attribute__((aligned (4)));
- int right __attribute__((aligned (4)));
-}
-GpuHidHaarTreeNode;
-
-
-typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier
-{
- int count __attribute__((aligned (4)));
- GpuHidHaarTreeNode* node __attribute__((aligned (8)));
- float* alpha __attribute__((aligned (8)));
-}
-GpuHidHaarClassifier;
-
-
-typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier
-{
- int count __attribute__((aligned (4)));
- float threshold __attribute__((aligned (4)));
- int two_rects __attribute__((aligned (4)));
- int reserved0 __attribute__((aligned (8)));
- int reserved1 __attribute__((aligned (8)));
- int reserved2 __attribute__((aligned (8)));
- int reserved3 __attribute__((aligned (8)));
-}
-GpuHidHaarStageClassifier;
-
-
-typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
-{
- int count __attribute__((aligned (4)));
- int is_stump_based __attribute__((aligned (4)));
- int has_tilted_features __attribute__((aligned (4)));
- int is_tree __attribute__((aligned (4)));
- int pq0 __attribute__((aligned (4)));
- int pq1 __attribute__((aligned (4)));
- int pq2 __attribute__((aligned (4)));
- int pq3 __attribute__((aligned (4)));
- int p0 __attribute__((aligned (4)));
- int p1 __attribute__((aligned (4)));
- int p2 __attribute__((aligned (4)));
- int p3 __attribute__((aligned (4)));
- float inv_window_area __attribute__((aligned (4)));
-} GpuHidHaarClassifierCascade;
-
-__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(
- global GpuHidHaarStageClassifier * stagecascadeptr,
- global int4 * info,
- global GpuHidHaarTreeNode * nodeptr,
- global const int * restrict sum1,
- global const float * restrict sqsum1,
- global int4 * candidate,
- const int pixelstep,
- const int loopcount,
- const int start_stage,
- const int split_stage,
- const int end_stage,
- const int startnode,
- const int splitnode,
- const int4 p,
- const int4 pq,
- const float correction)
-{
- int grpszx = get_local_size(0);
- int grpszy = get_local_size(1);
- int grpnumx = get_num_groups(0);
- int grpidx = get_group_id(0);
- int lclidx = get_local_id(0);
- int lclidy = get_local_id(1);
-
- int lcl_sz = mul24(grpszx,grpszy);
- int lcl_id = mad24(lclidy,grpszx,lclidx);
-
- __local int lclshare[1024];
- __local int* lcldata = lclshare;//for save win data
- __local int* glboutindex = lcldata + 28*28;//for save global out index
- __local int* lclcount = glboutindex + 1;//for save the numuber of temp pass pixel
- __local int* lcloutindex = lclcount + 1;//for save info of temp pass pixel
- __local float* partialsum = (__local float*)(lcloutindex + (lcl_sz<<1));
- glboutindex[0]=0;
- int outputoff = mul24(grpidx,256);
-
- //assume window size is 20X20
-#define WINDOWSIZE 20+1
- //make sure readwidth is the multiple of 4
- //ystep =1, from host code
- int readwidth = ((grpszx-1 + WINDOWSIZE+3)>>2)<<2;
- int readheight = grpszy-1+WINDOWSIZE;
- int read_horiz_cnt = readwidth >> 2;//each read int4
- int total_read = mul24(read_horiz_cnt,readheight);
- int read_loop = (total_read + lcl_sz - 1) >> 6;
- candidate[outputoff+(lcl_id<<2)] = (int4)0;
- candidate[outputoff+(lcl_id<<2)+1] = (int4)0;
- candidate[outputoff+(lcl_id<<2)+2] = (int4)0;
- candidate[outputoff+(lcl_id<<2)+3] = (int4)0;
- for(int scalei = 0; scalei <loopcount; scalei++)
- {
- int4 scaleinfo1= info[scalei];
- int width = (scaleinfo1.x & 0xffff0000) >> 16;
- int height = scaleinfo1.x & 0xffff;
- int grpnumperline =(scaleinfo1.y & 0xffff0000) >> 16;
- int totalgrp = scaleinfo1.y & 0xffff;
- int imgoff = scaleinfo1.z;
- float factor = as_float(scaleinfo1.w);
-
- __global const int * sum = sum1 + imgoff;
- __global const float * sqsum = sqsum1 + imgoff;
- for(int grploop=grpidx; grploop<totalgrp; grploop+=grpnumx)
- {
- int grpidy = grploop / grpnumperline;
- int grpidx = grploop - mul24(grpidy, grpnumperline);
- int x = mad24(grpidx,grpszx,lclidx);
- int y = mad24(grpidy,grpszy,lclidy);
- int grpoffx = x-lclidx;
- int grpoffy = y-lclidy;
-
- for(int i=0; i<read_loop; i++)
- {
- int pos_id = mad24(i,lcl_sz,lcl_id);
- pos_id = pos_id < total_read ? pos_id : 0;
-
- int lcl_y = pos_id / read_horiz_cnt;
- int lcl_x = pos_id - mul24(lcl_y, read_horiz_cnt);
-
- int glb_x = grpoffx + (lcl_x<<2);
- int glb_y = grpoffy + lcl_y;
-
- int glb_off = mad24(min(glb_y, height - 1),pixelstep,glb_x);
- int4 data = *(__global int4*)&sum[glb_off];
- int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2);
-
- vstore4(data, 0, &lcldata[lcl_off]);
- }
-
- lcloutindex[lcl_id] = 0;
- lclcount[0] = 0;
- int result = 1;
- int nodecounter= startnode;
- float mean, variance_norm_factor;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int lcl_off = mad24(lclidy,readwidth,lclidx);
- int4 cascadeinfo1, cascadeinfo2;
- cascadeinfo1 = p;
- cascadeinfo2 = pq;
-
- cascadeinfo1.x +=lcl_off;
- cascadeinfo1.z +=lcl_off;
- mean = (lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.x)] - lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.z)] -
- lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)])
- *correction;
-
- int p_offset = mad24(y, pixelstep, x);
-
- cascadeinfo2.x +=p_offset;
- cascadeinfo2.z +=p_offset;
- variance_norm_factor =sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.x)] - sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.z)] -
- sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)];
-
- variance_norm_factor = variance_norm_factor * correction - mean * mean;
- variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
-
- for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ )
- {
- float stage_sum = 0.f;
- int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
- float stagethreshold = as_float(stageinfo.y);
- for(int nodeloop = 0; nodeloop < stageinfo.x; )
- {
- __global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
-
- int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
- int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
- int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
- float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
- float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0]));
-
- float nodethreshold = w.w * variance_norm_factor;
-
- info1.x +=lcl_off;
- info1.z +=lcl_off;
- info2.x +=lcl_off;
- info2.z +=lcl_off;
-
- float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
- lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
-
- classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
- lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
-
- info3.x +=lcl_off;
- info3.z +=lcl_off;
- classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
- lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
-
- bool passThres = classsum >= nodethreshold;
-#if STUMP_BASED
- stage_sum += passThres ? alpha3.y : alpha3.x;
- nodecounter++;
- nodeloop++;
-#else
- bool isRootNode = (nodecounter & 1) == 0;
- if(isRootNode)
- {
- if( (passThres && currentnodeptr->right) ||
- (!passThres && currentnodeptr->left))
- {
- nodecounter ++;
- }
- else
- {
- stage_sum += alpha3.x;
- nodecounter += 2;
- nodeloop ++;
- }
- }
- else
- {
- stage_sum += passThres ? alpha3.z : alpha3.y;
- nodecounter ++;
- nodeloop ++;
- }
-#endif
- }
-
- result = (stage_sum >= stagethreshold);
- }
-
- if(result && (x < width) && (y < height))
- {
- int queueindex = atomic_inc(lclcount);
- lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
- lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- int queuecount = lclcount[0];
- barrier(CLK_LOCAL_MEM_FENCE);
- nodecounter = splitnode;
- for(int stageloop = split_stage; stageloop< end_stage && queuecount>0; stageloop++)
- {
- lclcount[0]=0;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
- float stagethreshold = as_float(stageinfo.y);
-
- int perfscale = queuecount > 4 ? 3 : 2;
- int queuecount_loop = (queuecount + (1<<perfscale)-1) >> perfscale;
- int lcl_compute_win = lcl_sz >> perfscale;
- int lcl_compute_win_id = (lcl_id >>(6-perfscale));
- int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale);
- int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
- for(int queueloop=0; queueloop<queuecount_loop; queueloop++)
- {
- float stage_sum = 0.f;
- int temp_coord = lcloutindex[lcl_compute_win_id<<1];
- float variance_norm_factor = as_float(lcloutindex[(lcl_compute_win_id<<1)+1]);
- int queue_pixel = mad24(((temp_coord & (int)0xffff0000)>>16),readwidth,temp_coord & 0xffff);
-
- if(lcl_compute_win_id < queuecount)
- {
- int tempnodecounter = lcl_compute_id;
- float part_sum = 0.f;
- const int stump_factor = STUMP_BASED ? 1 : 2;
- int root_offset = 0;
- for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x;)
- {
- __global GpuHidHaarTreeNode* currentnodeptr =
- nodeptr + (nodecounter + tempnodecounter) * stump_factor + root_offset;
-
- int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
- int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
- int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
- float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
- float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0]));
- float nodethreshold = w.w * variance_norm_factor;
-
- info1.x +=queue_pixel;
- info1.z +=queue_pixel;
- info2.x +=queue_pixel;
- info2.z +=queue_pixel;
-
- float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
- lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
-
-
- classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
- lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
-
- info3.x +=queue_pixel;
- info3.z +=queue_pixel;
- classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
- lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
-
- bool passThres = classsum >= nodethreshold;
-#if STUMP_BASED
- part_sum += passThres ? alpha3.y : alpha3.x;
- tempnodecounter += lcl_compute_win;
- lcl_loop++;
-#else
- if(root_offset == 0)
- {
- if( (passThres && currentnodeptr->right) ||
- (!passThres && currentnodeptr->left))
- {
- root_offset = 1;
- }
- else
- {
- part_sum += alpha3.x;
- tempnodecounter += lcl_compute_win;
- lcl_loop++;
- }
- }
- else
- {
- part_sum += passThres ? alpha3.z : alpha3.y;
- tempnodecounter += lcl_compute_win;
- lcl_loop++;
- root_offset = 0;
- }
-#endif
- }//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
- partialsum[lcl_id]=part_sum;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- if(lcl_compute_win_id < queuecount)
- {
- for(int i=0; i<lcl_compute_win && (lcl_compute_id==0); i++)
- {
- stage_sum += partialsum[lcl_id+i];
- }
- if(stage_sum >= stagethreshold && (lcl_compute_id==0))
- {
- int queueindex = atomic_inc(lclcount);
- lcloutindex[queueindex<<1] = temp_coord;
- lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
- }
- lcl_compute_win_id +=(1<<perfscale);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }//end for(int queueloop=0;queueloop<queuecount_loop;queueloop++)
-
- queuecount = lclcount[0];
- barrier(CLK_LOCAL_MEM_FENCE);
- nodecounter += stageinfo.x;
- }//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++)
-
- if(lcl_id<queuecount)
- {
- int temp = lcloutindex[lcl_id<<1];
- int x = mad24(grpidx,grpszx,temp & 0xffff);
- int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16));
- temp = glboutindex[0];
- int4 candidate_result;
- candidate_result.zw = (int2)convert_int_rtn(factor*20.f);
- candidate_result.x = convert_int_rtn(x*factor);
- candidate_result.y = convert_int_rtn(y*factor);
- atomic_inc(glboutindex);
- candidate[outputoff+temp+lcl_id] = candidate_result;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
- }//end for(int scalei = 0; scalei <loopcount; scalei++)
-}
+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-// Wu Xinglong, wxl370@126.com
-// Sen Liu, swjtuls1987@126.com
-// Peng Xiao, pengxiao@outlook.com
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-// Enter your kernel in this window
-//#pragma OPENCL EXTENSION cl_amd_printf:enable
-#define CV_HAAR_FEATURE_MAX 3
-typedef int sumtype;
-typedef float sqsumtype;
-
-typedef struct __attribute__((aligned(128))) GpuHidHaarTreeNode
-{
- int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned(64)));
- float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
- float threshold /*__attribute__((aligned (4)))*/;
- float alpha[3] __attribute__((aligned(16)));
- int left __attribute__((aligned(4)));
- int right __attribute__((aligned(4)));
-}
-GpuHidHaarTreeNode;
-typedef struct __attribute__((aligned(32))) GpuHidHaarClassifier
-{
- int count __attribute__((aligned(4)));
- GpuHidHaarTreeNode *node __attribute__((aligned(8)));
- float *alpha __attribute__((aligned(8)));
-}
-GpuHidHaarClassifier;
-typedef struct __attribute__((aligned(64))) GpuHidHaarStageClassifier
-{
- int count __attribute__((aligned(4)));
- float threshold __attribute__((aligned(4)));
- int two_rects __attribute__((aligned(4)));
- int reserved0 __attribute__((aligned(8)));
- int reserved1 __attribute__((aligned(8)));
- int reserved2 __attribute__((aligned(8)));
- int reserved3 __attribute__((aligned(8)));
-}
-GpuHidHaarStageClassifier;
-typedef struct __attribute__((aligned(64))) GpuHidHaarClassifierCascade
-{
- int count __attribute__((aligned(4)));
- int is_stump_based __attribute__((aligned(4)));
- int has_tilted_features __attribute__((aligned(4)));
- int is_tree __attribute__((aligned(4)));
- int pq0 __attribute__((aligned(4)));
- int pq1 __attribute__((aligned(4)));
- int pq2 __attribute__((aligned(4)));
- int pq3 __attribute__((aligned(4)));
- int p0 __attribute__((aligned(4)));
- int p1 __attribute__((aligned(4)));
- int p2 __attribute__((aligned(4)));
- int p3 __attribute__((aligned(4)));
- float inv_window_area __attribute__((aligned(4)));
-} GpuHidHaarClassifierCascade;
-
-__kernel void gpuRunHaarClassifierCascade_scaled2(
- global GpuHidHaarStageClassifier *stagecascadeptr,
- global int4 *info,
- global GpuHidHaarTreeNode *nodeptr,
- global const int *restrict sum,
- global const float *restrict sqsum,
- global int4 *candidate,
- const int rows,
- const int cols,
- const int step,
- const int loopcount,
- const int start_stage,
- const int split_stage,
- const int end_stage,
- const int startnode,
- global int4 *p,
- global float *correction,
- const int nodecount)
-{
- int grpszx = get_local_size(0);
- int grpszy = get_local_size(1);
- int grpnumx = get_num_groups(0);
- int grpidx = get_group_id(0);
- int lclidx = get_local_id(0);
- int lclidy = get_local_id(1);
- int lcl_sz = mul24(grpszx, grpszy);
- int lcl_id = mad24(lclidy, grpszx, lclidx);
- __local int glboutindex[1];
- __local int lclcount[1];
- __local int lcloutindex[64];
- glboutindex[0] = 0;
- int outputoff = mul24(grpidx, 256);
- candidate[outputoff + (lcl_id << 2)] = (int4)0;
- candidate[outputoff + (lcl_id << 2) + 1] = (int4)0;
- candidate[outputoff + (lcl_id << 2) + 2] = (int4)0;
- candidate[outputoff + (lcl_id << 2) + 3] = (int4)0;
- int max_idx = rows * cols - 1;
- for (int scalei = 0; scalei < loopcount; scalei++)
- {
- int4 scaleinfo1;
- scaleinfo1 = info[scalei];
- int width = (scaleinfo1.x & 0xffff0000) >> 16;
- int height = scaleinfo1.x & 0xffff;
- int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16;
- int totalgrp = scaleinfo1.y & 0xffff;
- float factor = as_float(scaleinfo1.w);
- float correction_t = correction[scalei];
- int ystep = (int)(max(2.0f, factor) + 0.5f);
-
- for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx)
- {
- int4 cascadeinfo = p[scalei];
- int grpidy = grploop / grpnumperline;
- int grpidx = grploop - mul24(grpidy, grpnumperline);
- int ix = mad24(grpidx, grpszx, lclidx);
- int iy = mad24(grpidy, grpszy, lclidy);
- int x = ix * ystep;
- int y = iy * ystep;
- lcloutindex[lcl_id] = 0;
- lclcount[0] = 0;
- int nodecounter;
- float mean, variance_norm_factor;
- //if((ix < width) && (iy < height))
- {
- const int p_offset = mad24(y, step, x);
- cascadeinfo.x += p_offset;
- cascadeinfo.z += p_offset;
- mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
- - sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
- sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
- + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)])
- * correction_t;
- variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
- - sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
- sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
- + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)];
- variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
- variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f;
- bool result = true;
- nodecounter = startnode + nodecount * scalei;
- for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++)
- {
- float stage_sum = 0.f;
- int stagecount = stagecascadeptr[stageloop].count;
- for (int nodeloop = 0; nodeloop < stagecount;)
- {
- __global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter);
- int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0]));
- int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0]));
- int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0]));
- float4 w = *(__global float4 *)(&(currentnodeptr->weight[0]));
- float3 alpha3 = *(__global float3 *)(&(currentnodeptr->alpha[0]));
- float nodethreshold = w.w * variance_norm_factor;
-
- info1.x += p_offset;
- info1.z += p_offset;
- info2.x += p_offset;
- info2.z += p_offset;
- info3.x += p_offset;
- info3.z += p_offset;
- float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)]
- - sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] -
- sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)]
- + sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x;
- classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)]
- - sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] -
- sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)]
- + sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y;
- classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)]
- - sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] -
- sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)]
- + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z;
-
- bool passThres = classsum >= nodethreshold;
-
-#if STUMP_BASED
- stage_sum += passThres ? alpha3.y : alpha3.x;
- nodecounter++;
- nodeloop++;
-#else
- bool isRootNode = (nodecounter & 1) == 0;
- if(isRootNode)
- {
- if( (passThres && currentnodeptr->right) ||
- (!passThres && currentnodeptr->left))
- {
- nodecounter ++;
- }
- else
- {
- stage_sum += alpha3.x;
- nodecounter += 2;
- nodeloop ++;
- }
- }
- else
- {
- stage_sum += (passThres ? alpha3.z : alpha3.y);
- nodecounter ++;
- nodeloop ++;
- }
-#endif
- }
- result = (int)(stage_sum >= stagecascadeptr[stageloop].threshold);
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (result && (ix < width) && (iy < height))
- {
- int queueindex = atomic_inc(lclcount);
- lcloutindex[queueindex] = (y << 16) | x;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- int queuecount = lclcount[0];
-
- if (lcl_id < queuecount)
- {
- int temp = lcloutindex[lcl_id];
- int x = temp & 0xffff;
- int y = (temp & (int)0xffff0000) >> 16;
- temp = atomic_inc(glboutindex);
- int4 candidate_result;
- candidate_result.zw = (int2)convert_int_rtn(factor * 20.f);
- candidate_result.x = x;
- candidate_result.y = y;
- candidate[outputoff + temp + lcl_id] = candidate_result;
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- }
- }
-}
-__kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, int nodenum)
-{
- int counter = get_global_id(0);
- int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0;
- GpuHidHaarTreeNode t1 = *(orinode + counter);
-#pragma unroll
-
- for (i = 0; i < 3; i++)
- {
- tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f);
- tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f);
- tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f);
- tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f);
- }
-
- t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]);
- counter += nodenum;
-#pragma unroll
-
- for (i = 0; i < 3; i++)
- {
- newnode[counter].p[i][0] = tr_x[i];
- newnode[counter].p[i][1] = tr_y[i];
- newnode[counter].p[i][2] = tr_x[i] + tr_w[i];
- newnode[counter].p[i][3] = tr_y[i] + tr_h[i];
- newnode[counter].weight[i] = t1.weight[i] * weight_scale;
- }
-
- newnode[counter].left = t1.left;
- newnode[counter].right = t1.right;
- newnode[counter].threshold = t1.threshold;
- newnode[counter].alpha[0] = t1.alpha[0];
- newnode[counter].alpha[1] = t1.alpha[1];
- newnode[counter].alpha[2] = t1.alpha[2];
-}