modules/imgproc/src/opencl/resize.cl

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // @Authors
  18 //    Zhang Ying, zhangying913@gmail.com
  19 //        Niko Li, newlife20080214@gmail.com
  20 // Redistribution and use in source and binary forms, with or without modification,
  21 // are permitted provided that the following conditions are met:
  22 //
  23 //   * Redistribution's of source code must retain the above copyright notice,
  24 //     this list of conditions and the following disclaimer.
  25 //
  26 //   * Redistribution's in binary form must reproduce the above copyright notice,
  27 //     this list of conditions and the following disclaimer in the documentation
  28 //     and/or other materials provided with the distribution.
  29 //
  30 //   * The name of the copyright holders may not be used to endorse or promote products
  31 //     derived from this software without specific prior written permission.
  32 //
  33 // This software is provided by the copyright holders and contributors as is and
  34 // any express or implied warranties, including, but not limited to, the implied
  35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  36 // In no event shall the Intel Corporation or contributors be liable for any direct,
  37 // indirect, incidental, special, exemplary, or consequential damages
  38 // (including, but not limited to, procurement of substitute goods or services;
  39 // loss of use, data, or profits; or business interruption) however caused
  40 // and on any theory of liability, whether in contract, strict liability,
  41 // or tort (including negligence or otherwise) arising in any way out of
  42 // the use of this software, even if advised of the possibility of such damage.
  43 //
  44 //M*/
  45
  46 #ifdef DOUBLE_SUPPORT
  47 #ifdef cl_amd_fp64
  48 #pragma OPENCL EXTENSION cl_amd_fp64:enable
  49 #elif defined (cl_khr_fp64)
  50 #pragma OPENCL EXTENSION cl_khr_fp64:enable
  51 #endif
  52 #endif
  53
  54 #define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS)
  55 #define CAST_BITS (INTER_RESIZE_COEF_BITS << 1)
  56 #define INC(x,l) min(x+1,l-1)
  57
  58 #define noconvert
  59
  60 #if cn != 3
  61 #define loadpix(addr)  *(__global const T *)(addr)
  62 #define storepix(val, addr)  *(__global T *)(addr) = val
  63 #define TSIZE (int)sizeof(T)
  64 #else
  65 #define loadpix(addr)  vload3(0, (__global const T1 *)(addr))
  66 #define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
  67 #define TSIZE (int)sizeof(T1)*cn
  68 #endif
  69
  70 #if defined USE_SAMPLER
  71
  72 #if cn == 1
  73 #define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z).x
  74 #define INTERMEDIATE_TYPE  float
  75 #elif cn == 2
  76 #define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z).xy
  77 #define INTERMEDIATE_TYPE  float2
  78 #elif cn == 3
  79 #define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z).xyz
  80 #define INTERMEDIATE_TYPE  float3
  81 #elif cn == 4
  82 #define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z)
  83 #define INTERMEDIATE_TYPE  float4
  84 #endif
  85
  86 #define __CAT(x, y) x##y
  87 #define CAT(x, y) __CAT(x, y)
  88 //#define INTERMEDIATE_TYPE CAT(float, cn)
  89 #define float1 float
  90
  91 #if depth == 0
  92 #define RESULT_SCALE    255.0f
  93 #elif depth == 1
  94 #define RESULT_SCALE    127.0f
  95 #elif depth == 2
  96 #define RESULT_SCALE    65535.0f
  97 #elif depth == 3
  98 #define RESULT_SCALE    32767.0f
  99 #else
 100 #define RESULT_SCALE    1.0f
 101 #endif
 102
 103 __kernel void resizeSampler(__read_only image2d_t srcImage,
 104                             __global uchar* dstptr, int dststep, int dstoffset,
 105                             int dstrows, int dstcols,
 106                             float ifx, float ify)
 107 {
 108     const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |
 109                               CLK_ADDRESS_CLAMP_TO_EDGE |
 110                               CLK_FILTER_LINEAR;
 111
 112     int dx = get_global_id(0);
 113     int dy = get_global_id(1);
 114
 115     float sx = ((dx+0.5f) * ifx), sy = ((dy+0.5f) * ify);
 116
 117     INTERMEDIATE_TYPE intermediate = READ_IMAGE(srcImage, sampler, (float2)(sx, sy));
 118
 119 #if depth <= 4
 120     T uval = convertToDT(round(intermediate * RESULT_SCALE));
 121 #else
 122     T uval = convertToDT(intermediate * RESULT_SCALE);
 123 #endif
 124
 125     if(dx < dstcols && dy < dstrows)
 126     {
 127         storepix(uval, dstptr + mad24(dy, dststep, dstoffset + dx*TSIZE));
 128     }
 129 }
 130
 131 #elif defined INTER_LINEAR_INTEGER
 132
 133 __kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
 134                        __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
 135                        __global const uchar * buffer)
 136 {
 137     int dx = get_global_id(0);
 138     int dy = get_global_id(1);
 139
 140     if (dx < dst_cols && dy < dst_rows)
 141     {
 142         __global const int * xofs = (__global const int *)(buffer), * yofs = xofs + dst_cols;
 143         __global const short * ialpha = (__global const short *)(yofs + dst_rows);
 144         __global const short * ibeta = ialpha + ((dst_cols + dy) << 1);
 145         ialpha += dx << 1;
 146
 147         int sx0 = xofs[dx], sy0 = clamp(yofs[dy], 0, src_rows - 1),
 148         sy1 = clamp(yofs[dy] + 1, 0, src_rows - 1);
 149         short a0 = ialpha[0], a1 = ialpha[1];
 150         short b0 = ibeta[0], b1 = ibeta[1];
 151
 152         int src_index0 = mad24(sy0, src_step, mad24(sx0, TSIZE, src_offset)),
 153         src_index1 = mad24(sy1, src_step, mad24(sx0, TSIZE, src_offset));
 154         WT data0 = convertToWT(loadpix(srcptr + src_index0));
 155         WT data1 = convertToWT(loadpix(srcptr + src_index0 + TSIZE));
 156         WT data2 = convertToWT(loadpix(srcptr + src_index1));
 157         WT data3 = convertToWT(loadpix(srcptr + src_index1 + TSIZE));
 158
 159         WT val = ( (((data0 * a0 + data1 * a1) >> 4) * b0) >> 16) +
 160                  ( (((data2 * a0 + data3 * a1) >> 4) * b1) >> 16);
 161
 162         storepix(convertToDT((val + 2) >> 2),
 163                  dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));
 164     }
 165 }
 166
 167 #elif defined INTER_LINEAR
 168
 169 __kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
 170                        __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
 171                        float ifx, float ify)
 172 {
 173     int dx = get_global_id(0);
 174     int dy = get_global_id(1);
 175
 176     if (dx < dst_cols && dy < dst_rows)
 177     {
 178         float sx = ((dx+0.5f) * ifx - 0.5f), sy = ((dy+0.5f) * ify - 0.5f);
 179         int x = floor(sx), y = floor(sy);
 180
 181         float u = sx - x, v = sy - y;
 182
 183         if ( x<0 ) x=0,u=0;
 184         if ( x>=src_cols ) x=src_cols-1,u=0;
 185         if ( y<0 ) y=0,v=0;
 186         if ( y>=src_rows ) y=src_rows-1,v=0;
 187
 188         int y_ = INC(y, src_rows);
 189         int x_ = INC(x, src_cols);
 190
 191 #if depth <= 4
 192         u = u * INTER_RESIZE_COEF_SCALE;
 193         v = v * INTER_RESIZE_COEF_SCALE;
 194
 195         int U = rint(u);
 196         int V = rint(v);
 197         int U1 = rint(INTER_RESIZE_COEF_SCALE - u);
 198         int V1 = rint(INTER_RESIZE_COEF_SCALE - v);
 199
 200         WT data0 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))));
 201         WT data1 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x_, TSIZE, src_offset))));
 202         WT data2 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x, TSIZE, src_offset))));
 203         WT data3 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x_, TSIZE, src_offset))));
 204
 205         WT val = mul24((WT)mul24(U1, V1), data0) + mul24((WT)mul24(U, V1), data1) +
 206                    mul24((WT)mul24(U1, V), data2) + mul24((WT)mul24(U, V), data3);
 207
 208         T uval = convertToDT((val + (1<<(CAST_BITS-1)))>>CAST_BITS);
 209 #else
 210         float u1 = 1.f - u;
 211         float v1 = 1.f - v;
 212         WT data0 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))));
 213         WT data1 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x_, TSIZE, src_offset))));
 214         WT data2 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x, TSIZE, src_offset))));
 215         WT data3 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x_, TSIZE, src_offset))));
 216
 217         T uval = u1 * v1 * data0 + u * v1 * data1 + u1 * v *data2 + u * v *data3;
 218 #endif
 219         storepix(uval, dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));
 220     }
 221 }
 222
 223 #elif defined INTER_NEAREST
 224
 225 __kernel void resizeNN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
 226                        __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
 227                        float ifx, float ify)
 228 {
 229     int dx = get_global_id(0);
 230     int dy = get_global_id(1);
 231
 232     if (dx < dst_cols && dy < dst_rows)
 233     {
 234         float s1 = dx * ifx;
 235         float s2 = dy * ify;
 236         int sx = min(convert_int_rtz(s1), src_cols - 1);
 237         int sy = min(convert_int_rtz(s2), src_rows - 1);
 238
 239         storepix(loadpix(srcptr + mad24(sy, src_step, mad24(sx, TSIZE, src_offset))),
 240                  dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));
 241     }
 242 }
 243
 244 #elif defined INTER_AREA
 245
 246 #ifdef INTER_AREA_FAST
 247
 248 __kernel void resizeAREA_FAST(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
 249                               __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
 250 {
 251     int dx = get_global_id(0);
 252     int dy = get_global_id(1);
 253
 254     if (dx < dst_cols && dy < dst_rows)
 255     {
 256         int dst_index = mad24(dy, dst_step, dst_offset);
 257
 258         int sx = XSCALE * dx;
 259         int sy = YSCALE * dy;
 260         WTV sum = (WTV)(0);
 261
 262         #pragma unroll
 263         for (int py = 0; py < YSCALE; ++py)
 264         {
 265             int y = min(sy + py, src_rows - 1);
 266             int src_index = mad24(y, src_step, src_offset);
 267             #pragma unroll
 268             for (int px = 0; px < XSCALE; ++px)
 269             {
 270                 int x = min(sx + px, src_cols - 1);
 271                 sum += convertToWTV(loadpix(src + src_index + x*TSIZE));
 272             }
 273         }
 274
 275         storepix(convertToT(convertToWT2V(sum) * (WT2V)(SCALE)), dst + mad24(dx, TSIZE, dst_index));
 276     }
 277 }
 278
 279 #else
 280
 281 __kernel void resizeAREA(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
 282                          __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
 283                          float ifx, float ify, __global const int * ofs_tab,
 284                          __global const int * map_tab, __global const float * alpha_tab)
 285 {
 286     int dx = get_global_id(0);
 287     int dy = get_global_id(1);
 288
 289     if (dx < dst_cols && dy < dst_rows)
 290     {
 291         int dst_index = mad24(dy, dst_step, dst_offset);
 292
 293         __global const int * xmap_tab = map_tab;
 294         __global const int * ymap_tab = (__global const int *)(map_tab + (src_cols << 1));
 295         __global const float * xalpha_tab = alpha_tab;
 296         __global const float * yalpha_tab = (__global const float *)(alpha_tab + (src_cols << 1));
 297         __global const int * xofs_tab = ofs_tab;
 298         __global const int * yofs_tab = (__global const int *)(ofs_tab + dst_cols + 1);
 299
 300         int xk0 = xofs_tab[dx], xk1 = xofs_tab[dx + 1];
 301         int yk0 = yofs_tab[dy], yk1 = yofs_tab[dy + 1];
 302
 303         int sy0 = ymap_tab[yk0], sy1 = ymap_tab[yk1 - 1];
 304         int sx0 = xmap_tab[xk0], sx1 = xmap_tab[xk1 - 1];
 305
 306         WTV sum = (WTV)(0), buf;
 307         int src_index = mad24(sy0, src_step, src_offset);
 308
 309         for (int sy = sy0, yk = yk0; sy <= sy1; ++sy, src_index += src_step, ++yk)
 310         {
 311             WTV beta = (WTV)(yalpha_tab[yk]);
 312             buf = (WTV)(0);
 313
 314             for (int sx = sx0, xk = xk0; sx <= sx1; ++sx, ++xk)
 315             {
 316                 WTV alpha = (WTV)(xalpha_tab[xk]);
 317                 buf += convertToWTV(loadpix(src + mad24(sx, TSIZE, src_index))) * alpha;
 318             }
 319             sum += buf * beta;
 320         }
 321
 322         storepix(convertToT(sum), dst + mad24(dx, TSIZE, dst_index));
 323     }
 324 }
 325
 326 #endif
 327
 328 #endif