modules/imgproc/src/opencl/filterSmall.cl

   1 // This file is part of OpenCV project.
   2 // It is subject to the license terms in the LICENSE file found in the top-level directory
   3 // of this distribution and at http://opencv.org/license.html.
   4
   5 // Copyright (C) 2014, Itseez, Inc., all rights reserved.
   6 // Third party copyrights are property of their respective owners.
   7
   8 #ifdef BORDER_REPLICATE
   9 //BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
  10 #define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
  11 #define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
  12 #define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i))
  13 #define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))
  14 #endif
  15
  16 #ifdef BORDER_REFLECT
  17 //BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
  18 #define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))
  19 #define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
  20 #define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)-1 : (i))
  21 #define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
  22 #endif
  23
  24 #ifdef BORDER_REFLECT_101
  25 //BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba
  26 #define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))
  27 #define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
  28 #define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)                 : (i))
  29 #define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
  30 #endif
  31
  32 //blur function does not support BORDER_WRAP
  33 #ifdef BORDER_WRAP
  34 //BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
  35 #define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))
  36 #define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
  37 #define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (i)+(b_edge) : (i))
  38 #define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
  39 #endif
  40
  41 #ifdef BORDER_ISOLATED
  42 #define ISOLATED_MIN(VAL) (VAL)
  43 #else
  44 #define ISOLATED_MIN(VAL) 0
  45 #endif
  46
  47 #ifdef EXTRA_EXTRAPOLATION // border > src image size
  48 #ifdef BORDER_CONSTANT
  49 // None
  50 #elif defined BORDER_REPLICATE
  51 #define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
  52     { \
  53         x = max(min(x, maxX - 1), minX); \
  54         y = max(min(y, maxY - 1), minY); \
  55     }
  56 #elif defined BORDER_WRAP
  57 #define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
  58     { \
  59         if (x < minX) \
  60             x -= ((x - maxX + 1) / maxX) * maxX; \
  61         if (x >= maxX) \
  62             x %= maxX; \
  63         if (y < minY) \
  64             y -= ((y - maxY + 1) / maxY) * maxY; \
  65         if (y >= maxY) \
  66             y %= maxY; \
  67     }
  68 #elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)
  69 #define EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, delta) \
  70     { \
  71         if (maxX - minX == 1) \
  72             x = minX; \
  73         else \
  74             do \
  75             { \
  76                 if (x < minX) \
  77                     x = minX - (x - minX) - 1 + delta; \
  78                 else \
  79                     x = maxX - 1 - (x - maxX) - delta; \
  80             } \
  81             while (x >= maxX || x < minX); \
  82         \
  83         if (maxY - minY == 1) \
  84             y = minY; \
  85         else \
  86             do \
  87             { \
  88                 if (y < minY) \
  89                     y = minY - (y - minY) - 1 + delta; \
  90                 else \
  91                     y = maxY - 1 - (y - maxY) - delta; \
  92             } \
  93             while (y >= maxY || y < minY); \
  94     }
  95 #ifdef BORDER_REFLECT
  96 #define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 0)
  97 #elif defined(BORDER_REFLECT_101) || defined(BORDER_REFLECT101)
  98 #define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 1)
  99 #endif
 100 #else
 101 #error No extrapolation method
 102 #endif
 103 #else
 104 #define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
 105     { \
 106         int _row = y - ISOLATED_MIN(minY), _col = x - ISOLATED_MIN(minX); \
 107         _row = ADDR_H(_row, 0, maxY - ISOLATED_MIN(minY)); \
 108         _row = ADDR_B(_row, maxY - ISOLATED_MIN(minY), _row); \
 109         y = _row + ISOLATED_MIN(minY); \
 110         \
 111         _col = ADDR_L(_col, 0, maxX - ISOLATED_MIN(minX)); \
 112         _col = ADDR_R(_col, maxX - ISOLATED_MIN(minX), _col); \
 113         x = _col + ISOLATED_MIN(minX); \
 114     }
 115 #endif
 116
 117 #ifdef DOUBLE_SUPPORT
 118 #ifdef cl_amd_fp64
 119 #pragma OPENCL EXTENSION cl_amd_fp64:enable
 120 #elif defined (cl_khr_fp64)
 121 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 122 #endif
 123 #endif
 124
 125 #if cn != 3
 126 #define loadpix(addr) *(__global const srcT *)(addr)
 127 #define storepix(val, addr)  *(__global dstT *)(addr) = val
 128 #define SRCSIZE (int)sizeof(srcT)
 129 #define DSTSIZE (int)sizeof(dstT)
 130 #else
 131 #define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
 132 #define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
 133 #define SRCSIZE (int)sizeof(srcT1) * cn
 134 #define DSTSIZE (int)sizeof(dstT1) * cn
 135 #endif
 136
 137 #define noconvert
 138
 139 struct RectCoords
 140 {
 141     int x1, y1, x2, y2;
 142 };
 143
 144 #ifdef BORDER_ISOLATED
 145 inline bool isBorder(const struct RectCoords bounds, int2 coord, int numPixels)
 146 {
 147     return coord.x < bounds.x1 || coord.y < bounds.y1 || coord.x + numPixels > bounds.x2 || coord.y >= bounds.y2;
 148 }
 149 #else
 150 inline bool isBorder(const struct RectCoords bounds, int2 coord, int numPixels)
 151 {
 152     return coord.x < 0 || coord.y < 0 || coord.x + numPixels > bounds.x2 || coord.y >= bounds.y2;
 153 }
 154 #endif
 155
 156 #define float1 float
 157 #define uchar1 uchar
 158 #define int1 int
 159 #define uint1 unit
 160
 161 #define __CAT(x, y) x##y
 162 #define CAT(x, y) __CAT(x, y)
 163
 164 #define vload1(OFFSET, PTR) (*(PTR + OFFSET))
 165 #define PX_LOAD_VEC_TYPE CAT(srcT1, PX_LOAD_VEC_SIZE)
 166 #define PX_LOAD_FLOAT_VEC_TYPE CAT(WT1, PX_LOAD_VEC_SIZE)
 167
 168 #if PX_LOAD_VEC_SIZE == 1
 169 #define PX_LOAD_FLOAT_VEC_CONV (float)
 170 #elif PX_LOAD_VEC_SIZE == 2
 171 #define PX_LOAD_FLOAT_VEC_CONV convert_float2
 172 #elif PX_LOAD_VEC_SIZE == 3
 173 #define PX_LOAD_FLOAT_VEC_CONV convert_float3
 174 #elif PX_LOAD_VEC_SIZE == 4
 175 #define PX_LOAD_FLOAT_VEC_CONV convert_float4
 176 #endif
 177
 178 //#define PX_LOAD_FLOAT_VEC_CONV CAT(convert_, PX_LOAD_FLOAT_VEC_TYPE)
 179 #define PX_LOAD CAT(vload, PX_LOAD_VEC_SIZE)
 180
 181
 182 inline PX_LOAD_FLOAT_VEC_TYPE readSrcPixelGroup(int2 pos, __global const uchar * srcptr,
 183                                                 int srcstep, const struct RectCoords srcCoords)
 184 {
 185     __global const srcT1 * ptr = (__global const srcT1 *)
 186                                 (srcptr + mad24(pos.y, srcstep, pos.x * SRCSIZE));
 187     return PX_LOAD_FLOAT_VEC_CONV(PX_LOAD(0, ptr));
 188 }
 189
 190 // Macros to ensure unrolled loops
 191 #define LOOP1(VAR, STMT) (STMT); (VAR)++;
 192 #define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++;
 193 #define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++;
 194 #define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++;
 195 #define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++;
 196 #define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++;
 197 #define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++;
 198 #define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++;
 199 #define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++;
 200 #define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++;
 201 #define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++;
 202 #define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++;
 203 #define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++;
 204
 205 #define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))
 206
 207 #ifdef OP_BOX_FILTER
 208 #define PROCESS_ELEM \
 209     WT total_sum = (WT)(0); \
 210     int sy = 0; \
 211     LOOP(KERNEL_SIZE_Y, sy, \
 212     { \
 213         int sx = 0; \
 214         LOOP(KERNEL_SIZE_X, sx, \
 215         { \
 216             total_sum += privateData[py + sy][px + sx]; \
 217         }); \
 218     })
 219
 220 #elif defined OP_FILTER2D
 221
 222 #define DIG(a) a,
 223 __constant WT1 kernelData[] = { COEFF };
 224
 225 #define PROCESS_ELEM \
 226     WT total_sum = 0; \
 227     int sy = 0; \
 228     int kernelIndex = 0; \
 229     LOOP(KERNEL_SIZE_Y, sy, \
 230     { \
 231         int sx = 0; \
 232         LOOP(KERNEL_SIZE_X, sx, \
 233         { \
 234             total_sum = fma(kernelData[kernelIndex++], privateData[py + sy][px + sx], total_sum); \
 235         }); \
 236     })
 237
 238 #elif defined OP_ERODE || defined OP_DILATE
 239
 240 #ifdef DEPTH_0
 241 #define MIN_VAL 0
 242 #define MAX_VAL UCHAR_MAX
 243 #elif defined DEPTH_1
 244 #define MIN_VAL SCHAR_MIN
 245 #define MAX_VAL SCHAR_MAX
 246 #elif defined DEPTH_2
 247 #define MIN_VAL 0
 248 #define MAX_VAL USHRT_MAX
 249 #elif defined DEPTH_3
 250 #define MIN_VAL SHRT_MIN
 251 #define MAX_VAL SHRT_MAX
 252 #elif defined DEPTH_4
 253 #define MIN_VAL INT_MIN
 254 #define MAX_VAL INT_MAX
 255 #elif defined DEPTH_5
 256 #define MIN_VAL (-FLT_MAX)
 257 #define MAX_VAL FLT_MAX
 258 #elif defined DEPTH_6
 259 #define MIN_VAL (-DBL_MAX)
 260 #define MAX_VAL DBL_MAX
 261 #endif
 262
 263 #ifdef OP_ERODE
 264 #define VAL (WT)MAX_VAL
 265 #elif defined OP_DILATE
 266 #define VAL (WT)MIN_VAL
 267 #else
 268 #error "Unknown operation"
 269 #endif
 270
 271 #define convert_float1 convert_float
 272 #define convert_uchar1 convert_uchar
 273 #define convert_int1 convert_int
 274 #define convert_uint1 convert_uint
 275
 276 #ifdef OP_ERODE
 277 #if defined INTEL_DEVICE && defined DEPTH_0
 278 // workaround for bug in Intel HD graphics drivers (10.18.10.3496 or older)
 279 #define WA_CONVERT_1 CAT(convert_uint, cn)
 280 #define WA_CONVERT_2 CAT(convert_, srcT)
 281 #define MORPH_OP(A, B) ((A) < (B) ? (A) : (B))
 282 #else
 283 #define MORPH_OP(A, B) min((A), (B))
 284 #endif
 285 #endif
 286 #ifdef OP_DILATE
 287 #define MORPH_OP(A, B) max((A), (B))
 288 #endif
 289
 290 #define PROCESS(_y, _x) \
 291     total_sum = convertToWT(MORPH_OP(convertToWT(total_sum), convertToWT(privateData[py + _y][px + _x])));
 292
 293 #define PROCESS_ELEM \
 294     WT total_sum = convertToWT(VAL); \
 295     PROCESS_ELEM_
 296
 297 #else
 298 #error "No processing is specified"
 299 #endif
 300
 301 #if defined OP_GRADIENT || defined OP_TOPHAT || defined OP_BLACKHAT
 302 #define EXTRA_PARAMS , __global const uchar * matptr, int mat_step, int mat_offset
 303 #else
 304 #define EXTRA_PARAMS
 305 #endif
 306
 307 inline WT getBorderPixel(const struct RectCoords bounds, int2 coord,
 308     __global const uchar * srcptr, int srcstep)
 309 {
 310 #ifdef BORDER_CONSTANT
 311 #ifdef OP_ERODE
 312     return (WT)(MAX_VAL);
 313 #elif defined OP_DILATE
 314     return (WT)(MIN_VAL);
 315 #else
 316     return (WT)(0);
 317 #endif
 318 #else
 319
 320     int selected_col = coord.x;
 321     int selected_row = coord.y;
 322
 323     EXTRAPOLATE(selected_col, selected_row,
 324         bounds.x1, bounds.y1,
 325         bounds.x2, bounds.y2);
 326
 327     __global const uchar* ptr = srcptr + mad24(selected_row, srcstep, selected_col * SRCSIZE);
 328     return convertToWT(loadpix(ptr));
 329 #endif
 330 }
 331
 332 inline WT readSrcPixelSingle(int2 pos, __global const uchar * srcptr,
 333     int srcstep, const struct RectCoords srcCoords)
 334 {
 335     if (!isBorder(srcCoords, pos, 1))
 336     {
 337         __global const uchar * ptr = srcptr + mad24(pos.y, srcstep, pos.x * SRCSIZE);
 338         return convertToWT(loadpix(ptr));
 339     }
 340     else
 341         return getBorderPixel(srcCoords, pos, srcptr, srcstep);
 342 }
 343
 344
 345 __kernel void filterSmall(__global const uchar * srcptr, int src_step, int srcOffsetX, int srcOffsetY, int srcEndX, int srcEndY,
 346                           __global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols
 347 #ifdef NORMALIZE
 348                           , float alpha
 349 #endif
 350                           EXTRA_PARAMS )
 351 {
 352     // for non-isolated border: offsetX, offsetY, wholeX, wholeY
 353     const struct RectCoords srcCoords = { srcOffsetX, srcOffsetY, srcEndX, srcEndY };
 354
 355     const int startX = get_global_id(0) * PX_PER_WI_X;
 356     const int startY = get_global_id(1) * PX_PER_WI_Y;
 357
 358     if (startX >= cols || startY >= rows)
 359         return;
 360
 361     WT privateData[PX_PER_WI_Y + KERNEL_SIZE_Y - 1][PRIV_DATA_WIDTH];
 362
 363     // Load all of the pixels needed for the calculation
 364     int py = 0;
 365     LOOP(PX_LOAD_Y_ITERATIONS, py,
 366     {
 367         int y = startY + py;
 368         int px = 0;
 369         LOOP(PX_LOAD_X_ITERATIONS, px,
 370         {
 371             int x = startX + (px * PX_LOAD_NUM_PX);
 372             int2 srcPos = (int2)(srcCoords.x1 + x - ANCHOR_X, srcCoords.y1 + y - ANCHOR_Y);
 373
 374             if (!isBorder(srcCoords, srcPos, PX_LOAD_NUM_PX))
 375             {
 376                 PX_LOAD_FLOAT_VEC_TYPE p = readSrcPixelGroup(srcPos, srcptr, src_step, srcCoords);
 377 #ifdef SQR
 378                 *((PX_LOAD_FLOAT_VEC_TYPE *)&privateData[py][px * PX_LOAD_NUM_PX]) = p * p;
 379 #else
 380                 *((PX_LOAD_FLOAT_VEC_TYPE *)&privateData[py][px * PX_LOAD_NUM_PX]) = p;
 381 #endif
 382             }
 383             else
 384             {
 385                 int lx = 0;
 386                 LOOP(PX_LOAD_NUM_PX, lx,
 387                 {
 388                     WT p = readSrcPixelSingle(srcPos, srcptr, src_step, srcCoords);
 389 #ifdef SQR
 390                     *((WT*)&privateData[py][px * PX_LOAD_NUM_PX + lx]) = p * p;
 391 #else
 392                     *((WT*)&privateData[py][px * PX_LOAD_NUM_PX + lx]) = p;
 393 #endif
 394                     srcPos.x++;
 395                 });
 396             }
 397         });
 398     });
 399
 400     // Use the stored pixels to compute the results
 401     py = 0;
 402     LOOP(PX_PER_WI_Y, py,
 403     {
 404         int y = startY + py;
 405         int px = 0;
 406         LOOP(PX_PER_WI_X, px,
 407         {
 408             int x = startX + px;
 409             PROCESS_ELEM;
 410             int dst_index = mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset));
 411             __global dstT * dstPtr = (__global dstT *)(dstptr + dst_index);
 412 #ifdef NORMALIZE
 413             total_sum *= (WT)(alpha);
 414 #endif
 415 #if defined OP_GRADIENT || defined OP_TOPHAT || defined OP_BLACKHAT
 416             //for this type of operations SRCSIZE == DSTSIZE
 417             int mat_index = mad24(y, mat_step, mad24(x, SRCSIZE, mat_offset));
 418             WT value = convertToWT(loadpix(matptr + mat_index));
 419
 420 #ifdef OP_GRADIENT
 421             storepix(convertToDstT(convertToWT(total_sum) - convertToWT(value)), dstPtr );
 422 #elif defined OP_TOPHAT
 423             storepix(convertToDstT(convertToWT(value) - convertToWT(total_sum)), dstPtr );
 424 #elif defined OP_BLACKHAT
 425             storepix(convertToDstT(convertToWT(total_sum) - convertToWT(value)), dstPtr );
 426 #endif
 427 #else // erode or dilate, or open-close
 428             storepix(convertToDstT(total_sum), dstPtr);
 429 #endif
 430         });
 431     });
 432 }