modules/core/src/opencl/reduce.cl

   1 ////////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // @Authors
  18 //    Shengen Yan,yanshengen@gmail.com
  19 //
  20 // Redistribution and use in source and binary forms, with or without modification,
  21 // are permitted provided that the following conditions are met:
  22 //
  23 //   * Redistribution's of source code must retain the above copyright notice,
  24 //     this list of conditions and the following disclaimer.
  25 //
  26 //   * Redistribution's in binary form must reproduce the above copyright notice,
  27 //     this list of conditions and the following disclaimer in the documentation
  28 //     and/or other materials provided with the distribution.
  29 //
  30 //   * The name of the copyright holders may not be used to endorse or promote products
  31 //     derived from this software without specific prior written permission.
  32 //
  33 // This software is provided by the copyright holders and contributors as is and
  34 // any express or implied warranties, including, but not limited to, the implied
  35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  36 // In no event shall the Intel Corporation or contributors be liable for any direct,
  37 // indirect, incidental, special, exemplary, or consequential damages
  38 // (including, but not limited to, procurement of substitute goods or services;
  39 // loss of use, data, or profits; or business interruption) however caused
  40 // and on any theory of liability, whether in contract, strict liability,
  41 // or tort (including negligence or otherwise) arising in any way out of
  42 // the use of this software, even if advised of the possibility of such damage.
  43 //
  44
  45 #ifdef DOUBLE_SUPPORT
  46 #ifdef cl_amd_fp64
  47 #pragma OPENCL EXTENSION cl_amd_fp64:enable
  48 #elif defined (cl_khr_fp64)
  49 #pragma OPENCL EXTENSION cl_khr_fp64:enable
  50 #endif
  51 #endif
  52
  53 #if defined OP_NORM_INF_MASK
  54
  55 #ifdef DEPTH_0
  56 #define MIN_VAL 0
  57 #define MAX_VAL 255
  58 #elif defined DEPTH_1
  59 #define MIN_VAL -128
  60 #define MAX_VAL 127
  61 #elif defined DEPTH_2
  62 #define MIN_VAL 0
  63 #define MAX_VAL 65535
  64 #elif defined DEPTH_3
  65 #define MIN_VAL -32768
  66 #define MAX_VAL 32767
  67 #elif defined DEPTH_4
  68 #define MIN_VAL INT_MIN
  69 #define MAX_VAL INT_MAX
  70 #elif defined DEPTH_5
  71 #define MIN_VAL (-FLT_MAX)
  72 #define MAX_VAL FLT_MAX
  73 #elif defined DEPTH_6
  74 #define MIN_VAL (-DBL_MAX)
  75 #define MAX_VAL DBL_MAX
  76 #endif
  77
  78 #define dstT srcT
  79 #define dstT1 srcT1
  80
  81 #endif // min/max stuff
  82
  83 #define noconvert
  84
  85 #ifndef kercn
  86 #define kercn 1
  87 #endif
  88
  89 #ifdef HAVE_MASK_CONT
  90 #define MASK_INDEX int mask_index = id + mask_offset;
  91 #else
  92 #define MASK_INDEX int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols))
  93 #endif
  94
  95 #if cn != 3
  96 #define loadpix(addr) *(__global const srcT *)(addr)
  97 #define storepix(val, addr)  *(__global dstT *)(addr) = val
  98 #if kercn == 1
  99 #define srcTSIZE (int)sizeof(srcT)
 100 #else
 101 #define srcTSIZE (int)sizeof(srcT1)
 102 #endif
 103 #define dstTSIZE (int)sizeof(dstT)
 104 #else
 105 #define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
 106 #define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
 107 #define srcTSIZE ((int)sizeof(srcT1)*3)
 108 #define dstTSIZE ((int)sizeof(dstT1)*3)
 109 #endif
 110
 111 #ifdef HAVE_MASK
 112 #ifdef HAVE_SRC2
 113 #define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset, __global const uchar * src2ptr, int src2_step, int src2_offset
 114 #else
 115 #define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset
 116 #endif
 117 #else
 118 #ifdef HAVE_SRC2
 119 #define EXTRA_PARAMS , __global const uchar * src2ptr, int src2_step, int src2_offset
 120 #else
 121 #define EXTRA_PARAMS
 122 #endif
 123 #endif
 124
 125 // accumulative reduction stuff
 126 #if defined OP_SUM || defined OP_SUM_ABS || defined OP_SUM_SQR || defined OP_DOT
 127
 128 #ifdef OP_DOT
 129 #if ddepth <= 4
 130 #define FUNC(a, b, c) a = mad24(b, c, a)
 131 #else
 132 #define FUNC(a, b, c) a = mad(b, c, a)
 133 #endif
 134
 135 #elif defined OP_SUM
 136 #define FUNC(a, b) a += b
 137
 138 #elif defined OP_SUM_ABS
 139 #define FUNC(a, b) a += b >= (dstT)(0) ? b : -b
 140
 141 #elif defined OP_SUM_SQR
 142 #if ddepth <= 4
 143 #define FUNC(a, b) a = mad24(b, b, a)
 144 #else
 145 #define FUNC(a, b) a = mad(b, b, a)
 146 #endif
 147 #endif
 148
 149 #ifdef OP_CALC2
 150 #define DECLARE_LOCAL_MEM \
 151     __local dstT localmem[WGS2_ALIGNED], localmem2[WGS2_ALIGNED]
 152 #define DEFINE_ACCUMULATOR \
 153     dstT accumulator = (dstT)(0), accumulator2 = (dstT)(0)
 154 #else
 155 #define DECLARE_LOCAL_MEM \
 156     __local dstT localmem[WGS2_ALIGNED]
 157 #define DEFINE_ACCUMULATOR \
 158     dstT accumulator = (dstT)(0)
 159 #endif
 160
 161 #ifdef HAVE_SRC2
 162 #ifdef OP_CALC2
 163 #define PROCESS_ELEMS \
 164     dstT temp = convertToDT(loadpix(srcptr + src_index)); \
 165     dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
 166     temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
 167     temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
 168     FUNC(accumulator2, temp2); \
 169     FUNC(accumulator, temp)
 170 #else
 171 #define PROCESS_ELEMS \
 172     dstT temp = convertToDT(loadpix(srcptr + src_index)); \
 173     dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
 174     temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
 175     FUNC(accumulator, temp)
 176 #endif
 177 #else
 178 #define PROCESS_ELEMS \
 179     dstT temp = convertToDT(loadpix(srcptr + src_index)); \
 180     FUNC(accumulator, temp)
 181 #endif
 182
 183 #ifdef HAVE_MASK
 184 #define REDUCE_GLOBAL \
 185     MASK_INDEX; \
 186     if (mask[mask_index]) \
 187     { \
 188         PROCESS_ELEMS; \
 189     }
 190 #elif defined OP_DOT
 191
 192 #ifdef HAVE_SRC2_CONT
 193 #define SRC2_INDEX int src2_index = mad24(id, srcTSIZE, src2_offset);
 194 #else
 195 #define SRC2_INDEX int src2_index = mad24(id / cols, src2_step, mad24(id % cols, srcTSIZE, src2_offset))
 196 #endif
 197
 198 #if kercn == 1
 199 #define REDUCE_GLOBAL \
 200     SRC2_INDEX; \
 201     dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
 202     FUNC(accumulator, temp, temp2)
 203 #elif kercn == 2
 204 #define REDUCE_GLOBAL \
 205     SRC2_INDEX; \
 206     dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
 207     FUNC(accumulator, temp.s0, temp2.s0); \
 208     FUNC(accumulator, temp.s1, temp2.s1)
 209 #elif kercn == 4
 210 #define REDUCE_GLOBAL \
 211     SRC2_INDEX; \
 212     dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
 213     FUNC(accumulator, temp.s0, temp2.s0); \
 214     FUNC(accumulator, temp.s1, temp2.s1); \
 215     FUNC(accumulator, temp.s2, temp2.s2); \
 216     FUNC(accumulator, temp.s3, temp2.s3)
 217 #elif kercn == 8
 218 #define REDUCE_GLOBAL \
 219     SRC2_INDEX; \
 220     dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
 221     FUNC(accumulator, temp.s0, temp2.s0); \
 222     FUNC(accumulator, temp.s1, temp2.s1); \
 223     FUNC(accumulator, temp.s2, temp2.s2); \
 224     FUNC(accumulator, temp.s3, temp2.s3); \
 225     FUNC(accumulator, temp.s4, temp2.s4); \
 226     FUNC(accumulator, temp.s5, temp2.s5); \
 227     FUNC(accumulator, temp.s6, temp2.s6); \
 228     FUNC(accumulator, temp.s7, temp2.s7)
 229 #elif kercn == 16
 230 #define REDUCE_GLOBAL \
 231     SRC2_INDEX; \
 232     dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
 233     FUNC(accumulator, temp.s0, temp2.s0); \
 234     FUNC(accumulator, temp.s1, temp2.s1); \
 235     FUNC(accumulator, temp.s2, temp2.s2); \
 236     FUNC(accumulator, temp.s3, temp2.s3); \
 237     FUNC(accumulator, temp.s4, temp2.s4); \
 238     FUNC(accumulator, temp.s5, temp2.s5); \
 239     FUNC(accumulator, temp.s6, temp2.s6); \
 240     FUNC(accumulator, temp.s7, temp2.s7); \
 241     FUNC(accumulator, temp.s8, temp2.s8); \
 242     FUNC(accumulator, temp.s9, temp2.s9); \
 243     FUNC(accumulator, temp.sA, temp2.sA); \
 244     FUNC(accumulator, temp.sB, temp2.sB); \
 245     FUNC(accumulator, temp.sC, temp2.sC); \
 246     FUNC(accumulator, temp.sD, temp2.sD); \
 247     FUNC(accumulator, temp.sE, temp2.sE); \
 248     FUNC(accumulator, temp.sF, temp2.sF)
 249 #endif
 250
 251 #else // sum or norm with 2 args
 252 #ifdef HAVE_SRC2
 253 #ifdef OP_CALC2 // norm relative
 254 #if kercn == 1
 255 #define REDUCE_GLOBAL \
 256     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
 257     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
 258     temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
 259     temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
 260     FUNC(accumulator, temp); \
 261     FUNC(accumulator2, temp2)
 262 #elif kercn == 2
 263 #define REDUCE_GLOBAL \
 264     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
 265     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
 266     temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
 267     temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
 268     FUNC(accumulator, temp.s0); \
 269     FUNC(accumulator, temp.s1); \
 270     FUNC(accumulator2, temp2.s0); \
 271     FUNC(accumulator2, temp2.s1)
 272 #elif kercn == 4
 273 #define REDUCE_GLOBAL \
 274     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
 275     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
 276     temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
 277     temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
 278     FUNC(accumulator, temp.s0); \
 279     FUNC(accumulator, temp.s1); \
 280     FUNC(accumulator, temp.s2); \
 281     FUNC(accumulator, temp.s3); \
 282     FUNC(accumulator2, temp2.s0); \
 283     FUNC(accumulator2, temp2.s1); \
 284     FUNC(accumulator2, temp2.s2); \
 285     FUNC(accumulator2, temp2.s3)
 286 #elif kercn == 8
 287 #define REDUCE_GLOBAL \
 288     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
 289     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
 290     temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
 291     temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
 292     FUNC(accumulator, temp.s0); \
 293     FUNC(accumulator, temp.s1); \
 294     FUNC(accumulator, temp.s2); \
 295     FUNC(accumulator, temp.s3); \
 296     FUNC(accumulator, temp.s4); \
 297     FUNC(accumulator, temp.s5); \
 298     FUNC(accumulator, temp.s6); \
 299     FUNC(accumulator, temp.s7); \
 300     FUNC(accumulator2, temp2.s0); \
 301     FUNC(accumulator2, temp2.s1); \
 302     FUNC(accumulator2, temp2.s2); \
 303     FUNC(accumulator2, temp2.s3); \
 304     FUNC(accumulator2, temp2.s4); \
 305     FUNC(accumulator2, temp2.s5); \
 306     FUNC(accumulator2, temp2.s6); \
 307     FUNC(accumulator2, temp2.s7)
 308 #elif kercn == 16
 309 #define REDUCE_GLOBAL \
 310     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
 311     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
 312     temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
 313     temp2 = temp2 >= (dstT)(0) ? temp2 : -temp2; \
 314     FUNC(accumulator, temp.s0); \
 315     FUNC(accumulator, temp.s1); \
 316     FUNC(accumulator, temp.s2); \
 317     FUNC(accumulator, temp.s3); \
 318     FUNC(accumulator, temp.s4); \
 319     FUNC(accumulator, temp.s5); \
 320     FUNC(accumulator, temp.s6); \
 321     FUNC(accumulator, temp.s7); \
 322     FUNC(accumulator, temp.s8); \
 323     FUNC(accumulator, temp.s9); \
 324     FUNC(accumulator, temp.sA); \
 325     FUNC(accumulator, temp.sB); \
 326     FUNC(accumulator, temp.sC); \
 327     FUNC(accumulator, temp.sD); \
 328     FUNC(accumulator, temp.sE); \
 329     FUNC(accumulator, temp.sF); \
 330     FUNC(accumulator2, temp2.s0); \
 331     FUNC(accumulator2, temp2.s1); \
 332     FUNC(accumulator2, temp2.s2); \
 333     FUNC(accumulator2, temp2.s3); \
 334     FUNC(accumulator2, temp2.s4); \
 335     FUNC(accumulator2, temp2.s5); \
 336     FUNC(accumulator2, temp2.s6); \
 337     FUNC(accumulator2, temp2.s7); \
 338     FUNC(accumulator2, temp2.s8); \
 339     FUNC(accumulator2, temp2.s9); \
 340     FUNC(accumulator2, temp2.sA); \
 341     FUNC(accumulator2, temp2.sB); \
 342     FUNC(accumulator2, temp2.sC); \
 343     FUNC(accumulator2, temp2.sD); \
 344     FUNC(accumulator2, temp2.sE); \
 345     FUNC(accumulator2, temp2.sF)
 346 #endif
 347 #else // norm with 2 args
 348 #if kercn == 1
 349 #define REDUCE_GLOBAL \
 350     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
 351     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
 352     temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
 353     FUNC(accumulator, temp)
 354 #elif kercn == 2
 355 #define REDUCE_GLOBAL \
 356     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
 357     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
 358     temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
 359     FUNC(accumulator, temp.s0); \
 360     FUNC(accumulator, temp.s1)
 361 #elif kercn == 4
 362 #define REDUCE_GLOBAL \
 363     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
 364     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
 365     temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
 366     FUNC(accumulator, temp.s0); \
 367     FUNC(accumulator, temp.s1); \
 368     FUNC(accumulator, temp.s2); \
 369     FUNC(accumulator, temp.s3)
 370 #elif kercn == 8
 371 #define REDUCE_GLOBAL \
 372     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
 373     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
 374     temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
 375     FUNC(accumulator, temp.s0); \
 376     FUNC(accumulator, temp.s1); \
 377     FUNC(accumulator, temp.s2); \
 378     FUNC(accumulator, temp.s3); \
 379     FUNC(accumulator, temp.s4); \
 380     FUNC(accumulator, temp.s5); \
 381     FUNC(accumulator, temp.s6); \
 382     FUNC(accumulator, temp.s7)
 383 #elif kercn == 16
 384 #define REDUCE_GLOBAL \
 385     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
 386     dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
 387     temp = temp > temp2 ? temp - temp2 : (temp2 - temp); \
 388     FUNC(accumulator, temp.s0); \
 389     FUNC(accumulator, temp.s1); \
 390     FUNC(accumulator, temp.s2); \
 391     FUNC(accumulator, temp.s3); \
 392     FUNC(accumulator, temp.s4); \
 393     FUNC(accumulator, temp.s5); \
 394     FUNC(accumulator, temp.s6); \
 395     FUNC(accumulator, temp.s7); \
 396     FUNC(accumulator, temp.s8); \
 397     FUNC(accumulator, temp.s9); \
 398     FUNC(accumulator, temp.sA); \
 399     FUNC(accumulator, temp.sB); \
 400     FUNC(accumulator, temp.sC); \
 401     FUNC(accumulator, temp.sD); \
 402     FUNC(accumulator, temp.sE); \
 403     FUNC(accumulator, temp.sF)
 404 #endif
 405 #endif
 406
 407 #else // sum
 408 #if kercn == 1
 409 #define REDUCE_GLOBAL \
 410     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
 411     FUNC(accumulator, temp)
 412 #elif kercn == 2
 413 #define REDUCE_GLOBAL \
 414     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
 415     FUNC(accumulator, temp.s0); \
 416     FUNC(accumulator, temp.s1)
 417 #elif kercn == 4
 418 #define REDUCE_GLOBAL \
 419     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
 420     FUNC(accumulator, temp.s0); \
 421     FUNC(accumulator, temp.s1); \
 422     FUNC(accumulator, temp.s2); \
 423     FUNC(accumulator, temp.s3)
 424 #elif kercn == 8
 425 #define REDUCE_GLOBAL \
 426     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
 427     FUNC(accumulator, temp.s0); \
 428     FUNC(accumulator, temp.s1); \
 429     FUNC(accumulator, temp.s2); \
 430     FUNC(accumulator, temp.s3); \
 431     FUNC(accumulator, temp.s4); \
 432     FUNC(accumulator, temp.s5); \
 433     FUNC(accumulator, temp.s6); \
 434     FUNC(accumulator, temp.s7)
 435 #elif kercn == 16
 436 #define REDUCE_GLOBAL \
 437     dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
 438     FUNC(accumulator, temp.s0); \
 439     FUNC(accumulator, temp.s1); \
 440     FUNC(accumulator, temp.s2); \
 441     FUNC(accumulator, temp.s3); \
 442     FUNC(accumulator, temp.s4); \
 443     FUNC(accumulator, temp.s5); \
 444     FUNC(accumulator, temp.s6); \
 445     FUNC(accumulator, temp.s7); \
 446     FUNC(accumulator, temp.s8); \
 447     FUNC(accumulator, temp.s9); \
 448     FUNC(accumulator, temp.sA); \
 449     FUNC(accumulator, temp.sB); \
 450     FUNC(accumulator, temp.sC); \
 451     FUNC(accumulator, temp.sD); \
 452     FUNC(accumulator, temp.sE); \
 453     FUNC(accumulator, temp.sF)
 454 #endif
 455 #endif
 456 #endif
 457
 458 #ifdef OP_CALC2
 459 #define SET_LOCAL_1 \
 460     localmem[lid] = accumulator; \
 461     localmem2[lid] = accumulator2
 462 #define REDUCE_LOCAL_1 \
 463     localmem[lid - WGS2_ALIGNED] += accumulator; \
 464     localmem2[lid - WGS2_ALIGNED] += accumulator2
 465 #define REDUCE_LOCAL_2 \
 466     localmem[lid] += localmem[lid2]; \
 467     localmem2[lid] += localmem2[lid2]
 468 #define CALC_RESULT \
 469     storepix(localmem[0], dstptr + dstTSIZE * gid); \
 470     storepix(localmem2[0], dstptr + mad24(groupnum, dstTSIZE, dstTSIZE * gid))
 471 #else
 472 #define SET_LOCAL_1 \
 473     localmem[lid] = accumulator
 474 #define REDUCE_LOCAL_1 \
 475     localmem[lid - WGS2_ALIGNED] += accumulator
 476 #define REDUCE_LOCAL_2 \
 477     localmem[lid] += localmem[lid2]
 478 #define CALC_RESULT \
 479     storepix(localmem[0], dstptr + dstTSIZE * gid)
 480 #endif
 481
 482 // countNonZero stuff
 483 #elif defined OP_COUNT_NON_ZERO
 484 #define dstT int
 485 #define DECLARE_LOCAL_MEM \
 486     __local dstT localmem[WGS2_ALIGNED]
 487 #define DEFINE_ACCUMULATOR \
 488     dstT accumulator = (dstT)(0); \
 489     srcT1 zero = (srcT1)(0), one = (srcT1)(1)
 490 #if kercn == 1
 491 #define REDUCE_GLOBAL \
 492     accumulator += loadpix(srcptr + src_index) == zero ? zero : one
 493 #elif kercn == 2
 494 #define REDUCE_GLOBAL \
 495     srcT value = loadpix(srcptr + src_index); \
 496     accumulator += value.s0 == zero ? zero : one; \
 497     accumulator += value.s1 == zero ? zero : one
 498 #elif kercn == 4
 499 #define REDUCE_GLOBAL \
 500     srcT value = loadpix(srcptr + src_index); \
 501     accumulator += value.s0 == zero ? zero : one; \
 502     accumulator += value.s1 == zero ? zero : one; \
 503     accumulator += value.s2 == zero ? zero : one; \
 504     accumulator += value.s3 == zero ? zero : one
 505 #elif kercn == 8
 506 #define REDUCE_GLOBAL \
 507     srcT value = loadpix(srcptr + src_index); \
 508     accumulator += value.s0 == zero ? zero : one; \
 509     accumulator += value.s1 == zero ? zero : one; \
 510     accumulator += value.s2 == zero ? zero : one; \
 511     accumulator += value.s3 == zero ? zero : one; \
 512     accumulator += value.s4 == zero ? zero : one; \
 513     accumulator += value.s5 == zero ? zero : one; \
 514     accumulator += value.s6 == zero ? zero : one; \
 515     accumulator += value.s7 == zero ? zero : one
 516 #elif kercn == 16
 517 #define REDUCE_GLOBAL \
 518     srcT value = loadpix(srcptr + src_index); \
 519     accumulator += value.s0 == zero ? zero : one; \
 520     accumulator += value.s1 == zero ? zero : one; \
 521     accumulator += value.s2 == zero ? zero : one; \
 522     accumulator += value.s3 == zero ? zero : one; \
 523     accumulator += value.s4 == zero ? zero : one; \
 524     accumulator += value.s5 == zero ? zero : one; \
 525     accumulator += value.s6 == zero ? zero : one; \
 526     accumulator += value.s7 == zero ? zero : one; \
 527     accumulator += value.s8 == zero ? zero : one; \
 528     accumulator += value.s9 == zero ? zero : one; \
 529     accumulator += value.sA == zero ? zero : one; \
 530     accumulator += value.sB == zero ? zero : one; \
 531     accumulator += value.sC == zero ? zero : one; \
 532     accumulator += value.sD == zero ? zero : one; \
 533     accumulator += value.sE == zero ? zero : one; \
 534     accumulator += value.sF == zero ? zero : one
 535 #endif
 536
 537 #define SET_LOCAL_1 \
 538     localmem[lid] = accumulator
 539 #define REDUCE_LOCAL_1 \
 540     localmem[lid - WGS2_ALIGNED] += accumulator
 541 #define REDUCE_LOCAL_2 \
 542     localmem[lid] += localmem[lid2]
 543 #define CALC_RESULT \
 544     storepix(localmem[0], dstptr + dstTSIZE * gid)
 545
 546 // norm (NORM_INF) with cn > 1 and mask
 547 #elif defined OP_NORM_INF_MASK
 548
 549 #define DECLARE_LOCAL_MEM \
 550     __local srcT localmem_max[WGS2_ALIGNED]
 551 #define DEFINE_ACCUMULATOR \
 552     srcT maxval = MIN_VAL, temp
 553 #define REDUCE_GLOBAL \
 554     MASK_INDEX; \
 555     if (mask[mask_index]) \
 556     { \
 557         temp = loadpix(srcptr + src_index); \
 558         maxval = max(maxval, (srcT)(temp >= (srcT)(0) ? temp : -temp)); \
 559     }
 560 #define SET_LOCAL_1 \
 561     localmem_max[lid] = maxval
 562 #define REDUCE_LOCAL_1 \
 563     localmem_max[lid - WGS2_ALIGNED] = max(maxval, localmem_max[lid - WGS2_ALIGNED])
 564 #define REDUCE_LOCAL_2 \
 565     localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2])
 566 #define CALC_RESULT \
 567     storepix(localmem_max[0], dstptr + dstTSIZE * gid)
 568
 569 #else
 570 #error "No operation"
 571 #endif // end of norm (NORM_INF) with cn > 1 and mask
 572
 573 #ifdef OP_DOT
 574 #undef EXTRA_PARAMS
 575 #define EXTRA_PARAMS , __global uchar * src2ptr, int src2_step, int src2_offset
 576 #endif
 577
 578 __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset, int cols,
 579                      int total, int groupnum, __global uchar * dstptr EXTRA_PARAMS)
 580 {
 581     int lid = get_local_id(0);
 582     int gid = get_group_id(0);
 583     int  id = get_global_id(0) * kercn;
 584
 585     srcptr += src_offset;
 586 #ifdef HAVE_SRC2
 587     src2ptr += src2_offset;
 588 #endif
 589
 590     DECLARE_LOCAL_MEM;
 591     DEFINE_ACCUMULATOR;
 592
 593     for (int grain = groupnum * WGS * kercn; id < total; id += grain)
 594     {
 595 #ifdef HAVE_SRC_CONT
 596         int src_index = mul24(id, srcTSIZE);
 597 #else
 598         int src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE));
 599 #endif
 600 #ifdef HAVE_SRC2
 601 #ifdef HAVE_SRC2_CONT
 602         int src2_index = mul24(id, srcTSIZE);
 603 #else
 604         int src2_index = mad24(id / cols, src2_step, mul24(id % cols, srcTSIZE));
 605 #endif
 606 #endif
 607         REDUCE_GLOBAL;
 608     }
 609
 610     if (lid < WGS2_ALIGNED)
 611     {
 612         SET_LOCAL_1;
 613     }
 614     barrier(CLK_LOCAL_MEM_FENCE);
 615
 616     if (lid >= WGS2_ALIGNED && total >= WGS2_ALIGNED)
 617     {
 618         REDUCE_LOCAL_1;
 619     }
 620     barrier(CLK_LOCAL_MEM_FENCE);
 621
 622     for (int lsize = WGS2_ALIGNED >> 1; lsize > 0; lsize >>= 1)
 623     {
 624         if (lid < lsize)
 625         {
 626            int lid2 = lsize + lid;
 627            REDUCE_LOCAL_2;
 628         }
 629         barrier(CLK_LOCAL_MEM_FENCE);
 630     }
 631
 632     if (lid == 0)
 633     {
 634         CALC_RESULT;
 635     }
 636 }