modules/core/src/opencl/arithm.cl

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
  14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
  15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  16 // Third party copyrights are property of their respective owners.
  17 //
  18 // @Authors
  19 //    Jia Haipeng, jiahaipeng95@gmail.com
  20 //
  21 //
  22 // Redistribution and use in source and binary forms, with or without modification,
  23 // are permitted provided that the following conditions are met:
  24 //
  25 //   * Redistribution's of source code must retain the above copyright notice,
  26 //     this list of conditions and the following disclaimer.
  27 //
  28 //   * Redistribution's in binary form must reproduce the above copyright notice,
  29 //     this list of conditions and the following disclaimer in the documentation
  30 //     and/or other materials provided with the distribution.
  31 //
  32 //   * The name of the copyright holders may not be used to endorse or promote products
  33 //     derived from this software without specific prior written permission.
  34 //
  35 // This software is provided by the copyright holders and contributors as is and
  36 // any express or implied warranties, including, but not limited to, the implied
  37 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  38 // In no event shall the copyright holders or contributors be liable for any direct,
  39 // indirect, incidental, special, exemplary, or consequential damages
  40 // (including, but not limited to, procurement of substitute goods or services;
  41 // loss of use, data, or profits; or business interruption) however caused
  42 // and on any theory of liability, whether in contract, strict liability,
  43 // or tort (including negligence or otherwise) arising in any way out of
  44 // the use of this software, even if advised of the possibility of such damage.
  45 //
  46 //M*/
  47
  48 /*
  49   Usage:
  50      after compiling this program user gets a single kernel called KF.
  51      the following flags should be passed:
  52      1) one of "-D BINARY_OP", "-D UNARY_OP", "-D MASK_BINARY_OP" or "-D MASK_UNARY_OP"
  53      2) the actual operation performed, one of "-D OP_...", see below the list of operations.
  54      2a) "-D dstDepth=<destination depth> [-D cn=<num channels]"
  55          for some operations, like min/max/and/or/xor it's enough
  56      2b) "-D srcDepth1=<source1 depth> -D srcDepth2=<source2 depth> -D dstDepth=<destination depth>
  57           -D workDepth=<work depth> [-D cn=<num channels>]" - for mixed-type operations
  58 */
  59
  60 #ifdef DOUBLE_SUPPORT
  61 #ifdef cl_amd_fp64
  62 #pragma OPENCL EXTENSION cl_amd_fp64:enable
  63 #elif defined cl_khr_fp64
  64 #pragma OPENCL EXTENSION cl_khr_fp64:enable
  65 #endif
  66 #endif
  67
  68 #ifdef INTEL_DEVICE
  69 #pragma OPENCL FP_CONTRACT : on
  70 #pragma OPENCL FP_FAST_FMAF : on
  71 #pragma OPENCL FP_FAST_FMA : on
  72 #endif
  73
  74 #if depth <= 5
  75 #define CV_PI M_PI_F
  76 #else
  77 #define CV_PI M_PI
  78 #endif
  79
  80 #ifndef cn
  81 #define cn 1
  82 #endif
  83
  84 #if cn == 1
  85 #undef srcT1_C1
  86 #undef srcT2_C1
  87 #undef dstT_C1
  88 #define srcT1_C1 srcT1
  89 #define srcT2_C1 srcT2
  90 #define dstT_C1 dstT
  91 #endif
  92
  93 #if cn != 3
  94     #define storedst(val) *(__global dstT *)(dstptr + dst_index) = val
  95     #define storedst2(val) *(__global dstT *)(dstptr2 + dst_index2) = val
  96 #else
  97     #define storedst(val) vstore3(val, 0, (__global dstT_C1 *)(dstptr + dst_index))
  98     #define storedst2(val) vstore3(val, 0, (__global dstT_C1 *)(dstptr2 + dst_index2))
  99 #endif
 100
 101 #define noconvert
 102
 103 #ifndef workT
 104
 105     #ifndef srcT1
 106     #define srcT1 dstT
 107     #endif
 108
 109     #ifndef srcT1_C1
 110     #define srcT1_C1 dstT_C1
 111     #endif
 112
 113     #ifndef srcT2
 114     #define srcT2 dstT
 115     #endif
 116
 117     #ifndef srcT2_C1
 118     #define srcT2_C1 dstT_C1
 119     #endif
 120
 121     #define workT dstT
 122     #if cn != 3
 123         #define srcelem1 *(__global srcT1 *)(srcptr1 + src1_index)
 124         #define srcelem2 *(__global srcT2 *)(srcptr2 + src2_index)
 125     #else
 126         #define srcelem1 vload3(0, (__global srcT1_C1 *)(srcptr1 + src1_index))
 127         #define srcelem2 vload3(0, (__global srcT2_C1 *)(srcptr2 + src2_index))
 128     #endif
 129     #ifndef convertToDT
 130     #define convertToDT noconvert
 131     #endif
 132
 133 #else
 134
 135     #ifndef convertToWT2
 136     #define convertToWT2 convertToWT1
 137     #endif
 138     #if cn != 3
 139         #define srcelem1 convertToWT1(*(__global srcT1 *)(srcptr1 + src1_index))
 140         #define srcelem2 convertToWT2(*(__global srcT2 *)(srcptr2 + src2_index))
 141     #else
 142         #define srcelem1 convertToWT1(vload3(0, (__global srcT1_C1 *)(srcptr1 + src1_index)))
 143         #define srcelem2 convertToWT2(vload3(0, (__global srcT2_C1 *)(srcptr2 + src2_index)))
 144     #endif
 145
 146 #endif
 147
 148 #ifndef workST
 149 #define workST workT
 150 #endif
 151
 152 #define EXTRA_PARAMS
 153 #define EXTRA_INDEX
 154 #define EXTRA_INDEX_ADD
 155
 156 #if defined OP_ADD
 157 #define PROCESS_ELEM storedst(convertToDT(srcelem1 + srcelem2))
 158
 159 #elif defined OP_SUB
 160 #define PROCESS_ELEM storedst(convertToDT(srcelem1 - srcelem2))
 161
 162 #elif defined OP_RSUB
 163 #define PROCESS_ELEM storedst(convertToDT(srcelem2 - srcelem1))
 164
 165 #elif defined OP_ABSDIFF
 166 #define PROCESS_ELEM \
 167     workT v = srcelem1 - srcelem2; \
 168     storedst(convertToDT(v >= (workT)(0) ? v : -v))
 169
 170 #elif defined OP_AND
 171 #define PROCESS_ELEM storedst(srcelem1 & srcelem2)
 172
 173 #elif defined OP_OR
 174 #define PROCESS_ELEM storedst(srcelem1 | srcelem2)
 175
 176 #elif defined OP_XOR
 177 #define PROCESS_ELEM storedst(srcelem1 ^ srcelem2)
 178
 179 #elif defined OP_NOT
 180 #define PROCESS_ELEM storedst(~srcelem1)
 181
 182 #elif defined OP_MIN
 183 #define PROCESS_ELEM storedst(min(srcelem1, srcelem2))
 184
 185 #elif defined OP_MAX
 186 #define PROCESS_ELEM storedst(max(srcelem1, srcelem2))
 187
 188 #elif defined OP_MUL
 189 #define PROCESS_ELEM storedst(convertToDT(srcelem1 * srcelem2))
 190
 191 #elif defined OP_MUL_SCALE
 192 #undef EXTRA_PARAMS
 193 #ifdef UNARY_OP
 194 #define EXTRA_PARAMS , workST srcelem2_, scaleT scale
 195 #undef srcelem2
 196 #define srcelem2 srcelem2_
 197 #else
 198 #define EXTRA_PARAMS , scaleT scale
 199 #endif
 200 #define PROCESS_ELEM storedst(convertToDT(srcelem1 * scale * srcelem2))
 201
 202 #elif defined OP_DIV
 203 #define PROCESS_ELEM \
 204         workT e2 = srcelem2, zero = (workT)(0); \
 205         storedst(convertToDT(e2 != zero ? srcelem1 / e2 : zero))
 206
 207 #elif defined OP_DIV_SCALE
 208 #undef EXTRA_PARAMS
 209 #ifdef UNARY_OP
 210 #define EXTRA_PARAMS , workST srcelem2_, scaleT scale
 211 #undef srcelem2
 212 #define srcelem2 srcelem2_
 213 #else
 214 #define EXTRA_PARAMS , scaleT scale
 215 #endif
 216 #define PROCESS_ELEM \
 217         workT e2 = srcelem2, zero = (workT)(0); \
 218         storedst(convertToDT(e2 == zero ? zero : (srcelem1 * (workT)(scale) / e2)))
 219
 220 #elif defined OP_RDIV_SCALE
 221 #undef EXTRA_PARAMS
 222 #ifdef UNARY_OP
 223 #define EXTRA_PARAMS , workST srcelem2_, scaleT scale
 224 #undef srcelem2
 225 #define srcelem2 srcelem2_
 226 #else
 227 #define EXTRA_PARAMS , scaleT scale
 228 #endif
 229 #define PROCESS_ELEM \
 230         workT e1 = srcelem1, zero = (workT)(0); \
 231         storedst(convertToDT(e1 == zero ? zero : (srcelem2 * (workT)(scale) / e1)))
 232
 233 #elif defined OP_RECIP_SCALE
 234 #undef EXTRA_PARAMS
 235 #define EXTRA_PARAMS , scaleT scale
 236 #define PROCESS_ELEM \
 237         workT e1 = srcelem1, zero = (workT)(0); \
 238         storedst(convertToDT(e1 != zero ? scale / e1 : zero))
 239
 240 #elif defined OP_ADDW
 241 #undef EXTRA_PARAMS
 242 #define EXTRA_PARAMS , scaleT alpha, scaleT beta, scaleT gamma
 243 #if wdepth <= 4
 244 #define PROCESS_ELEM storedst(convertToDT(mad24(srcelem1, alpha, mad24(srcelem2, beta, gamma))))
 245 #else
 246 #define PROCESS_ELEM storedst(convertToDT(fma(srcelem1, alpha, mad(srcelem2, beta, gamma))))
 247 #endif
 248
 249 #elif defined OP_MAG
 250 #define PROCESS_ELEM storedst(hypot(srcelem1, srcelem2))
 251
 252 #elif defined OP_ABS_NOSAT
 253 #define PROCESS_ELEM \
 254     dstT v = convertToDT(srcelem1); \
 255     storedst(v >= 0 ? v : -v)
 256
 257 #elif defined OP_PHASE_RADIANS
 258 #define PROCESS_ELEM \
 259         workT tmp = atan2(srcelem2, srcelem1); \
 260         if (tmp < 0) \
 261             tmp += 6.283185307179586232f; \
 262         storedst(tmp)
 263
 264 #elif defined OP_PHASE_DEGREES
 265     #define PROCESS_ELEM \
 266     workT tmp = degrees(atan2(srcelem2, srcelem1)); \
 267     if (tmp < 0) \
 268         tmp += 360; \
 269     storedst(tmp)
 270
 271 #elif defined OP_EXP
 272 #if wdepth == 5
 273 #define PROCESS_ELEM storedst(native_exp(srcelem1))
 274 #else
 275 #define PROCESS_ELEM storedst(exp(srcelem1))
 276 #endif
 277
 278 #elif defined OP_POW
 279 #define PROCESS_ELEM storedst(pow(srcelem1, srcelem2))
 280
 281 #elif defined OP_POWN
 282 #undef workT
 283 #define workT int
 284 #define PROCESS_ELEM storedst(pown(srcelem1, srcelem2))
 285
 286 #elif defined OP_SQRT
 287 #define PROCESS_ELEM storedst(native_sqrt(srcelem1))
 288
 289 #elif defined OP_LOG
 290 #define PROCESS_ELEM \
 291     dstT v = (dstT)(srcelem1); \
 292     storedst(v > (dstT)(0) ? log(v) : log(-v))
 293
 294 #elif defined OP_CMP
 295 #define srcT2 srcT1
 296 #ifndef convertToWT1
 297 #define convertToWT1
 298 #endif
 299 #define PROCESS_ELEM \
 300     workT s1 = srcelem1, s2 = srcelem2; \
 301     storedst(s1 CMP_OPERATOR s2 ? (dstT)(255) : (dstT)(0))
 302
 303 #elif defined OP_CONVERT_SCALE_ABS
 304 #undef EXTRA_PARAMS
 305 #define EXTRA_PARAMS , workT1 alpha, workT1 beta
 306 #if wdepth <= 4
 307 #define PROCESS_ELEM \
 308     workT value = mad24(srcelem1, (workT)(alpha), (workT)(beta)); \
 309     storedst(convertToDT(value >= 0 ? value : -value))
 310 #else
 311 #define PROCESS_ELEM \
 312     workT value = fma(srcelem1, (workT)(alpha), (workT)(beta)); \
 313     storedst(convertToDT(value >= 0 ? value : -value))
 314 #endif
 315
 316 #elif defined OP_SCALE_ADD
 317 #undef EXTRA_PARAMS
 318 #define EXTRA_PARAMS , workT1 alpha
 319 #if wdepth <= 4
 320 #define PROCESS_ELEM storedst(convertToDT(mad24(srcelem1, (workT)(alpha), srcelem2)))
 321 #else
 322 #define PROCESS_ELEM storedst(convertToDT(fma(srcelem1, (workT)(alpha), srcelem2)))
 323 #endif
 324
 325 #elif defined OP_CTP_AD || defined OP_CTP_AR
 326 #if depth <= 5
 327 #define CV_EPSILON FLT_EPSILON
 328 #else
 329 #define CV_EPSILON DBL_EPSILON
 330 #endif
 331 #ifdef OP_CTP_AD
 332 #define TO_DEGREE cartToPolar = degrees(cartToPolar);
 333 #elif defined OP_CTP_AR
 334 #define TO_DEGREE
 335 #endif
 336 #define PROCESS_ELEM \
 337     dstT x = srcelem1, y = srcelem2; \
 338     dstT x2 = x * x, y2 = y * y; \
 339     dstT magnitude = sqrt(x2 + y2); \
 340     dstT tmp = y >= 0 ? 0 : CV_PI * 2; \
 341     tmp = x < 0 ? CV_PI : tmp; \
 342     dstT tmp1 = y >= 0 ? CV_PI * 0.5f : CV_PI * 1.5f; \
 343     dstT cartToPolar = y2 <= x2 ? x * y / mad((dstT)(0.28f), y2, x2 + CV_EPSILON) + tmp : (tmp1 - x * y / mad((dstT)(0.28f), x2, y2 + CV_EPSILON)); \
 344     TO_DEGREE \
 345     storedst(magnitude); \
 346     storedst2(cartToPolar)
 347
 348 #elif defined OP_PTC_AD || defined OP_PTC_AR
 349 #ifdef OP_PTC_AD
 350 #define FROM_DEGREE y = radians(y)
 351 #else
 352 #define FROM_DEGREE
 353 #endif
 354 #define PROCESS_ELEM \
 355     dstT x = srcelem1, y = srcelem2, cosval; \
 356     FROM_DEGREE; \
 357     storedst2(sincos(y, &srcelem2) * x); \
 358     storedst(cosval * x);
 359
 360 #elif defined OP_PATCH_NANS
 361 #undef EXTRA_PARAMS
 362 #define EXTRA_PARAMS , dstT val
 363 #define PROCESS_ELEM \
 364     if (isnan(srcelem1)) \
 365         storedst(val)
 366
 367 #else
 368 #error "unknown op type"
 369 #endif
 370
 371 #if defined OP_CTP_AD || defined OP_CTP_AR || defined OP_PTC_AD || defined OP_PTC_AR
 372     #undef EXTRA_PARAMS
 373     #define EXTRA_PARAMS , __global uchar* dstptr2, int dststep2, int dstoffset2
 374     #undef EXTRA_INDEX
 375     #define EXTRA_INDEX int dst_index2 = mad24(y0, dststep2, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset2))
 376     #undef EXTRA_INDEX_ADD
 377     #define EXTRA_INDEX_ADD dst_index2 += dststep2
 378 #endif
 379
 380 #if defined UNARY_OP || defined MASK_UNARY_OP
 381
 382 #if defined OP_AND || defined OP_OR || defined OP_XOR || defined OP_ADD || defined OP_SAT_ADD || \
 383     defined OP_SUB || defined OP_SAT_SUB || defined OP_RSUB || defined OP_SAT_RSUB || \
 384     defined OP_ABSDIFF || defined OP_CMP || defined OP_MIN || defined OP_MAX || defined OP_POW || \
 385     defined OP_MUL || defined OP_DIV || defined OP_POWN
 386     #undef EXTRA_PARAMS
 387     #define EXTRA_PARAMS , workST srcelem2_
 388     #undef srcelem2
 389     #define srcelem2 srcelem2_
 390 #endif
 391
 392 #if cn == 3
 393 #undef srcelem2
 394 #define srcelem2 (workT)(srcelem2_.x, srcelem2_.y, srcelem2_.z)
 395 #endif
 396
 397 #endif
 398
 399 #if defined BINARY_OP
 400
 401 __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
 402                  __global const uchar * srcptr2, int srcstep2, int srcoffset2,
 403                  __global uchar * dstptr, int dststep, int dstoffset,
 404                  int rows, int cols EXTRA_PARAMS )
 405 {
 406     int x = get_global_id(0);
 407     int y0 = get_global_id(1) * rowsPerWI;
 408
 409     if (x < cols)
 410     {
 411         int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
 412 #if !(defined(OP_RECIP_SCALE) || defined(OP_NOT))
 413         int src2_index = mad24(y0, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2));
 414 #endif
 415         int dst_index  = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
 416         EXTRA_INDEX;
 417
 418         for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, dst_index += dststep)
 419         {
 420             PROCESS_ELEM;
 421 #if !(defined(OP_RECIP_SCALE) || defined(OP_NOT))
 422             src2_index += srcstep2;
 423 #endif
 424             EXTRA_INDEX_ADD;
 425         }
 426     }
 427 }
 428
 429 #elif defined MASK_BINARY_OP
 430
 431 __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
 432                  __global const uchar * srcptr2, int srcstep2, int srcoffset2,
 433                  __global const uchar * mask, int maskstep, int maskoffset,
 434                  __global uchar * dstptr, int dststep, int dstoffset,
 435                  int rows, int cols EXTRA_PARAMS )
 436 {
 437     int x = get_global_id(0);
 438     int y0 = get_global_id(1) * rowsPerWI;
 439
 440     if (x < cols)
 441     {
 442         int mask_index = mad24(y0, maskstep, x + maskoffset);
 443         int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
 444         int src2_index = mad24(y0, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2));
 445         int dst_index  = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
 446
 447         for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, src2_index += srcstep2,
 448                                                                 mask_index += maskstep, dst_index += dststep)
 449             if (mask[mask_index])
 450             {
 451                 PROCESS_ELEM;
 452             }
 453     }
 454 }
 455
 456 #elif defined UNARY_OP
 457
 458 __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
 459                  __global uchar * dstptr, int dststep, int dstoffset,
 460                  int rows, int cols EXTRA_PARAMS )
 461 {
 462     int x = get_global_id(0);
 463     int y0 = get_global_id(1) * rowsPerWI;
 464
 465     if (x < cols)
 466     {
 467         int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
 468         int dst_index  = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
 469
 470         for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, dst_index += dststep)
 471         {
 472             PROCESS_ELEM;
 473         }
 474     }
 475 }
 476
 477 #elif defined MASK_UNARY_OP
 478
 479 __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
 480                  __global const uchar * mask, int maskstep, int maskoffset,
 481                  __global uchar * dstptr, int dststep, int dstoffset,
 482                  int rows, int cols EXTRA_PARAMS )
 483 {
 484     int x = get_global_id(0);
 485     int y0 = get_global_id(1) * rowsPerWI;
 486
 487     if (x < cols)
 488     {
 489         int mask_index = mad24(y0, maskstep, x + maskoffset);
 490         int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
 491         int dst_index  = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
 492
 493         for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, mask_index += maskstep, dst_index += dststep)
 494             if (mask[mask_index])
 495             {
 496                 PROCESS_ELEM;
 497             }
 498     }
 499 }
 500
 501 #else
 502
 503 #error "Unknown operation type"
 504
 505 #endif