1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Third party copyrights are property of their respective owners.
19 // Jia Haipeng, jiahaipeng95@gmail.com
22 // Redistribution and use in source and binary forms, with or without modification,
23 // are permitted provided that the following conditions are met:
25 // * Redistribution's of source code must retain the above copyright notice,
26 // this list of conditions and the following disclaimer.
28 // * Redistribution's in binary form must reproduce the above copyright notice,
29 // this list of conditions and the following disclaimer in the documentation
30 // and/or other materials provided with the distribution.
32 // * The name of the copyright holders may not be used to endorse or promote products
33 // derived from this software without specific prior written permission.
35 // This software is provided by the copyright holders and contributors as is and
36 // any express or implied warranties, including, but not limited to, the implied
37 // warranties of merchantability and fitness for a particular purpose are disclaimed.
38 // In no event shall the copyright holders or contributors be liable for any direct,
39 // indirect, incidental, special, exemplary, or consequential damages
40 // (including, but not limited to, procurement of substitute goods or services;
41 // loss of use, data, or profits; or business interruption) however caused
42 // and on any theory of liability, whether in contract, strict liability,
43 // or tort (including negligence or otherwise) arising in any way out of
44 // the use of this software, even if advised of the possibility of such damage.
50 after compiling this program user gets a single kernel called KF.
51 the following flags should be passed:
52 1) one of "-D BINARY_OP", "-D UNARY_OP", "-D MASK_BINARY_OP" or "-D MASK_UNARY_OP"
53 2) the actual operation performed, one of "-D OP_...", see below the list of operations.
54 2a) "-D dstDepth=<destination depth> [-D cn=<num channels]"
55 for some operations, like min/max/and/or/xor it's enough
56 2b) "-D srcDepth1=<source1 depth> -D srcDepth2=<source2 depth> -D dstDepth=<destination depth>
57 -D workDepth=<work depth> [-D cn=<num channels>]" - for mixed-type operations
62 #pragma OPENCL EXTENSION cl_amd_fp64:enable
63 #elif defined cl_khr_fp64
64 #pragma OPENCL EXTENSION cl_khr_fp64:enable
69 #pragma OPENCL FP_CONTRACT : on
70 #pragma OPENCL FP_FAST_FMAF : on
71 #pragma OPENCL FP_FAST_FMA : on
88 #define srcT1_C1 srcT1
89 #define srcT2_C1 srcT2
94 #define storedst(val) *(__global dstT *)(dstptr + dst_index) = val
95 #define storedst2(val) *(__global dstT *)(dstptr2 + dst_index2) = val
97 #define storedst(val) vstore3(val, 0, (__global dstT_C1 *)(dstptr + dst_index))
98 #define storedst2(val) vstore3(val, 0, (__global dstT_C1 *)(dstptr2 + dst_index2))
110 #define srcT1_C1 dstT_C1
118 #define srcT2_C1 dstT_C1
123 #define srcelem1 *(__global srcT1 *)(srcptr1 + src1_index)
124 #define srcelem2 *(__global srcT2 *)(srcptr2 + src2_index)
126 #define srcelem1 vload3(0, (__global srcT1_C1 *)(srcptr1 + src1_index))
127 #define srcelem2 vload3(0, (__global srcT2_C1 *)(srcptr2 + src2_index))
130 #define convertToDT noconvert
136 #define convertToWT2 convertToWT1
139 #define srcelem1 convertToWT1(*(__global srcT1 *)(srcptr1 + src1_index))
140 #define srcelem2 convertToWT2(*(__global srcT2 *)(srcptr2 + src2_index))
142 #define srcelem1 convertToWT1(vload3(0, (__global srcT1_C1 *)(srcptr1 + src1_index)))
143 #define srcelem2 convertToWT2(vload3(0, (__global srcT2_C1 *)(srcptr2 + src2_index)))
154 #define EXTRA_INDEX_ADD
157 #define PROCESS_ELEM storedst(convertToDT(srcelem1 + srcelem2))
160 #define PROCESS_ELEM storedst(convertToDT(srcelem1 - srcelem2))
162 #elif defined OP_RSUB
163 #define PROCESS_ELEM storedst(convertToDT(srcelem2 - srcelem1))
165 #elif defined OP_ABSDIFF
166 #define PROCESS_ELEM \
167 workT v = srcelem1 - srcelem2; \
168 storedst(convertToDT(v >= (workT)(0) ? v : -v))
171 #define PROCESS_ELEM storedst(srcelem1 & srcelem2)
174 #define PROCESS_ELEM storedst(srcelem1 | srcelem2)
177 #define PROCESS_ELEM storedst(srcelem1 ^ srcelem2)
180 #define PROCESS_ELEM storedst(~srcelem1)
183 #define PROCESS_ELEM storedst(min(srcelem1, srcelem2))
186 #define PROCESS_ELEM storedst(max(srcelem1, srcelem2))
189 #define PROCESS_ELEM storedst(convertToDT(srcelem1 * srcelem2))
191 #elif defined OP_MUL_SCALE
194 #define EXTRA_PARAMS , workST srcelem2_, scaleT scale
196 #define srcelem2 srcelem2_
198 #define EXTRA_PARAMS , scaleT scale
200 #define PROCESS_ELEM storedst(convertToDT(srcelem1 * scale * srcelem2))
203 #define PROCESS_ELEM \
204 workT e2 = srcelem2, zero = (workT)(0); \
205 storedst(convertToDT(e2 != zero ? srcelem1 / e2 : zero))
207 #elif defined OP_DIV_SCALE
210 #define EXTRA_PARAMS , workST srcelem2_, scaleT scale
212 #define srcelem2 srcelem2_
214 #define EXTRA_PARAMS , scaleT scale
216 #define PROCESS_ELEM \
217 workT e2 = srcelem2, zero = (workT)(0); \
218 storedst(convertToDT(e2 == zero ? zero : (srcelem1 * (workT)(scale) / e2)))
220 #elif defined OP_RDIV_SCALE
223 #define EXTRA_PARAMS , workST srcelem2_, scaleT scale
225 #define srcelem2 srcelem2_
227 #define EXTRA_PARAMS , scaleT scale
229 #define PROCESS_ELEM \
230 workT e1 = srcelem1, zero = (workT)(0); \
231 storedst(convertToDT(e1 == zero ? zero : (srcelem2 * (workT)(scale) / e1)))
233 #elif defined OP_RECIP_SCALE
235 #define EXTRA_PARAMS , scaleT scale
236 #define PROCESS_ELEM \
237 workT e1 = srcelem1, zero = (workT)(0); \
238 storedst(convertToDT(e1 != zero ? scale / e1 : zero))
240 #elif defined OP_ADDW
242 #define EXTRA_PARAMS , scaleT alpha, scaleT beta, scaleT gamma
244 #define PROCESS_ELEM storedst(convertToDT(mad24(srcelem1, alpha, mad24(srcelem2, beta, gamma))))
246 #define PROCESS_ELEM storedst(convertToDT(fma(srcelem1, alpha, mad(srcelem2, beta, gamma))))
250 #define PROCESS_ELEM storedst(hypot(srcelem1, srcelem2))
252 #elif defined OP_ABS_NOSAT
253 #define PROCESS_ELEM \
254 dstT v = convertToDT(srcelem1); \
255 storedst(v >= 0 ? v : -v)
257 #elif defined OP_PHASE_RADIANS
258 #define PROCESS_ELEM \
259 workT tmp = atan2(srcelem2, srcelem1); \
261 tmp += 6.283185307179586232f; \
264 #elif defined OP_PHASE_DEGREES
265 #define PROCESS_ELEM \
266 workT tmp = degrees(atan2(srcelem2, srcelem1)); \
273 #define PROCESS_ELEM storedst(native_exp(srcelem1))
275 #define PROCESS_ELEM storedst(exp(srcelem1))
279 #define PROCESS_ELEM storedst(pow(srcelem1, srcelem2))
281 #elif defined OP_POWN
284 #define PROCESS_ELEM storedst(pown(srcelem1, srcelem2))
286 #elif defined OP_SQRT
287 #define PROCESS_ELEM storedst(native_sqrt(srcelem1))
290 #define PROCESS_ELEM \
291 dstT v = (dstT)(srcelem1); \
292 storedst(v > (dstT)(0) ? log(v) : log(-v))
299 #define PROCESS_ELEM \
300 workT s1 = srcelem1, s2 = srcelem2; \
301 storedst(s1 CMP_OPERATOR s2 ? (dstT)(255) : (dstT)(0))
303 #elif defined OP_CONVERT_SCALE_ABS
305 #define EXTRA_PARAMS , workT1 alpha, workT1 beta
307 #define PROCESS_ELEM \
308 workT value = mad24(srcelem1, (workT)(alpha), (workT)(beta)); \
309 storedst(convertToDT(value >= 0 ? value : -value))
311 #define PROCESS_ELEM \
312 workT value = fma(srcelem1, (workT)(alpha), (workT)(beta)); \
313 storedst(convertToDT(value >= 0 ? value : -value))
316 #elif defined OP_SCALE_ADD
318 #define EXTRA_PARAMS , workT1 alpha
320 #define PROCESS_ELEM storedst(convertToDT(mad24(srcelem1, (workT)(alpha), srcelem2)))
322 #define PROCESS_ELEM storedst(convertToDT(fma(srcelem1, (workT)(alpha), srcelem2)))
325 #elif defined OP_CTP_AD || defined OP_CTP_AR
327 #define CV_EPSILON FLT_EPSILON
329 #define CV_EPSILON DBL_EPSILON
332 #define TO_DEGREE cartToPolar = degrees(cartToPolar);
333 #elif defined OP_CTP_AR
336 #define PROCESS_ELEM \
337 dstT x = srcelem1, y = srcelem2; \
338 dstT x2 = x * x, y2 = y * y; \
339 dstT magnitude = sqrt(x2 + y2); \
340 dstT tmp = y >= 0 ? 0 : CV_PI * 2; \
341 tmp = x < 0 ? CV_PI : tmp; \
342 dstT tmp1 = y >= 0 ? CV_PI * 0.5f : CV_PI * 1.5f; \
343 dstT cartToPolar = y2 <= x2 ? x * y / mad((dstT)(0.28f), y2, x2 + CV_EPSILON) + tmp : (tmp1 - x * y / mad((dstT)(0.28f), x2, y2 + CV_EPSILON)); \
345 storedst(magnitude); \
346 storedst2(cartToPolar)
348 #elif defined OP_PTC_AD || defined OP_PTC_AR
350 #define FROM_DEGREE y = radians(y)
354 #define PROCESS_ELEM \
355 dstT x = srcelem1, y = srcelem2, cosval; \
357 storedst2(sincos(y, &srcelem2) * x); \
358 storedst(cosval * x);
360 #elif defined OP_PATCH_NANS
362 #define EXTRA_PARAMS , dstT val
363 #define PROCESS_ELEM \
364 if (isnan(srcelem1)) \
368 #error "unknown op type"
371 #if defined OP_CTP_AD || defined OP_CTP_AR || defined OP_PTC_AD || defined OP_PTC_AR
373 #define EXTRA_PARAMS , __global uchar* dstptr2, int dststep2, int dstoffset2
375 #define EXTRA_INDEX int dst_index2 = mad24(y0, dststep2, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset2))
376 #undef EXTRA_INDEX_ADD
377 #define EXTRA_INDEX_ADD dst_index2 += dststep2
380 #if defined UNARY_OP || defined MASK_UNARY_OP
382 #if defined OP_AND || defined OP_OR || defined OP_XOR || defined OP_ADD || defined OP_SAT_ADD || \
383 defined OP_SUB || defined OP_SAT_SUB || defined OP_RSUB || defined OP_SAT_RSUB || \
384 defined OP_ABSDIFF || defined OP_CMP || defined OP_MIN || defined OP_MAX || defined OP_POW || \
385 defined OP_MUL || defined OP_DIV || defined OP_POWN
387 #define EXTRA_PARAMS , workST srcelem2_
389 #define srcelem2 srcelem2_
394 #define srcelem2 (workT)(srcelem2_.x, srcelem2_.y, srcelem2_.z)
399 #if defined BINARY_OP
401 __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
402 __global const uchar * srcptr2, int srcstep2, int srcoffset2,
403 __global uchar * dstptr, int dststep, int dstoffset,
404 int rows, int cols EXTRA_PARAMS )
406 int x = get_global_id(0);
407 int y0 = get_global_id(1) * rowsPerWI;
411 int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
412 #if !(defined(OP_RECIP_SCALE) || defined(OP_NOT))
413 int src2_index = mad24(y0, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2));
415 int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
418 for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, dst_index += dststep)
421 #if !(defined(OP_RECIP_SCALE) || defined(OP_NOT))
422 src2_index += srcstep2;
429 #elif defined MASK_BINARY_OP
431 __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
432 __global const uchar * srcptr2, int srcstep2, int srcoffset2,
433 __global const uchar * mask, int maskstep, int maskoffset,
434 __global uchar * dstptr, int dststep, int dstoffset,
435 int rows, int cols EXTRA_PARAMS )
437 int x = get_global_id(0);
438 int y0 = get_global_id(1) * rowsPerWI;
442 int mask_index = mad24(y0, maskstep, x + maskoffset);
443 int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
444 int src2_index = mad24(y0, srcstep2, mad24(x, (int)sizeof(srcT2_C1) * cn, srcoffset2));
445 int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
447 for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, src2_index += srcstep2,
448 mask_index += maskstep, dst_index += dststep)
449 if (mask[mask_index])
456 #elif defined UNARY_OP
458 __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
459 __global uchar * dstptr, int dststep, int dstoffset,
460 int rows, int cols EXTRA_PARAMS )
462 int x = get_global_id(0);
463 int y0 = get_global_id(1) * rowsPerWI;
467 int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
468 int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
470 for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, dst_index += dststep)
477 #elif defined MASK_UNARY_OP
479 __kernel void KF(__global const uchar * srcptr1, int srcstep1, int srcoffset1,
480 __global const uchar * mask, int maskstep, int maskoffset,
481 __global uchar * dstptr, int dststep, int dstoffset,
482 int rows, int cols EXTRA_PARAMS )
484 int x = get_global_id(0);
485 int y0 = get_global_id(1) * rowsPerWI;
489 int mask_index = mad24(y0, maskstep, x + maskoffset);
490 int src1_index = mad24(y0, srcstep1, mad24(x, (int)sizeof(srcT1_C1) * cn, srcoffset1));
491 int dst_index = mad24(y0, dststep, mad24(x, (int)sizeof(dstT_C1) * cn, dstoffset));
493 for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src1_index += srcstep1, mask_index += maskstep, dst_index += dststep)
494 if (mask[mask_index])
503 #error "Unknown operation type"