2 * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
18 * Copyright (c) 2016-2020 ARM Limited.
20 * SPDX-License-Identifier: MIT
22 * Permission is hereby granted, free of charge, to any person obtaining a copy
23 * of this software and associated documentation files (the "Software"), to
24 * deal in the Software without restriction, including without limitation the
25 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
26 * sell copies of the Software, and to permit persons to whom the Software is
27 * furnished to do so, subject to the following conditions:
29 * The above copyright notice and this permission notice shall be included in all
30 * copies or substantial portions of the Software.
32 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
33 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
34 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
35 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
36 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
37 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
40 #ifndef ARM_COMPUTE_HELPER_H
41 #define ARM_COMPUTE_HELPER_H
43 #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
44 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
45 #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
47 #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
48 #pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
49 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
51 #if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \
52 defined(cl_arm_integer_dot_product_accumulate_int8)
53 #pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
54 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) &&
55 // defined(cl_arm_integer_dot_product_accumulate_int8)
57 #if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
58 #pragma OPENCL EXTENSION cl_arm_printf : enable
59 #endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
61 #define GPU_ARCH_MIDGARD 0x100
62 #define GPU_ARCH_BIFROST 0x200
64 /** Concatenate two inputs.
66 * @param[in] a The first input to be concatenated
67 * @param[in] b The second input to be concatenated
69 * @return The concatenated output
71 #define CONCAT(a, b) a##b
73 /** Expand the given vector
75 * @param[in] x The vector to be expanded
77 * @return The expanded output
81 /** Clamp the given value between an upper and lower bound.
83 * @param[in] x The value to be clamped
84 * @param[in] min_val The lower bound
85 * @param[in] max_val The upper bound
87 * @return The clamped value.
89 #define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
91 /** REVn reverses the given vector whose size is n.
94 * @param[in] x The vector to be reversed
96 * @return The reversed vector
100 #define REV2(x) ((x).s10)
101 #define REV3(x) ((x).s210)
102 #define REV4(x) ((x).s3210)
103 #define REV8(x) ((x).s76543210)
104 #define REV16(x) ((x).sFEDCBA9876543210)
105 /** @} */ // end of group REVn
107 /** Reverse the given vector.
110 * @param[in] x The vector to be reversed
111 * @param[in] s The size of the vector
113 * @return The reversed vector
116 #define REVERSE_STR(x, s) REV##s((x))
117 #define REVERSE(x, s) REVERSE_STR(x, s)
118 /** @} */ // end of group REVERSE
120 /** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
123 * @param[in] x The vector to be shifted
125 * @return The shifted vector
128 #define ROT1_0(x) ((x))
130 #define ROT2_0(x) ((x))
131 #define ROT2_1(x) ((x).s10)
133 #define ROT3_0(x) ((x))
134 #define ROT3_1(x) ((x).s201)
135 #define ROT3_2(x) ((x).s120)
137 #define ROT4_0(x) ((x))
138 #define ROT4_1(x) ((x).s3012)
139 #define ROT4_2(x) ((x).s2301)
140 #define ROT4_3(x) ((x).s1230)
142 #define ROT8_0(x) ((x))
143 #define ROT8_1(x) ((x).s70123456)
144 #define ROT8_2(x) ((x).s67012345)
145 #define ROT8_3(x) ((x).s56701234)
146 #define ROT8_4(x) ((x).s45670123)
147 #define ROT8_5(x) ((x).s34567012)
148 #define ROT8_6(x) ((x).s23456701)
149 #define ROT8_7(x) ((x).s12345670)
151 #define ROT16_0(x) ((x))
152 #define ROT16_1(x) ((x).sF0123456789ABCDE)
153 #define ROT16_2(x) ((x).sEF0123456789ABCD)
154 #define ROT16_3(x) ((x).sDEF0123456789ABC)
155 #define ROT16_4(x) ((x).sCDEF0123456789AB)
156 #define ROT16_5(x) ((x).sBCDEF0123456789A)
157 #define ROT16_6(x) ((x).sABCDEF0123456789)
158 #define ROT16_7(x) ((x).s9ABCDEF012345678)
159 #define ROT16_8(x) ((x).s89ABCDEF01234567)
160 #define ROT16_9(x) ((x).s789ABCDEF0123456)
161 #define ROT16_10(x) ((x).s6789ABCDEF012345)
162 #define ROT16_11(x) ((x).s56789ABCDEF01234)
163 #define ROT16_12(x) ((x).s456789ABCDEF0123)
164 #define ROT16_13(x) ((x).s3456789ABCDEF012)
165 #define ROT16_14(x) ((x).s23456789ABCDEF01)
166 #define ROT16_15(x) ((x).s123456789ABCDEF0)
167 /** @} */ // end of group ROTs_n
169 /** Circular-right-shift (rotate-right) the given vector by the given amount.
172 * @param[in] x The vector to be shifted
173 * @param[in] s The size of the vector
174 * @param[in] n The amount to be shifted
176 * @return The shifted vector
179 #define ROTATE_STR(x, s, n) ROT##s##_##n(x)
180 #define ROTATE(x, s, n) ROTATE_STR(x, s, n)
181 /** @} */ // end of group ROTATE
183 /** Creates a vector of size n filled with offset values corresponding to the location of each
187 * @param[in] dt The data type of the output vector
189 * @return The vector filled with offset values
192 #define V_OFFS1(dt) (dt)(0)
193 #define V_OFFS2(dt) (dt)(0, 1)
194 #define V_OFFS3(dt) (dt)(0, 1, 3)
195 #define V_OFFS4(dt) (dt)(0, 1, 2, 3)
196 #define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
197 #define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
198 /** @} */ // end of group V_OFFSn
200 /** Create a vector filled with offset values corresponding to the location of each element.
203 * @param[in] dt The data type of the output vector
204 * @param[in] s The size of the output vector
206 * @return The vector filled with offset values
209 #define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
210 #define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
211 /** @} */ // end of group VEC_OFFS
213 #define VLOAD_STR(size) vload##size
214 #define VLOAD(size) VLOAD_STR(size)
216 #define VSTORE_STR(size) vstore##size
217 #define VSTORE(size) VSTORE_STR(size)
224 #define ushort1 ushort
229 #define double1 double
231 #define vload1(OFFSET, PTR) *(OFFSET + PTR)
232 #define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
234 // Convert built-in functions with _sat modifier are not supported in floating point so we create
236 // without _sat to overcome this issue
237 #define convert_float_sat convert_float
238 #define convert_float1_sat convert_float
239 #define convert_float2_sat convert_float2
240 #define convert_float3_sat convert_float3
241 #define convert_float4_sat convert_float4
242 #define convert_float8_sat convert_float8
243 #define convert_float16_sat convert_float16
244 #define convert_half_sat convert_float
245 #define convert_half1_sat convert_half
246 #define convert_half2_sat convert_half2
247 #define convert_half3_sat convert_half3
248 #define convert_half4_sat convert_half4
249 #define convert_half8_sat convert_half8
250 #define convert_half16_sat convert_half16
252 #define convert_float1 convert_float
253 #define convert_half1 convert_half
254 #define convert_char1 convert_char
255 #define convert_uchar1 convert_uchar
256 #define convert_short1 convert_short
257 #define convert_ushort1 convert_ushort
258 #define convert_int1 convert_int
259 #define convert_uint1 convert_uint
260 #define convert_long1 convert_long
261 #define convert_ulong1 convert_ulong
262 #define convert_double1 convert_double
264 #define convert_char1_sat convert_char_sat
265 #define convert_uchar1_sat convert_uchar_sat
266 #define convert_short1_sat convert_short_sat
267 #define convert_ushort1_sat convert_ushort_sat
268 #define convert_int1_sat convert_int_sat
269 #define convert_uint1_sat convert_uint_sat
270 #define convert_long1_sat convert_long_sat
271 #define convert_ulong1_sat convert_ulong_sat
272 #define convert_double1_sat convert_double_sat
274 #define VEC_DATA_TYPE_STR(type, size) type##size
275 #define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
277 #define CL_VEC_DATA_TYPE_STR(type, size) type##size
278 #define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
280 #define CONVERT_STR(x, type) (convert_##type((x)))
281 #define CONVERT(x, type) CONVERT_STR(x, type)
283 #define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
284 #define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
286 #define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
287 #define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
289 #define VECTOR_DECLARATION(name) \
290 __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, \
291 uint name##_offset_first_element_in_bytes
293 #define IMAGE_DECLARATION(name) \
294 __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
295 uint name##_step_y, uint name##_offset_first_element_in_bytes
297 #define TENSOR3D_DECLARATION(name) \
298 __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
299 uint name##_step_y, uint name##_stride_z, uint name##_step_z, \
300 uint name##_offset_first_element_in_bytes
302 #define TENSOR4D_DECLARATION(name) \
303 __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
304 uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w, \
305 uint name##_step_w, uint name##_offset_first_element_in_bytes
307 #define CONVERT_TO_VECTOR_STRUCT(name) \
308 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
311 #define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
312 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
314 #define CONVERT_TO_IMAGE_STRUCT(name) \
315 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
316 name##_step_x, name##_stride_y, name##_step_y)
318 #define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
319 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
322 #define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
323 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \
324 name##_stride_x, name##_step_x, name##_stride_y, \
325 name##_step_y, name##_stride_z, name##_step_z)
327 #define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
328 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \
329 name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, \
332 #define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
333 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \
334 name##_stride_x, name##_step_x, name##_stride_y, \
335 name##_step_y, name##_stride_z, name##_step_z)
337 #define CONVERT_TO_TENSOR3D_STRUCT(name) \
338 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
339 name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \
342 #define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
343 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
344 0, name##_stride_y, 0, name##_stride_z, 0)
346 #define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \
347 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
348 name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \
349 name##_step_z, name##_stride_w, name##_step_w, mod_size)
351 #define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
352 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
353 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, \
356 /** Structure to hold Vector information */
357 typedef struct Vector
359 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */
360 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
361 int stride_x; /**< Stride of the image in X dimension (in bytes) */
364 /** Structure to hold Image information */
367 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */
368 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
369 int stride_x; /**< Stride of the image in X dimension (in bytes) */
370 int stride_y; /**< Stride of the image in Y dimension (in bytes) */
373 /** Structure to hold 3D tensor information */
374 typedef struct Tensor3D
376 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */
377 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
378 int stride_x; /**< Stride of the image in X dimension (in bytes) */
379 int stride_y; /**< Stride of the image in Y dimension (in bytes) */
380 int stride_z; /**< Stride of the image in Z dimension (in bytes) */
383 /** Structure to hold 4D tensor information */
384 typedef struct Tensor4D
386 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */
387 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
388 int stride_x; /**< Stride of the image in X dimension (in bytes) */
389 int stride_y; /**< Stride of the image in Y dimension (in bytes) */
390 int stride_z; /**< Stride of the image in Z dimension (in bytes) */
391 int stride_w; /**< Stride of the image in W dimension (in bytes) */
394 /** Wrap vector information into an Vector structure, and make the pointer point at this workitem's
397 * @param[in] ptr Pointer to the starting postion of the buffer
398 * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
399 * @param[in] stride_x Stride of the vector in X dimension (in bytes)
400 * @param[in] step_x stride_x * number of elements along X processed per
403 * @return An image object
405 inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
406 uint stride_x, uint step_x)
410 .offset_first_element_in_bytes = offset_first_element_in_bytes,
411 .stride_x = stride_x,
413 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
417 /** Wrap image information into an Image structure, and make the pointer point at this workitem's
420 * @param[in] ptr Pointer to the starting postion of the buffer
421 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
422 * @param[in] stride_x Stride of the image in X dimension (in bytes)
423 * @param[in] step_x stride_x * number of elements along X processed per
425 * @param[in] stride_y Stride of the image in Y dimension (in bytes)
426 * @param[in] step_y stride_y * number of elements along Y processed per
429 * @return An image object
431 inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
432 uint stride_x, uint step_x, uint stride_y, uint step_y)
434 Image img = {.ptr = ptr,
435 .offset_first_element_in_bytes = offset_first_element_in_bytes,
436 .stride_x = stride_x,
437 .stride_y = stride_y};
439 img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
443 /** Wrap 3D tensor information into an image structure, and make the pointer point at this
446 * @param[in] ptr Pointer to the starting postion of the buffer
447 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
448 * @param[in] stride_x Stride of the image in X dimension (in bytes)
449 * @param[in] step_x stride_x * number of elements along X processed per
451 * @param[in] stride_y Stride of the image in Y dimension (in bytes)
452 * @param[in] step_y stride_y * number of elements along Y processed per
454 * @param[in] stride_z Stride of the image in Z dimension (in bytes)
455 * @param[in] step_z stride_z * number of elements along Z processed per
458 * @return A 3D tensor object
460 inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
461 uint offset_first_element_in_bytes,
462 uint stride_x, uint step_x, uint stride_y,
463 uint step_y, uint stride_z, uint step_z)
465 Image img = {.ptr = ptr,
466 .offset_first_element_in_bytes = offset_first_element_in_bytes,
467 .stride_x = stride_x,
468 .stride_y = stride_y};
469 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x +
470 get_global_id(1) * step_y + get_global_id(2) * step_z;
474 /** Wrap 3D tensor information into an tensor structure, and make the pointer point at this
477 * @param[in] ptr Pointer to the starting postion of the buffer
478 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
479 * @param[in] stride_x Stride of the image in X dimension (in bytes)
480 * @param[in] step_x stride_x * number of elements along X processed per
482 * @param[in] stride_y Stride of the image in Y dimension (in bytes)
483 * @param[in] step_y stride_y * number of elements along Y processed per
485 * @param[in] stride_z Stride of the image in Z dimension (in bytes)
486 * @param[in] step_z stride_z * number of elements along Z processed per
489 * @return A 3D tensor object
491 inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr,
492 uint offset_first_element_in_bytes, uint stride_x,
493 uint step_x, uint stride_y, uint step_y, uint stride_z,
496 Tensor3D tensor = {.ptr = ptr,
497 .offset_first_element_in_bytes = offset_first_element_in_bytes,
498 .stride_x = stride_x,
499 .stride_y = stride_y,
500 .stride_z = stride_z};
501 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x +
502 get_global_id(1) * step_y + get_global_id(2) * step_z;
506 inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr,
507 uint offset_first_element_in_bytes, uint stride_x,
508 uint step_x, uint stride_y, uint step_y, uint stride_z,
509 uint step_z, uint stride_w, uint step_w, uint mod_size)
511 Tensor4D tensor = {.ptr = ptr,
512 .offset_first_element_in_bytes = offset_first_element_in_bytes,
513 .stride_x = stride_x,
514 .stride_y = stride_y,
515 .stride_z = stride_z,
516 .stride_w = stride_w};
518 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x +
519 get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z +
520 (get_global_id(2) / mod_size) * step_w;
524 /** Get the pointer position of a Vector
526 * @param[in] vec Pointer to the starting position of the buffer
527 * @param[in] x Relative X position
529 inline __global const uchar *vector_offset(const Vector *vec, int x)
531 return vec->ptr + x * vec->stride_x;
534 /** Get the pointer position of a Image
536 * @param[in] img Pointer to the starting position of the buffer
537 * @param[in] x Relative X position
538 * @param[in] y Relative Y position
540 inline __global uchar *offset(const Image *img, int x, int y)
542 return img->ptr + x * img->stride_x + y * img->stride_y;
545 /** Get the pointer position of a Tensor3D
547 * @param[in] tensor Pointer to the starting position of the buffer
548 * @param[in] x Relative X position
549 * @param[in] y Relative Y position
550 * @param[in] z Relative Z position
552 inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
554 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
557 /** Get the pointer position of a Tensor4D
559 * @param[in] tensor Pointer to the starting position of the buffer
560 * @param[in] x Relative X position
561 * @param[in] y Relative Y position
562 * @param[in] z Relative Z position
563 * @param[in] w Relative W position
565 inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
567 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z +
568 w * tensor->stride_w;