compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h

   1 /*
   2  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /*
  18  * Copyright (c) 2016-2020 ARM Limited.
  19  *
  20  * SPDX-License-Identifier: MIT
  21  *
  22  * Permission is hereby granted, free of charge, to any person obtaining a copy
  23  * of this software and associated documentation files (the "Software"), to
  24  * deal in the Software without restriction, including without limitation the
  25  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  26  * sell copies of the Software, and to permit persons to whom the Software is
  27  * furnished to do so, subject to the following conditions:
  28  *
  29  * The above copyright notice and this permission notice shall be included in all
  30  * copies or substantial portions of the Software.
  31  *
  32  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  33  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  34  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  35  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  36  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  37  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  38  * SOFTWARE.
  39  */
  40 #ifndef ARM_COMPUTE_HELPER_H
  41 #define ARM_COMPUTE_HELPER_H
  42
  43 #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
  44 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
  45 #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
  46
  47 #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
  48 #pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
  49 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
  50
  51 #if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \
  52   defined(cl_arm_integer_dot_product_accumulate_int8)
  53 #pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
  54 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) &&
  55        // defined(cl_arm_integer_dot_product_accumulate_int8)
  56
  57 #if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
  58 #pragma OPENCL EXTENSION cl_arm_printf : enable
  59 #endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
  60
  61 #define GPU_ARCH_MIDGARD 0x100
  62 #define GPU_ARCH_BIFROST 0x200
  63
  64 /** Concatenate two inputs.
  65  *
  66  * @param[in] a The first input to be concatenated
  67  * @param[in] b The second input to be concatenated
  68  *
  69  * @return The concatenated output
  70  */
  71 #define CONCAT(a, b) a##b
  72
  73 /** Expand the given vector
  74  *
  75  * @param[in] x The vector to be expanded
  76  *
  77  * @return The expanded output
  78  */
  79 #define EXPAND(x) x
  80
  81 /** Clamp the given value between an upper and lower bound.
  82  *
  83  * @param[in] x       The value to be clamped
  84  * @param[in] min_val The lower bound
  85  * @param[in] max_val The upper bound
  86  *
  87  * @return The clamped value.
  88  */
  89 #define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
  90
  91 /** REVn reverses the given vector whose size is n.
  92  * @name REVn
  93  *
  94  * @param[in] x The vector to be reversed
  95  *
  96  * @return The reversed vector
  97  * @{
  98  */
  99 #define REV1(x) ((x))
 100 #define REV2(x) ((x).s10)
 101 #define REV3(x) ((x).s210)
 102 #define REV4(x) ((x).s3210)
 103 #define REV8(x) ((x).s76543210)
 104 #define REV16(x) ((x).sFEDCBA9876543210)
 105 /** @} */ // end of group REVn
 106
 107 /** Reverse the given vector.
 108  * @name REVERSE
 109  *
 110  * @param[in] x The vector to be reversed
 111  * @param[in] s The size of the vector
 112  *
 113  * @return The reversed vector
 114  * @{
 115  */
 116 #define REVERSE_STR(x, s) REV##s((x))
 117 #define REVERSE(x, s) REVERSE_STR(x, s)
 118 /** @} */ // end of group REVERSE
 119
 120 /** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
 121  * @name ROTs_n
 122  *
 123  * @param[in] x The vector to be shifted
 124  *
 125  * @return The shifted vector
 126  * @{
 127  */
 128 #define ROT1_0(x) ((x))
 129
 130 #define ROT2_0(x) ((x))
 131 #define ROT2_1(x) ((x).s10)
 132
 133 #define ROT3_0(x) ((x))
 134 #define ROT3_1(x) ((x).s201)
 135 #define ROT3_2(x) ((x).s120)
 136
 137 #define ROT4_0(x) ((x))
 138 #define ROT4_1(x) ((x).s3012)
 139 #define ROT4_2(x) ((x).s2301)
 140 #define ROT4_3(x) ((x).s1230)
 141
 142 #define ROT8_0(x) ((x))
 143 #define ROT8_1(x) ((x).s70123456)
 144 #define ROT8_2(x) ((x).s67012345)
 145 #define ROT8_3(x) ((x).s56701234)
 146 #define ROT8_4(x) ((x).s45670123)
 147 #define ROT8_5(x) ((x).s34567012)
 148 #define ROT8_6(x) ((x).s23456701)
 149 #define ROT8_7(x) ((x).s12345670)
 150
 151 #define ROT16_0(x) ((x))
 152 #define ROT16_1(x) ((x).sF0123456789ABCDE)
 153 #define ROT16_2(x) ((x).sEF0123456789ABCD)
 154 #define ROT16_3(x) ((x).sDEF0123456789ABC)
 155 #define ROT16_4(x) ((x).sCDEF0123456789AB)
 156 #define ROT16_5(x) ((x).sBCDEF0123456789A)
 157 #define ROT16_6(x) ((x).sABCDEF0123456789)
 158 #define ROT16_7(x) ((x).s9ABCDEF012345678)
 159 #define ROT16_8(x) ((x).s89ABCDEF01234567)
 160 #define ROT16_9(x) ((x).s789ABCDEF0123456)
 161 #define ROT16_10(x) ((x).s6789ABCDEF012345)
 162 #define ROT16_11(x) ((x).s56789ABCDEF01234)
 163 #define ROT16_12(x) ((x).s456789ABCDEF0123)
 164 #define ROT16_13(x) ((x).s3456789ABCDEF012)
 165 #define ROT16_14(x) ((x).s23456789ABCDEF01)
 166 #define ROT16_15(x) ((x).s123456789ABCDEF0)
 167 /** @} */ // end of group ROTs_n
 168
 169 /** Circular-right-shift (rotate-right) the given vector by the given amount.
 170  * @name ROTATE
 171  *
 172  * @param[in] x The vector to be shifted
 173  * @param[in] s The size of the vector
 174  * @param[in] n The amount to be shifted
 175  *
 176  * @return The shifted vector
 177  * @{
 178  */
 179 #define ROTATE_STR(x, s, n) ROT##s##_##n(x)
 180 #define ROTATE(x, s, n) ROTATE_STR(x, s, n)
 181 /** @} */ // end of group ROTATE
 182
 183 /** Creates a vector of size n filled with offset values corresponding to the location of each
 184  * element.
 185  * @name V_OFFSn
 186  *
 187  * @param[in] dt The data type of the output vector
 188  *
 189  * @return The vector filled with offset values
 190  * @{
 191  */
 192 #define V_OFFS1(dt) (dt)(0)
 193 #define V_OFFS2(dt) (dt)(0, 1)
 194 #define V_OFFS3(dt) (dt)(0, 1, 3)
 195 #define V_OFFS4(dt) (dt)(0, 1, 2, 3)
 196 #define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
 197 #define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
 198 /** @} */ // end of group V_OFFSn
 199
 200 /** Create a vector filled with offset values corresponding to the location of each element.
 201  * @name VEC_OFFS
 202  *
 203  * @param[in] dt The data type of the output vector
 204  * @param[in] s  The size of the output vector
 205  *
 206  * @return The vector filled with offset values
 207  * @{
 208  */
 209 #define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
 210 #define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
 211 /** @} */ // end of group VEC_OFFS
 212
 213 #define VLOAD_STR(size) vload##size
 214 #define VLOAD(size) VLOAD_STR(size)
 215
 216 #define VSTORE_STR(size) vstore##size
 217 #define VSTORE(size) VSTORE_STR(size)
 218
 219 #define float1 float
 220 #define half1 half
 221 #define char1 char
 222 #define uchar1 uchar
 223 #define short1 short
 224 #define ushort1 ushort
 225 #define int1 int
 226 #define uint1 uint
 227 #define long1 long
 228 #define ulong1 ulong
 229 #define double1 double
 230
 231 #define vload1(OFFSET, PTR) *(OFFSET + PTR)
 232 #define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
 233
 234 // Convert built-in functions with _sat modifier are not supported in floating point so we create
 235 // defines
 236 // without _sat to overcome this issue
 237 #define convert_float_sat convert_float
 238 #define convert_float1_sat convert_float
 239 #define convert_float2_sat convert_float2
 240 #define convert_float3_sat convert_float3
 241 #define convert_float4_sat convert_float4
 242 #define convert_float8_sat convert_float8
 243 #define convert_float16_sat convert_float16
 244 #define convert_half_sat convert_float
 245 #define convert_half1_sat convert_half
 246 #define convert_half2_sat convert_half2
 247 #define convert_half3_sat convert_half3
 248 #define convert_half4_sat convert_half4
 249 #define convert_half8_sat convert_half8
 250 #define convert_half16_sat convert_half16
 251
 252 #define convert_float1 convert_float
 253 #define convert_half1 convert_half
 254 #define convert_char1 convert_char
 255 #define convert_uchar1 convert_uchar
 256 #define convert_short1 convert_short
 257 #define convert_ushort1 convert_ushort
 258 #define convert_int1 convert_int
 259 #define convert_uint1 convert_uint
 260 #define convert_long1 convert_long
 261 #define convert_ulong1 convert_ulong
 262 #define convert_double1 convert_double
 263
 264 #define convert_char1_sat convert_char_sat
 265 #define convert_uchar1_sat convert_uchar_sat
 266 #define convert_short1_sat convert_short_sat
 267 #define convert_ushort1_sat convert_ushort_sat
 268 #define convert_int1_sat convert_int_sat
 269 #define convert_uint1_sat convert_uint_sat
 270 #define convert_long1_sat convert_long_sat
 271 #define convert_ulong1_sat convert_ulong_sat
 272 #define convert_double1_sat convert_double_sat
 273
 274 #define VEC_DATA_TYPE_STR(type, size) type##size
 275 #define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
 276
 277 #define CL_VEC_DATA_TYPE_STR(type, size) type##size
 278 #define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
 279
 280 #define CONVERT_STR(x, type) (convert_##type((x)))
 281 #define CONVERT(x, type) CONVERT_STR(x, type)
 282
 283 #define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
 284 #define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
 285
 286 #define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
 287 #define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
 288
 289 #define VECTOR_DECLARATION(name)                                        \
 290   __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, \
 291     uint name##_offset_first_element_in_bytes
 292
 293 #define IMAGE_DECLARATION(name)                                                               \
 294   __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
 295     uint name##_step_y, uint name##_offset_first_element_in_bytes
 296
 297 #define TENSOR3D_DECLARATION(name)                                                            \
 298   __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
 299     uint name##_step_y, uint name##_stride_z, uint name##_step_z,                             \
 300     uint name##_offset_first_element_in_bytes
 301
 302 #define TENSOR4D_DECLARATION(name)                                                            \
 303   __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \
 304     uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w,       \
 305     uint name##_step_w, uint name##_offset_first_element_in_bytes
 306
 307 #define CONVERT_TO_VECTOR_STRUCT(name)                                                          \
 308   update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
 309                              name##_step_x)
 310
 311 #define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
 312   update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
 313
 314 #define CONVERT_TO_IMAGE_STRUCT(name)                                                          \
 315   update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
 316                             name##_step_x, name##_stride_y, name##_step_y)
 317
 318 #define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name)                                                     \
 319   update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
 320                             name##_stride_y, 0)
 321
 322 #define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)                                              \
 323   update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \
 324                                           name##_stride_x, name##_step_x, name##_stride_y,  \
 325                                           name##_step_y, name##_stride_z, name##_step_z)
 326
 327 #define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name)                                             \
 328   update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes,        \
 329                                           name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, \
 330                                           name##_step_z)
 331
 332 #define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)                                              \
 333   update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \
 334                                           name##_stride_x, name##_step_x, name##_stride_y,  \
 335                                           name##_step_y, name##_stride_z, name##_step_z)
 336
 337 #define CONVERT_TO_TENSOR3D_STRUCT(name)                                                          \
 338   update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
 339                                name##_step_x, name##_stride_y, name##_step_y, name##_stride_z,    \
 340                                name##_step_z)
 341
 342 #define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name)                                                  \
 343   update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
 344                                0, name##_stride_y, 0, name##_stride_z, 0)
 345
 346 #define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                \
 347   update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
 348                                name##_step_x, name##_stride_y, name##_step_y, name##_stride_z,    \
 349                                name##_step_z, name##_stride_w, name##_step_w, mod_size)
 350
 351 #define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size)                                        \
 352   update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
 353                                0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0,     \
 354                                mod_size)
 355
 356 /** Structure to hold Vector information */
 357 typedef struct Vector
 358 {
 359   __global uchar *ptr;               /**< Pointer to the starting postion of the buffer */
 360   int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
 361   int stride_x;                      /**< Stride of the image in X dimension (in bytes) */
 362 } Vector;
 363
 364 /** Structure to hold Image information */
 365 typedef struct Image
 366 {
 367   __global uchar *ptr;               /**< Pointer to the starting postion of the buffer */
 368   int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
 369   int stride_x;                      /**< Stride of the image in X dimension (in bytes) */
 370   int stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
 371 } Image;
 372
 373 /** Structure to hold 3D tensor information */
 374 typedef struct Tensor3D
 375 {
 376   __global uchar *ptr;               /**< Pointer to the starting postion of the buffer */
 377   int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
 378   int stride_x;                      /**< Stride of the image in X dimension (in bytes) */
 379   int stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
 380   int stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
 381 } Tensor3D;
 382
 383 /** Structure to hold 4D tensor information */
 384 typedef struct Tensor4D
 385 {
 386   __global uchar *ptr;               /**< Pointer to the starting postion of the buffer */
 387   int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
 388   int stride_x;                      /**< Stride of the image in X dimension (in bytes) */
 389   int stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
 390   int stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
 391   int stride_w;                      /**< Stride of the image in W dimension (in bytes) */
 392 } Tensor4D;
 393
 394 /** Wrap vector information into an Vector structure, and make the pointer point at this workitem's
 395  * data.
 396  *
 397  * @param[in] ptr                           Pointer to the starting postion of the buffer
 398  * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
 399  * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
 400  * @param[in] step_x                        stride_x * number of elements along X processed per
 401  * workitem(in bytes)
 402  *
 403  * @return An image object
 404  */
 405 inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
 406                                          uint stride_x, uint step_x)
 407 {
 408   Vector vector = {
 409     .ptr = ptr,
 410     .offset_first_element_in_bytes = offset_first_element_in_bytes,
 411     .stride_x = stride_x,
 412   };
 413   vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
 414   return vector;
 415 }
 416
 417 /** Wrap image information into an Image structure, and make the pointer point at this workitem's
 418  * data.
 419  *
 420  * @param[in] ptr                           Pointer to the starting postion of the buffer
 421  * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
 422  * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
 423  * @param[in] step_x                        stride_x * number of elements along X processed per
 424  * workitem(in bytes)
 425  * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
 426  * @param[in] step_y                        stride_y * number of elements along Y processed per
 427  * workitem(in bytes)
 428  *
 429  * @return An image object
 430  */
 431 inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes,
 432                                        uint stride_x, uint step_x, uint stride_y, uint step_y)
 433 {
 434   Image img = {.ptr = ptr,
 435                .offset_first_element_in_bytes = offset_first_element_in_bytes,
 436                .stride_x = stride_x,
 437                .stride_y = stride_y};
 438   img.ptr +=
 439     img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
 440   return img;
 441 }
 442
 443 /** Wrap 3D tensor information into an image structure, and make the pointer point at this
 444  * workitem's data.
 445  *
 446  * @param[in] ptr                           Pointer to the starting postion of the buffer
 447  * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
 448  * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
 449  * @param[in] step_x                        stride_x * number of elements along X processed per
 450  * workitem(in bytes)
 451  * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
 452  * @param[in] step_y                        stride_y * number of elements along Y processed per
 453  * workitem(in bytes)
 454  * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
 455  * @param[in] step_z                        stride_z * number of elements along Z processed per
 456  * workitem(in bytes)
 457  *
 458  * @return A 3D tensor object
 459  */
 460 inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
 461                                                      uint offset_first_element_in_bytes,
 462                                                      uint stride_x, uint step_x, uint stride_y,
 463                                                      uint step_y, uint stride_z, uint step_z)
 464 {
 465   Image img = {.ptr = ptr,
 466                .offset_first_element_in_bytes = offset_first_element_in_bytes,
 467                .stride_x = stride_x,
 468                .stride_y = stride_y};
 469   img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x +
 470              get_global_id(1) * step_y + get_global_id(2) * step_z;
 471   return img;
 472 }
 473
 474 /** Wrap 3D tensor information into an tensor structure, and make the pointer point at this
 475  * workitem's data.
 476  *
 477  * @param[in] ptr                           Pointer to the starting postion of the buffer
 478  * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
 479  * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
 480  * @param[in] step_x                        stride_x * number of elements along X processed per
 481  * workitem(in bytes)
 482  * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
 483  * @param[in] step_y                        stride_y * number of elements along Y processed per
 484  * workitem(in bytes)
 485  * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
 486  * @param[in] step_z                        stride_z * number of elements along Z processed per
 487  * workitem(in bytes)
 488  *
 489  * @return A 3D tensor object
 490  */
 491 inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr,
 492                                              uint offset_first_element_in_bytes, uint stride_x,
 493                                              uint step_x, uint stride_y, uint step_y, uint stride_z,
 494                                              uint step_z)
 495 {
 496   Tensor3D tensor = {.ptr = ptr,
 497                      .offset_first_element_in_bytes = offset_first_element_in_bytes,
 498                      .stride_x = stride_x,
 499                      .stride_y = stride_y,
 500                      .stride_z = stride_z};
 501   tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x +
 502                 get_global_id(1) * step_y + get_global_id(2) * step_z;
 503   return tensor;
 504 }
 505
 506 inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr,
 507                                              uint offset_first_element_in_bytes, uint stride_x,
 508                                              uint step_x, uint stride_y, uint step_y, uint stride_z,
 509                                              uint step_z, uint stride_w, uint step_w, uint mod_size)
 510 {
 511   Tensor4D tensor = {.ptr = ptr,
 512                      .offset_first_element_in_bytes = offset_first_element_in_bytes,
 513                      .stride_x = stride_x,
 514                      .stride_y = stride_y,
 515                      .stride_z = stride_z,
 516                      .stride_w = stride_w};
 517
 518   tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x +
 519                 get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z +
 520                 (get_global_id(2) / mod_size) * step_w;
 521   return tensor;
 522 }
 523
 524 /** Get the pointer position of a Vector
 525  *
 526  * @param[in] vec Pointer to the starting position of the buffer
 527  * @param[in] x   Relative X position
 528  */
 529 inline __global const uchar *vector_offset(const Vector *vec, int x)
 530 {
 531   return vec->ptr + x * vec->stride_x;
 532 }
 533
 534 /** Get the pointer position of a Image
 535  *
 536  * @param[in] img Pointer to the starting position of the buffer
 537  * @param[in] x   Relative X position
 538  * @param[in] y   Relative Y position
 539  */
 540 inline __global uchar *offset(const Image *img, int x, int y)
 541 {
 542   return img->ptr + x * img->stride_x + y * img->stride_y;
 543 }
 544
 545 /** Get the pointer position of a Tensor3D
 546  *
 547  * @param[in] tensor Pointer to the starting position of the buffer
 548  * @param[in] x      Relative X position
 549  * @param[in] y      Relative Y position
 550  * @param[in] z      Relative Z position
 551  */
 552 inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
 553 {
 554   return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
 555 }
 556
 557 /** Get the pointer position of a Tensor4D
 558  *
 559  * @param[in] tensor Pointer to the starting position of the buffer
 560  * @param[in] x      Relative X position
 561  * @param[in] y      Relative Y position
 562  * @param[in] z      Relative Z position
 563  * @param[in] w      Relative W position
 564  */
 565 inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
 566 {
 567   return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z +
 568          w * tensor->stride_w;
 569 }
 570
 571 #endif // _HELPER_H