compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl

   1 /*
   2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /*
  18  * Copyright (c) 2019-2020 ARM Limited.
  19  *
  20  * SPDX-License-Identifier: MIT
  21  *
  22  * Permission is hereby granted, free of charge, to any person obtaining a copy
  23  * of this software and associated documentation files (the "Software"), to
  24  * deal in the Software without restriction, including without limitation the
  25  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  26  * sell copies of the Software, and to permit persons to whom the Software is
  27  * furnished to do so, subject to the following conditions:
  28  *
  29  * The above copyright notice and this permission notice shall be included in all
  30  * copies or substantial portions of the Software.
  31  *
  32  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  33  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  34  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  35  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  36  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  37  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  38  * SOFTWARE.
  39  */
  40 #include "helpers.h"
  41
  42 #if defined(FLOAT_DATA_TYPE)
  43 #define ISGREATER(x, y) isgreater(x, y)
  44 #define ISLESS(x, y) isless(x, y)
  45 #else // !FLOAT_DATA_TYPE
  46 #if defined(WIDTH)
  47 #define ISGREATER(x, y) (x > y) ? 1 : 0
  48 #define ISLESS(x, y) (x < y) ? 1 : 0
  49 #else // !defined(WIDTH)
  50 #define ISGREATER(x, y) \
  51   select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x > y)
  52 #define ISLESS(x, y) \
  53   select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x < y)
  54 #endif // defined(WIDTH)
  55 #endif // defined(FLOAT_DATA_TYPE)
  56
  57 #if defined(ARG_MAX)
  58 #define CONDITION_TO_USE(x, y) ISGREATER(x, y)
  59 #elif defined(ARG_MIN)
  60 #define CONDITION_TO_USE(x, y) ISLESS(x, y)
  61 #else // !(defined(ARG_MAX) || defined(ARG_MIN))
  62 #error "Unsupported reduction operation!"
  63 #endif // defined(ARG_MAX)
  64
  65 #if defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT)
  66 #if defined(WIDTH)
  67 #if defined(ARG_MIN)
  68 #if defined(PREV_OUTPUT)
  69 /** Find index minimum value of a vector
  70  *
  71  * @param[in] input Pointer to the first value.
  72  *
  73  * @return index of the vector.
  74  */
  75 inline DATA_TYPE_OUTPUT arg_idx_min_prev_out(__global const DATA_TYPE *input,
  76                                              __global const DATA_TYPE_OUTPUT *prev_res,
  77                                              const int x_idx)
  78 {
  79   int end_elem = (x_idx + 1) * 16;
  80   if (end_elem > WIDTH)
  81   {
  82     end_elem = WIDTH - x_idx * 16;
  83   }
  84   DATA_TYPE_OUTPUT res = prev_res[0];
  85   for (int x_v = 1; x_v < end_elem; ++x_v)
  86   {
  87     res = select(res, prev_res[x_v], *(input + prev_res[x_v]) < *(input + res));
  88   }
  89   return res;
  90 }
  91 #else // !defined(PREV_OUTPUT)
  92 /** Find index minimum value of a vector
  93  *
  94  * @param[in] input Pointer to the first value.
  95  *
  96  * @return index of the vector.
  97  */
  98 inline DATA_TYPE_OUTPUT arg_idx_min(__global const DATA_TYPE *input, const int x_idx)
  99 {
 100 #if WIDTH < 16
 101   DATA_TYPE_OUTPUT res = 0;
 102   for (DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
 103   {
 104     res = select(res, x_v, *(input + x_v) < *(input + res));
 105   }
 106   return res;
 107 #else  // WIDTH >= 16
 108   int x_elem = x_idx * 16;
 109   const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
 110   x_elem -= x_goback;
 111
 112   VEC_DATA_TYPE(DATA_TYPE, 16)
 113   in = vload16(0, input - x_goback);
 114   VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
 115   res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
 116
 117   VEC_DATA_TYPE(DATA_TYPE_SELECT, 8)
 118   idx_sel = (in.s01234567 <= in.s89abcdef);
 119   in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel);
 120   res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
 121
 122   idx_sel.s0123 = (in.s0123 < in.s4567) ||
 123                   (in.s0123 == in.s4567 &&
 124                    CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
 125   in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123);
 126   res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
 127
 128   idx_sel.s01 =
 129       (in.s01 < in.s23) ||
 130       (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
 131   in.s01 = select(in.s23, in.s01, idx_sel.s01);
 132   res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
 133
 134   idx_sel.s0 = (in.s0 < in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT));
 135   res.s0 = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
 136
 137   return res.s0 + x_elem;
 138 #endif // WIDTH < 16
 139 }
 140 #endif // defined(PREV_OUTPUT)
 141 #endif // defined(ARG_MIN)
 142 #if defined(ARG_MAX)
 143 #if defined(PREV_OUTPUT)
 144 /** Find index maximum value of a vector
 145  *
 146  * @param[in] input Pointer to the first value.
 147  *
 148  * @return index of the vector.
 149  */
 150 inline DATA_TYPE_OUTPUT arg_idx_max_prev_out(__global const DATA_TYPE *input,
 151                                              __global const DATA_TYPE_OUTPUT *prev_res,
 152                                              const int x_idx)
 153 {
 154   int end_elem = (x_idx + 1) * 16;
 155   if (end_elem > WIDTH)
 156   {
 157     end_elem = WIDTH - x_idx * 16;
 158   }
 159   DATA_TYPE_OUTPUT res = prev_res[0];
 160   unsigned int res_int = res;
 161   DATA_TYPE_OUTPUT condition_check2;
 162   for (int x_v = 1; x_v < end_elem; ++x_v)
 163   {
 164     int i1 = prev_res[x_v];
 165     condition_check2 = *(input + i1) > *(input + res_int);
 166     res = select(res, prev_res[x_v], condition_check2);
 167   }
 168   return res;
 169 }
 170 #else // !defined(PREV_OUTPUT)
 171 /** Find index maximum value of a vector
 172  *
 173  * @param[in] input Pointer to the first value.
 174  *
 175  * @return index of the vector.
 176  */
 177 inline DATA_TYPE_OUTPUT arg_idx_max(__global const DATA_TYPE *input, const int x_idx)
 178 {
 179 #if WIDTH < 16
 180   DATA_TYPE_OUTPUT res = 0;
 181   unsigned int i1;
 182   unsigned int i2;
 183   DATA_TYPE_OUTPUT condition_check;
 184   for (DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
 185   {
 186     i1 = x_v;
 187     i2 = res;
 188     condition_check = *(input + i1) > *(input + i2);
 189     res = select(res, x_v, condition_check);
 190   }
 191   return res;
 192 #else  // WIDTH >= 16
 193   int x_elem = x_idx * 16;
 194   const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
 195   x_elem -= x_goback;
 196
 197   VEC_DATA_TYPE(DATA_TYPE, 16)
 198   in = vload16(0, input - x_goback);
 199   VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
 200   res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
 201
 202   VEC_DATA_TYPE(DATA_TYPE_SELECT, 8)
 203   idx_sel = (in.s01234567 >= in.s89abcdef);
 204   in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel);
 205   res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
 206
 207   idx_sel.s0123 = (in.s0123 > in.s4567) ||
 208                   (in.s0123 == in.s4567 &&
 209                    CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
 210   in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123);
 211   res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
 212
 213   idx_sel.s01 =
 214       (in.s01 > in.s23) ||
 215       (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
 216   in.s01 = select(in.s23, in.s01, idx_sel.s01);
 217   res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
 218
 219   idx_sel.s0 = (in.s0 > in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT));
 220   res.s0 = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
 221
 222   return res.s0 + x_elem;
 223 #endif // WIDTH < 16
 224 }
 225 #endif // defined(PREV_OUTPUT)
 226 #endif // defined(ARG_MAX)
 227
 228 /** This kernel performs parallel reduction given an operation on x-axis.
 229  *
 230  * @note In case the results of previous stages are passed the flag PREV_OUTPUT has to be passed
 231  * using -DPREV_OUTPUT
 232  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
 233  * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g.
 234  * -DDATA_TYPE_OUTPUT=uint
 235  * @note The arg_max flag must be passed at compile time using -DARG_MAX if we want to compute the
 236  * ArgMax
 237  * @note The arg_min flag must be passed at compile time using -DARG_MIN if we want to compute the
 238  * ArgMin
 239  *
 240  * @param[in] src_ptr                                   Pointer to the source tensor. Supported data
 241  * types: S32/F16/F32
 242  * @param[in] src_stride_x                              Stride of the source tensor in X dimension
 243  * (in bytes)
 244  * @param[in] src_step_x                                src_stride_x * number of elements along X
 245  * processed per workitem(in bytes)
 246  * @param[in] src_stride_y                              Stride of the source tensor in Y dimension
 247  * (in bytes)
 248  * @param[in] src_step_y                                src_stride_y * number of elements along Y
 249  * processed per workitem(in bytes)
 250  * @param[in] src_offset_first_element_in_bytes         The offset of the first element in the
 251  * source tensor
 252  * @param[in] prev_res_ptr                              (Optional) Pointer to previous results
 253  * tensor. Supported data types: U32/S32
 254  * @param[in] prev_res_stride_x                         (Optional) Stride of the output tensor in X
 255  * dimension (in bytes)
 256  * @param[in] prev_res_step_x                           (Optional) prev_res_stride_x * number of
 257  * elements along X processed per workitem(in bytes)
 258  * @param[in] prev_res_stride_y                         (Optional) Stride of the output tensor in Y
 259  * dimension (in bytes)
 260  * @param[in] prev_res_step_y                           (Optional) prev_res_stride_y * number of
 261  * elements along Y processed per workitem(in bytes)
 262  * @param[in] prev_res_offset_first_element_in_bytes    (Optional) The offset of the first element
 263  * in the previous results tensor
 264  * @param[in] partial_res_ptr                           The local buffer to hold partial result
 265  * values. Supported data types: U32/S32
 266  * @param[in] partial_res_stride_x                      Stride of the output tensor in X dimension
 267  * (in bytes)
 268  * @param[in] partial_res_step_x                        partial_res_stride_x * number of elements
 269  * along X processed per workitem(in bytes)
 270  * @param[in] partial_res_stride_y                      Stride of the output tensor in Y dimension
 271  * (in bytes)
 272  * @param[in] partial_res_step_y                        partial_res_stride_y * number of elements
 273  * along Y processed per workitem(in bytes)
 274  * @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the
 275  * source tensor
 276  * @param[in] local_results                             Local buffer for storing the partial result
 277  */
 278 __kernel void arg_min_max_ex_x(IMAGE_DECLARATION(src),
 279 #if defined(PREV_OUTPUT)
 280                                IMAGE_DECLARATION(prev_res),
 281 #endif // defined(PREV_OUTPUT)
 282                                IMAGE_DECLARATION(partial_res),
 283                                __local DATA_TYPE_OUTPUT *local_results)
 284 {
 285 #if defined(PREV_OUTPUT)
 286   Image src = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src);
 287   Image prev_res = CONVERT_TO_IMAGE_STRUCT(prev_res);
 288 #else  // !defined(PREV_OUTPUT)
 289   Image src = CONVERT_TO_IMAGE_STRUCT(src);
 290 #endif // defined(PREV_OUTPUT)
 291   Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res);
 292
 293   unsigned int lsize = get_local_size(0);
 294   unsigned int lid = get_local_id(0);
 295
 296   const uint x_idx = get_global_id(0);
 297   const uint y_idx = get_global_id(1);
 298   const __global DATA_TYPE *src_in_row =
 299       (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes +
 300                                    y_idx * src_step_y);
 301
 302   for (unsigned int y = 0; y < get_local_size(1); ++y)
 303   {
 304 #if defined(ARG_MAX)
 305 #if defined(PREV_OUTPUT)
 306     local_results[lid] = arg_idx_max_prev_out(
 307         src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
 308 #else  // !defined(PREV_OUTPUT)
 309     local_results[lid] = arg_idx_max((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
 310 #endif // defined(PREV_OUTPUT)
 311 #else  // defined(ARG_MIN)
 312 #if defined(PREV_OUTPUT)
 313     local_results[lid] = arg_idx_min_prev_out(
 314         src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
 315 #else  // !defined(PREV_OUTPUT)
 316     local_results[lid] = arg_idx_min((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
 317 #endif // defined(PREV_OUTPUT)
 318 #endif // defined(ARG_MAX) || defined(ARG_MIN)
 319
 320     barrier(CLK_LOCAL_MEM_FENCE);
 321
 322     // Looking for the next highest power of 2 (maximum value of lsize is 8)
 323     unsigned int middle = lsize - 1;
 324     middle |= middle >> 1;
 325     middle |= middle >> 2;
 326     middle += 1;
 327     // Perform parallel reduction
 328     DATA_TYPE_OUTPUT condition_check3;
 329     for (unsigned int i = middle; i > 0; i >>= 1)
 330     {
 331       if (lid < i && lid + i < lsize)
 332       {
 333         DATA_TYPE tmp0 = *(src_in_row + local_results[lid]);
 334         DATA_TYPE tmp1 = *(src_in_row + local_results[lid + i]);
 335 #if defined(ARG_MAX)
 336         condition_check3 =
 337             ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1);
 338         local_results[lid] = select(local_results[lid], local_results[lid + i], condition_check3);
 339 #else  // defined(ARG_MIN)
 340         local_results[lid] = select(
 341             local_results[lid], local_results[lid + i],
 342             ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1));
 343 #endif // defined(ARG_MAX) || defined(ARG_MIN)
 344       }
 345       barrier(CLK_LOCAL_MEM_FENCE);
 346     }
 347
 348     if (lid == 0)
 349     {
 350       ((__global DATA_TYPE_OUTPUT *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
 351     }
 352   }
 353 }
 354 #endif // defined(WIDTH)
 355
 356 #if defined(HEIGHT)
 357 /** This kernel performs reduction on y-axis.
 358  *
 359  * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g.
 360  * -DDATA_TYPE=float
 361  * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g.
 362  * -DDATA_TYPE_OUTPUT=uint
 363  * @note The data type of the select results must be passed at compile time using
 364  * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
 365  * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
 366  *
 367  * @param[in] src_ptr                              Pointer to the source tensor. Supported data
 368  * types: S32/F16/F32
 369  * @param[in] src_stride_x                         Stride of the source tensor in X dimension (in
 370  * bytes)
 371  * @param[in] src_step_x                           src_stride_x * number of elements along X
 372  * processed per workitem(in bytes)
 373  * @param[in] src_stride_y                         Stride of the source tensor in Y dimension (in
 374  * bytes)
 375  * @param[in] src_step_y                           src_stride_y * number of elements along Y
 376  * processed per workitem(in bytes)
 377  * @param[in] src_offset_first_element_in_bytes    The offset of the first element in the source
 378  * tensor
 379  * @param[in] output_ptr                           The local buffer to hold sumed values. Supported
 380  * data types: U32/S32
 381  * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in
 382  * bytes)
 383  * @param[in] output_step_x                        output_stride_x * number of elements along X
 384  * processed per workitem(in bytes)
 385  * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in
 386  * bytes)
 387  * @param[in] output_step_y                        output_stride_y * number of elements along Y
 388  * processed per workitem(in bytes)
 389  * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source
 390  * tensor
 391  */
 392 __kernel void arg_min_max_ex_y(IMAGE_DECLARATION(src), IMAGE_DECLARATION(output))
 393 {
 394   Image src = CONVERT_TO_IMAGE_STRUCT(src);
 395   Image output = CONVERT_TO_IMAGE_STRUCT(output);
 396
 397   VEC_DATA_TYPE(DATA_TYPE, 16)
 398   res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE, 16));
 399
 400   VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
 401   indx = 0;
 402   for (unsigned int y = 1; y < HEIGHT; ++y)
 403   {
 404     VEC_DATA_TYPE(DATA_TYPE, 16)
 405     in =
 406         CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE, 16));
 407
 408     VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
 409     cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
 410     indx = select(indx, y, cond_conv);
 411     res = select(res, in, CONDITION_TO_USE(in, res));
 412   }
 413
 414   // Store result
 415   vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
 416 }
 417 #endif // defined(HEIGHT)
 418
 419 #if defined(DEPTH)
 420 /** This kernel performs reduction on z-axis.
 421  *
 422  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
 423  * @note The data type of the select results must be passed at compile time using
 424  * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
 425  * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
 426  *
 427  * @param[in] input_ptr                            Pointer to the source tensor. Supported data
 428  * types: S32/F16/F32
 429  * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in
 430  * bytes)
 431  * @param[in] input_step_x                         input_stride_x * number of elements along X
 432  * processed per workitem(in bytes)
 433  * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in
 434  * bytes)
 435  * @param[in] input_step_y                         input_stride_y * number of elements along Y
 436  * processed per workitem(in bytes)
 437  * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in
 438  * bytes)
 439  * @param[in] input_step_z                         input_stride_z * number of elements along Z
 440  * processed per workitem(in bytes)
 441  * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source
 442  * tensor
 443  * @param[in] output_ptr                           The local buffer to hold sumed values. Supported
 444  * data types: U32/S32
 445  * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in
 446  * bytes)
 447  * @param[in] output_step_x                        output_stride_x * number of elements along X
 448  * processed per workitem(in bytes)
 449  * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in
 450  * bytes)
 451  * @param[in] output_step_y                        output_stride_y * number of elements along Y
 452  * processed per workitem(in bytes)
 453  * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in
 454  * bytes)
 455  * @param[in] output_step_z                        output_stride_z * number of elements along Z
 456  * processed per workitem(in bytes)
 457  * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source
 458  * tensor
 459  */
 460 __kernel void arg_min_max_ex_z(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
 461 {
 462   Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
 463   Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
 464
 465   VEC_DATA_TYPE(DATA_TYPE, 16)
 466   res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)),
 467                 VEC_DATA_TYPE(DATA_TYPE, 16));
 468
 469   VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
 470   indx = 0;
 471   for (DATA_TYPE_OUTPUT z = 1; z < DEPTH; ++z)
 472   {
 473     VEC_DATA_TYPE(DATA_TYPE, 16)
 474     in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)),
 475                  VEC_DATA_TYPE(DATA_TYPE, 16));
 476
 477     VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
 478     cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
 479     indx = select(indx, z, cond_conv);
 480     res = select(res, in, CONDITION_TO_USE(in, res));
 481   }
 482
 483   // Store result
 484   vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
 485 }
 486 #endif /* defined(DEPTH) */
 487
 488 #if defined(BATCH) && defined(DEPTH)
 489 /** This kernel performs reduction on w-axis.
 490  *
 491  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
 492  * @note The data type of the select results must be passed at compile time using
 493  * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
 494  * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
 495  * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
 496  *
 497  * @param[in] input_ptr                            Pointer to the source tensor. Supported data
 498  * types: S32/F16/F32
 499  * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in
 500  * bytes)
 501  * @param[in] input_step_x                         input_stride_x * number of elements along X
 502  * processed per workitem(in bytes)
 503  * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in
 504  * bytes)
 505  * @param[in] input_step_y                         input_stride_y * number of elements along Y
 506  * processed per workitem(in bytes)
 507  * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in
 508  * bytes)
 509  * @param[in] input_step_z                         input_stride_z * number of elements along Z
 510  * processed per workitem(in bytes)
 511  * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in
 512  * bytes)
 513  * @param[in] input_step_w                         input_stride_w * number of elements along W
 514  * processed per workitem(in bytes)
 515  * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source
 516  * tensor
 517  * @param[in] output_ptr                           The local buffer to hold sumed values. Supported
 518  * data types: U32/S32
 519  * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in
 520  * bytes)
 521  * @param[in] output_step_x                        output_stride_x * number of elements along X
 522  * processed per workitem(in bytes)
 523  * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in
 524  * bytes)
 525  * @param[in] output_step_y                        output_stride_y * number of elements along Y
 526  * processed per workitem(in bytes)
 527  * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in
 528  * bytes)
 529  * @param[in] output_step_z                        output_stride_z * number of elements along Z
 530  * processed per workitem(in bytes)
 531  * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in
 532  * bytes)
 533  * @param[in] output_step_w                        output_stride_w * number of elements along W
 534  * processed per workitem(in bytes)
 535  * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source
 536  * tensor
 537  */
 538 __kernel void arg_min_max_ex_w(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
 539 {
 540   Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH);
 541   Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH);
 542
 543   VEC_DATA_TYPE(DATA_TYPE, 16)
 544   res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)),
 545                 VEC_DATA_TYPE(DATA_TYPE, 16));
 546
 547   VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
 548   indx = 0;
 549   for (DATA_TYPE_OUTPUT w = 1; w < BATCH; ++w)
 550   {
 551     VEC_DATA_TYPE(DATA_TYPE, 16)
 552     in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)),
 553                  VEC_DATA_TYPE(DATA_TYPE, 16));
 554
 555     VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
 556     cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
 557     indx = select(indx, w, cond_conv);
 558     res = select(res, in, CONDITION_TO_USE(in, res));
 559   }
 560
 561   // Store result
 562   vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
 563 }
 564 #endif /* defined(BATCH) && defined(DEPTH) */
 565 #endif /* defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT) */