compute/ARMComputeEx/src/core/CL/cl_kernels/arg_min_max_ex.cl

   1 /*
   2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /*
  18  * Copyright (c) 2019-2020 ARM Limited.
  19  *
  20  * SPDX-License-Identifier: MIT
  21  *
  22  * Permission is hereby granted, free of charge, to any person obtaining a copy
  23  * of this software and associated documentation files (the "Software"), to
  24  * deal in the Software without restriction, including without limitation the
  25  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  26  * sell copies of the Software, and to permit persons to whom the Software is
  27  * furnished to do so, subject to the following conditions:
  28  *
  29  * The above copyright notice and this permission notice shall be included in all
  30  * copies or substantial portions of the Software.
  31  *
  32  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  33  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  34  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  35  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  36  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  37  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  38  * SOFTWARE.
  39  */
  40 #include "helpers.h"
  41
  42 #if defined(FLOAT_DATA_TYPE)
  43 #define ISGREATER(x, y) isgreater(x, y)
  44 #define ISLESS(x, y) isless(x, y)
  45 #else // !FLOAT_DATA_TYPE
  46 #if defined(WIDTH)
  47 #define ISGREATER(x, y) (x > y) ? 1 : 0
  48 #define ISLESS(x, y) (x < y) ? 1 : 0
  49 #else // !defined(WIDTH)
  50 #define ISGREATER(x, y) \
  51   select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x > y)
  52 #define ISLESS(x, y) \
  53   select((VEC_DATA_TYPE(DATA_TYPE_SELECT, 16))0, (VEC_DATA_TYPE(DATA_TYPE_SELECT, 16)) - 1, x < y)
  54 #endif // defined(WIDTH)
  55 #endif // defined(FLOAT_DATA_TYPE)
  56
  57 #if defined(ARG_MAX)
  58 #define CONDITION_TO_USE(x, y) ISGREATER(x, y)
  59 #elif defined(ARG_MIN)
  60 #define CONDITION_TO_USE(x, y) ISLESS(x, y)
  61 #else // !(defined(ARG_MAX) || defined(ARG_MIN))
  62 #error "Unsupported reduction operation!"
  63 #endif // defined(ARG_MAX)
  64
  65 #if defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT)
  66 #if defined(WIDTH)
  67 #if defined(ARG_MIN)
  68 #if defined(PREV_OUTPUT)
  69 /** Find index minimum value of a vector
  70  *
  71  * @param[in] input Pointer to the first value.
  72  *
  73  * @return index of the vector.
  74  */
  75 inline DATA_TYPE_OUTPUT arg_idx_min_prev_out(__global const DATA_TYPE *input,
  76                                              __global const DATA_TYPE_OUTPUT *prev_res,
  77                                              const int x_idx)
  78 {
  79   int end_elem = (x_idx + 1) * 16;
  80   if (end_elem > WIDTH)
  81   {
  82     end_elem = WIDTH - x_idx * 16;
  83   }
  84   DATA_TYPE_OUTPUT res = prev_res[0];
  85   for (int x_v = 1; x_v < end_elem; ++x_v)
  86   {
  87     res = select(res, prev_res[x_v], *(input + prev_res[x_v]) < *(input + res));
  88   }
  89   return res;
  90 }
  91 #else // !defined(PREV_OUTPUT)
  92 /** Find index minimum value of a vector
  93  *
  94  * @param[in] input Pointer to the first value.
  95  *
  96  * @return index of the vector.
  97  */
  98 inline DATA_TYPE_OUTPUT arg_idx_min(__global const DATA_TYPE *input, const int x_idx)
  99 {
 100 #if WIDTH < 16
 101   DATA_TYPE_OUTPUT res = 0;
 102   for (DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
 103   {
 104     res = select(res, x_v, *(input + x_v) < *(input + res));
 105   }
 106   return res;
 107 #else  // WIDTH >= 16
 108   int x_elem = x_idx * 16;
 109   const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
 110   x_elem -= x_goback;
 111
 112   VEC_DATA_TYPE(DATA_TYPE, 16)
 113   in = vload16(0, input - x_goback);
 114   VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
 115   res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
 116
 117   VEC_DATA_TYPE(DATA_TYPE_SELECT, 8)
 118   idx_sel = (in.s01234567 <= in.s89abcdef);
 119   in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel);
 120   res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
 121
 122   idx_sel.s0123 =
 123     (in.s0123 < in.s4567) ||
 124     (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
 125   in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123);
 126   res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
 127
 128   idx_sel.s01 =
 129     (in.s01 < in.s23) ||
 130     (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
 131   in.s01 = select(in.s23, in.s01, idx_sel.s01);
 132   res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
 133
 134   idx_sel.s0 = (in.s0 < in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT));
 135   res.s0 = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
 136
 137   return res.s0 + x_elem;
 138 #endif // WIDTH < 16
 139 }
 140 #endif // defined(PREV_OUTPUT)
 141 #endif // defined(ARG_MIN)
 142 #if defined(ARG_MAX)
 143 #if defined(PREV_OUTPUT)
 144 /** Find index maximum value of a vector
 145  *
 146  * @param[in] input Pointer to the first value.
 147  *
 148  * @return index of the vector.
 149  */
 150 inline DATA_TYPE_OUTPUT arg_idx_max_prev_out(__global const DATA_TYPE *input,
 151                                              __global const DATA_TYPE_OUTPUT *prev_res,
 152                                              const int x_idx)
 153 {
 154   int end_elem = (x_idx + 1) * 16;
 155   if (end_elem > WIDTH)
 156   {
 157     end_elem = WIDTH - x_idx * 16;
 158   }
 159   DATA_TYPE_OUTPUT res = prev_res[0];
 160   unsigned int res_int = res;
 161   DATA_TYPE_OUTPUT condition_check2;
 162   for (int x_v = 1; x_v < end_elem; ++x_v)
 163   {
 164     int i1 = prev_res[x_v];
 165     condition_check2 = *(input + i1) > *(input + res_int);
 166     res = select(res, prev_res[x_v], condition_check2);
 167   }
 168   return res;
 169 }
 170 #else // !defined(PREV_OUTPUT)
 171 /** Find index maximum value of a vector
 172  *
 173  * @param[in] input Pointer to the first value.
 174  *
 175  * @return index of the vector.
 176  */
 177 inline DATA_TYPE_OUTPUT arg_idx_max(__global const DATA_TYPE *input, const int x_idx)
 178 {
 179 #if WIDTH < 16
 180   DATA_TYPE_OUTPUT res = 0;
 181   unsigned int i1;
 182   unsigned int i2;
 183   DATA_TYPE_OUTPUT condition_check;
 184   for (DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
 185   {
 186     i1 = x_v;
 187     i2 = res;
 188     condition_check = *(input + i1) > *(input + i2);
 189     res = select(res, x_v, condition_check);
 190   }
 191   return res;
 192 #else  // WIDTH >= 16
 193   int x_elem = x_idx * 16;
 194   const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
 195   x_elem -= x_goback;
 196
 197   VEC_DATA_TYPE(DATA_TYPE, 16)
 198   in = vload16(0, input - x_goback);
 199   VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
 200   res = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
 201
 202   VEC_DATA_TYPE(DATA_TYPE_SELECT, 8)
 203   idx_sel = (in.s01234567 >= in.s89abcdef);
 204   in.s01234567 = select(in.s89abcdef, in.s01234567, idx_sel);
 205   res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
 206
 207   idx_sel.s0123 =
 208     (in.s0123 > in.s4567) ||
 209     (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), VEC_DATA_TYPE(DATA_TYPE_SELECT, 4)));
 210   in.s0123 = select(in.s4567, in.s0123, idx_sel.s0123);
 211   res.s0123 = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
 212
 213   idx_sel.s01 =
 214     (in.s01 > in.s23) ||
 215     (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), VEC_DATA_TYPE(DATA_TYPE_SELECT, 2)));
 216   in.s01 = select(in.s23, in.s01, idx_sel.s01);
 217   res.s01 = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
 218
 219   idx_sel.s0 = (in.s0 > in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), DATA_TYPE_SELECT));
 220   res.s0 = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
 221
 222   return res.s0 + x_elem;
 223 #endif // WIDTH < 16
 224 }
 225 #endif // defined(PREV_OUTPUT)
 226 #endif // defined(ARG_MAX)
 227
 228 /** This kernel performs parallel reduction given an operation on x-axis.
 229  *
 230  * @note In case the results of previous stages are passed the flag PREV_OUTPUT has to be passed
 231  * using -DPREV_OUTPUT
 232  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
 233  * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g.
 234  * -DDATA_TYPE_OUTPUT=uint
 235  * @note The arg_max flag must be passed at compile time using -DARG_MAX if we want to compute the
 236  * ArgMax
 237  * @note The arg_min flag must be passed at compile time using -DARG_MIN if we want to compute the
 238  * ArgMin
 239  *
 240  * @param[in] src_ptr                                   Pointer to the source tensor. Supported data
 241  * types: S32/F16/F32
 242  * @param[in] src_stride_x                              Stride of the source tensor in X dimension
 243  * (in bytes)
 244  * @param[in] src_step_x                                src_stride_x * number of elements along X
 245  * processed per workitem(in bytes)
 246  * @param[in] src_stride_y                              Stride of the source tensor in Y dimension
 247  * (in bytes)
 248  * @param[in] src_step_y                                src_stride_y * number of elements along Y
 249  * processed per workitem(in bytes)
 250  * @param[in] src_offset_first_element_in_bytes         The offset of the first element in the
 251  * source tensor
 252  * @param[in] prev_res_ptr                              (Optional) Pointer to previous results
 253  * tensor. Supported data types: U32/S32
 254  * @param[in] prev_res_stride_x                         (Optional) Stride of the output tensor in X
 255  * dimension (in bytes)
 256  * @param[in] prev_res_step_x                           (Optional) prev_res_stride_x * number of
 257  * elements along X processed per workitem(in bytes)
 258  * @param[in] prev_res_stride_y                         (Optional) Stride of the output tensor in Y
 259  * dimension (in bytes)
 260  * @param[in] prev_res_step_y                           (Optional) prev_res_stride_y * number of
 261  * elements along Y processed per workitem(in bytes)
 262  * @param[in] prev_res_offset_first_element_in_bytes    (Optional) The offset of the first element
 263  * in the previous results tensor
 264  * @param[in] partial_res_ptr                           The local buffer to hold partial result
 265  * values. Supported data types: U32/S32
 266  * @param[in] partial_res_stride_x                      Stride of the output tensor in X dimension
 267  * (in bytes)
 268  * @param[in] partial_res_step_x                        partial_res_stride_x * number of elements
 269  * along X processed per workitem(in bytes)
 270  * @param[in] partial_res_stride_y                      Stride of the output tensor in Y dimension
 271  * (in bytes)
 272  * @param[in] partial_res_step_y                        partial_res_stride_y * number of elements
 273  * along Y processed per workitem(in bytes)
 274  * @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the
 275  * source tensor
 276  * @param[in] local_results                             Local buffer for storing the partial result
 277  */
 278 __kernel void arg_min_max_ex_x(IMAGE_DECLARATION(src),
 279 #if defined(PREV_OUTPUT)
 280                                IMAGE_DECLARATION(prev_res),
 281 #endif // defined(PREV_OUTPUT)
 282                                IMAGE_DECLARATION(partial_res),
 283                                __local DATA_TYPE_OUTPUT *local_results)
 284 {
 285 #if defined(PREV_OUTPUT)
 286   Image src = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src);
 287   Image prev_res = CONVERT_TO_IMAGE_STRUCT(prev_res);
 288 #else  // !defined(PREV_OUTPUT)
 289   Image src = CONVERT_TO_IMAGE_STRUCT(src);
 290 #endif // defined(PREV_OUTPUT)
 291   Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res);
 292
 293   unsigned int lsize = get_local_size(0);
 294   unsigned int lid = get_local_id(0);
 295
 296   const uint x_idx = get_global_id(0);
 297   const uint y_idx = get_global_id(1);
 298   const __global DATA_TYPE *src_in_row =
 299     (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + y_idx * src_step_y);
 300
 301   for (unsigned int y = 0; y < get_local_size(1); ++y)
 302   {
 303 #if defined(ARG_MAX)
 304 #if defined(PREV_OUTPUT)
 305     local_results[lid] =
 306       arg_idx_max_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
 307 #else  // !defined(PREV_OUTPUT)
 308     local_results[lid] = arg_idx_max((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
 309 #endif // defined(PREV_OUTPUT)
 310 #else  // defined(ARG_MIN)
 311 #if defined(PREV_OUTPUT)
 312     local_results[lid] =
 313       arg_idx_min_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
 314 #else  // !defined(PREV_OUTPUT)
 315     local_results[lid] = arg_idx_min((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
 316 #endif // defined(PREV_OUTPUT)
 317 #endif // defined(ARG_MAX) || defined(ARG_MIN)
 318
 319     barrier(CLK_LOCAL_MEM_FENCE);
 320
 321     // Looking for the next highest power of 2 (maximum value of lsize is 8)
 322     unsigned int middle = lsize - 1;
 323     middle |= middle >> 1;
 324     middle |= middle >> 2;
 325     middle += 1;
 326     // Perform parallel reduction
 327     DATA_TYPE_OUTPUT condition_check3;
 328     for (unsigned int i = middle; i > 0; i >>= 1)
 329     {
 330       if (lid < i && lid + i < lsize)
 331       {
 332         DATA_TYPE tmp0 = *(src_in_row + local_results[lid]);
 333         DATA_TYPE tmp1 = *(src_in_row + local_results[lid + i]);
 334 #if defined(ARG_MAX)
 335         condition_check3 =
 336           ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1);
 337         local_results[lid] = select(local_results[lid], local_results[lid + i], condition_check3);
 338 #else  // defined(ARG_MIN)
 339         local_results[lid] = select(
 340           local_results[lid], local_results[lid + i],
 341           ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1));
 342 #endif // defined(ARG_MAX) || defined(ARG_MIN)
 343       }
 344       barrier(CLK_LOCAL_MEM_FENCE);
 345     }
 346
 347     if (lid == 0)
 348     {
 349       ((__global DATA_TYPE_OUTPUT *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
 350     }
 351   }
 352 }
 353 #endif // defined(WIDTH)
 354
 355 #if defined(HEIGHT)
 356 /** This kernel performs reduction on y-axis.
 357  *
 358  * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g.
 359  * -DDATA_TYPE=float
 360  * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g.
 361  * -DDATA_TYPE_OUTPUT=uint
 362  * @note The data type of the select results must be passed at compile time using
 363  * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
 364  * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
 365  *
 366  * @param[in] src_ptr                              Pointer to the source tensor. Supported data
 367  * types: S32/F16/F32
 368  * @param[in] src_stride_x                         Stride of the source tensor in X dimension (in
 369  * bytes)
 370  * @param[in] src_step_x                           src_stride_x * number of elements along X
 371  * processed per workitem(in bytes)
 372  * @param[in] src_stride_y                         Stride of the source tensor in Y dimension (in
 373  * bytes)
 374  * @param[in] src_step_y                           src_stride_y * number of elements along Y
 375  * processed per workitem(in bytes)
 376  * @param[in] src_offset_first_element_in_bytes    The offset of the first element in the source
 377  * tensor
 378  * @param[in] output_ptr                           The local buffer to hold sumed values. Supported
 379  * data types: U32/S32
 380  * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in
 381  * bytes)
 382  * @param[in] output_step_x                        output_stride_x * number of elements along X
 383  * processed per workitem(in bytes)
 384  * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in
 385  * bytes)
 386  * @param[in] output_step_y                        output_stride_y * number of elements along Y
 387  * processed per workitem(in bytes)
 388  * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source
 389  * tensor
 390  */
 391 __kernel void arg_min_max_ex_y(IMAGE_DECLARATION(src), IMAGE_DECLARATION(output))
 392 {
 393   Image src = CONVERT_TO_IMAGE_STRUCT(src);
 394   Image output = CONVERT_TO_IMAGE_STRUCT(output);
 395
 396   VEC_DATA_TYPE(DATA_TYPE, 16)
 397   res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE, 16));
 398
 399   VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
 400   indx = 0;
 401   for (unsigned int y = 1; y < HEIGHT; ++y)
 402   {
 403     VEC_DATA_TYPE(DATA_TYPE, 16)
 404     in =
 405       CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE, 16));
 406
 407     VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
 408     cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
 409     indx = select(indx, y, cond_conv);
 410     res = select(res, in, CONDITION_TO_USE(in, res));
 411   }
 412
 413   // Store result
 414   vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
 415 }
 416 #endif // defined(HEIGHT)
 417
 418 #if defined(DEPTH)
 419 /** This kernel performs reduction on z-axis.
 420  *
 421  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
 422  * @note The data type of the select results must be passed at compile time using
 423  * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
 424  * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
 425  *
 426  * @param[in] input_ptr                            Pointer to the source tensor. Supported data
 427  * types: S32/F16/F32
 428  * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in
 429  * bytes)
 430  * @param[in] input_step_x                         input_stride_x * number of elements along X
 431  * processed per workitem(in bytes)
 432  * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in
 433  * bytes)
 434  * @param[in] input_step_y                         input_stride_y * number of elements along Y
 435  * processed per workitem(in bytes)
 436  * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in
 437  * bytes)
 438  * @param[in] input_step_z                         input_stride_z * number of elements along Z
 439  * processed per workitem(in bytes)
 440  * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source
 441  * tensor
 442  * @param[in] output_ptr                           The local buffer to hold sumed values. Supported
 443  * data types: U32/S32
 444  * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in
 445  * bytes)
 446  * @param[in] output_step_x                        output_stride_x * number of elements along X
 447  * processed per workitem(in bytes)
 448  * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in
 449  * bytes)
 450  * @param[in] output_step_y                        output_stride_y * number of elements along Y
 451  * processed per workitem(in bytes)
 452  * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in
 453  * bytes)
 454  * @param[in] output_step_z                        output_stride_z * number of elements along Z
 455  * processed per workitem(in bytes)
 456  * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source
 457  * tensor
 458  */
 459 __kernel void arg_min_max_ex_z(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
 460 {
 461   Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
 462   Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
 463
 464   VEC_DATA_TYPE(DATA_TYPE, 16)
 465   res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)),
 466                 VEC_DATA_TYPE(DATA_TYPE, 16));
 467
 468   VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
 469   indx = 0;
 470   for (DATA_TYPE_OUTPUT z = 1; z < DEPTH; ++z)
 471   {
 472     VEC_DATA_TYPE(DATA_TYPE, 16)
 473     in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)),
 474                  VEC_DATA_TYPE(DATA_TYPE, 16));
 475
 476     VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
 477     cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
 478     indx = select(indx, z, cond_conv);
 479     res = select(res, in, CONDITION_TO_USE(in, res));
 480   }
 481
 482   // Store result
 483   vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
 484 }
 485 #endif /* defined(DEPTH) */
 486
 487 #if defined(BATCH) && defined(DEPTH)
 488 /** This kernel performs reduction on w-axis.
 489  *
 490  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
 491  * @note The data type of the select results must be passed at compile time using
 492  * -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
 493  * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
 494  * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
 495  *
 496  * @param[in] input_ptr                            Pointer to the source tensor. Supported data
 497  * types: S32/F16/F32
 498  * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in
 499  * bytes)
 500  * @param[in] input_step_x                         input_stride_x * number of elements along X
 501  * processed per workitem(in bytes)
 502  * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in
 503  * bytes)
 504  * @param[in] input_step_y                         input_stride_y * number of elements along Y
 505  * processed per workitem(in bytes)
 506  * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in
 507  * bytes)
 508  * @param[in] input_step_z                         input_stride_z * number of elements along Z
 509  * processed per workitem(in bytes)
 510  * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in
 511  * bytes)
 512  * @param[in] input_step_w                         input_stride_w * number of elements along W
 513  * processed per workitem(in bytes)
 514  * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source
 515  * tensor
 516  * @param[in] output_ptr                           The local buffer to hold sumed values. Supported
 517  * data types: U32/S32
 518  * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in
 519  * bytes)
 520  * @param[in] output_step_x                        output_stride_x * number of elements along X
 521  * processed per workitem(in bytes)
 522  * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in
 523  * bytes)
 524  * @param[in] output_step_y                        output_stride_y * number of elements along Y
 525  * processed per workitem(in bytes)
 526  * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in
 527  * bytes)
 528  * @param[in] output_step_z                        output_stride_z * number of elements along Z
 529  * processed per workitem(in bytes)
 530  * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in
 531  * bytes)
 532  * @param[in] output_step_w                        output_stride_w * number of elements along W
 533  * processed per workitem(in bytes)
 534  * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source
 535  * tensor
 536  */
 537 __kernel void arg_min_max_ex_w(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
 538 {
 539   Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH);
 540   Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH);
 541
 542   VEC_DATA_TYPE(DATA_TYPE, 16)
 543   res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)),
 544                 VEC_DATA_TYPE(DATA_TYPE, 16));
 545
 546   VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
 547   indx = 0;
 548   for (DATA_TYPE_OUTPUT w = 1; w < BATCH; ++w)
 549   {
 550     VEC_DATA_TYPE(DATA_TYPE, 16)
 551     in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)),
 552                  VEC_DATA_TYPE(DATA_TYPE, 16));
 553
 554     VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
 555     cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16));
 556     indx = select(indx, w, cond_conv);
 557     res = select(res, in, CONDITION_TO_USE(in, res));
 558   }
 559
 560   // Store result
 561   vstore16(indx, 0, (__global DATA_TYPE_OUTPUT *)output.ptr);
 562 }
 563 #endif /* defined(BATCH) && defined(DEPTH) */
 564 #endif /* defined(DATA_TYPE_OUTPUT) && defined(DATA_TYPE_SELECT) */