src/core/CL/cl_kernels/softmax_layer.cl

   1 /*
   2  * Copyright (c) 2017-2018 ARM Limited.
   3  *
   4  * SPDX-License-Identifier: MIT
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to
   8  * deal in the Software without restriction, including without limitation the
   9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  10  * sell copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in all
  14  * copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24 #include "helpers.h"
  25
  26 #ifdef FIXED_POINT_POSITION
  27
  28 #include "fixed_point.h"
  29 #define MAX_OP(x, y, type, size) MAX_OP_EXPAND(x, y, type, size)
  30 #define ADD_OP(x, y, type, size) ADD_SAT_OP_EXPAND((x), (y), type, size)
  31 #define SUB_OP(x, y, type, size) SUB_SAT_OP_EXPAND((x), (y), type, size)
  32 #define MUL_OP(x, y, type, size) MUL_SAT_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
  33 #define DIV_OP(x, y, type, size) DIV_SAT_OP_VEC_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
  34 #define EXP_OP(x, type, size) EXP_OP_EXPAND((x), type, size, FIXED_POINT_POSITION)
  35
  36 #define MIN_VAL_EXPAND(type) type##_MIN
  37 #define MIN_VAL(type) MIN_VAL_EXPAND(type)
  38 #define MINVAL MIN_VAL(DATA_TYPE)
  39 #define SELECT_DATA_TYPE EXPAND(DATA_TYPE)
  40
  41 #else /* FIXED_POINT_POSITION */
  42
  43 #define MAX_OP(x, y, type, size) max((x), (y))
  44 #define ADD_OP(x, y, type, size) ((x) + (y))
  45 #define SUB_OP(x, y, type, size) ((x) - (y))
  46 #define MUL_OP(x, y, type, size) ((x) * (y))
  47 #define DIV_OP(x, y, type, size) ((x) / (y))
  48 #define EXP_OP(x, type, size) exp((x))
  49
  50 #ifdef USE_F16
  51 #define MINVAL -HALF_MAX
  52 #define SELECT_DATA_TYPE short
  53 #else /* USE_F16 */
  54 #define MINVAL -FLT_MAX
  55 #define SELECT_DATA_TYPE int
  56 #endif /* USE_F16 */
  57
  58 #endif /* FIXED_POINT_POSITION */
  59
  60 /* Number of workitems in dimension 0. */
  61 #if !defined(GRID_SIZE)
  62 #define GRID_SIZE 1
  63 #endif /* !defined(GRID_SIZE) */
  64
  65 /* Vector size, i.e. number of vector elements. */
  66 #if VECTOR_SIZE == 2
  67 __constant VEC_DATA_TYPE(DATA_TYPE, 2) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 2))(MINVAL);
  68 __constant uint2 idx__ = (uint2)(0, 1);
  69
  70 #elif VECTOR_SIZE == 4
  71 __constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
  72 __constant uint4 idx__ = (uint4)(0, 1, 2, 3);
  73
  74 #elif VECTOR_SIZE == 8
  75 __constant VEC_DATA_TYPE(DATA_TYPE, 8) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 8))(MINVAL);
  76 __constant uint8 idx__ = (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
  77
  78 #else /* VECTOR_SIZE DEFAULT */
  79 #define VECTOR_SIZE 16
  80 #define LOG_VECTOR_SIZE 4
  81 __constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
  82 __constant uint16 idx__ = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
  83
  84 #endif /* VECTOR_SIZE END */
  85
  86 __constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
  87 __constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
  88 __constant uint4 idx4   = (uint4)(0, 1, 2, 3);
  89
  90 /** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
  91  *
  92  * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
  93  * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
  94  *
  95  * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
  96  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  97  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  98  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  99  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 100  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
 101  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
 102  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
 103  * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
 104  * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
 105  * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
 106  * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
 107  * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
 108  * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
 109  * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
 110  * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
 111  * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
 112  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
 113  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
 114  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
 115  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
 116  * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
 117  * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
 118  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
 119  */
 120 __kernel void softmax_layer_norm(
 121     TENSOR3D_DECLARATION(src),
 122     TENSOR3D_DECLARATION(sum),
 123     TENSOR3D_DECLARATION(dst))
 124 {
 125     Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
 126     Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
 127     Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
 128
 129     // Load max value of 1D logits vector (row)
 130     DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1)));
 131     VEC_DATA_TYPE(DATA_TYPE, 16)
 132     data = vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0));
 133     vstore16(DIV_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
 134 }
 135
 136 /** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
 137  * then gets the exponent of each element as sums all elements across each row.
 138  *
 139  * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
 140  * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
 141  * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
 142  * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
 143  *
 144  * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
 145  * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
 146  * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
 147  * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
 148  * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
 149  * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
 150  * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
 151  * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
 152  * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
 153  * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
 154  * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
 155  * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
 156  * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
 157  * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
 158  * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
 159  * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
 160  * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
 161  * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
 162  * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
 163  * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
 164  * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
 165  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
 166  * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
 167  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
 168  * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
 169  * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
 170  * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
 171  * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
 172  * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
 173  * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
 174  * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
 175  * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
 176  * @param[in]  width                              Input image width
 177  */
 178 __kernel void softmax_layer_max_shift_exp_sum_serial(
 179     TENSOR3D_DECLARATION(src),
 180     TENSOR3D_DECLARATION(maxo),
 181     TENSOR3D_DECLARATION(dst),
 182     TENSOR3D_DECLARATION(sum),
 183     uint width)
 184 {
 185     Image src  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
 186     Image dst  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
 187     Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
 188     Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
 189
 190 #ifdef BETA
 191     // Initialize beta
 192     VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
 193     beta = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))BETA;
 194 #endif /* BETA */
 195
 196     // Initialize local maximum
 197     VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
 198     max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))type_min_;
 199
 200     // Calculate max of row
 201     const uint width_ = width >> LOG_VECTOR_SIZE;
 202     for(uint i = 0; i < width_; i++)
 203     {
 204         VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
 205         data_max    = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
 206         max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, VECTOR_SIZE);
 207     }
 208
 209 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE
 210     VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
 211     data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
 212     VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE)
 213     widx        = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
 214     max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, VECTOR_SIZE);
 215 #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
 216
 217     // Perform max reduction
 218 #if VECTOR_SIZE == 16
 219     max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, DATA_TYPE, 8);
 220 #endif /* VECTOR SIZE 16 END */
 221 #if VECTOR_SIZE >= 8
 222     max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, DATA_TYPE, 4);
 223 #endif /* VECTOR SIZE 8 END */
 224 #if VECTOR_SIZE >= 4
 225     max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
 226 #endif /* VECTOR SIZE 4 END */
 227     max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
 228     // Store result
 229     *((__global DATA_TYPE *)maxo.ptr) = max_val_vec.s0;
 230
 231     /* Second section */
 232
 233     // Load max value of 1D logits vector (row)
 234     DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&maxo, 0, 0));
 235
 236     // Set sum vector
 237     VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
 238     sum1D = 0;
 239
 240     // Shift values, exp and sum
 241     for(uint i = 0; i < width_; i++)
 242     {
 243         VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
 244         data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
 245         data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
 246 #ifdef BETA
 247         data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
 248 #endif /* BETA */
 249         data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
 250         VSTORE(VECTOR_SIZE)
 251         (data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
 252         sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
 253     }
 254
 255 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE
 256     VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
 257     data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
 258     data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
 259 #ifdef BETA
 260     data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
 261 #endif /* BETA */
 262     data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
 263     widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
 264     data = select(0, data, widx);
 265     VSTORE(VECTOR_SIZE)
 266     (data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));
 267     sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
 268 #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
 269
 270     // Perform sum reduction
 271 #if VECTOR_SIZE == 16
 272     sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);
 273 #endif /* VECTOR SIZE 16 END */
 274 #if VECTOR_SIZE >= 8
 275     sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);
 276 #endif /* VECTOR SIZE 8 END */
 277 #if VECTOR_SIZE >= 4
 278     sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
 279 #endif /* VECTOR SIZE 4 END */
 280     sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
 281
 282     // Calculate and store result
 283     *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
 284 }
 285
 286 /** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
 287  * then gets the exponent of each element as sums all elements across each row.
 288  *
 289  * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
 290  * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
 291  * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
 292  * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
 293  *
 294  * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
 295  * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
 296  * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
 297  * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
 298  * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
 299  * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
 300  * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
 301  * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
 302  * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
 303  * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
 304  * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
 305  * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
 306  * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
 307  * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
 308  * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
 309  * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
 310  * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
 311  * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
 312  * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
 313  * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
 314  * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
 315  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
 316  * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
 317  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
 318  * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
 319  * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
 320  * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
 321  * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
 322  * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
 323  * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
 324  * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
 325  * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
 326  * @param[in]  width                              Input image width
 327  */
 328 __kernel void softmax_layer_max_shift_exp_sum_parallel(
 329     TENSOR3D_DECLARATION(src),
 330     TENSOR3D_DECLARATION(maxo),
 331     TENSOR3D_DECLARATION(dst),
 332     TENSOR3D_DECLARATION(sum),
 333     uint width)
 334 {
 335     Image src  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
 336     Image dst  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
 337     Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
 338     Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
 339
 340     const uint lid = get_local_id(0);
 341
 342 #ifdef BETA
 343     // Initialize beta
 344     VEC_DATA_TYPE(DATA_TYPE, 4)
 345     beta = (VEC_DATA_TYPE(DATA_TYPE, 4))BETA;
 346 #endif /* BETA */
 347
 348     // Define one temporary vector per work-item.
 349     __local VEC_DATA_TYPE(DATA_TYPE, 4) tmp_local[GRID_SIZE];
 350     __local DATA_TYPE max_local;
 351
 352     __constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
 353     VEC_DATA_TYPE(DATA_TYPE, 4)
 354     max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, 4))type_min4;
 355     // Number of elements per work-item.
 356     const uint row = width / GRID_SIZE;
 357     // Number of iterations per work-item.
 358     const uint width_ = row >> 2;
 359     // Calculate max of row
 360     uint i = 0;
 361     for(; i < width_; i++)
 362     {
 363         VEC_DATA_TYPE(DATA_TYPE, 4)
 364         data_max    = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
 365         max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
 366     }
 367 #ifdef NON_MULTIPLE_OF_GRID_SIZE
 368     // How many work-items needed to complete the computation.
 369     int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
 370     if(lid < boundary_workitems)
 371     {
 372         VEC_DATA_TYPE(DATA_TYPE, 4)
 373         data_max    = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
 374         max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
 375     }
 376 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE
 377     if(boundary_workitems == 0)
 378     {
 379         boundary_workitems = GRID_SIZE;
 380         i--;
 381     }
 382     if(lid == (boundary_workitems - 1))
 383     {
 384         // Handle non multiple of 4
 385         VEC_DATA_TYPE(DATA_TYPE, 4)
 386         data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
 387         VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
 388         widx        = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
 389         max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, 4);
 390     }
 391 #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
 392 #endif /* NON_MULTIPLE_OF_GRID_SIZE */
 393     tmp_local[lid] = max_val_vec;
 394
 395     barrier(CLK_LOCAL_MEM_FENCE);
 396
 397     if(GRID_SIZE >= 256)
 398     {
 399         if(lid < 128)
 400         {
 401             tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
 402         }
 403         barrier(CLK_LOCAL_MEM_FENCE);
 404     }
 405     if(GRID_SIZE >= 128)
 406     {
 407         if(lid < 64)
 408         {
 409             tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
 410         }
 411         barrier(CLK_LOCAL_MEM_FENCE);
 412     }
 413     if(GRID_SIZE >= 64)
 414     {
 415         if(lid < 32)
 416         {
 417             tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
 418         }
 419         barrier(CLK_LOCAL_MEM_FENCE);
 420     }
 421     if(GRID_SIZE >= 32)
 422     {
 423         if(lid < 16)
 424         {
 425             tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
 426         }
 427         barrier(CLK_LOCAL_MEM_FENCE);
 428     }
 429     if(GRID_SIZE >= 16)
 430     {
 431         if(lid < 8)
 432         {
 433             tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
 434         }
 435         barrier(CLK_LOCAL_MEM_FENCE);
 436     }
 437     if(GRID_SIZE >= 8)
 438     {
 439         if(lid < 4)
 440         {
 441             tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
 442         }
 443         barrier(CLK_LOCAL_MEM_FENCE);
 444     }
 445     if(GRID_SIZE >= 4)
 446     {
 447         if(lid < 2)
 448         {
 449             tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
 450         }
 451         barrier(CLK_LOCAL_MEM_FENCE);
 452     }
 453     if(lid == 0)
 454     {
 455         max_val_vec     = MAX_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
 456         max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
 457         max_val_vec.s0  = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
 458         max_local       = max_val_vec.s0;
 459     }
 460     barrier(CLK_LOCAL_MEM_FENCE);
 461
 462     /* Second section */
 463
 464     // Set sum vector
 465     VEC_DATA_TYPE(DATA_TYPE, 4)
 466     sum1D             = 0;
 467     DATA_TYPE max_val = max_local;
 468
 469     // Shift values, exp and sum
 470     for(i = 0; i < width_; i++)
 471     {
 472         VEC_DATA_TYPE(DATA_TYPE, 4)
 473         data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
 474         data = SUB_OP(data, max_val, DATA_TYPE, 4);
 475 #ifdef BETA
 476         data = MUL_OP(data, beta, DATA_TYPE, 4);
 477 #endif /* BETA */
 478         data = EXP_OP(data, DATA_TYPE, 4);
 479         VSTORE(4)
 480         (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
 481         sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
 482     }
 483 #ifdef NON_MULTIPLE_OF_GRID_SIZE
 484     boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
 485     if(lid < boundary_workitems)
 486     {
 487         VEC_DATA_TYPE(DATA_TYPE, 4)
 488         data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
 489         data = SUB_OP(data, max_val, DATA_TYPE, 4);
 490 #ifdef BETA
 491         data = MUL_OP(data, beta, DATA_TYPE, 4);
 492 #endif /* BETA */
 493         data = EXP_OP(data, DATA_TYPE, 4);
 494         VSTORE(4)
 495         (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
 496         sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
 497     }
 498 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE
 499     if(boundary_workitems == 0)
 500     {
 501         boundary_workitems = GRID_SIZE;
 502         i--;
 503     }
 504     if(lid == (boundary_workitems - 1))
 505     {
 506         // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
 507         VEC_DATA_TYPE(DATA_TYPE, 4)
 508         data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
 509         data = SUB_OP(data, max_val, DATA_TYPE, 4);
 510 #ifdef BETA
 511         data = MUL_OP(data, beta, DATA_TYPE, 4);
 512 #endif /* BETA */
 513         data = EXP_OP(data, DATA_TYPE, 4);
 514         VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
 515         widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
 516         data = select(0, data, widx);
 517         VSTORE(4)
 518         (data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));
 519         sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
 520     }
 521 #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
 522 #endif /* NON_MULTIPLE_OF_GRID_SIZE */
 523     tmp_local[lid] = sum1D;
 524
 525     barrier(CLK_LOCAL_MEM_FENCE);
 526
 527     if(GRID_SIZE >= 256)
 528     {
 529         if(lid < 128)
 530         {
 531             tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
 532         }
 533         barrier(CLK_LOCAL_MEM_FENCE);
 534     }
 535     if(GRID_SIZE >= 128)
 536     {
 537         if(lid < 64)
 538         {
 539             tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
 540         }
 541         barrier(CLK_LOCAL_MEM_FENCE);
 542     }
 543     if(GRID_SIZE >= 64)
 544     {
 545         if(lid < 32)
 546         {
 547             tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
 548         }
 549         barrier(CLK_LOCAL_MEM_FENCE);
 550     }
 551     if(GRID_SIZE >= 32)
 552     {
 553         if(lid < 16)
 554         {
 555             tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
 556         }
 557         barrier(CLK_LOCAL_MEM_FENCE);
 558     }
 559     if(GRID_SIZE >= 16)
 560     {
 561         if(lid < 8)
 562         {
 563             tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
 564         }
 565         barrier(CLK_LOCAL_MEM_FENCE);
 566     }
 567     if(GRID_SIZE >= 8)
 568     {
 569         if(lid < 4)
 570         {
 571             tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
 572         }
 573         barrier(CLK_LOCAL_MEM_FENCE);
 574     }
 575     if(GRID_SIZE >= 4)
 576     {
 577         if(lid < 2)
 578         {
 579             tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
 580         }
 581         barrier(CLK_LOCAL_MEM_FENCE);
 582     }
 583     if(lid == 0)
 584     {
 585         sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
 586         // Perform max reduction
 587         sum1D.s01                        = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
 588         sum1D.s0                         = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
 589         *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
 590     }
 591 }