src/core/CL/cl_kernels/gemm.cl

   1 /*
   2  * Copyright (c) 2017 ARM Limited.
   3  *
   4  * SPDX-License-Identifier: MIT
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to
   8  * deal in the Software without restriction, including without limitation the
   9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  10  * sell copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in all
  14  * copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24 #include "helpers.h"
  25
  26 /** This OpenCL kernel computes the "vector" 1x4 transposition of input matrix
  27  *
  28  * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
  29  * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
  30  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  31  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  32  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  33  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
  34  * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: F32
  35  * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
  36  * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
  37  * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
  38  * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
  39  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
  40  */
  41 __kernel void gemm_transpose1x4_f32(IMAGE_DECLARATION(src),
  42                                     IMAGE_DECLARATION(dst))
  43 {
  44     uint x = get_global_id(0);
  45     uint y = get_global_id(1);
  46
  47     /* Compute address for Matrix B - source */
  48     Image src = CONVERT_TO_IMAGE_STRUCT(src);
  49
  50     /* Compute address for Matrix B transposed - destination. X and Y are swapped */
  51     uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
  52
  53     float4 b0 = vload4(0, (__global float *)src.ptr);
  54
  55     vstore4(b0, 0, (__global float *)(dst_ptr + dst_addr_in_bytes));
  56 }
  57
  58 /** This OpenCL kernel computes the "vector" 1x8 transposition of input matrix
  59  *
  60  * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F16
  61  * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
  62  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  63  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  64  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  65  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
  66  * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: F16
  67  * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
  68  * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
  69  * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
  70  * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
  71  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
  72  */
  73 __kernel void gemm_transpose1x8_f16(IMAGE_DECLARATION(src),
  74                                     IMAGE_DECLARATION(dst))
  75 {
  76     uint x = get_global_id(0);
  77     uint y = get_global_id(1);
  78
  79     /* Compute address for Matrix B - source */
  80     Image src = CONVERT_TO_IMAGE_STRUCT(src);
  81
  82     /* Compute address for Matrix B transposed - destination. X and Y are swapped */
  83     uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
  84
  85     half8 b0 = vload8(0, (__global half *)src.ptr);
  86
  87     vstore8(b0, 0, (__global half *)(dst_ptr + dst_addr_in_bytes));
  88 }
  89
  90 /** This OpenCL kernel computes the "vector" 1x16 transposition of input matrix
  91  *
  92  * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8
  93  * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
  94  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  95  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
  96  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  97  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
  98  * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: U8
  99  * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
 100  * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
 101  * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
 102  * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
 103  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
 104  */
 105 __kernel void gemm_transpose1x16_u8(IMAGE_DECLARATION(src),
 106                                     IMAGE_DECLARATION(dst))
 107 {
 108     uint x = get_global_id(0);
 109     uint y = get_global_id(1);
 110
 111     /* Compute address for Matrix B - source */
 112     Image src = CONVERT_TO_IMAGE_STRUCT(src);
 113
 114     /* Compute address for Matrix B transposed - destination. X and Y are swapped */
 115     uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
 116
 117     uchar16 b0 = vload16(0, (__global uchar *)src.ptr);
 118
 119     vstore16(b0, 0, (__global uchar *)(dst_ptr + dst_addr_in_bytes));
 120 }
 121
 122 /** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
 123  *
 124  * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
 125  * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
 126  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 127  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
 128  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 129  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
 130  * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: F32
 131  * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
 132  * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
 133  * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
 134  * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
 135  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
 136  */
 137 __kernel void gemm_interleave4x4_f32(IMAGE_DECLARATION(src),
 138                                      IMAGE_DECLARATION(dst))
 139 {
 140     /* Compute source and destination addresses */
 141     Image src = CONVERT_TO_IMAGE_STRUCT(src);
 142     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 143
 144     /* Load values from Matrix A */
 145     float4 a0 = vload4(0, (__global float *)(offset(&src, 0, 0)));
 146     float4 a1 = vload4(0, (__global float *)(offset(&src, 0, 1)));
 147     float4 a2 = vload4(0, (__global float *)(offset(&src, 0, 2)));
 148     float4 a3 = vload4(0, (__global float *)(offset(&src, 0, 3)));
 149
 150     float4 val0 = (float4)(a0.s0, a1.s0, a2.s0, a3.s0);
 151     vstore4(val0, 0, ((__global float *)dst.ptr) + 0);
 152
 153     val0 = (float4)(a0.s1, a1.s1, a2.s1, a3.s1);
 154     vstore4(val0, 0, ((__global float *)dst.ptr) + 4);
 155
 156     val0 = (float4)(a0.s2, a1.s2, a2.s2, a3.s2);
 157     vstore4(val0, 0, ((__global float *)dst.ptr) + 8);
 158
 159     val0 = (float4)(a0.s3, a1.s3, a2.s3, a3.s3);
 160     vstore4(val0, 0, ((__global float *)dst.ptr) + 12);
 161 }
 162
 163 /** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
 164  *
 165  * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F16
 166  * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
 167  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 168  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
 169  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 170  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
 171  * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: F16
 172  * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
 173  * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
 174  * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
 175  * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
 176  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
 177  */
 178 __kernel void gemm_interleave4x4_f16(IMAGE_DECLARATION(src),
 179                                      IMAGE_DECLARATION(dst))
 180 {
 181     /* Compute source and destination addresses */
 182     Image src = CONVERT_TO_IMAGE_STRUCT(src);
 183     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 184
 185     /* Load values from Matrix A */
 186     half8 a0 = vload8(0, (__global half *)(offset(&src, 0, 0)));
 187     half8 a1 = vload8(0, (__global half *)(offset(&src, 0, 1)));
 188     half8 a2 = vload8(0, (__global half *)(offset(&src, 0, 2)));
 189     half8 a3 = vload8(0, (__global half *)(offset(&src, 0, 3)));
 190
 191     half8 val0 = (half8)((half4)(a0.s0, a1.s0, a2.s0, a3.s0), (half4)(a0.s1, a1.s1, a2.s1, a3.s1));
 192     vstore8(val0, 0, ((__global half *)dst.ptr) + 0);
 193
 194     val0 = (half8)((half4)(a0.s2, a1.s2, a2.s2, a3.s2), (half4)(a0.s3, a1.s3, a2.s3, a3.s3));
 195     vstore8(val0, 0, ((__global half *)dst.ptr) + 8);
 196
 197     val0 = (half8)((half4)(a0.s4, a1.s4, a2.s4, a3.s4), (half4)(a0.s5, a1.s5, a2.s5, a3.s5));
 198     vstore8(val0, 0, ((__global half *)dst.ptr) + 16);
 199
 200     val0 = (half8)((half4)(a0.s6, a1.s6, a2.s6, a3.s6), (half4)(a0.s7, a1.s7, a2.s7, a3.s7));
 201     vstore8(val0, 0, ((__global half *)dst.ptr) + 24);
 202 }
 203
 204 /** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
 205  *
 206  * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8
 207  * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
 208  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 209  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
 210  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 211  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
 212  * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: U8
 213  * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
 214  * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
 215  * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
 216  * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
 217  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
 218  */
 219 __kernel void gemm_interleave4x4_u8(IMAGE_DECLARATION(src),
 220                                     IMAGE_DECLARATION(dst))
 221 {
 222     /* Compute source and destination addresses */
 223     Image src = CONVERT_TO_IMAGE_STRUCT(src);
 224     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 225
 226     /* Load values from Matrix A */
 227     uchar16 a0 = vload16(0, (__global uchar *)(offset(&src, 0, 0)));
 228     uchar16 a1 = vload16(0, (__global uchar *)(offset(&src, 0, 1)));
 229     uchar16 a2 = vload16(0, (__global uchar *)(offset(&src, 0, 2)));
 230     uchar16 a3 = vload16(0, (__global uchar *)(offset(&src, 0, 3)));
 231
 232     uchar16 val0 = (uchar16)((uchar4)(a0.s0, a1.s0, a2.s0, a3.s0), (uchar4)(a0.s1, a1.s1, a2.s1, a3.s1),
 233                              (uchar4)(a0.s2, a1.s2, a2.s2, a3.s2), (uchar4)(a0.s3, a1.s3, a2.s3, a3.s3));
 234     vstore16(val0, 0, ((__global uchar *)dst.ptr) + 0);
 235
 236     val0 = (uchar16)((uchar4)(a0.s4, a1.s4, a2.s4, a3.s4), (uchar4)(a0.s5, a1.s5, a2.s5, a3.s5),
 237                      (uchar4)(a0.s6, a1.s6, a2.s6, a3.s6), (uchar4)(a0.s7, a1.s7, a2.s7, a3.s7));
 238     vstore16(val0, 0, ((__global uchar *)dst.ptr) + 16);
 239
 240     val0 = (uchar16)((uchar4)(a0.s8, a1.s8, a2.s8, a3.s8), (uchar4)(a0.s9, a1.s9, a2.s9, a3.s9),
 241                      (uchar4)(a0.sA, a1.sA, a2.sA, a3.sA), (uchar4)(a0.sB, a1.sB, a2.sB, a3.sB));
 242     vstore16(val0, 0, ((__global uchar *)dst.ptr) + 32);
 243
 244     val0 = (uchar16)((uchar4)(a0.sC, a1.sC, a2.sC, a3.sC), (uchar4)(a0.sD, a1.sD, a2.sD, a3.sD),
 245                      (uchar4)(a0.sE, a1.sE, a2.sE, a3.sE), (uchar4)(a0.sF, a1.sF, a2.sF, a3.sF));
 246     vstore16(val0, 0, ((__global uchar *)dst.ptr) + 48);
 247 }
 248
 249 /** This kernel accumulates each row with the biases vector
 250  *
 251  * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: F32
 252  * @param[in]      accum_stride_x                       Stride of the accmulate tensor in X dimension (in bytes)
 253  * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
 254  * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
 255  * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
 256  * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
 257  * @param[in]      biases_ptr                           Pointer to the biases vector. Same as input.
 258  * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
 259  * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
 260  * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
 261  */
 262 __kernel void gemm_accumulate_biases_f32(
 263     IMAGE_DECLARATION(accum),
 264     VECTOR_DECLARATION(biases))
 265 {
 266     Image  accum  = CONVERT_TO_IMAGE_STRUCT(accum);
 267     Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
 268
 269     float4 accum_value  = vload4(0, (__global float *)accum.ptr);
 270     float4 biases_value = vload4(0, (__global float *)biases.ptr);
 271     accum_value         = biases_value + accum_value;
 272
 273     // Store result in the accummulate buffer
 274     vstore4(accum_value, 0, (__global float *)accum.ptr);
 275 }
 276
 277 /** This kernel accumulates each row with the biases vector
 278  *
 279  * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: F16
 280  * @param[in]      accum_stride_x                       Stride of the accumulate tensor in X dimension (in bytes)
 281  * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
 282  * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
 283  * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
 284  * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
 285  * @param[in]      biases_ptr                           Pointer to the biases vector. Same as input.
 286  * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
 287  * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
 288  * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
 289  */
 290 __kernel void gemm_accumulate_biases_f16(
 291     IMAGE_DECLARATION(accum),
 292     VECTOR_DECLARATION(biases))
 293 {
 294     Image  accum  = CONVERT_TO_IMAGE_STRUCT(accum);
 295     Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
 296
 297     half8 accum_value  = vload8(0, (__global half *)accum.ptr);
 298     half8 biases_value = vload8(0, (__global half *)biases.ptr);
 299     accum_value        = biases_value + accum_value;
 300
 301     // Store result in the accummulate buffer
 302     vstore8(accum_value, 0, (__global half *)accum.ptr);
 303 }
 304
 305 #if(defined WIDTH_MATRIX_B)
 306 /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
 307  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_u8 and @ref gemm_transpose1x16_u8 before running the matrix multiplication
 308  *
 309  * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B
 310  *
 311  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported formats: U8
 312  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
 313  * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 314  * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
 315  * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 316  * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
 317  * @param[in]  src1_ptr                           Pointer to the source matrix. Supported formats: U8
 318  * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
 319  * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 320  * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
 321  * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 322  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
 323  * @param[out] dst_ptr                            Pointer to the destination matrix Supported formats: U8
 324  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
 325  * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
 326  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
 327  * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
 328  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
 329  * @param[in]  a_offset                           Offset to be added to each element of the matrix A
 330  * @param[in]  b_offset                           Offset to be added to each element of the matrix B.
 331  * @param[in]  c_offset                           Offset to be added to each element of the matrix C.
 332  * @param[in]  c_mult_int                         Multiplied with each element of the matrix C.
 333  * @param[in]  shift                              Number of bits to shift right the result.
 334  */
 335 __kernel void gemm_mm_u8(IMAGE_DECLARATION(src0),
 336                          IMAGE_DECLARATION(src1),
 337                          IMAGE_DECLARATION(dst),
 338                          int a_offset,
 339                          int b_offset,
 340                          int c_offset,
 341                          int c_mult_int,
 342                          int shift)
 343 {
 344     /* src_addr.s0 = address of matrix A */
 345     /* src_addr.s1 = address of matrix B */
 346
 347     /* Compute address for matrix A and B */
 348     int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
 349                                                                         (src1_stride_y));
 350
 351     /* Add offset_first_element_in_bytes */
 352     src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
 353
 354     /* Compute end row address for matrix B */
 355     int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
 356
 357     /* Reset accumulators */
 358     int16 c00 = 0.0f;
 359     int16 c10 = 0.0f;
 360     int16 c20 = 0.0f;
 361     int16 c30 = 0.0f;
 362
 363     for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 32))
 364     {
 365         /* Load values from matrix A (interleaved) and matrix B (transposed) */
 366         int8 a0  = (int8)a_offset + convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0));
 367         int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
 368
 369         c00 += (int16)a0.s0 * b0;
 370         c10 += (int16)a0.s1 * b0;
 371         c20 += (int16)a0.s2 * b0;
 372         c30 += (int16)a0.s3 * b0;
 373
 374         int16 b1 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16));
 375
 376         c00 += (int16)a0.s4 * b1;
 377         c10 += (int16)a0.s5 * b1;
 378         c20 += (int16)a0.s6 * b1;
 379         c30 += (int16)a0.s7 * b1;
 380     }
 381
 382     for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16))
 383     {
 384         /* Load values from matrix A (interleaved) and matrix B (transposed) */
 385         int4 a0  = (int4)a_offset + convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0));
 386         int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
 387
 388         c00 += (int16)a0.s0 * b0;
 389         c10 += (int16)a0.s1 * b0;
 390         c20 += (int16)a0.s2 * b0;
 391         c30 += (int16)a0.s3 * b0;
 392     }
 393
 394     /* Compute destination address */
 395     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 396
 397     /* Multiply by the weight of matrix product */
 398     c00 = (((int16)c_offset + c00) * (int16)c_mult_int) >> shift;
 399     c10 = (((int16)c_offset + c10) * (int16)c_mult_int) >> shift;
 400     c20 = (((int16)c_offset + c20) * (int16)c_mult_int) >> shift;
 401     c30 = (((int16)c_offset + c30) * (int16)c_mult_int) >> shift;
 402
 403     /* Store 4x16 block */
 404     vstore16(convert_uchar16_sat(c00), 0, (__global uchar *)(offset(&dst, 0, 0)));
 405     vstore16(convert_uchar16_sat(c10), 0, (__global uchar *)(offset(&dst, 0, 1)));
 406     vstore16(convert_uchar16_sat(c20), 0, (__global uchar *)(offset(&dst, 0, 2)));
 407     vstore16(convert_uchar16_sat(c30), 0, (__global uchar *)(offset(&dst, 0, 3)));
 408 }
 409 #endif
 410
 411 /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
 412  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f32 and @ref gemm_transpose1x4_f32 before running the matrix multiplication
 413  *
 414  * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
 415  *
 416  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
 417  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
 418  * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 419  * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
 420  * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 421  * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
 422  * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F32
 423  * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
 424  * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 425  * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
 426  * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 427  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
 428  * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F32
 429  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
 430  * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
 431  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
 432  * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
 433  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
 434  */
 435 #if(defined WIDTH_MATRIX_B && defined ALPHA)
 436 __kernel void gemm_mm_f32(IMAGE_DECLARATION(src0),
 437                           IMAGE_DECLARATION(src1),
 438                           IMAGE_DECLARATION(dst))
 439 {
 440     /* src_addr.s0 = address of matrix A */
 441     /* src_addr.s1 = address of matrix B */
 442
 443     /* Compute address for matrix A and B */
 444     int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
 445                                                                         (src1_stride_y));
 446
 447     /* Add offset_first_element_in_bytes */
 448     src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
 449
 450     /* Divide by 4 in order to get the src_addr in unit of float */
 451     src_addr = src_addr >> 2;
 452
 453     /* Compute end row address for matrix B */
 454     int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
 455
 456     /* Reset accumulators */
 457     float4 c00 = 0.0f;
 458     float4 c10 = 0.0f;
 459     float4 c20 = 0.0f;
 460     float4 c30 = 0.0f;
 461
 462     for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 8))
 463     {
 464         /* Load values from matrix A (interleaved) and matrix B (transposed) */
 465         float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
 466         float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
 467
 468         c00 += (float4)a0.s0 * b0;
 469         c10 += (float4)a0.s1 * b0;
 470         c20 += (float4)a0.s2 * b0;
 471         c30 += (float4)a0.s3 * b0;
 472
 473         /* Load values from matrix A (interleaved) and matrix B (transposed) */
 474         a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0 + 4);
 475         b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1 + 4);
 476
 477         c00 += (float4)a0.s0 * b0;
 478         c10 += (float4)a0.s1 * b0;
 479         c20 += (float4)a0.s2 * b0;
 480         c30 += (float4)a0.s3 * b0;
 481     }
 482
 483     for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 4))
 484     {
 485         /* Load values from matrix A (interleaved) and matrix B (transposed) */
 486         float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
 487         float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
 488
 489         c00 += (float4)a0.s0 * b0;
 490         c10 += (float4)a0.s1 * b0;
 491         c20 += (float4)a0.s2 * b0;
 492         c30 += (float4)a0.s3 * b0;
 493     }
 494
 495     /* Compute destination address */
 496     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 497
 498     /* Multiply by the weight of matrix product */
 499     c00 = c00 * (float4)ALPHA;
 500     c10 = c10 * (float4)ALPHA;
 501     c20 = c20 * (float4)ALPHA;
 502     c30 = c30 * (float4)ALPHA;
 503
 504     /* Store 4x4 block */
 505     vstore4(c00, 0, (__global float *)(offset(&dst, 0, 0)));
 506     vstore4(c10, 0, (__global float *)(offset(&dst, 0, 1)));
 507     vstore4(c20, 0, (__global float *)(offset(&dst, 0, 2)));
 508     vstore4(c30, 0, (__global float *)(offset(&dst, 0, 3)));
 509 }
 510
 511 /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
 512  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f16 and @ref gemm_transpose1x8_f16 before running the matrix multiplication
 513  *
 514  * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
 515  *
 516  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
 517  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
 518  * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 519  * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
 520  * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 521  * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
 522  * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F16
 523  * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
 524  * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 525  * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
 526  * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 527  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
 528  * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F16
 529  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
 530  * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
 531  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
 532  * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
 533  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
 534  */
 535 __kernel void gemm_mm_f16(IMAGE_DECLARATION(src0),
 536                           IMAGE_DECLARATION(src1),
 537                           IMAGE_DECLARATION(dst))
 538 {
 539     /* src_addr.s0 = address of matrix A */
 540     /* src_addr.s1 = address of matrix B */
 541
 542     /* Compute address for matrix A and B */
 543     int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
 544                                                                         (src1_stride_y));
 545
 546     /* Add offset_first_element_in_bytes */
 547     src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
 548
 549     /* Divide by 2 in order to get the src_addr in unit of half */
 550     src_addr = src_addr >> 1;
 551
 552     /* Compute end row address for matrix B */
 553     int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
 554
 555     /* Reset accumulators */
 556     half8 c00 = 0.0f;
 557     half8 c10 = 0.0f;
 558     half8 c20 = 0.0f;
 559     half8 c30 = 0.0f;
 560
 561     for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 16))
 562     {
 563         /* Load values from matrix A (interleaved) and matrix B (transposed) */
 564         half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
 565         half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
 566
 567         c00 += (half8)a0.s0 * b0;
 568         c10 += (half8)a0.s1 * b0;
 569         c20 += (half8)a0.s2 * b0;
 570         c30 += (half8)a0.s3 * b0;
 571
 572         /* Load values from matrix A (interleaved) and matrix B (transposed) */
 573         a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0 + 4);
 574         b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1 + 8);
 575
 576         c00 += (half8)a0.s0 * b0;
 577         c10 += (half8)a0.s1 * b0;
 578         c20 += (half8)a0.s2 * b0;
 579         c30 += (half8)a0.s3 * b0;
 580     }
 581
 582     for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 8))
 583     {
 584         /* Load values from matrix A (interleaved) and matrix B (transposed) */
 585         half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
 586         half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
 587
 588         c00 += (half8)a0.s0 * b0;
 589         c10 += (half8)a0.s1 * b0;
 590         c20 += (half8)a0.s2 * b0;
 591         c30 += (half8)a0.s3 * b0;
 592     }
 593
 594     /* Compute destination address */
 595     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 596
 597     /* Multiply by the weight of matrix product */
 598     c00 = c00 * (half8)ALPHA;
 599     c10 = c10 * (half8)ALPHA;
 600     c20 = c20 * (half8)ALPHA;
 601     c30 = c30 * (half8)ALPHA;
 602
 603     /* Store 4x8 block */
 604     vstore8(c00, 0, (__global half *)(offset(&dst, 0, 0)));
 605     vstore8(c10, 0, (__global half *)(offset(&dst, 0, 1)));
 606     vstore8(c20, 0, (__global half *)(offset(&dst, 0, 2)));
 607     vstore8(c30, 0, (__global half *)(offset(&dst, 0, 3)));
 608 }
 609
 610 /** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)
 611  *
 612  * @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA
 613  *
 614  * @attention The input vector A and matrix B must not be reshaped
 615  *
 616  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
 617  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
 618  * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 619  * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
 620  * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 621  * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
 622  * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F32
 623  * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
 624  * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 625  * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
 626  * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 627  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
 628  * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F32
 629  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
 630  * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
 631  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
 632  * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
 633  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
 634  */
 635 #if(defined WIDTH_VECTOR_A)
 636 __kernel void gemm_vm_f32(IMAGE_DECLARATION(src0),
 637                           IMAGE_DECLARATION(src1),
 638                           IMAGE_DECLARATION(dst))
 639 {
 640     int idx = get_global_id(0) * 4;
 641
 642     /* Compute the address for the vector A and matrix B */
 643     int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
 644     src_addr.s1 += idx * sizeof(float);
 645
 646     int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
 647
 648     float4 acc = 0.0f;
 649
 650     for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
 651     {
 652         float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
 653         float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
 654         float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
 655
 656         acc += b0 * (float4)a0.s0;
 657         acc += b1 * (float4)a0.s1;
 658     }
 659
 660     for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
 661     {
 662         float  a0 = *((__global float *)(src0_ptr + src_addr.s0));
 663         float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
 664
 665         acc += b0 * (float4)a0;
 666     }
 667
 668     /* Compute destination address */
 669     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 670
 671     /* Multiply by the weight of vector-matrix product */
 672     acc = acc * (float4)ALPHA;
 673
 674     vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
 675 }
 676
 677 /** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)
 678  *
 679  * @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA
 680  *
 681  * @attention The input vector A and matrix B must not be reshaped
 682  *
 683  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
 684  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
 685  * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 686  * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
 687  * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 688  * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
 689  * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: F16
 690  * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
 691  * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 692  * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
 693  * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 694  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
 695  * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: F16
 696  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
 697  * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
 698  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
 699  * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
 700  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
 701  */
 702 __kernel void gemm_vm_f16(IMAGE_DECLARATION(src0),
 703                           IMAGE_DECLARATION(src1),
 704                           IMAGE_DECLARATION(dst))
 705 {
 706     int idx = get_global_id(0) * 8;
 707
 708     /* Compute the address for the vector A and matrix B */
 709     int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
 710     src_addr.s1 += idx * sizeof(half);
 711
 712     int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(half));
 713
 714     half8 acc = 0.0f;
 715
 716     for(; src_addr.s0 <= (end_row_vec_a - 4 * sizeof(half)); src_addr += (int2)(4 * sizeof(half), 4 * src1_stride_y))
 717     {
 718         half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0));
 719         half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
 720         half8 b1 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
 721         half8 b2 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 2 * src1_stride_y));
 722         half8 b3 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 3 * src1_stride_y));
 723
 724         acc += b0 * (half8)a0.s0;
 725         acc += b1 * (half8)a0.s1;
 726         acc += b2 * (half8)a0.s2;
 727         acc += b3 * (half8)a0.s3;
 728     }
 729
 730     for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(half), src1_stride_y))
 731     {
 732         half a0  = *((__global half *)(src0_ptr + src_addr.s0));
 733         half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
 734
 735         acc += b0 * (half8)a0;
 736     }
 737
 738     /* Compute destination address */
 739     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 740
 741     /* Multiply by the weight of vector-matrix product */
 742     acc = acc * (half8)ALPHA;
 743
 744     vstore8(acc, 0, (__global half *)(offset(&dst, 0, 0)));
 745 }
 746 #endif /* (defined WIDTH_VECTOR_A) */
 747 #endif /* (defined WIDTH_MATRIX_B && defined ALPHA) */
 748
 749 /** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
 750  *
 751  * @attention The beta's value need to be passed at compile time using -DBETA
 752  *
 753  * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
 754  * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
 755  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 756  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
 757  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 758  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
 759  * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: F32
 760  * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
 761  * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
 762  * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
 763  * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
 764  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
 765  */
 766 #if(defined BETA)
 767 __kernel void gemm_ma_f32(IMAGE_DECLARATION(src),
 768                           IMAGE_DECLARATION(dst))
 769 {
 770     /* Compute source and destination addresses */
 771     Image src = CONVERT_TO_IMAGE_STRUCT(src);
 772     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 773
 774     /* Load values from A x B */
 775     float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
 776
 777     /* Load values from Matrix C */
 778     float4 c = vload4(0, (__global float *)src.ptr);
 779
 780     /* Computes alpha * axb + beta * c */
 781     float4 out = alpha_ab + (float4)BETA * c;
 782
 783     /* Store final result in axb matrix */
 784     vstore4(out, 0, (__global float *)dst.ptr);
 785 }
 786
 787 /** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
 788  *
 789  * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F16
 790  * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
 791  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 792  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
 793  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 794  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
 795  * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: F16
 796  * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
 797  * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
 798  * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
 799  * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
 800  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
 801  */
 802 __kernel void gemm_ma_f16(IMAGE_DECLARATION(src),
 803                           IMAGE_DECLARATION(dst))
 804 {
 805     /* Compute source and destination addresses */
 806     Image src = CONVERT_TO_IMAGE_STRUCT(src);
 807     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 808
 809     /* Load values from A x B */
 810     half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
 811
 812     /* Load values from Matrix C */
 813     half8 c = vload8(0, (__global half *)src.ptr);
 814
 815     /* Computes alpha * axb + beta * c */
 816     half8 out = alpha_ab + (half8)BETA * c;
 817
 818     /* Store final result in axb matrix */
 819     vstore8(out, 0, (__global half *)dst.ptr);
 820 }
 821 #endif /* (defined BETA) */