2 * Copyright (c) 2017 ARM Limited.
4 * SPDX-License-Identifier: MIT
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 /** This OpenCL kernel computes the "vector" 1x4 transposition of input matrix
28 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
29 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
30 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
31 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
32 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
33 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
34 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32
35 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
36 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
37 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
38 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
39 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
41 __kernel void gemm_transpose1x4_f32(IMAGE_DECLARATION(src),
42 IMAGE_DECLARATION(dst))
44 uint x = get_global_id(0);
45 uint y = get_global_id(1);
47 /* Compute address for Matrix B - source */
48 Image src = CONVERT_TO_IMAGE_STRUCT(src);
50 /* Compute address for Matrix B transposed - destination. X and Y are swapped */
51 uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
53 float4 b0 = vload4(0, (__global float *)src.ptr);
55 vstore4(b0, 0, (__global float *)(dst_ptr + dst_addr_in_bytes));
58 /** This OpenCL kernel computes the "vector" 1x8 transposition of input matrix
60 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F16
61 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
62 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
63 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
64 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
65 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
66 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F16
67 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
68 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
69 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
70 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
71 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
73 __kernel void gemm_transpose1x8_f16(IMAGE_DECLARATION(src),
74 IMAGE_DECLARATION(dst))
76 uint x = get_global_id(0);
77 uint y = get_global_id(1);
79 /* Compute address for Matrix B - source */
80 Image src = CONVERT_TO_IMAGE_STRUCT(src);
82 /* Compute address for Matrix B transposed - destination. X and Y are swapped */
83 uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
85 half8 b0 = vload8(0, (__global half *)src.ptr);
87 vstore8(b0, 0, (__global half *)(dst_ptr + dst_addr_in_bytes));
90 /** This OpenCL kernel computes the "vector" 1x16 transposition of input matrix
92 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8
93 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
94 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
95 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
96 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
97 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
98 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: U8
99 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
100 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
101 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
102 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
103 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
105 __kernel void gemm_transpose1x16_u8(IMAGE_DECLARATION(src),
106 IMAGE_DECLARATION(dst))
108 uint x = get_global_id(0);
109 uint y = get_global_id(1);
111 /* Compute address for Matrix B - source */
112 Image src = CONVERT_TO_IMAGE_STRUCT(src);
114 /* Compute address for Matrix B transposed - destination. X and Y are swapped */
115 uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
117 uchar16 b0 = vload16(0, (__global uchar *)src.ptr);
119 vstore16(b0, 0, (__global uchar *)(dst_ptr + dst_addr_in_bytes));
122 /** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
124 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U32/S32/F32
125 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
126 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
127 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
128 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
129 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
130 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: U32/S32/F32
131 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
132 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
133 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
134 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
135 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
137 __kernel void gemm_interleave4x4_32bit(IMAGE_DECLARATION(src),
138 IMAGE_DECLARATION(dst))
140 /* Compute source and destination addresses */
141 Image src = CONVERT_TO_IMAGE_STRUCT(src);
142 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
144 /* Load values from Matrix A */
145 float4 a0 = vload4(0, (__global float *)(offset(&src, 0, 0)));
146 float4 a1 = vload4(0, (__global float *)(offset(&src, 0, 1)));
147 float4 a2 = vload4(0, (__global float *)(offset(&src, 0, 2)));
148 float4 a3 = vload4(0, (__global float *)(offset(&src, 0, 3)));
150 float4 val0 = (float4)(a0.s0, a1.s0, a2.s0, a3.s0);
151 vstore4(val0, 0, ((__global float *)dst.ptr) + 0);
153 val0 = (float4)(a0.s1, a1.s1, a2.s1, a3.s1);
154 vstore4(val0, 0, ((__global float *)dst.ptr) + 4);
156 val0 = (float4)(a0.s2, a1.s2, a2.s2, a3.s2);
157 vstore4(val0, 0, ((__global float *)dst.ptr) + 8);
159 val0 = (float4)(a0.s3, a1.s3, a2.s3, a3.s3);
160 vstore4(val0, 0, ((__global float *)dst.ptr) + 12);
163 /** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
165 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U16/S16/F16
166 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
167 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
168 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
169 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
170 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
171 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: U16/S16/F16
172 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
173 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
174 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
175 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
176 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
178 __kernel void gemm_interleave4x4_16bit(IMAGE_DECLARATION(src),
179 IMAGE_DECLARATION(dst))
181 /* Compute source and destination addresses */
182 Image src = CONVERT_TO_IMAGE_STRUCT(src);
183 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
185 /* Load values from Matrix A */
186 half8 a0 = vload8(0, (__global half *)(offset(&src, 0, 0)));
187 half8 a1 = vload8(0, (__global half *)(offset(&src, 0, 1)));
188 half8 a2 = vload8(0, (__global half *)(offset(&src, 0, 2)));
189 half8 a3 = vload8(0, (__global half *)(offset(&src, 0, 3)));
191 half8 val0 = (half8)((half4)(a0.s0, a1.s0, a2.s0, a3.s0), (half4)(a0.s1, a1.s1, a2.s1, a3.s1));
192 vstore8(val0, 0, ((__global half *)dst.ptr) + 0);
194 val0 = (half8)((half4)(a0.s2, a1.s2, a2.s2, a3.s2), (half4)(a0.s3, a1.s3, a2.s3, a3.s3));
195 vstore8(val0, 0, ((__global half *)dst.ptr) + 8);
197 val0 = (half8)((half4)(a0.s4, a1.s4, a2.s4, a3.s4), (half4)(a0.s5, a1.s5, a2.s5, a3.s5));
198 vstore8(val0, 0, ((__global half *)dst.ptr) + 16);
200 val0 = (half8)((half4)(a0.s6, a1.s6, a2.s6, a3.s6), (half4)(a0.s7, a1.s7, a2.s7, a3.s7));
201 vstore8(val0, 0, ((__global half *)dst.ptr) + 24);
204 /** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
206 * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8
207 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
208 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
209 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
210 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
211 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
212 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: U8/S8
213 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
214 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
215 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
216 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
217 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
219 __kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src),
220 IMAGE_DECLARATION(dst))
222 /* Compute source and destination addresses */
223 Image src = CONVERT_TO_IMAGE_STRUCT(src);
224 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
226 /* Load values from Matrix A */
227 uchar16 a0 = vload16(0, (__global uchar *)(offset(&src, 0, 0)));
228 uchar16 a1 = vload16(0, (__global uchar *)(offset(&src, 0, 1)));
229 uchar16 a2 = vload16(0, (__global uchar *)(offset(&src, 0, 2)));
230 uchar16 a3 = vload16(0, (__global uchar *)(offset(&src, 0, 3)));
232 uchar16 val0 = (uchar16)((uchar4)(a0.s0, a1.s0, a2.s0, a3.s0), (uchar4)(a0.s1, a1.s1, a2.s1, a3.s1),
233 (uchar4)(a0.s2, a1.s2, a2.s2, a3.s2), (uchar4)(a0.s3, a1.s3, a2.s3, a3.s3));
234 vstore16(val0, 0, ((__global uchar *)dst.ptr) + 0);
236 val0 = (uchar16)((uchar4)(a0.s4, a1.s4, a2.s4, a3.s4), (uchar4)(a0.s5, a1.s5, a2.s5, a3.s5),
237 (uchar4)(a0.s6, a1.s6, a2.s6, a3.s6), (uchar4)(a0.s7, a1.s7, a2.s7, a3.s7));
238 vstore16(val0, 0, ((__global uchar *)dst.ptr) + 16);
240 val0 = (uchar16)((uchar4)(a0.s8, a1.s8, a2.s8, a3.s8), (uchar4)(a0.s9, a1.s9, a2.s9, a3.s9),
241 (uchar4)(a0.sA, a1.sA, a2.sA, a3.sA), (uchar4)(a0.sB, a1.sB, a2.sB, a3.sB));
242 vstore16(val0, 0, ((__global uchar *)dst.ptr) + 32);
244 val0 = (uchar16)((uchar4)(a0.sC, a1.sC, a2.sC, a3.sC), (uchar4)(a0.sD, a1.sD, a2.sD, a3.sD),
245 (uchar4)(a0.sE, a1.sE, a2.sE, a3.sE), (uchar4)(a0.sF, a1.sF, a2.sF, a3.sF));
246 vstore16(val0, 0, ((__global uchar *)dst.ptr) + 48);
249 /** This kernel accumulates each row with the biases vector
251 * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: F32
252 * @param[in] accum_stride_x Stride of the accmulate tensor in X dimension (in bytes)
253 * @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes)
254 * @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes)
255 * @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
256 * @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor
257 * @param[in] biases_ptr Pointer to the biases vector. Same as input.
258 * @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes)
259 * @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
260 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
262 __kernel void gemm_accumulate_biases_f32(
263 IMAGE_DECLARATION(accum),
264 VECTOR_DECLARATION(biases))
266 Image accum = CONVERT_TO_IMAGE_STRUCT(accum);
267 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
269 float4 accum_value = vload4(0, (__global float *)accum.ptr);
270 float4 biases_value = vload4(0, (__global float *)biases.ptr);
271 accum_value = biases_value + accum_value;
273 // Store result in the accummulate buffer
274 vstore4(accum_value, 0, (__global float *)accum.ptr);
277 /** This kernel accumulates each row with the biases vector
279 * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: F16
280 * @param[in] accum_stride_x Stride of the accumulate tensor in X dimension (in bytes)
281 * @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes)
282 * @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes)
283 * @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
284 * @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor
285 * @param[in] biases_ptr Pointer to the biases vector. Same as input.
286 * @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes)
287 * @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
288 * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
290 __kernel void gemm_accumulate_biases_f16(
291 IMAGE_DECLARATION(accum),
292 VECTOR_DECLARATION(biases))
294 Image accum = CONVERT_TO_IMAGE_STRUCT(accum);
295 Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
297 half8 accum_value = vload8(0, (__global half *)accum.ptr);
298 half8 biases_value = vload8(0, (__global half *)biases.ptr);
299 accum_value = biases_value + accum_value;
301 // Store result in the accummulate buffer
302 vstore8(accum_value, 0, (__global half *)accum.ptr);
305 #if(defined WIDTH_MATRIX_B)
306 /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
307 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_u8 and @ref gemm_transpose1x16_u8 before running the matrix multiplication
309 * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B
311 * @param[in] src0_ptr Pointer to the source matrix. Supported formats: U8
312 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
313 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
314 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
315 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
316 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
317 * @param[in] src1_ptr Pointer to the source matrix. Supported formats: U8
318 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
319 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
320 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
321 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
322 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
323 * @param[out] dst_ptr Pointer to the destination matrix Supported formats: U8
324 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
325 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
326 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
327 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
328 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
329 * @param[in] a_offset Offset to be added to each element of the matrix A
330 * @param[in] b_offset Offset to be added to each element of the matrix B.
331 * @param[in] c_offset Offset to be added to each element of the matrix C.
332 * @param[in] c_mult_int Multiplied with each element of the matrix C.
333 * @param[in] shift Number of bits to shift right the result.
335 __kernel void gemm_mm_u8(IMAGE_DECLARATION(src0),
336 IMAGE_DECLARATION(src1),
337 IMAGE_DECLARATION(dst),
344 /* src_addr.s0 = address of matrix A */
345 /* src_addr.s1 = address of matrix B */
347 /* Compute address for matrix A and B */
348 int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
351 /* Add offset_first_element_in_bytes */
352 src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
354 /* Compute end row address for matrix B */
355 int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
357 /* Reset accumulators */
363 for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 32))
365 /* Load values from matrix A (interleaved) and matrix B (transposed) */
366 int8 a0 = (int8)a_offset + convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0));
367 int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
369 c00 += (int16)a0.s0 * b0;
370 c10 += (int16)a0.s1 * b0;
371 c20 += (int16)a0.s2 * b0;
372 c30 += (int16)a0.s3 * b0;
374 int16 b1 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16));
376 c00 += (int16)a0.s4 * b1;
377 c10 += (int16)a0.s5 * b1;
378 c20 += (int16)a0.s6 * b1;
379 c30 += (int16)a0.s7 * b1;
382 for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16))
384 /* Load values from matrix A (interleaved) and matrix B (transposed) */
385 int4 a0 = (int4)a_offset + convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0));
386 int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
388 c00 += (int16)a0.s0 * b0;
389 c10 += (int16)a0.s1 * b0;
390 c20 += (int16)a0.s2 * b0;
391 c30 += (int16)a0.s3 * b0;
394 /* Compute destination address */
395 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
397 /* Multiply by the weight of matrix product */
398 c00 = (((int16)c_offset + c00) * (int16)c_mult_int) >> shift;
399 c10 = (((int16)c_offset + c10) * (int16)c_mult_int) >> shift;
400 c20 = (((int16)c_offset + c20) * (int16)c_mult_int) >> shift;
401 c30 = (((int16)c_offset + c30) * (int16)c_mult_int) >> shift;
403 /* Store 4x16 block */
404 vstore16(convert_uchar16_sat(c00), 0, (__global uchar *)(offset(&dst, 0, 0)));
405 vstore16(convert_uchar16_sat(c10), 0, (__global uchar *)(offset(&dst, 0, 1)));
406 vstore16(convert_uchar16_sat(c20), 0, (__global uchar *)(offset(&dst, 0, 2)));
407 vstore16(convert_uchar16_sat(c30), 0, (__global uchar *)(offset(&dst, 0, 3)));
411 /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
412 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f32 and @ref gemm_transpose1x4_f32 before running the matrix multiplication
414 * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
416 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
417 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
418 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
419 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
420 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
421 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
422 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F32
423 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
424 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
425 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
426 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
427 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
428 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32
429 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
430 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
431 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
432 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
433 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
435 #if(defined WIDTH_MATRIX_B && defined ALPHA)
436 __kernel void gemm_mm_f32(IMAGE_DECLARATION(src0),
437 IMAGE_DECLARATION(src1),
438 IMAGE_DECLARATION(dst))
440 /* src_addr.s0 = address of matrix A */
441 /* src_addr.s1 = address of matrix B */
443 /* Compute address for matrix A and B */
444 int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
447 /* Add offset_first_element_in_bytes */
448 src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
450 /* Divide by 4 in order to get the src_addr in unit of float */
451 src_addr = src_addr >> 2;
453 /* Compute end row address for matrix B */
454 int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
456 /* Reset accumulators */
462 for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 8))
464 /* Load values from matrix A (interleaved) and matrix B (transposed) */
465 float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
466 float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
468 c00 += (float4)a0.s0 * b0;
469 c10 += (float4)a0.s1 * b0;
470 c20 += (float4)a0.s2 * b0;
471 c30 += (float4)a0.s3 * b0;
473 /* Load values from matrix A (interleaved) and matrix B (transposed) */
474 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0 + 4);
475 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1 + 4);
477 c00 += (float4)a0.s0 * b0;
478 c10 += (float4)a0.s1 * b0;
479 c20 += (float4)a0.s2 * b0;
480 c30 += (float4)a0.s3 * b0;
483 for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 4))
485 /* Load values from matrix A (interleaved) and matrix B (transposed) */
486 float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
487 float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
489 c00 += (float4)a0.s0 * b0;
490 c10 += (float4)a0.s1 * b0;
491 c20 += (float4)a0.s2 * b0;
492 c30 += (float4)a0.s3 * b0;
495 /* Compute destination address */
496 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
498 /* Multiply by the weight of matrix product */
499 c00 = c00 * (float4)ALPHA;
500 c10 = c10 * (float4)ALPHA;
501 c20 = c20 * (float4)ALPHA;
502 c30 = c30 * (float4)ALPHA;
504 /* Store 4x4 block */
505 vstore4(c00, 0, (__global float *)(offset(&dst, 0, 0)));
506 vstore4(c10, 0, (__global float *)(offset(&dst, 0, 1)));
507 vstore4(c20, 0, (__global float *)(offset(&dst, 0, 2)));
508 vstore4(c30, 0, (__global float *)(offset(&dst, 0, 3)));
511 /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
512 * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f16 and @ref gemm_transpose1x8_f16 before running the matrix multiplication
514 * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA
516 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
517 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
518 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
519 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
520 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
521 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
522 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F16
523 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
524 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
525 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
526 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
527 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
528 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F16
529 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
530 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
531 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
532 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
533 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
535 __kernel void gemm_mm_f16(IMAGE_DECLARATION(src0),
536 IMAGE_DECLARATION(src1),
537 IMAGE_DECLARATION(dst))
539 /* src_addr.s0 = address of matrix A */
540 /* src_addr.s1 = address of matrix B */
542 /* Compute address for matrix A and B */
543 int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
546 /* Add offset_first_element_in_bytes */
547 src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
549 /* Divide by 2 in order to get the src_addr in unit of half */
550 src_addr = src_addr >> 1;
552 /* Compute end row address for matrix B */
553 int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;
555 /* Reset accumulators */
561 for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 16))
563 /* Load values from matrix A (interleaved) and matrix B (transposed) */
564 half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
565 half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
567 c00 += (half8)a0.s0 * b0;
568 c10 += (half8)a0.s1 * b0;
569 c20 += (half8)a0.s2 * b0;
570 c30 += (half8)a0.s3 * b0;
572 /* Load values from matrix A (interleaved) and matrix B (transposed) */
573 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0 + 4);
574 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1 + 8);
576 c00 += (half8)a0.s0 * b0;
577 c10 += (half8)a0.s1 * b0;
578 c20 += (half8)a0.s2 * b0;
579 c30 += (half8)a0.s3 * b0;
582 for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 8))
584 /* Load values from matrix A (interleaved) and matrix B (transposed) */
585 half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
586 half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
588 c00 += (half8)a0.s0 * b0;
589 c10 += (half8)a0.s1 * b0;
590 c20 += (half8)a0.s2 * b0;
591 c30 += (half8)a0.s3 * b0;
594 /* Compute destination address */
595 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
597 /* Multiply by the weight of matrix product */
598 c00 = c00 * (half8)ALPHA;
599 c10 = c10 * (half8)ALPHA;
600 c20 = c20 * (half8)ALPHA;
601 c30 = c30 * (half8)ALPHA;
603 /* Store 4x8 block */
604 vstore8(c00, 0, (__global half *)(offset(&dst, 0, 0)));
605 vstore8(c10, 0, (__global half *)(offset(&dst, 0, 1)));
606 vstore8(c20, 0, (__global half *)(offset(&dst, 0, 2)));
607 vstore8(c30, 0, (__global half *)(offset(&dst, 0, 3)));
610 /** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)
612 * @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA
614 * @attention The input vector A and matrix B must not be reshaped
616 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
617 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
618 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
619 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
620 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
621 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
622 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F32
623 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
624 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
625 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
626 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
627 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
628 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32
629 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
630 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
631 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
632 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
633 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
635 #if(defined WIDTH_VECTOR_A)
636 __kernel void gemm_vm_f32(IMAGE_DECLARATION(src0),
637 IMAGE_DECLARATION(src1),
638 IMAGE_DECLARATION(dst))
640 int idx = get_global_id(0) * 4;
642 /* Compute the address for the vector A and matrix B */
643 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
644 src_addr.s1 += idx * sizeof(float);
646 int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
650 for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
652 float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
653 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
654 float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
656 acc += b0 * (float4)a0.s0;
657 acc += b1 * (float4)a0.s1;
660 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
662 float a0 = *((__global float *)(src0_ptr + src_addr.s0));
663 float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
665 acc += b0 * (float4)a0;
668 /* Compute destination address */
669 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
671 /* Multiply by the weight of vector-matrix product */
672 acc = acc * (float4)ALPHA;
674 vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
677 /** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)
679 * @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA
681 * @attention The input vector A and matrix B must not be reshaped
683 * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
684 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
685 * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
686 * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
687 * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
688 * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
689 * @param[in] src1_ptr Pointer to the source matrix. Supported data types: F16
690 * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
691 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
692 * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
693 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
694 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
695 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F16
696 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
697 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
698 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
699 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
700 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
702 __kernel void gemm_vm_f16(IMAGE_DECLARATION(src0),
703 IMAGE_DECLARATION(src1),
704 IMAGE_DECLARATION(dst))
706 int idx = get_global_id(0) * 8;
708 /* Compute the address for the vector A and matrix B */
709 int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
710 src_addr.s1 += idx * sizeof(half);
712 int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(half));
716 for(; src_addr.s0 <= (end_row_vec_a - 4 * sizeof(half)); src_addr += (int2)(4 * sizeof(half), 4 * src1_stride_y))
718 half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0));
719 half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
720 half8 b1 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
721 half8 b2 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 2 * src1_stride_y));
722 half8 b3 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 3 * src1_stride_y));
724 acc += b0 * (half8)a0.s0;
725 acc += b1 * (half8)a0.s1;
726 acc += b2 * (half8)a0.s2;
727 acc += b3 * (half8)a0.s3;
730 for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(half), src1_stride_y))
732 half a0 = *((__global half *)(src0_ptr + src_addr.s0));
733 half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
735 acc += b0 * (half8)a0;
738 /* Compute destination address */
739 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
741 /* Multiply by the weight of vector-matrix product */
742 acc = acc * (half8)ALPHA;
744 vstore8(acc, 0, (__global half *)(offset(&dst, 0, 0)));
746 #endif /* (defined WIDTH_VECTOR_A) */
747 #endif /* (defined WIDTH_MATRIX_B && defined ALPHA) */
749 /** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
751 * @attention The beta's value need to be passed at compile time using -DBETA
753 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32
754 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
755 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
756 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
757 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
758 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
759 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32
760 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
761 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
762 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
763 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
764 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
767 __kernel void gemm_ma_f32(IMAGE_DECLARATION(src),
768 IMAGE_DECLARATION(dst))
770 /* Compute source and destination addresses */
771 Image src = CONVERT_TO_IMAGE_STRUCT(src);
772 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
774 /* Load values from A x B */
775 float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
777 /* Load values from Matrix C */
778 float4 c = vload4(0, (__global float *)src.ptr);
780 /* Computes alpha * axb + beta * c */
781 float4 out = alpha_ab + (float4)BETA * c;
783 /* Store final result in axb matrix */
784 vstore4(out, 0, (__global float *)dst.ptr);
787 /** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
789 * @param[in] src_ptr Pointer to the source matrix. Supported data types: F16
790 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)
791 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
792 * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)
793 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
794 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix
795 * @param[out] dst_ptr Pointer to the destination matrix Supported data types: F16
796 * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
797 * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
798 * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
799 * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
800 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
802 __kernel void gemm_ma_f16(IMAGE_DECLARATION(src),
803 IMAGE_DECLARATION(dst))
805 /* Compute source and destination addresses */
806 Image src = CONVERT_TO_IMAGE_STRUCT(src);
807 Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
809 /* Load values from A x B */
810 half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
812 /* Load values from Matrix C */
813 half8 c = vload8(0, (__global half *)src.ptr);
815 /* Computes alpha * axb + beta * c */
816 half8 out = alpha_ab + (half8)BETA * c;
818 /* Store final result in axb matrix */
819 vstore8(out, 0, (__global half *)dst.ptr);
821 #endif /* (defined BETA) */