1 // Copyright (c) 2018 Intel Corporation
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
15 #include "include/mmad.cl"
17 inline uint FUNC(calculate_output_offset_to_account_padding)(uint cOffset)
19 uint tmp_idx = cOffset;
20 uint f_val_idx = tmp_idx % 32;
22 uint b_val_idx = tmp_idx % 4;
24 uint x_idx = tmp_idx % OUTPUT_SIZE_X;
25 tmp_idx /= OUTPUT_SIZE_X;
26 uint y_idx = tmp_idx % OUTPUT_SIZE_Y;
27 tmp_idx /= OUTPUT_SIZE_Y;
28 uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4);
29 tmp_idx /= (OUTPUT_BATCH_NUM / 4);
30 uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32);
32 uint padded_offset = f_slice_idx * OUT_F_BLOCK_PITCH;
33 padded_offset += b_slice_idx * OUT_B_BLOCK_PITCH;
34 padded_offset += y_idx * OUT_Y_PITCH;
35 padded_offset += x_idx * OUT_X_PITCH;
36 padded_offset += b_val_idx * 32;
37 padded_offset += f_val_idx;
38 padded_offset += OUT_OFFSET;
43 inline void FUNC(mmad_32x32_int8)( __local uint* l_tileA, const uint l_offsetTileA,
44 __local int8* l_tileB, const uint l_offsetTileB_col0,
45 const uint l_offsetTileB_col1, const uint l_offsetTileB_col2,
46 const uint l_offsetTileB_col3, int8* rowA, int8* colB,
49 // Read tile A from SLM to regA
50 uint l_offsetTileATemp = l_offsetTileA;
51 __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
52 for (uint j = 0; j < (SG_TILE_M / 8); ++j)
54 rowA[j] = as_int8(SLM_BLOCK_READ_8(&l_tileA[l_offsetTileATemp]));
55 l_offsetTileATemp += 8 * SG_SIZE;
57 // Read tile B from SLM to regB and compute mmad
58 colB[0] = l_tileB[l_offsetTileB_col0];
59 colB[1] = l_tileB[l_offsetTileB_col1];
60 __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
61 for (uint j = 0; j < (SG_TILE_M / 8); ++j)
64 regC[0*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[0], regC[0*(SIMD_LANE_M / 8) + j]);
66 colB[0] = l_tileB[l_offsetTileB_col2];
67 __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
68 for (uint j = 0; j < (SG_TILE_M / 8); ++j)
71 regC[1*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[1], regC[1*(SIMD_LANE_M / 8) + j] );
73 colB[1] = l_tileB[l_offsetTileB_col3];
74 __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
75 for (uint j = 0; j < (SG_TILE_M / 8); ++j)
78 regC[2*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[0], regC[2*(SIMD_LANE_M / 8) + j]);
80 __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
81 for (uint j = 0; j < (SG_TILE_M / 8); ++j)
84 regC[3*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[1], regC[3*(SIMD_LANE_M / 8) + j]);
89 * \brief GEMM kernel to compute MxN matrix using SLM
90 * \param g_inA - Input matrix
91 * \param g_inB - Input matrix
92 * \param g_outC - Output matrix
95 __attribute__((intel_reqd_sub_group_size(SG_SIZE)))
96 KERNEL(Kernel_GEMM_MMAD8_32x32SG_128x128WG_SLM_INT8)
98 __global char* const g_inA,
100 __global char* const g_inB,
102 __global BIAS_TYPE* biases,
104 __global float* quantizations,
106 __global float* calibrations,
113 __global int4* const g_matrixA = (__global int4*)g_inA;
114 __global int4* const g_matrixB = (__global int4*)g_inB;
115 __global int8* g_matrixC = (__global int8*)g_outC;
117 // 1) All work-items in work-group fill SLM with tileA and tileB.
118 // 2) Each sub-group works to compute a 32x32 tileC (stored in regC).
119 // Note that each work-item in the sub-group computes a 32x4 chunk of tileC.
120 // 3) Repeat until tileC is fully computed (while moving tileA and tileB "windows")
121 __local int8 l_workGroupTileA_0[(WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)];
122 __local int8 l_workGroupTileB_0[(WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)];
123 __local uint* l_workGroupTileA_uint_0 = (__local uint*)l_workGroupTileA_0;
125 __local int8 l_workGroupTileA_1[(WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)];
126 __local int8 l_workGroupTileB_1[(WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)];
127 __local uint* l_workGroupTileA_uint_1 = (__local uint*)l_workGroupTileA_1;
129 __local int8* l_workGroupTileA_live = l_workGroupTileA_0;
130 __local int8* l_workGroupTileB_live = l_workGroupTileB_0;
131 __local uint* l_workGroupTileA_live_uint = l_workGroupTileA_uint_0;
133 __local int4* l_workGroupTileA_0_int4 = (__local int4*)l_workGroupTileA_0;
134 __local int4* l_workGroupTileB_0_int4 = (__local int4*)l_workGroupTileB_0;
135 __local int4* l_workGroupTileA_1_int4 = (__local int4*)l_workGroupTileA_1;
136 __local int4* l_workGroupTileB_1_int4 = (__local int4*)l_workGroupTileB_1;
138 const uint l_groupSize = get_local_size(DIM_X) * get_local_size(DIM_Y);
141 const uint g_tidY = get_global_id(DIM_Y);
142 const uint g_tidX = get_global_id(DIM_X);
143 const uint l_tidX = get_local_id(DIM_X);
144 const uint l_tidY = get_local_id(DIM_Y);
145 const uint l_tid = l_tidY * get_local_size(DIM_X) + l_tidX;
148 const uint sg_tid = get_sub_group_local_id();
149 const uint sg_global_idX = (uint)(g_tidX / SG_SIZE);
150 const uint sg_global_idY = g_tidY;
151 const uint sg_local_idX = (uint)(l_tidX / SG_SIZE);
152 const uint sg_local_idY = l_tidY;
153 const uint sg_local_id = sg_local_idY * get_local_size(DIM_X) / SG_SIZE + sg_local_idX;
155 const uint sub_group_id = get_sub_group_id();
158 int8 regC[(SIMD_LANE_M / 8) * SIMD_LANE_N] = {0}; // Each work-item responsible for 32x4 ints elts
159 int8 rowA[(SG_TILE_M * MATRIX_SMALL_K / SG_SIZE) / sizeof(int8)]; // each work-item will hold 1/8 of matrixA
160 int8 colB[2]; // each lane will store 32x4 piece of matrixB
163 const uint l_offsetTileA = SG_TILE_M * (MATRIX_SMALL_K / sizeof(uint)) * sg_local_idY;
164 const uint numElements32x32TileB = (MATRIX_SMALL_K * SG_TILE_N) / sizeof(int8);
165 const uint numElements32x8TileB = numElements32x32TileB / 4;
166 const uint l_offsetTileB = numElements32x32TileB * sg_local_idX;
167 const uint l_offsetTileB_col0 = l_offsetTileB + sg_tid;
168 const uint l_offsetTileB_col1 = l_offsetTileB + 1 * numElements32x8TileB + sg_tid;
169 const uint l_offsetTileB_col2 = l_offsetTileB + 2 * numElements32x8TileB + sg_tid;
170 const uint l_offsetTileB_col3 = l_offsetTileB + 3 * numElements32x8TileB + sg_tid;
173 #ifdef TILED_GLOBAL_LAYOUT // 32-row major (matrixA) and 32-col major (matrixB)
174 uint g_idxA = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_M) * get_group_id(DIM_Y) + l_tid;
175 uint g_idxB = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_N) * get_group_id(DIM_X) + l_tid;
176 #else // Row (matrixA) and Col (matrixB) major layout
177 uint g_idxA = WG_TILE_M * (MATRIX_K / sizeof(int4)) * get_group_id(DIM_Y) +
178 (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2);
179 uint g_idxB = WG_TILE_N * (MATRIX_K / sizeof(int4)) * get_group_id(DIM_X) +
180 (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2);
185 uint g_idxATemp = g_idxA;
186 for (uint i = l_tid; i < (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)); i += WG_SIZE)
188 l_workGroupTileA_0_int4[i] = g_matrixA[g_idxATemp];
189 #ifdef TILED_GLOBAL_LAYOUT
190 g_idxATemp += WG_SIZE;
192 g_idxATemp += (WG_SIZE / 2) * (MATRIX_K / sizeof(int4));
196 uint g_idxBTemp = g_idxB;
197 for (uint i = l_tid; i < (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)); i += WG_SIZE)
199 l_workGroupTileB_0_int4[i] = g_matrixB[g_idxBTemp];
200 #ifdef TILED_GLOBAL_LAYOUT
201 g_idxBTemp += WG_SIZE;
203 g_idxBTemp += (WG_SIZE / 2) * (MATRIX_K / sizeof(int4));
207 #ifdef TILED_GLOBAL_LAYOUT
208 g_idxA += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
209 g_idxB += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
211 g_idxA += MATRIX_SMALL_K / sizeof(int4);
212 g_idxB += MATRIX_SMALL_K / sizeof(int4);
215 barrier(CLK_LOCAL_MEM_FENCE);
218 int4 hdcReadValueA[(WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)) / WG_SIZE < 1
220 : (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)) / WG_SIZE];
221 int4 hdcReadValueB[(WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)) / WG_SIZE < 1
223 : (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)) / WG_SIZE];
225 __attribute__((opencl_unroll_hint(1)))
226 for (uint k = 0; k < (MATRIX_K / MATRIX_SMALL_K) - 1; k++)
229 * SLM setup - HDC read only
232 #if ((MATRIX_K / MATRIX_SMALL_K) > 1)
233 uint g_idxATemp = g_idxA;
234 for (uint i = l_tid, j = 0; i < (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)); i += WG_SIZE, ++j)
236 hdcReadValueA[j] = g_matrixA[g_idxATemp];
237 #ifdef TILED_GLOBAL_LAYOUT
238 g_idxATemp += WG_SIZE;
240 g_idxATemp += (WG_SIZE / 2) * (MATRIX_K / sizeof(int4));
244 uint g_idxBTemp = g_idxB;
245 for (uint i = l_tid, j = 0; i < (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)); i += WG_SIZE, ++j)
247 hdcReadValueB[j] = g_matrixB[g_idxBTemp];
248 #ifdef TILED_GLOBAL_LAYOUT
249 g_idxBTemp += WG_SIZE;
251 g_idxBTemp += (WG_SIZE / 2) * (MATRIX_K / sizeof(int4));
255 #ifdef TILED_GLOBAL_LAYOUT
256 g_idxA += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
257 g_idxB += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
259 g_idxA += MATRIX_SMALL_K / sizeof(int4);
260 g_idxB += MATRIX_SMALL_K / sizeof(int4);
268 FUNC_CALL(mmad_32x32_int8)(l_workGroupTileA_live_uint, l_offsetTileA, l_workGroupTileB_live,
269 l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2,
270 l_offsetTileB_col3, rowA, colB, regC);
273 * SLM setup - SLM write only
276 #if ((MATRIX_K / MATRIX_SMALL_K) > 1)
279 for (uint i = l_tid, j = 0; i < (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4));
282 l_workGroupTileA_1_int4[i] = hdcReadValueA[j];
285 for (uint i = l_tid, j = 0; i < (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4));
288 l_workGroupTileB_1_int4[i] = hdcReadValueB[j];
291 l_workGroupTileA_live = l_workGroupTileA_1;
292 l_workGroupTileB_live = l_workGroupTileB_1;
293 l_workGroupTileA_live_uint = l_workGroupTileA_uint_1;
297 for (uint i = l_tid, j = 0; i < (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4));
300 l_workGroupTileA_0_int4[i] = hdcReadValueA[j];
303 for (uint i = l_tid, j = 0; i < (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4));
306 l_workGroupTileB_0_int4[i] = hdcReadValueB[j];
309 l_workGroupTileA_live = l_workGroupTileA_0;
310 l_workGroupTileB_live = l_workGroupTileB_0;
311 l_workGroupTileA_live_uint = l_workGroupTileA_uint_0;
313 barrier(CLK_LOCAL_MEM_FENCE);
318 * Last MMAD compute iteration (avoids branching in main loop)
320 FUNC_CALL(mmad_32x32_int8)(l_workGroupTileA_live_uint, l_offsetTileA, l_workGroupTileB_live,
321 l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2,
322 l_offsetTileB_col3, rowA, colB, regC);
324 #ifdef OUTPUT_TILED_GLOBAL_LAYOUT
325 // Write out in swizzled manner after quantizing
326 __global uchar* g_outC_uchar = (__global uchar*)g_outC;
327 uint cOffset = sg_global_idX * (MATRIX_M * SG_TILE_N / sizeof(uchar)) +
328 sg_global_idY * (SG_TILE_M * SG_TILE_N / sizeof(uchar));
330 uchar8 regC_uchar8[SIMD_LANE_M * SIMD_LANE_N / (sizeof(uchar8) / sizeof(uchar))];
333 const uint workgroup_id_x = get_group_id(0);
334 uint feature_off = 32*(sub_group_id % (WG_TILE_N / 32)) + WG_TILE_N*workgroup_id_x; //=32*{0,1,2,3} + WG_TILE_N * workgroup_id_x
335 uint feature = get_sub_group_local_id() + feature_off;
337 float4 quant_f = as_float4(intel_sub_group_block_read4((__global uint*) (quantizations + feature) ));
338 float4 bias_f = as_float4(intel_sub_group_block_read4((__global uint*) (biases + feature) ));
339 float4 calib_f = as_float4(intel_sub_group_block_read4((__global uint*) (calibrations + feature) ));
341 #if MMAD_SUPPORTED == 1
342 __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) )))
344 for (uint i = 0; i < SG_TILE_M / (sizeof(int8) / sizeof(int)); i++)
346 // begin of account for output PADDING
347 uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset);
348 // end of account for padding
351 regC_uchar8[offset_uc8].s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s0) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));
352 regC_uchar8[offset_uc8].s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s0) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));
353 regC_uchar8[offset_uc8].s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s0) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));
354 regC_uchar8[offset_uc8].s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s0) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));
356 regC_uchar8[offset_uc8].s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s1) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));
357 regC_uchar8[offset_uc8].s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s1) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));
358 regC_uchar8[offset_uc8].s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s1) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));
359 regC_uchar8[offset_uc8].s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s1) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));
361 FUNC_CALL(sub_group_block_write_uchar8)(&g_outC_uchar[padded_offset], regC_uchar8[offset_uc8]);
362 cOffset += sizeof(uchar8) * SG_SIZE;
363 padded_offset += sizeof(uchar8) * SG_SIZE;
367 regC_uchar8[offset_uc8].s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s2) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));
368 regC_uchar8[offset_uc8].s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s2) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));
369 regC_uchar8[offset_uc8].s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s2) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));
370 regC_uchar8[offset_uc8].s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s2) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));
372 regC_uchar8[offset_uc8].s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s3) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));
373 regC_uchar8[offset_uc8].s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s3) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));
374 regC_uchar8[offset_uc8].s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s3) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));
375 regC_uchar8[offset_uc8].s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s3) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));
377 FUNC_CALL(sub_group_block_write_uchar8)(&g_outC_uchar[padded_offset], regC_uchar8[offset_uc8]);
378 cOffset += sizeof(uchar8) * SG_SIZE;
381 // now we need to calculate again for other x
382 padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset);
385 regC_uchar8[offset_uc8].s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s4) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));
386 regC_uchar8[offset_uc8].s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s4) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));
387 regC_uchar8[offset_uc8].s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s4) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));
388 regC_uchar8[offset_uc8].s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s4) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));
390 regC_uchar8[offset_uc8].s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s5) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));
391 regC_uchar8[offset_uc8].s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s5) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));
392 regC_uchar8[offset_uc8].s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s5) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));
393 regC_uchar8[offset_uc8].s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s5) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));
395 FUNC_CALL(sub_group_block_write_uchar8)(&g_outC_uchar[padded_offset], regC_uchar8[offset_uc8]);
396 cOffset += sizeof(uchar8) * SG_SIZE;
397 padded_offset += sizeof(uchar8) * SG_SIZE;
400 regC_uchar8[offset_uc8].s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s6) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));
401 regC_uchar8[offset_uc8].s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s6) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));
402 regC_uchar8[offset_uc8].s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s6) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));
403 regC_uchar8[offset_uc8].s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s6) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));
405 regC_uchar8[offset_uc8].s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s7) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));
406 regC_uchar8[offset_uc8].s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s7) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));
407 regC_uchar8[offset_uc8].s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s7) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));
408 regC_uchar8[offset_uc8].s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s7) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));
410 FUNC_CALL(sub_group_block_write_uchar8)(&g_outC_uchar[padded_offset], regC_uchar8[offset_uc8]);
411 cOffset += sizeof(uchar8) * SG_SIZE;
415 // Write final accumulated values
416 uint cOffset = sg_global_idX * ((MATRIX_M / 8) * SG_TILE_N) + sg_global_idY * (SG_TILE_M / 8) +
417 sg_tid * (MATRIX_M / 8);
418 __attribute__((opencl_unroll_hint(SIMD_LANE_N)))
419 for (uint i = 0; i < (SIMD_LANE_N); ++i)
421 __attribute__((opencl_unroll_hint(SIMD_LANE_M / 8)))
422 for (uint j = 0; j < (SIMD_LANE_M / 8); ++j)
424 g_matrixC[cOffset + j] = regC[i*(SIMD_LANE_M / 8) + j];
426 cOffset += SG_SIZE * (MATRIX_M / 8);