Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / kernel_selector / core / cl_kernels / convolution_gpu_mmad_32x32sg_slm_int8.cl
1 // Copyright (c) 2018 Intel Corporation
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "include/mmad.cl"
16
17 inline uint FUNC(calculate_output_offset_to_account_padding)(uint cOffset)
18 {
19     uint tmp_idx = cOffset;
20     uint f_val_idx = tmp_idx % 32;
21     tmp_idx /= 32;
22     uint b_val_idx = tmp_idx % 4;
23     tmp_idx /= 4;
24     uint x_idx = tmp_idx % OUTPUT_SIZE_X;
25     tmp_idx /= OUTPUT_SIZE_X;
26     uint y_idx = tmp_idx % OUTPUT_SIZE_Y;
27     tmp_idx /= OUTPUT_SIZE_Y;
28     uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4);
29     tmp_idx /= (OUTPUT_BATCH_NUM / 4);
30     uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32);
31
32     uint padded_offset = f_slice_idx * OUT_F_BLOCK_PITCH;
33     padded_offset += b_slice_idx * OUT_B_BLOCK_PITCH;
34     padded_offset += y_idx * OUT_Y_PITCH;
35     padded_offset += x_idx * OUT_X_PITCH;
36     padded_offset += b_val_idx * 32;
37     padded_offset += f_val_idx;
38     padded_offset += OUT_OFFSET;
39
40     return padded_offset;
41 }
42
43 inline void FUNC(mmad_32x32_int8)(  __local uint* l_tileA, const uint l_offsetTileA,
44                                     __local int8* l_tileB, const uint l_offsetTileB_col0,
45                                     const uint l_offsetTileB_col1, const uint l_offsetTileB_col2,
46                                     const uint l_offsetTileB_col3, int8* rowA, int8* colB,
47                                     int8* regC)
48 {
49     // Read tile A from SLM to regA
50     uint l_offsetTileATemp = l_offsetTileA;
51     __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
52     for (uint j = 0; j < (SG_TILE_M / 8); ++j)
53     {
54                 rowA[j] = as_int8(SLM_BLOCK_READ_8(&l_tileA[l_offsetTileATemp]));
55         l_offsetTileATemp += 8 * SG_SIZE;
56     }
57     // Read tile B from SLM to regB and compute mmad
58     colB[0] = l_tileB[l_offsetTileB_col0];
59     colB[1] = l_tileB[l_offsetTileB_col1];
60     __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
61     for (uint j = 0; j < (SG_TILE_M / 8); ++j)
62     {
63         // Compute partial C
64         regC[0*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[0], regC[0*(SIMD_LANE_M / 8) + j]);
65     }
66     colB[0] = l_tileB[l_offsetTileB_col2];
67     __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
68     for (uint j = 0; j < (SG_TILE_M / 8); ++j)
69     {
70         // Compute partial C
71         regC[1*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[1], regC[1*(SIMD_LANE_M / 8) + j] );
72         }
73     colB[1] = l_tileB[l_offsetTileB_col3];
74     __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
75     for (uint j = 0; j < (SG_TILE_M / 8); ++j)
76     {
77         // Compute partial C
78         regC[2*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[0], regC[2*(SIMD_LANE_M / 8) + j]);
79     }
80     __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
81     for (uint j = 0; j < (SG_TILE_M / 8); ++j)
82     {
83         // Compute partial C
84         regC[3*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[1], regC[3*(SIMD_LANE_M / 8) + j]);
85     }
86 }
87
88 /*
89  *  \brief GEMM kernel to compute MxN matrix using SLM
90  *  \param g_inA  - Input matrix 
91  *  \param g_inB  - Input matrix 
92  *  \param g_outC - Output matrix
93  */
94
95 __attribute__((intel_reqd_sub_group_size(SG_SIZE)))   
96 KERNEL(Kernel_GEMM_MMAD8_32x32SG_128x128WG_SLM_INT8)
97                                                                                                                   (                                                                                                               
98                                                                                                                   __global char* const g_inA,                                                    
99                                                                                                                   __global int* g_outC,
100                                                                                                                   __global char* const g_inB,                                                     
101                                                                                                                     #if BIAS_TERM
102                                                                                                                                 __global BIAS_TYPE* biases,
103                                                                                                                         #endif
104                                                                                                                                 __global float* quantizations,
105                                                                                                                         #if CALIBRATION_TERM
106                                                                                                                                 __global float* calibrations,
107                                                                                                                         #endif
108                                                                                                                                 uint split_idx
109
110                                                                                                                    )
111 {
112
113     __global int4* const g_matrixA = (__global int4*)g_inA;
114     __global int4* const g_matrixB = (__global int4*)g_inB;
115     __global int8* g_matrixC = (__global int8*)g_outC;
116
117     // 1) All work-items in work-group fill SLM with tileA and tileB.
118     // 2) Each sub-group works to compute a 32x32 tileC (stored in regC).
119     //    Note that each work-item in the sub-group computes a 32x4 chunk of tileC.
120     // 3) Repeat until tileC is fully computed (while moving tileA and tileB "windows")
121     __local int8 l_workGroupTileA_0[(WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)];
122     __local int8 l_workGroupTileB_0[(WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)];
123     __local uint* l_workGroupTileA_uint_0 = (__local uint*)l_workGroupTileA_0;
124
125     __local int8 l_workGroupTileA_1[(WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)];
126     __local int8 l_workGroupTileB_1[(WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)];
127     __local uint* l_workGroupTileA_uint_1 = (__local uint*)l_workGroupTileA_1;
128
129     __local int8* l_workGroupTileA_live =  l_workGroupTileA_0;
130     __local int8* l_workGroupTileB_live =  l_workGroupTileB_0;
131     __local uint* l_workGroupTileA_live_uint = l_workGroupTileA_uint_0;
132
133     __local int4* l_workGroupTileA_0_int4 = (__local int4*)l_workGroupTileA_0;
134     __local int4* l_workGroupTileB_0_int4 = (__local int4*)l_workGroupTileB_0;
135     __local int4* l_workGroupTileA_1_int4 = (__local int4*)l_workGroupTileA_1;
136     __local int4* l_workGroupTileB_1_int4 = (__local int4*)l_workGroupTileB_1;
137
138     const uint l_groupSize = get_local_size(DIM_X) * get_local_size(DIM_Y);
139
140     // Thread IDs
141     const uint g_tidY = get_global_id(DIM_Y);
142     const uint g_tidX = get_global_id(DIM_X);
143     const uint l_tidX = get_local_id(DIM_X);
144     const uint l_tidY = get_local_id(DIM_Y);
145     const uint l_tid = l_tidY * get_local_size(DIM_X) + l_tidX;
146
147     // SubGroup IDs
148     const uint sg_tid = get_sub_group_local_id();
149     const uint sg_global_idX = (uint)(g_tidX / SG_SIZE);
150     const uint sg_global_idY = g_tidY;
151     const uint sg_local_idX = (uint)(l_tidX / SG_SIZE);
152     const uint sg_local_idY = l_tidY;
153     const uint sg_local_id = sg_local_idY * get_local_size(DIM_X) / SG_SIZE + sg_local_idX;
154
155     const uint sub_group_id = get_sub_group_id();
156
157     // Registers
158     int8 regC[(SIMD_LANE_M / 8) * SIMD_LANE_N] = {0}; // Each work-item responsible for 32x4 ints elts
159     int8 rowA[(SG_TILE_M * MATRIX_SMALL_K / SG_SIZE) / sizeof(int8)]; // each work-item will hold 1/8 of matrixA
160     int8 colB[2]; // each lane will store 32x4 piece of matrixB
161
162     // SLM indices
163     const uint l_offsetTileA = SG_TILE_M * (MATRIX_SMALL_K / sizeof(uint)) * sg_local_idY;
164     const uint numElements32x32TileB = (MATRIX_SMALL_K * SG_TILE_N) / sizeof(int8);
165     const uint numElements32x8TileB = numElements32x32TileB / 4;
166     const uint l_offsetTileB = numElements32x32TileB * sg_local_idX;
167     const uint l_offsetTileB_col0 = l_offsetTileB + sg_tid;
168     const uint l_offsetTileB_col1 = l_offsetTileB + 1 * numElements32x8TileB + sg_tid;
169     const uint l_offsetTileB_col2 = l_offsetTileB + 2 * numElements32x8TileB + sg_tid;
170     const uint l_offsetTileB_col3 = l_offsetTileB + 3 * numElements32x8TileB + sg_tid;
171
172     // Global indices
173 #ifdef TILED_GLOBAL_LAYOUT  // 32-row major (matrixA) and 32-col major (matrixB)
174     uint g_idxA = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_M) * get_group_id(DIM_Y) + l_tid;
175     uint g_idxB = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_N) * get_group_id(DIM_X) + l_tid;
176 #else  // Row (matrixA) and Col (matrixB) major layout
177     uint g_idxA = WG_TILE_M * (MATRIX_K / sizeof(int4)) * get_group_id(DIM_Y) +
178                   (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2);
179     uint g_idxB = WG_TILE_N * (MATRIX_K / sizeof(int4)) * get_group_id(DIM_X) +
180                   (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2);
181 #endif
182
183     // Initial SLM setup
184     {
185         uint g_idxATemp = g_idxA;
186         for (uint i = l_tid; i < (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)); i += WG_SIZE)
187         {
188             l_workGroupTileA_0_int4[i] = g_matrixA[g_idxATemp];
189 #ifdef TILED_GLOBAL_LAYOUT
190             g_idxATemp += WG_SIZE;
191 #else
192             g_idxATemp += (WG_SIZE / 2) * (MATRIX_K / sizeof(int4));
193 #endif
194         }
195
196         uint g_idxBTemp = g_idxB;
197         for (uint i = l_tid; i < (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)); i += WG_SIZE)
198         {
199             l_workGroupTileB_0_int4[i] = g_matrixB[g_idxBTemp];
200 #ifdef TILED_GLOBAL_LAYOUT
201             g_idxBTemp += WG_SIZE;
202 #else
203             g_idxBTemp +=  (WG_SIZE / 2) * (MATRIX_K / sizeof(int4));
204 #endif
205         }
206
207 #ifdef TILED_GLOBAL_LAYOUT
208         g_idxA += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
209         g_idxB += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
210 #else
211         g_idxA += MATRIX_SMALL_K / sizeof(int4);
212         g_idxB += MATRIX_SMALL_K / sizeof(int4);
213 #endif
214
215         barrier(CLK_LOCAL_MEM_FENCE);
216     }
217
218     int4 hdcReadValueA[(WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)) / WG_SIZE < 1
219                            ? 1
220                            : (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)) / WG_SIZE];
221     int4 hdcReadValueB[(WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)) / WG_SIZE < 1
222                            ? 1
223                            : (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)) / WG_SIZE];
224
225     __attribute__((opencl_unroll_hint(1)))
226     for (uint k = 0; k < (MATRIX_K / MATRIX_SMALL_K) - 1; k++)
227     {
228         /*
229          * SLM setup - HDC read only
230          */
231
232 #if ((MATRIX_K / MATRIX_SMALL_K) > 1)
233         uint g_idxATemp = g_idxA;
234         for (uint i = l_tid, j = 0; i < (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)); i += WG_SIZE, ++j)
235         {
236             hdcReadValueA[j] = g_matrixA[g_idxATemp];
237 #ifdef TILED_GLOBAL_LAYOUT
238             g_idxATemp += WG_SIZE;
239 #else
240             g_idxATemp += (WG_SIZE / 2) * (MATRIX_K / sizeof(int4));
241 #endif
242         }
243
244         uint g_idxBTemp = g_idxB;
245         for (uint i = l_tid, j = 0; i < (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)); i += WG_SIZE, ++j)
246         {
247             hdcReadValueB[j] = g_matrixB[g_idxBTemp];
248 #ifdef TILED_GLOBAL_LAYOUT
249             g_idxBTemp += WG_SIZE;
250 #else
251             g_idxBTemp += (WG_SIZE / 2) * (MATRIX_K / sizeof(int4));
252 #endif
253         }
254
255 #ifdef TILED_GLOBAL_LAYOUT
256         g_idxA += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
257         g_idxB += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
258 #else
259         g_idxA += MATRIX_SMALL_K / sizeof(int4);
260         g_idxB += MATRIX_SMALL_K / sizeof(int4);
261 #endif
262 #endif
263
264         /*
265          * MMAD compute
266          */
267
268         FUNC_CALL(mmad_32x32_int8)(l_workGroupTileA_live_uint, l_offsetTileA, l_workGroupTileB_live,
269                                 l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2,
270                                 l_offsetTileB_col3, rowA, colB, regC);
271
272         /*
273          * SLM setup - SLM write only
274          */
275
276 #if ((MATRIX_K / MATRIX_SMALL_K) > 1)
277         if (k % 2 == 0)
278         {
279             for (uint i = l_tid, j = 0; i < (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4));
280                  i += WG_SIZE, ++j)
281             {
282                 l_workGroupTileA_1_int4[i] = hdcReadValueA[j];
283             }
284
285             for (uint i = l_tid, j = 0; i < (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4));
286                  i += WG_SIZE, ++j)
287             {
288                 l_workGroupTileB_1_int4[i] = hdcReadValueB[j];
289             }
290
291             l_workGroupTileA_live = l_workGroupTileA_1;
292             l_workGroupTileB_live = l_workGroupTileB_1;
293             l_workGroupTileA_live_uint = l_workGroupTileA_uint_1;
294         }
295         else
296         {
297             for (uint i = l_tid, j = 0; i < (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4));
298                  i += WG_SIZE, ++j)
299             {
300                 l_workGroupTileA_0_int4[i] = hdcReadValueA[j];
301             }
302
303             for (uint i = l_tid, j = 0; i < (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4));
304                  i += WG_SIZE, ++j)
305             {
306                 l_workGroupTileB_0_int4[i] = hdcReadValueB[j];
307             }
308
309             l_workGroupTileA_live = l_workGroupTileA_0;
310             l_workGroupTileB_live = l_workGroupTileB_0;
311             l_workGroupTileA_live_uint = l_workGroupTileA_uint_0;
312         }
313         barrier(CLK_LOCAL_MEM_FENCE);
314 #endif
315     }
316
317     /*
318      * Last MMAD compute iteration (avoids branching in main loop)
319      */
320     FUNC_CALL(mmad_32x32_int8)(l_workGroupTileA_live_uint, l_offsetTileA, l_workGroupTileB_live,
321                             l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2,
322                             l_offsetTileB_col3, rowA, colB, regC);
323                             
324 #ifdef OUTPUT_TILED_GLOBAL_LAYOUT
325     // Write out in swizzled manner after quantizing
326     __global uchar* g_outC_uchar = (__global uchar*)g_outC;
327     uint cOffset = sg_global_idX * (MATRIX_M * SG_TILE_N / sizeof(uchar)) +
328                    sg_global_idY * (SG_TILE_M * SG_TILE_N / sizeof(uchar));
329
330     uchar8 regC_uchar8[SIMD_LANE_M * SIMD_LANE_N / (sizeof(uchar8) / sizeof(uchar))];
331     uint offset_uc8 = 0;
332
333         const uint workgroup_id_x = get_group_id(0); 
334         uint feature_off = 32*(sub_group_id % (WG_TILE_N / 32)) + WG_TILE_N*workgroup_id_x; //=32*{0,1,2,3} + WG_TILE_N * workgroup_id_x 
335         uint feature = get_sub_group_local_id() + feature_off;
336
337     float4 quant_f = as_float4(intel_sub_group_block_read4((__global uint*) (quantizations + feature) ));
338     float4 bias_f = as_float4(intel_sub_group_block_read4((__global uint*) (biases + feature) ));
339     float4 calib_f = as_float4(intel_sub_group_block_read4((__global uint*) (calibrations + feature) ));
340
341 #if MMAD_SUPPORTED == 1
342     __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) )))
343 #endif
344     for (uint i = 0; i < SG_TILE_M / (sizeof(int8) / sizeof(int)); i++)
345     {
346         // begin of account for output PADDING
347         uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset);
348         // end of account for padding
349
350         // B0 F0..31
351                 regC_uchar8[offset_uc8].s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s0) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));
352                 regC_uchar8[offset_uc8].s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s0) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));
353                 regC_uchar8[offset_uc8].s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s0) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));
354                 regC_uchar8[offset_uc8].s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s0) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));       
355         // B1 F0..31            
356                 regC_uchar8[offset_uc8].s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s1) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));
357                 regC_uchar8[offset_uc8].s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s1) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));
358                 regC_uchar8[offset_uc8].s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s1) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));
359                 regC_uchar8[offset_uc8].s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s1) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));       
360
361                 FUNC_CALL(sub_group_block_write_uchar8)(&g_outC_uchar[padded_offset], regC_uchar8[offset_uc8]);
362         cOffset += sizeof(uchar8) * SG_SIZE;
363         padded_offset += sizeof(uchar8) * SG_SIZE;
364         offset_uc8++;
365
366         // B2 F0..31
367         regC_uchar8[offset_uc8].s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s2) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));
368                 regC_uchar8[offset_uc8].s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s2) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));
369                 regC_uchar8[offset_uc8].s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s2) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));
370                 regC_uchar8[offset_uc8].s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s2) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));       
371         // B3 F0..31            
372                 regC_uchar8[offset_uc8].s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s3) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));
373                 regC_uchar8[offset_uc8].s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s3) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));
374                 regC_uchar8[offset_uc8].s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s3) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));
375                 regC_uchar8[offset_uc8].s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s3) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));       
376                 
377                 FUNC_CALL(sub_group_block_write_uchar8)(&g_outC_uchar[padded_offset], regC_uchar8[offset_uc8]);
378                 cOffset += sizeof(uchar8) * SG_SIZE;
379         offset_uc8++;
380
381         // now we need to calculate again for other x
382         padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset);
383         //
384
385         regC_uchar8[offset_uc8].s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s4) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));
386                 regC_uchar8[offset_uc8].s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s4) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));
387                 regC_uchar8[offset_uc8].s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s4) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));
388                 regC_uchar8[offset_uc8].s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s4) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));       
389                 
390                 regC_uchar8[offset_uc8].s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s5) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));
391                 regC_uchar8[offset_uc8].s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s5) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));
392                 regC_uchar8[offset_uc8].s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s5) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));
393                 regC_uchar8[offset_uc8].s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s5) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));
394
395                 FUNC_CALL(sub_group_block_write_uchar8)(&g_outC_uchar[padded_offset], regC_uchar8[offset_uc8]);
396         cOffset += sizeof(uchar8) * SG_SIZE;
397         padded_offset += sizeof(uchar8) * SG_SIZE;
398         offset_uc8++;
399
400         regC_uchar8[offset_uc8].s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s6) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));
401                 regC_uchar8[offset_uc8].s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s6) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));
402                 regC_uchar8[offset_uc8].s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s6) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));
403                 regC_uchar8[offset_uc8].s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s6) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));       
404                 
405                 regC_uchar8[offset_uc8].s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s7) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));
406                 regC_uchar8[offset_uc8].s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s7) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));
407                 regC_uchar8[offset_uc8].s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s7) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));
408                 regC_uchar8[offset_uc8].s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s7) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));
409
410                 FUNC_CALL(sub_group_block_write_uchar8)(&g_outC_uchar[padded_offset], regC_uchar8[offset_uc8]);
411         cOffset += sizeof(uchar8) * SG_SIZE;
412         offset_uc8++;
413     }
414 #else
415     // Write final accumulated values
416     uint cOffset = sg_global_idX * ((MATRIX_M / 8) * SG_TILE_N) + sg_global_idY * (SG_TILE_M / 8) +
417                    sg_tid * (MATRIX_M / 8);
418     __attribute__((opencl_unroll_hint(SIMD_LANE_N)))
419     for (uint i = 0; i < (SIMD_LANE_N); ++i)
420     {
421         __attribute__((opencl_unroll_hint(SIMD_LANE_M / 8)))
422         for (uint j = 0; j < (SIMD_LANE_M / 8); ++j)
423         {
424             g_matrixC[cOffset + j] = regC[i*(SIMD_LANE_M / 8) + j];
425         }
426         cOffset += SG_SIZE * (MATRIX_M / 8);
427     }
428 #endif
429
430 }