inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_slm_int8.cl

   1 // Copyright (c) 2018 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 #include "include/mmad.cl"
  16
  17 inline uint FUNC(calculate_output_offset_to_account_padding)(uint cOffset)
  18 {
  19     uint tmp_idx = cOffset;
  20     uint f_val_idx = tmp_idx % 32;
  21     tmp_idx /= 32;
  22     uint b_val_idx = tmp_idx % 4;
  23     tmp_idx /= 4;
  24     uint x_idx = tmp_idx % OUTPUT_SIZE_X;
  25     tmp_idx /= OUTPUT_SIZE_X;
  26     uint y_idx = tmp_idx % OUTPUT_SIZE_Y;
  27     tmp_idx /= OUTPUT_SIZE_Y;
  28     uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4);
  29     tmp_idx /= (OUTPUT_BATCH_NUM / 4);
  30     uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32);
  31
  32     uint padded_offset = f_slice_idx * OUT_F_BLOCK_PITCH;
  33     padded_offset += b_slice_idx * OUT_B_BLOCK_PITCH;
  34     padded_offset += y_idx * OUT_Y_PITCH;
  35     padded_offset += x_idx * OUT_X_PITCH;
  36     padded_offset += b_val_idx * 32;
  37     padded_offset += f_val_idx;
  38     padded_offset += OUT_OFFSET;
  39
  40     return padded_offset;
  41 }
  42
  43 inline void FUNC(mmad_32x32_int8)(  __local uint* l_tileA, const uint l_offsetTileA,
  44                                     __local int8* l_tileB, const uint l_offsetTileB_col0,
  45                                     const uint l_offsetTileB_col1, const uint l_offsetTileB_col2,
  46                                     const uint l_offsetTileB_col3, int8* rowA, int8* colB,
  47                                     int8* regC)
  48 {
  49     // Read tile A from SLM to regA
  50     uint l_offsetTileATemp = l_offsetTileA;
  51     __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
  52     for (uint j = 0; j < (SG_TILE_M / 8); ++j)
  53     {
  54                 rowA[j] = as_int8(SLM_BLOCK_READ_8(&l_tileA[l_offsetTileATemp]));
  55         l_offsetTileATemp += 8 * SG_SIZE;
  56     }
  57     // Read tile B from SLM to regB and compute mmad
  58     colB[0] = l_tileB[l_offsetTileB_col0];
  59     colB[1] = l_tileB[l_offsetTileB_col1];
  60     __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
  61     for (uint j = 0; j < (SG_TILE_M / 8); ++j)
  62     {
  63         // Compute partial C
  64         regC[0*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[0], regC[0*(SIMD_LANE_M / 8) + j]);
  65     }
  66     colB[0] = l_tileB[l_offsetTileB_col2];
  67     __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
  68     for (uint j = 0; j < (SG_TILE_M / 8); ++j)
  69     {
  70         // Compute partial C
  71         regC[1*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[1], regC[1*(SIMD_LANE_M / 8) + j] );
  72         }
  73     colB[1] = l_tileB[l_offsetTileB_col3];
  74     __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
  75     for (uint j = 0; j < (SG_TILE_M / 8); ++j)
  76     {
  77         // Compute partial C
  78         regC[2*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[0], regC[2*(SIMD_LANE_M / 8) + j]);
  79     }
  80     __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
  81     for (uint j = 0; j < (SG_TILE_M / 8); ++j)
  82     {
  83         // Compute partial C
  84         regC[3*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[1], regC[3*(SIMD_LANE_M / 8) + j]);
  85     }
  86 }
  87
  88 /*
  89  *  \brief GEMM kernel to compute MxN matrix using SLM
  90  *  \param g_inA  - Input matrix
  91  *  \param g_inB  - Input matrix
  92  *  \param g_outC - Output matrix
  93  */
  94
  95 __attribute__((intel_reqd_sub_group_size(SG_SIZE)))
  96 KERNEL(Kernel_GEMM_MMAD8_32x32SG_128x128WG_SLM_INT8)
  97                                                                                                                   (
  98                                                                                                                   __global char* const g_inA,
  99                                                                                                                   __global int* g_outC,
 100                                                                                                                   __global char* const g_inB,
 101                                                                                                                     #if BIAS_TERM
 102                                                                                                                                 __global BIAS_TYPE* biases,
 103                                                                                                                         #endif
 104                                                                                                                                 __global float* quantizations,
 105                                                                                                                         #if CALIBRATION_TERM
 106                                                                                                                                 __global float* calibrations,
 107                                                                                                                         #endif
 108                                                                                                                                 uint split_idx
 109
 110                                                                                                                    )
 111 {
 112
 113     __global int4* const g_matrixA = (__global int4*)g_inA;
 114     __global int4* const g_matrixB = (__global int4*)g_inB;
 115     __global int8* g_matrixC = (__global int8*)g_outC;
 116
 117     // 1) All work-items in work-group fill SLM with tileA and tileB.
 118     // 2) Each sub-group works to compute a 32x32 tileC (stored in regC).
 119     //    Note that each work-item in the sub-group computes a 32x4 chunk of tileC.
 120     // 3) Repeat until tileC is fully computed (while moving tileA and tileB "windows")
 121     __local int8 l_workGroupTileA_0[(WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)];
 122     __local int8 l_workGroupTileB_0[(WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)];
 123     __local uint* l_workGroupTileA_uint_0 = (__local uint*)l_workGroupTileA_0;
 124
 125     __local int8 l_workGroupTileA_1[(WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)];
 126     __local int8 l_workGroupTileB_1[(WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)];
 127     __local uint* l_workGroupTileA_uint_1 = (__local uint*)l_workGroupTileA_1;
 128
 129     __local int8* l_workGroupTileA_live =  l_workGroupTileA_0;
 130     __local int8* l_workGroupTileB_live =  l_workGroupTileB_0;
 131     __local uint* l_workGroupTileA_live_uint = l_workGroupTileA_uint_0;
 132
 133     __local int4* l_workGroupTileA_0_int4 = (__local int4*)l_workGroupTileA_0;
 134     __local int4* l_workGroupTileB_0_int4 = (__local int4*)l_workGroupTileB_0;
 135     __local int4* l_workGroupTileA_1_int4 = (__local int4*)l_workGroupTileA_1;
 136     __local int4* l_workGroupTileB_1_int4 = (__local int4*)l_workGroupTileB_1;
 137
 138     const uint l_groupSize = get_local_size(DIM_X) * get_local_size(DIM_Y);
 139
 140     // Thread IDs
 141     const uint g_tidY = get_global_id(DIM_Y);
 142     const uint g_tidX = get_global_id(DIM_X);
 143     const uint l_tidX = get_local_id(DIM_X);
 144     const uint l_tidY = get_local_id(DIM_Y);
 145     const uint l_tid = l_tidY * get_local_size(DIM_X) + l_tidX;
 146
 147     // SubGroup IDs
 148     const uint sg_tid = get_sub_group_local_id();
 149     const uint sg_global_idX = (uint)(g_tidX / SG_SIZE);
 150     const uint sg_global_idY = g_tidY;
 151     const uint sg_local_idX = (uint)(l_tidX / SG_SIZE);
 152     const uint sg_local_idY = l_tidY;
 153     const uint sg_local_id = sg_local_idY * get_local_size(DIM_X) / SG_SIZE + sg_local_idX;
 154
 155     const uint sub_group_id = get_sub_group_id();
 156
 157     // Registers
 158     int8 regC[(SIMD_LANE_M / 8) * SIMD_LANE_N] = {0}; // Each work-item responsible for 32x4 ints elts
 159     int8 rowA[(SG_TILE_M * MATRIX_SMALL_K / SG_SIZE) / sizeof(int8)]; // each work-item will hold 1/8 of matrixA
 160     int8 colB[2]; // each lane will store 32x4 piece of matrixB
 161
 162     // SLM indices
 163     const uint l_offsetTileA = SG_TILE_M * (MATRIX_SMALL_K / sizeof(uint)) * sg_local_idY;
 164     const uint numElements32x32TileB = (MATRIX_SMALL_K * SG_TILE_N) / sizeof(int8);
 165     const uint numElements32x8TileB = numElements32x32TileB / 4;
 166     const uint l_offsetTileB = numElements32x32TileB * sg_local_idX;
 167     const uint l_offsetTileB_col0 = l_offsetTileB + sg_tid;
 168     const uint l_offsetTileB_col1 = l_offsetTileB + 1 * numElements32x8TileB + sg_tid;
 169     const uint l_offsetTileB_col2 = l_offsetTileB + 2 * numElements32x8TileB + sg_tid;
 170     const uint l_offsetTileB_col3 = l_offsetTileB + 3 * numElements32x8TileB + sg_tid;
 171
 172     // Global indices
 173 #ifdef TILED_GLOBAL_LAYOUT  // 32-row major (matrixA) and 32-col major (matrixB)
 174     uint g_idxA = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_M) * get_group_id(DIM_Y) + l_tid;
 175     uint g_idxB = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_N) * get_group_id(DIM_X) + l_tid;
 176 #else  // Row (matrixA) and Col (matrixB) major layout
 177     uint g_idxA = WG_TILE_M * (MATRIX_K / sizeof(int4)) * get_group_id(DIM_Y) +
 178                   (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2);
 179     uint g_idxB = WG_TILE_N * (MATRIX_K / sizeof(int4)) * get_group_id(DIM_X) +
 180                   (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2);
 181 #endif
 182
 183     // Initial SLM setup
 184     {
 185         uint g_idxATemp = g_idxA;
 186         for (uint i = l_tid; i < (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)); i += WG_SIZE)
 187         {
 188             l_workGroupTileA_0_int4[i] = g_matrixA[g_idxATemp];
 189 #ifdef TILED_GLOBAL_LAYOUT
 190             g_idxATemp += WG_SIZE;
 191 #else
 192             g_idxATemp += (WG_SIZE / 2) * (MATRIX_K / sizeof(int4));
 193 #endif
 194         }
 195
 196         uint g_idxBTemp = g_idxB;
 197         for (uint i = l_tid; i < (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)); i += WG_SIZE)
 198         {
 199             l_workGroupTileB_0_int4[i] = g_matrixB[g_idxBTemp];
 200 #ifdef TILED_GLOBAL_LAYOUT
 201             g_idxBTemp += WG_SIZE;
 202 #else
 203             g_idxBTemp +=  (WG_SIZE / 2) * (MATRIX_K / sizeof(int4));
 204 #endif
 205         }
 206
 207 #ifdef TILED_GLOBAL_LAYOUT
 208         g_idxA += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
 209         g_idxB += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
 210 #else
 211         g_idxA += MATRIX_SMALL_K / sizeof(int4);
 212         g_idxB += MATRIX_SMALL_K / sizeof(int4);
 213 #endif
 214
 215         barrier(CLK_LOCAL_MEM_FENCE);
 216     }
 217
 218     int4 hdcReadValueA[(WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)) / WG_SIZE < 1
 219                            ? 1
 220                            : (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)) / WG_SIZE];
 221     int4 hdcReadValueB[(WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)) / WG_SIZE < 1
 222                            ? 1
 223                            : (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)) / WG_SIZE];
 224
 225     __attribute__((opencl_unroll_hint(1)))
 226     for (uint k = 0; k < (MATRIX_K / MATRIX_SMALL_K) - 1; k++)
 227     {
 228         /*
 229          * SLM setup - HDC read only
 230          */
 231
 232 #if ((MATRIX_K / MATRIX_SMALL_K) > 1)
 233         uint g_idxATemp = g_idxA;
 234         for (uint i = l_tid, j = 0; i < (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)); i += WG_SIZE, ++j)
 235         {
 236             hdcReadValueA[j] = g_matrixA[g_idxATemp];
 237 #ifdef TILED_GLOBAL_LAYOUT
 238             g_idxATemp += WG_SIZE;
 239 #else
 240             g_idxATemp += (WG_SIZE / 2) * (MATRIX_K / sizeof(int4));
 241 #endif
 242         }
 243
 244         uint g_idxBTemp = g_idxB;
 245         for (uint i = l_tid, j = 0; i < (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)); i += WG_SIZE, ++j)
 246         {
 247             hdcReadValueB[j] = g_matrixB[g_idxBTemp];
 248 #ifdef TILED_GLOBAL_LAYOUT
 249             g_idxBTemp += WG_SIZE;
 250 #else
 251             g_idxBTemp += (WG_SIZE / 2) * (MATRIX_K / sizeof(int4));
 252 #endif
 253         }
 254
 255 #ifdef TILED_GLOBAL_LAYOUT
 256         g_idxA += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
 257         g_idxB += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
 258 #else
 259         g_idxA += MATRIX_SMALL_K / sizeof(int4);
 260         g_idxB += MATRIX_SMALL_K / sizeof(int4);
 261 #endif
 262 #endif
 263
 264         /*
 265          * MMAD compute
 266          */
 267
 268         FUNC_CALL(mmad_32x32_int8)(l_workGroupTileA_live_uint, l_offsetTileA, l_workGroupTileB_live,
 269                                 l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2,
 270                                 l_offsetTileB_col3, rowA, colB, regC);
 271
 272         /*
 273          * SLM setup - SLM write only
 274          */
 275
 276 #if ((MATRIX_K / MATRIX_SMALL_K) > 1)
 277         if (k % 2 == 0)
 278         {
 279             for (uint i = l_tid, j = 0; i < (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4));
 280                  i += WG_SIZE, ++j)
 281             {
 282                 l_workGroupTileA_1_int4[i] = hdcReadValueA[j];
 283             }
 284
 285             for (uint i = l_tid, j = 0; i < (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4));
 286                  i += WG_SIZE, ++j)
 287             {
 288                 l_workGroupTileB_1_int4[i] = hdcReadValueB[j];
 289             }
 290
 291             l_workGroupTileA_live = l_workGroupTileA_1;
 292             l_workGroupTileB_live = l_workGroupTileB_1;
 293             l_workGroupTileA_live_uint = l_workGroupTileA_uint_1;
 294         }
 295         else
 296         {
 297             for (uint i = l_tid, j = 0; i < (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4));
 298                  i += WG_SIZE, ++j)
 299             {
 300                 l_workGroupTileA_0_int4[i] = hdcReadValueA[j];
 301             }
 302
 303             for (uint i = l_tid, j = 0; i < (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4));
 304                  i += WG_SIZE, ++j)
 305             {
 306                 l_workGroupTileB_0_int4[i] = hdcReadValueB[j];
 307             }
 308
 309             l_workGroupTileA_live = l_workGroupTileA_0;
 310             l_workGroupTileB_live = l_workGroupTileB_0;
 311             l_workGroupTileA_live_uint = l_workGroupTileA_uint_0;
 312         }
 313         barrier(CLK_LOCAL_MEM_FENCE);
 314 #endif
 315     }
 316
 317     /*
 318      * Last MMAD compute iteration (avoids branching in main loop)
 319      */
 320     FUNC_CALL(mmad_32x32_int8)(l_workGroupTileA_live_uint, l_offsetTileA, l_workGroupTileB_live,
 321                             l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2,
 322                             l_offsetTileB_col3, rowA, colB, regC);
 323
 324 #ifdef OUTPUT_TILED_GLOBAL_LAYOUT
 325     // Write out in swizzled manner after quantizing
 326     __global uchar* g_outC_uchar = (__global uchar*)g_outC;
 327     uint cOffset = sg_global_idX * (MATRIX_M * SG_TILE_N / sizeof(uchar)) +
 328                    sg_global_idY * (SG_TILE_M * SG_TILE_N / sizeof(uchar));
 329
 330     uchar8 regC_uchar8[SIMD_LANE_M * SIMD_LANE_N / (sizeof(uchar8) / sizeof(uchar))];
 331     uint offset_uc8 = 0;
 332
 333         const uint workgroup_id_x = get_group_id(0);
 334         uint feature_off = 32*(sub_group_id % (WG_TILE_N / 32)) + WG_TILE_N*workgroup_id_x; //=32*{0,1,2,3} + WG_TILE_N * workgroup_id_x
 335         uint feature = get_sub_group_local_id() + feature_off;
 336
 337     float4 quant_f = as_float4(intel_sub_group_block_read4((__global uint*) (quantizations + feature) ));
 338     float4 bias_f = as_float4(intel_sub_group_block_read4((__global uint*) (biases + feature) ));
 339     float4 calib_f = as_float4(intel_sub_group_block_read4((__global uint*) (calibrations + feature) ));
 340
 341 #if MMAD_SUPPORTED == 1
 342     __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) )))
 343 #endif
 344     for (uint i = 0; i < SG_TILE_M / (sizeof(int8) / sizeof(int)); i++)
 345     {
 346         // begin of account for output PADDING
 347         uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset);
 348         // end of account for padding
 349
 350         // B0 F0..31
 351                 regC_uchar8[offset_uc8].s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s0) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));
 352                 regC_uchar8[offset_uc8].s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s0) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));
 353                 regC_uchar8[offset_uc8].s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s0) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));
 354                 regC_uchar8[offset_uc8].s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s0) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));
 355         // B1 F0..31
 356                 regC_uchar8[offset_uc8].s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s1) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));
 357                 regC_uchar8[offset_uc8].s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s1) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));
 358                 regC_uchar8[offset_uc8].s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s1) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));
 359                 regC_uchar8[offset_uc8].s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s1) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));
 360
 361                 FUNC_CALL(sub_group_block_write_uchar8)(&g_outC_uchar[padded_offset], regC_uchar8[offset_uc8]);
 362         cOffset += sizeof(uchar8) * SG_SIZE;
 363         padded_offset += sizeof(uchar8) * SG_SIZE;
 364         offset_uc8++;
 365
 366         // B2 F0..31
 367         regC_uchar8[offset_uc8].s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s2) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));
 368                 regC_uchar8[offset_uc8].s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s2) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));
 369                 regC_uchar8[offset_uc8].s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s2) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));
 370                 regC_uchar8[offset_uc8].s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s2) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));
 371         // B3 F0..31
 372                 regC_uchar8[offset_uc8].s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s3) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));
 373                 regC_uchar8[offset_uc8].s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s3) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));
 374                 regC_uchar8[offset_uc8].s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s3) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));
 375                 regC_uchar8[offset_uc8].s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s3) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));
 376
 377                 FUNC_CALL(sub_group_block_write_uchar8)(&g_outC_uchar[padded_offset], regC_uchar8[offset_uc8]);
 378                 cOffset += sizeof(uchar8) * SG_SIZE;
 379         offset_uc8++;
 380
 381         // now we need to calculate again for other x
 382         padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset);
 383         //
 384
 385         regC_uchar8[offset_uc8].s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s4) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));
 386                 regC_uchar8[offset_uc8].s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s4) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));
 387                 regC_uchar8[offset_uc8].s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s4) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));
 388                 regC_uchar8[offset_uc8].s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s4) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));
 389
 390                 regC_uchar8[offset_uc8].s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s5) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));
 391                 regC_uchar8[offset_uc8].s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s5) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));
 392                 regC_uchar8[offset_uc8].s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s5) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));
 393                 regC_uchar8[offset_uc8].s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s5) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));
 394
 395                 FUNC_CALL(sub_group_block_write_uchar8)(&g_outC_uchar[padded_offset], regC_uchar8[offset_uc8]);
 396         cOffset += sizeof(uchar8) * SG_SIZE;
 397         padded_offset += sizeof(uchar8) * SG_SIZE;
 398         offset_uc8++;
 399
 400         regC_uchar8[offset_uc8].s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s6) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));
 401                 regC_uchar8[offset_uc8].s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s6) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));
 402                 regC_uchar8[offset_uc8].s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s6) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));
 403                 regC_uchar8[offset_uc8].s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s6) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));
 404
 405                 regC_uchar8[offset_uc8].s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s7) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));
 406                 regC_uchar8[offset_uc8].s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s7) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));
 407                 regC_uchar8[offset_uc8].s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s7) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));
 408                 regC_uchar8[offset_uc8].s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s7) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));
 409
 410                 FUNC_CALL(sub_group_block_write_uchar8)(&g_outC_uchar[padded_offset], regC_uchar8[offset_uc8]);
 411         cOffset += sizeof(uchar8) * SG_SIZE;
 412         offset_uc8++;
 413     }
 414 #else
 415     // Write final accumulated values
 416     uint cOffset = sg_global_idX * ((MATRIX_M / 8) * SG_TILE_N) + sg_global_idY * (SG_TILE_M / 8) +
 417                    sg_tid * (MATRIX_M / 8);
 418     __attribute__((opencl_unroll_hint(SIMD_LANE_N)))
 419     for (uint i = 0; i < (SIMD_LANE_N); ++i)
 420     {
 421         __attribute__((opencl_unroll_hint(SIMD_LANE_M / 8)))
 422         for (uint j = 0; j < (SIMD_LANE_M / 8); ++j)
 423         {
 424             g_matrixC[cOffset + j] = regC[i*(SIMD_LANE_M / 8) + j];
 425         }
 426         cOffset += SG_SIZE * (MATRIX_M / 8);
 427     }
 428 #endif
 429
 430 }