inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_224x128wg_slm_int8.cl

   1 // Copyright (c) 2018 Intel Corporation
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //      http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 #include "include/mmad.cl"
  16
  17 #define SCALE 0.11f
  18
  19 #ifdef LIGHTWEIGHT_QUANTIZATION
  20
  21 #define QUANTIZATION(idx) \
  22     {\
  23         for(uint z = 0; z < 4; z++)\
  24         {\
  25             regC_uchar16[z * 4 + 0] = convert_uchar_sat( (regC[0 * 4 + i][idx + z / 4]) * SCALE + bias_f.s0);\
  26             regC_uchar16[z * 4 + 1] = convert_uchar_sat( (regC[1 * 4 + i][idx + z / 4]) * SCALE + bias_f.s1);\
  27             regC_uchar16[z * 4 + 2] = convert_uchar_sat( (regC[2 * 4 + i][idx + z / 4]) * SCALE + bias_f.s2);\
  28             regC_uchar16[z * 4 + 3] = convert_uchar_sat( (regC[3 * 4 + i][idx + z / 4]) * SCALE + bias_f.s3);\
  29         }\
  30     }
  31
  32 #elif NO_QUANTIZATION
  33
  34 #define QUANTIZATION(idx) \
  35     regC_uchar16.s0 = convert_uchar_sat(regC[0 * 4 + i][idx]);\
  36     regC_uchar16.s1 = convert_uchar_sat(regC[1 * 4 + i][idx]);\
  37     regC_uchar16.s2 = convert_uchar_sat(regC[2 * 4 + i][idx]);\
  38     regC_uchar16.s3 = convert_uchar_sat(regC[3 * 4 + i][idx]);\
  39     \
  40     regC_uchar16.s4 = convert_uchar_sat(regC[0 * 4 + i][idx+1]);\
  41     regC_uchar16.s5 = convert_uchar_sat(regC[1 * 4 + i][idx+1]);\
  42     regC_uchar16.s6 = convert_uchar_sat(regC[2 * 4 + i][idx+1]);\
  43     regC_uchar16.s7 = convert_uchar_sat(regC[3 * 4 + i][idx+1]);\
  44     \
  45     regC_uchar16.s8 = convert_uchar_sat(regC[0 * 4 + i][idx+2]);\
  46     regC_uchar16.s9 = convert_uchar_sat(regC[1 * 4 + i][idx+2]);\
  47     regC_uchar16.sa = convert_uchar_sat(regC[2 * 4 + i][idx+2]);\
  48     regC_uchar16.sb = convert_uchar_sat(regC[3 * 4 + i][idx+2]);\
  49     \
  50     regC_uchar16.sc = convert_uchar_sat(regC[0 * 4 + i][idx+3]);\
  51     regC_uchar16.sd = convert_uchar_sat(regC[1 * 4 + i][idx+3]);\
  52     regC_uchar16.se = convert_uchar_sat(regC[2 * 4 + i][idx+3]);\
  53     regC_uchar16.sf = convert_uchar_sat(regC[3 * 4 + i][idx+3]);
  54
  55 #else
  56
  57 #define QUANTIZATION(idx) \
  58     regC_uchar16.s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));\
  59     regC_uchar16.s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));\
  60     regC_uchar16.s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));\
  61     regC_uchar16.s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));\
  62     \
  63     regC_uchar16.s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+1]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));\
  64     regC_uchar16.s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+1]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));\
  65     regC_uchar16.s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+1]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));\
  66     regC_uchar16.s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+1]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));\
  67     \
  68     regC_uchar16.s8 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+2]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));\
  69     regC_uchar16.s9 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+2]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));\
  70     regC_uchar16.sa = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+2]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));\
  71     regC_uchar16.sb = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+2]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));\
  72     \
  73     regC_uchar16.sc = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+3]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), NL_M, NL_N));\
  74     regC_uchar16.sd = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+3]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), NL_M, NL_N));\
  75     regC_uchar16.se = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+3]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), NL_M, NL_N));\
  76     regC_uchar16.sf = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+3]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), NL_M, NL_N));
  77
  78 #endif
  79
  80 inline uint FUNC(calculate_output_offset_to_account_padding)(uint cOffset)
  81 {
  82 #if OUT_WITH_PADDING == 1
  83     uint tmp_idx = cOffset;
  84     uint f_val_idx = tmp_idx % 32;
  85     tmp_idx /= 32;
  86     uint b_val_idx = tmp_idx % 4;
  87     tmp_idx /= 4;
  88     uint x_idx = tmp_idx % OUTPUT_SIZE_X;
  89     tmp_idx /= OUTPUT_SIZE_X;
  90     uint y_idx = tmp_idx % OUTPUT_SIZE_Y;
  91     tmp_idx /= OUTPUT_SIZE_Y;
  92     uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4);
  93     tmp_idx /= (OUTPUT_BATCH_NUM / 4);
  94     uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32);
  95
  96     uint padded_offset = f_slice_idx * OUT_F_BLOCK_PITCH;
  97     padded_offset += b_slice_idx * OUT_B_BLOCK_PITCH;
  98     padded_offset += y_idx * OUT_Y_PITCH;
  99     padded_offset += x_idx * OUT_X_PITCH;
 100     padded_offset += b_val_idx * 32;
 101     padded_offset += f_val_idx;
 102     padded_offset += OUT_OFFSET;
 103
 104     return padded_offset;
 105 #else
 106     return cOffset;
 107 #endif
 108 }
 109
 110 inline void FUNC(mmad_32x32_int8)(  __local uint* l_tileA, const uint l_offsetTileA,
 111                                     __local int8* l_tileB, const uint l_offsetTileB_col0,
 112                                     const uint l_offsetTileB_col1, const uint l_offsetTileB_col2,
 113                                     const uint l_offsetTileB_col3, int8* rowA, int8* colB,
 114                                     int8* regC)
 115 {
 116     // Read tile A from SLM to regA
 117     uint l_offsetTileATemp = l_offsetTileA;
 118     __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
 119     for (uint j = 0; j < (SG_TILE_M / 8); ++j)
 120     {
 121                 rowA[j] = as_int8(SLM_BLOCK_READ_8(&l_tileA[l_offsetTileATemp]));
 122         l_offsetTileATemp += 8 * SG_SIZE;
 123     }
 124     // Read tile B from SLM to regB and compute mmad
 125     colB[0] = l_tileB[l_offsetTileB_col0];
 126     colB[1] = l_tileB[l_offsetTileB_col1];
 127     __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
 128     for (uint j = 0; j < (SG_TILE_M / 8); ++j)
 129     {
 130         // Compute partial C
 131         regC[0*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[0], regC[0*(SIMD_LANE_M / 8) + j]);
 132     }
 133     colB[0] = l_tileB[l_offsetTileB_col2];
 134     __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
 135     for (uint j = 0; j < (SG_TILE_M / 8); ++j)
 136     {
 137         // Compute partial C
 138         regC[1*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[1], regC[1*(SIMD_LANE_M / 8) + j] );
 139         }
 140     colB[1] = l_tileB[l_offsetTileB_col3];
 141     __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
 142     for (uint j = 0; j < (SG_TILE_M / 8); ++j)
 143     {
 144         // Compute partial C
 145         regC[2*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[0], regC[2*(SIMD_LANE_M / 8) + j]);
 146     }
 147     __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
 148     for (uint j = 0; j < (SG_TILE_M / 8); ++j)
 149     {
 150         // Compute partial C
 151         regC[3*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[1], regC[3*(SIMD_LANE_M / 8) + j]);
 152     }
 153 }
 154
 155 /*
 156  *  \brief GEMM kernel to compute MxN matrix using SLM
 157  *  \param g_inA  - Input matrix
 158  *  \param g_inB  - Input matrix
 159  *  \param g_outC - Output matrix
 160  */
 161
 162 __attribute__((intel_reqd_sub_group_size(SG_SIZE)))
 163 KERNEL(Kernel_GEMM_MMAD8_32x32SG_224x128WG_SLM_INT8)
 164                                                                                                                   (__global char* const g_inA,
 165                                                                                                                   __global int* g_outC,
 166                                                                                                                   __global char* const g_inB,
 167                                                                                                                     #if BIAS_TERM
 168                                                                                                                                 __global BIAS_TYPE* biases,
 169                                                                                                                         #endif
 170                                                                                                                                 __global float* quantizations,
 171                                                                                                                         #if CALIBRATION_TERM
 172                                                                                                                                 __global float* calibrations,
 173                                                                                                                         #endif
 174                                                                                                                                 uint split_idx
 175
 176                                                                                                                    )
 177 {
 178
 179     __global int4* const g_matrixA = (__global int4*)g_inA;
 180     __global int4* const g_matrixB = (__global int4*)g_inB;
 181     __global int8* g_matrixC = (__global int8*)g_outC;
 182
 183     // Each work-group works to compute 128x128 tile.
 184     // Each work-group contains 16 sub-groups.
 185     // Each sub-group within the work-group works to compute a 32x32 tile.
 186     // 1) All work-items in WG fill SLM with tileA (128x32) and tileB (32x128).
 187     // 2) Each sub-group works to compute 32x32 tileC (stored in regC).
 188     //    Note that each work-item in the sub-group computes a 32x4 chunk of tileC.
 189     // 3) Repeat until tileC is fully computed (while moving tileA and tileB "windows")
 190     __local int8 l_workGroupTileA[2 * (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)];
 191     __local int8 l_workGroupTileB[2 * (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)];
 192
 193     __local uint* l_workGroupTileA_uint = (__local uint*)l_workGroupTileA;
 194     __local int4* l_workGroupTileA_int4 = (__local int4*)l_workGroupTileA;
 195     __local int4* l_workGroupTileB_int4 = (__local int4*)l_workGroupTileB;
 196
 197     const uint l_groupSize = get_local_size(DIM_X) * get_local_size(DIM_Y);
 198
 199     const uint l_pingPongOffsetA_uint = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(uint);
 200     const uint l_pingPongOffsetB_int8 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8);
 201     const uint l_pingPongOffsetA_int4 = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int4);
 202     const uint l_pingPongOffsetB_int4 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int4);
 203
 204     // Thread IDs
 205     const uint g_tidY = get_global_id(DIM_Y);
 206     const uint g_tidX = get_global_id(DIM_X);
 207     const uint l_tidX = get_local_id(DIM_X);
 208     const uint l_tidY = get_local_id(DIM_Y);
 209     const uint l_tid = l_tidY * get_local_size(DIM_X) + l_tidX;
 210
 211     // SubGroup IDs
 212     const uint sg_tid = get_sub_group_local_id();
 213     const uint sg_global_idX = (uint)(g_tidX / SG_SIZE);
 214     const uint sg_global_idY = g_tidY;
 215     const uint sg_local_idX = (uint)(l_tidX / SG_SIZE);
 216     const uint sg_local_idY = l_tidY;
 217     const uint sg_local_id = sg_local_idY * get_local_size(DIM_X) / SG_SIZE + sg_local_idX;
 218
 219         const uint sub_group_id = get_sub_group_id();
 220
 221     // Registers
 222     int8 regC[(SIMD_LANE_M / 8) * SIMD_LANE_N] = {0}; // Each work-item responsible for 32x4 ints elts
 223     int8 rowA[(SG_TILE_M * MATRIX_SMALL_K / SG_SIZE) / sizeof(int8)]; // each work-item will hold 1/8 of matrixA
 224     int8 colB[2];  // each lane will store 32x4 piece of matrixB
 225
 226     // SLM indices
 227     const uint l_offsetTileA = SG_TILE_M * (MATRIX_SMALL_K / sizeof(uint)) * sg_local_idY;
 228     const uint numElements32x32TileB = (MATRIX_SMALL_K * SG_TILE_N) / sizeof(int8);
 229     const uint numElements32x8TileB = numElements32x32TileB / 4;
 230     const uint l_offsetTileB = numElements32x32TileB * sg_local_idX;
 231     const uint l_offsetTileB_col0 = l_offsetTileB + sg_tid;
 232     const uint l_offsetTileB_col1 = l_offsetTileB + 1 * numElements32x8TileB + sg_tid;
 233     const uint l_offsetTileB_col2 = l_offsetTileB + 2 * numElements32x8TileB + sg_tid;
 234     const uint l_offsetTileB_col3 = l_offsetTileB + 3 * numElements32x8TileB + sg_tid;
 235
 236     // Global indices
 237     uint g_idxA[2];
 238     uint g_idxB[2];
 239 #ifdef TILED_GLOBAL_LAYOUT // 32-row major (matrixA) and 32-col major (matrixB)
 240     g_idxA[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_M) * get_group_id(DIM_Y) + l_tid;
 241     g_idxB[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_N) * get_group_id(DIM_X) + l_tid;
 242     g_idxA[1] = g_idxA[0] + l_groupSize;
 243     g_idxB[1] = g_idxB[0] + l_groupSize;
 244 #else // Row (matrixA) and Col (matrixB) major layout
 245     g_idxA[0] = WG_TILE_M * (MATRIX_K / sizeof(int4)) * get_group_id(DIM_Y) +
 246                (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2);
 247     g_idxB[0] = WG_TILE_N * (MATRIX_K / sizeof(int4)) * get_group_id(DIM_X) +
 248                (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2);
 249     g_idxA[1] = g_idxA[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4));
 250     g_idxB[1] = g_idxB[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4));
 251 #endif
 252     // Initial SLM setup
 253     {
 254         l_workGroupTileA_int4[l_tid] = g_matrixA[g_idxA[0]];
 255         l_workGroupTileB_int4[l_tid] = g_matrixB[g_idxB[0]];
 256
 257         l_workGroupTileA_int4[l_tid + l_groupSize] = g_matrixA[g_idxA[1]];
 258         if (l_tid < 32)
 259         {
 260             // Not all work-items will be needed to fetch the remaining matrix B
 261             l_workGroupTileB_int4[l_tid + l_groupSize] = g_matrixB[g_idxB[1]];
 262         }
 263 #ifdef TILED_GLOBAL_LAYOUT
 264         g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
 265         g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
 266         g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
 267         g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
 268 #else
 269         g_idxA[0] += MATRIX_SMALL_K / sizeof(int4);
 270         g_idxB[0] += MATRIX_SMALL_K / sizeof(int4);
 271         g_idxA[1] += MATRIX_SMALL_K / sizeof(int4);
 272         g_idxB[1] += MATRIX_SMALL_K / sizeof(int4);
 273 #endif
 274
 275         barrier(CLK_LOCAL_MEM_FENCE);
 276     }
 277     int4 hdcReadValueA[2];
 278     int4 hdcReadValueB[2];
 279
 280     __attribute__((opencl_unroll_hint(1)))
 281     for (uint k = 0; k < (MATRIX_K / MATRIX_SMALL_K) - 1; k++)
 282     {
 283         hdcReadValueA[0] = g_matrixA[g_idxA[0]];
 284         hdcReadValueB[0] = g_matrixB[g_idxB[0]];
 285         hdcReadValueA[1] = g_matrixA[g_idxA[1]];
 286         if (l_tid < 32)
 287         {
 288             // Not all work-items will be needed to fetch the remaining matrix B
 289             hdcReadValueB[1] = g_matrixB[g_idxB[1]];
 290         }
 291 #ifdef TILED_GLOBAL_LAYOUT
 292         g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
 293         g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
 294         g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
 295         g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
 296 #else
 297         g_idxA[0] += MATRIX_SMALL_K / sizeof(int4);
 298         g_idxB[0] += MATRIX_SMALL_K / sizeof(int4);
 299         g_idxA[1] += MATRIX_SMALL_K / sizeof(int4);
 300         g_idxB[1] += MATRIX_SMALL_K / sizeof(int4);
 301 #endif
 302
 303
 304         //MMAD compute
 305         FUNC_CALL(mmad_32x32_int8)(&l_workGroupTileA_uint[(k % 2) * l_pingPongOffsetA_uint],
 306                                 l_offsetTileA, &l_workGroupTileB[(k % 2) * l_pingPongOffsetB_int8],
 307                                 l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2,
 308                                 l_offsetTileB_col3, rowA, colB, regC);
 309
 310         //SLM setup - SLM write only
 311         l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid] = hdcReadValueA[0];
 312         l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid] = hdcReadValueB[0];
 313         l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid + l_groupSize] = hdcReadValueA[1];
 314         if (l_tid < 32)
 315         {
 316             // Not all work-items will be needed to fetch the remaining matrix B
 317             l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid + l_groupSize] = hdcReadValueB[1];
 318         }
 319         barrier(CLK_LOCAL_MEM_FENCE);
 320     } // main outer loop
 321
 322     //Last MMAD compute iteration (avoids branching in main loop)
 323         FUNC_CALL(mmad_32x32_int8)(
 324         &l_workGroupTileA_uint[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetA_uint],
 325         l_offsetTileA,
 326         &l_workGroupTileB[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetB_int8],
 327         l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, l_offsetTileB_col3, rowA, colB,
 328         regC);
 329
 330
 331 #ifdef OUTPUT_TILED_GLOBAL_LAYOUT
 332
 333     // Write out in swizzled manner after quantizing
 334     __global uchar* g_outC_uchar = (__global uchar*)g_outC;
 335     uint cOffset = sg_global_idX * (MATRIX_M * SG_TILE_N / sizeof(uchar)) +
 336                    sg_global_idY * (SG_TILE_M * SG_TILE_N / sizeof(uchar));
 337
 338     uchar16 regC_uchar16;
 339     uint offset_uc16 = 0;
 340
 341         const uint workgroup_id_x = get_group_id(0);
 342         uint feature_off = 32*(sub_group_id % (WG_TILE_N / 32)) + WG_TILE_N*workgroup_id_x; //=32*{0,1,2,3} + WG_TILE_N * workgroup_id_x
 343         uint feature = get_sub_group_local_id()*4 + feature_off;
 344
 345     float4 quant_f = vload4(0, quantizations + feature);
 346     float4 bias_f = vload4(0, biases + feature);
 347     float4 calib_f = vload4(0, calibrations + feature);
 348
 349 #if MMAD_SUPPORTED == 1
 350     __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) )))
 351 #endif
 352     for (uint i = 0; i < SG_TILE_M / (sizeof(int8) / sizeof(int)); i++)
 353     {
 354         uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset);
 355         {
 356             // B0..3, F0..31
 357             QUANTIZATION(0);
 358         }
 359
 360         intel_sub_group_block_write4((__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16));
 361                 cOffset += sizeof(uchar16) * SG_SIZE;
 362
 363         // now we need to calculate again for other x
 364         padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset);
 365         {
 366             // B0..3, F0..31
 367             QUANTIZATION(4);
 368         }
 369
 370         intel_sub_group_block_write4( (__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16) );
 371         cOffset += sizeof(uchar16) * SG_SIZE;
 372     }
 373
 374 #else
 375     // Write final accumulated values
 376     uint cOffset = sg_global_idX * ((MATRIX_M / 8) * SG_TILE_N) + sg_global_idY * (SG_TILE_M / 8) +
 377                    sg_tid * (MATRIX_M / 8);
 378     __attribute__((opencl_unroll_hint(SIMD_LANE_N)))
 379     for (uint i = 0; i < (SIMD_LANE_N); ++i)
 380     {
 381         __attribute__((opencl_unroll_hint(SIMD_LANE_M / 8)))
 382         for (uint j = 0; j < (SIMD_LANE_M / 8); ++j)
 383         {
 384             g_matrixC[cOffset + j] = regC[i*(SIMD_LANE_M / 8) + j];
 385         }
 386         cOffset += SG_SIZE * (MATRIX_M / 8);
 387     }
 388 #endif
 389 }